2025年12月20日剩余内容提交
This commit is contained in:
parent
5e9de851e3
commit
17dbd562f3
79
rag-python/README-部署指南.md
Normal file
79
rag-python/README-部署指南.md
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# RAG 知识库服务 - 部署指南
|
||||
|
||||
## 环境要求
|
||||
|
||||
- Python 3.11(必须是 3.11,不能用 3.14)
|
||||
- Ollama(本地大模型服务)
|
||||
|
||||
## 一、安装 Python 3.11
|
||||
|
||||
下载地址:https://www.python.org/downloads/release/python-3119/
|
||||
|
||||
安装时勾选 "Add Python to PATH"
|
||||
|
||||
## 二、安装 Ollama
|
||||
|
||||
下载地址:https://ollama.com/download
|
||||
|
||||
安装后运行以下命令下载模型:
|
||||
```bash
|
||||
ollama pull nomic-embed-text
|
||||
ollama pull qwen2.5:7b
|
||||
```
|
||||
|
||||
## 三、安装依赖
|
||||
|
||||
在 `rag-python` 目录下运行:
|
||||
```bash
|
||||
py -3.11 -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 四、使用方法
|
||||
|
||||
### 1. 添加文档
|
||||
|
||||
把要索引的文档放到 `knowledge_docs` 文件夹中
|
||||
|
||||
支持的格式:`.txt` `.md` `.pdf` `.docx`
|
||||
|
||||
### 2. 建立索引
|
||||
|
||||
```bash
|
||||
py -3.11 batch_index.py
|
||||
```
|
||||
|
||||
注意:扫描版 PDF 需要 OCR 识别,速度较慢(每页约 5-10 秒)
|
||||
|
||||
### 3. 启动服务
|
||||
|
||||
```bash
|
||||
py -3.11 app.py
|
||||
```
|
||||
|
||||
服务默认运行在 http://localhost:5000
|
||||
|
||||
## 五、常见问题
|
||||
|
||||
### Q: 提示缺少模块?
|
||||
```bash
|
||||
py -3.11 -m pip install 模块名
|
||||
```
|
||||
|
||||
### Q: OCR 识别很慢?
|
||||
扫描版 PDF 需要逐页识别,272 页大约需要 20-30 分钟。有 GPU 会快很多。
|
||||
|
||||
### Q: 如何测试服务?
|
||||
```bash
|
||||
curl http://localhost:5000/api/knowledge/search?query=测试
|
||||
```
|
||||
|
||||
## 六、目录结构
|
||||
|
||||
```
|
||||
rag-python/
|
||||
├── knowledge_docs/ # 放入要索引的文档
|
||||
├── index_data/ # 生成的索引文件(自动创建)
|
||||
├── batch_index.py # 批量索引脚本
|
||||
├── app.py # Web 服务入口
|
||||
└── requirements.txt # 依赖列表
|
||||
```
|
||||
|
|
@ -1,11 +1,51 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
文档解析器 - 支持多种文件格式
|
||||
文档解析器 - 支持多种文件格式,包括扫描版PDF的OCR识别
|
||||
支持 EasyOCR(推荐)、PaddleOCR 和 Tesseract
|
||||
"""
|
||||
import os
|
||||
import chardet
|
||||
from config import SUPPORTED_EXTENSIONS
|
||||
|
||||
# Tesseract OCR 路径配置(Windows)- 备用方案
|
||||
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
|
||||
# OCR 引擎选择:'easyocr'(推荐), 'paddle' 或 'tesseract'
|
||||
OCR_ENGINE = 'easyocr'
|
||||
|
||||
# EasyOCR 实例(延迟初始化)
|
||||
_easy_ocr = None
|
||||
|
||||
# PaddleOCR 实例(延迟初始化)
|
||||
_paddle_ocr = None
|
||||
|
||||
def get_easy_ocr():
|
||||
"""获取 EasyOCR 实例(单例模式)"""
|
||||
global _easy_ocr
|
||||
if _easy_ocr is None:
|
||||
try:
|
||||
import easyocr
|
||||
# 支持中文和英文
|
||||
_easy_ocr = easyocr.Reader(['ch_sim', 'en'], gpu=False)
|
||||
print(" EasyOCR 初始化成功")
|
||||
except ImportError:
|
||||
print(" EasyOCR 未安装,请运行: pip install easyocr")
|
||||
return None
|
||||
return _easy_ocr
|
||||
|
||||
def get_paddle_ocr():
|
||||
"""获取 PaddleOCR 实例(单例模式)"""
|
||||
global _paddle_ocr
|
||||
if _paddle_ocr is None:
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
_paddle_ocr = PaddleOCR(use_angle_cls=True, lang='ch', show_log=False)
|
||||
print(" PaddleOCR 初始化成功")
|
||||
except ImportError:
|
||||
print(" PaddleOCR 未安装")
|
||||
return None
|
||||
return _paddle_ocr
|
||||
|
||||
def detect_encoding(file_path):
|
||||
"""检测文件编码"""
|
||||
with open(file_path, 'rb') as f:
|
||||
|
|
@ -27,14 +67,154 @@ def parse_md(file_path):
|
|||
"""解析Markdown文件"""
|
||||
return parse_txt(file_path)
|
||||
|
||||
def parse_pdf(file_path):
|
||||
"""解析PDF文件(支持大文件)"""
|
||||
def parse_pdf_with_ocr(file_path):
|
||||
"""使用 PyMuPDF + OCR 解析PDF(支持扫描版)"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
file_size = os.path.getsize(file_path)
|
||||
file_size_mb = file_size / (1024 * 1024)
|
||||
print(f" PDF文件大小: {file_size_mb:.1f} MB")
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
total_pages = len(doc)
|
||||
print(f" PDF总页数: {total_pages}")
|
||||
|
||||
text_parts = []
|
||||
ocr_used = False
|
||||
|
||||
for i, page in enumerate(doc):
|
||||
if (i + 1) % 20 == 0 or i == 0:
|
||||
print(f" 解析进度: {i + 1}/{total_pages} 页")
|
||||
|
||||
try:
|
||||
# 先尝试直接提取文本
|
||||
text = page.get_text()
|
||||
|
||||
# 如果文本太少,可能是扫描版,使用OCR
|
||||
if len(text.strip()) < 50:
|
||||
if not ocr_used:
|
||||
print(f" 检测到扫描版PDF,启用OCR识别...")
|
||||
ocr_used = True
|
||||
|
||||
# 将页面渲染为图片
|
||||
mat = fitz.Matrix(2, 2) # 2x缩放提高OCR精度
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_data = pix.tobytes("png")
|
||||
|
||||
# 使用 EasyOCR、PaddleOCR 或 Tesseract
|
||||
if OCR_ENGINE == 'easyocr':
|
||||
text = ocr_with_easyocr(img_data)
|
||||
elif OCR_ENGINE == 'paddle':
|
||||
text = ocr_with_paddle(img_data)
|
||||
else:
|
||||
text = ocr_with_tesseract(img_data)
|
||||
|
||||
if text and text.strip():
|
||||
text_parts.append(text)
|
||||
|
||||
except Exception as e:
|
||||
print(f" 警告: 第 {i + 1} 页解析失败: {e}")
|
||||
continue
|
||||
|
||||
doc.close()
|
||||
total_chars = len(''.join(text_parts))
|
||||
print(f" PDF解析完成,提取文本 {total_chars} 字符" + (" (使用OCR)" if ocr_used else ""))
|
||||
return "\n".join(text_parts)
|
||||
|
||||
except ImportError as e:
|
||||
print(f" 缺少依赖: {e}")
|
||||
print(f" 请运行: pip install pymupdf easyocr pillow")
|
||||
return parse_pdf_basic(file_path)
|
||||
except Exception as e:
|
||||
print(f" PyMuPDF解析失败: {e},尝试基础解析...")
|
||||
return parse_pdf_basic(file_path)
|
||||
|
||||
def ocr_with_easyocr(img_data):
|
||||
"""使用 EasyOCR 识别图片(推荐)"""
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
import numpy as np
|
||||
|
||||
ocr = get_easy_ocr()
|
||||
if ocr is None:
|
||||
return ""
|
||||
|
||||
# 将图片数据转换为 numpy 数组
|
||||
img = Image.open(io.BytesIO(img_data))
|
||||
img_array = np.array(img)
|
||||
|
||||
# OCR 识别
|
||||
result = ocr.readtext(img_array)
|
||||
|
||||
# 提取文本
|
||||
texts = [item[1] for item in result]
|
||||
return '\n'.join(texts)
|
||||
except Exception as e:
|
||||
print(f" EasyOCR 识别失败: {e}")
|
||||
return ""
|
||||
|
||||
def ocr_with_paddle(img_data):
|
||||
"""使用 PaddleOCR 识别图片"""
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
import numpy as np
|
||||
|
||||
ocr = get_paddle_ocr()
|
||||
if ocr is None:
|
||||
return ""
|
||||
|
||||
# 将图片数据转换为 numpy 数组
|
||||
img = Image.open(io.BytesIO(img_data))
|
||||
img_array = np.array(img)
|
||||
|
||||
# OCR 识别
|
||||
result = ocr.ocr(img_array, cls=True)
|
||||
|
||||
# 提取文本
|
||||
texts = []
|
||||
if result and result[0]:
|
||||
for line in result[0]:
|
||||
if line and len(line) >= 2:
|
||||
texts.append(line[1][0])
|
||||
|
||||
return '\n'.join(texts)
|
||||
except Exception as e:
|
||||
print(f" PaddleOCR 识别失败: {e}")
|
||||
return ""
|
||||
|
||||
def ocr_with_tesseract(img_data):
|
||||
"""使用 Tesseract 识别图片(备用方案)"""
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# 配置 Tesseract 路径
|
||||
if os.path.exists(TESSERACT_PATH):
|
||||
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
|
||||
|
||||
img = Image.open(io.BytesIO(img_data))
|
||||
|
||||
# OCR识别(中文+英文)
|
||||
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f" Tesseract 识别失败: {e}")
|
||||
return ""
|
||||
|
||||
def parse_pdf_basic(file_path):
|
||||
"""基础PDF解析(使用PyPDF2,不支持扫描版)"""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
file_size = os.path.getsize(file_path)
|
||||
file_size_mb = file_size / (1024 * 1024)
|
||||
print(f" PDF文件大小: {file_size_mb:.1f} MB")
|
||||
print(f" PDF文件大小: {file_size_mb:.1f} MB (基础模式)")
|
||||
|
||||
reader = PdfReader(file_path)
|
||||
total_pages = len(reader.pages)
|
||||
|
|
@ -58,6 +238,10 @@ def parse_pdf(file_path):
|
|||
print(f"解析PDF文件失败 {file_path}: {e}")
|
||||
return ""
|
||||
|
||||
def parse_pdf(file_path):
|
||||
"""解析PDF文件(优先使用OCR方案)"""
|
||||
return parse_pdf_with_ocr(file_path)
|
||||
|
||||
def parse_docx(file_path):
|
||||
"""解析Word文档"""
|
||||
try:
|
||||
|
|
@ -95,3 +279,17 @@ def is_supported_file(filename):
|
|||
"""检查文件是否支持"""
|
||||
ext = os.path.splitext(filename)[1].lower()
|
||||
return ext in SUPPORTED_EXTENSIONS
|
||||
|
||||
def check_ocr_available():
|
||||
"""检查OCR是否可用"""
|
||||
try:
|
||||
import pytesseract
|
||||
if os.path.exists(TESSERACT_PATH):
|
||||
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
|
||||
version = pytesseract.get_tesseract_version()
|
||||
print(f"Tesseract OCR 版本: {version}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"OCR不可用: {e}")
|
||||
print("请安装 Tesseract OCR: https://github.com/UB-Mannheim/tesseract/wiki")
|
||||
return False
|
||||
|
|
|
|||
165
rag-python/merge_index.py
Normal file
165
rag-python/merge_index.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
索引合并脚本 - 用于合并多人处理的知识库索引
|
||||
将多个 index_data 文件夹合并成一个统一的索引
|
||||
|
||||
使用方法:
|
||||
1. 将各人处理好的 index_data 文件夹重命名后放到 to_merge/ 目录
|
||||
例如: to_merge/index_data_张三/, to_merge/index_data_李四/
|
||||
2. 运行: python merge_index.py
|
||||
3. 合并后的索引会保存到 index_data/ 目录
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
|
||||
# 添加当前目录到路径
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from config import INDEX_DIR, EMBEDDING_MODEL, OLLAMA_URL
|
||||
from vector_store import vector_store
|
||||
|
||||
# 待合并的文件夹
|
||||
MERGE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'to_merge')
|
||||
|
||||
def load_documents_from_folder(folder_path):
|
||||
"""从文件夹加载文档数据"""
|
||||
docs_file = os.path.join(folder_path, 'documents.json')
|
||||
if not os.path.exists(docs_file):
|
||||
print(f" 警告: {folder_path} 中没有 documents.json")
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(docs_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data.get('documents', [])
|
||||
except Exception as e:
|
||||
print(f" 错误: 读取 {docs_file} 失败: {e}")
|
||||
return []
|
||||
|
||||
def merge_indexes():
|
||||
"""合并多个索引"""
|
||||
print("=" * 60)
|
||||
print("索引合并工具")
|
||||
print("=" * 60)
|
||||
|
||||
# 检查合并目录
|
||||
if not os.path.exists(MERGE_DIR):
|
||||
os.makedirs(MERGE_DIR)
|
||||
print(f"已创建合并目录: {MERGE_DIR}")
|
||||
print(f"请将各人处理好的 index_data 文件夹放到此目录中")
|
||||
print(f"例如: {MERGE_DIR}/index_data_张三/")
|
||||
return
|
||||
|
||||
# 扫描待合并的文件夹
|
||||
folders = []
|
||||
for name in os.listdir(MERGE_DIR):
|
||||
folder_path = os.path.join(MERGE_DIR, name)
|
||||
if os.path.isdir(folder_path):
|
||||
docs_file = os.path.join(folder_path, 'documents.json')
|
||||
if os.path.exists(docs_file):
|
||||
folders.append((name, folder_path))
|
||||
|
||||
if not folders:
|
||||
print(f"在 {MERGE_DIR} 中没有找到有效的索引文件夹")
|
||||
print(f"请确保文件夹中包含 documents.json 文件")
|
||||
return
|
||||
|
||||
print(f"找到 {len(folders)} 个待合并的索引:")
|
||||
for name, path in folders:
|
||||
print(f" - {name}")
|
||||
print()
|
||||
|
||||
# 确认合并
|
||||
confirm = input("是否开始合并?这将覆盖现有的 index_data (y/n): ").strip().lower()
|
||||
if confirm != 'y':
|
||||
print("已取消")
|
||||
return
|
||||
|
||||
# 收集所有文档
|
||||
all_documents = []
|
||||
all_files = set()
|
||||
|
||||
print()
|
||||
print("正在收集文档...")
|
||||
for name, folder_path in folders:
|
||||
print(f" 处理: {name}")
|
||||
docs = load_documents_from_folder(folder_path)
|
||||
|
||||
for doc in docs:
|
||||
filename = doc.get('filename', '')
|
||||
# 避免重复文件
|
||||
if filename and filename not in all_files:
|
||||
all_files.add(filename)
|
||||
all_documents.append(doc)
|
||||
|
||||
print(f" 文档数: {len(docs)}, 累计: {len(all_documents)}")
|
||||
|
||||
print()
|
||||
print(f"共收集 {len(all_documents)} 个文档块")
|
||||
print(f"来自 {len(all_files)} 个不同文件")
|
||||
print()
|
||||
|
||||
# 清空现有索引
|
||||
print("清空现有索引...")
|
||||
if os.path.exists(INDEX_DIR):
|
||||
shutil.rmtree(INDEX_DIR)
|
||||
os.makedirs(INDEX_DIR)
|
||||
|
||||
# 重建索引
|
||||
print("重建向量索引...")
|
||||
print("这可能需要一些时间,请耐心等待...")
|
||||
print()
|
||||
|
||||
# 初始化向量存储
|
||||
vector_store.documents = []
|
||||
vector_store.index = None
|
||||
|
||||
# 按文件分组处理
|
||||
file_docs = {}
|
||||
for doc in all_documents:
|
||||
filename = doc.get('filename', 'unknown')
|
||||
if filename not in file_docs:
|
||||
file_docs[filename] = []
|
||||
file_docs[filename].append(doc)
|
||||
|
||||
total_files = len(file_docs)
|
||||
processed = 0
|
||||
|
||||
for filename, docs in file_docs.items():
|
||||
processed += 1
|
||||
print(f"[{processed}/{total_files}] 索引: {filename} ({len(docs)} 块)")
|
||||
|
||||
# 提取文本块
|
||||
chunks = [doc.get('content', '') for doc in docs]
|
||||
metadata = {
|
||||
'filename': filename,
|
||||
'file_path': docs[0].get('file_path', ''),
|
||||
'char_count': sum(len(c) for c in chunks)
|
||||
}
|
||||
|
||||
try:
|
||||
vector_store.add_documents(chunks, metadata)
|
||||
except Exception as e:
|
||||
print(f" 错误: {e}")
|
||||
|
||||
# 保存索引
|
||||
print()
|
||||
print("保存索引...")
|
||||
vector_store.save_index()
|
||||
|
||||
# 显示结果
|
||||
stats = vector_store.get_stats()
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("合并完成!")
|
||||
print("=" * 60)
|
||||
print(f"总文件数: {stats['total_files']}")
|
||||
print(f"总文本块: {stats['total_chunks']}")
|
||||
print(f"索引目录: {INDEX_DIR}")
|
||||
print()
|
||||
print("提示: 合并完成后可以删除 to_merge/ 目录中的文件")
|
||||
|
||||
if __name__ == '__main__':
|
||||
merge_indexes()
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# RAG 知识库服务依赖(使用本地 Ollama)
|
||||
# RAG Knowledge Base Dependencies
|
||||
flask>=2.0.0
|
||||
flask-cors>=4.0.0
|
||||
faiss-cpu>=1.7.0
|
||||
|
|
@ -9,3 +9,11 @@ python-docx>=0.8.0
|
|||
chardet>=5.0.0
|
||||
jieba>=0.42.0
|
||||
requests>=2.28.0
|
||||
|
||||
# PDF OCR Support
|
||||
pymupdf>=1.23.0
|
||||
pillow>=10.0.0
|
||||
pycryptodome>=3.19.0
|
||||
|
||||
# EasyOCR
|
||||
easyocr
|
||||
|
|
|
|||
|
|
@ -249,10 +249,13 @@ class VectorStore:
|
|||
for i, idx in enumerate(indices[0]):
|
||||
if idx < len(self.documents) and idx >= 0:
|
||||
doc = self.documents[idx]
|
||||
metadata = doc.get('metadata', {})
|
||||
results.append({
|
||||
'content': doc['content'],
|
||||
'score': float(scores[0][i]),
|
||||
'metadata': doc.get('metadata', {})
|
||||
'filename': metadata.get('filename', '未知来源'),
|
||||
'similarity': float(scores[0][i]),
|
||||
'metadata': metadata
|
||||
})
|
||||
|
||||
return results
|
||||
|
|
|
|||
124
rag-python/知识库处理说明.md
Normal file
124
rag-python/知识库处理说明.md
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
# 知识库多人协作处理方案
|
||||
|
||||
## 一、分工方式
|
||||
|
||||
将 PDF 文件按数量或类型分配给不同人员处理:
|
||||
|
||||
| 人员 | 负责文件 | 预计时间 |
|
||||
|-----|---------|---------|
|
||||
| 人员A | 心理测量类 (10个PDF) | 2-3小时 |
|
||||
| 人员B | 心理治疗类 (10个PDF) | 2-3小时 |
|
||||
| 人员C | 心理学基础 (10个PDF) | 2-3小时 |
|
||||
|
||||
## 二、每个人需要的文件
|
||||
|
||||
将以下文件打包发给每个处理人员:
|
||||
|
||||
```
|
||||
rag-python-处理包/
|
||||
├── batch_index.py # 索引脚本
|
||||
├── config.py # 配置文件
|
||||
├── document_parser.py # 文档解析
|
||||
├── text_splitter.py # 文本分块
|
||||
├── vector_store.py # 向量存储
|
||||
├── knowledge_docs/ # 空目录,用于放PDF
|
||||
├── index_data/ # 空目录,存放结果
|
||||
└── requirements.txt # Python依赖
|
||||
```
|
||||
|
||||
## 三、处理人员操作步骤
|
||||
|
||||
### 1. 环境准备
|
||||
|
||||
```bash
|
||||
# 安装 Python 依赖
|
||||
pip install -r requirements.txt
|
||||
|
||||
# 安装 Tesseract OCR(用于扫描版PDF)
|
||||
# Windows: 下载安装 https://github.com/UB-Mannheim/tesseract/wiki
|
||||
# 安装时勾选中文语言包
|
||||
|
||||
# 安装 Ollama 并下载嵌入模型
|
||||
ollama pull nomic-embed-text
|
||||
```
|
||||
|
||||
### 2. 放入 PDF 文件
|
||||
|
||||
将分配的 PDF 文件放入 `knowledge_docs/` 目录
|
||||
|
||||
### 3. 执行索引
|
||||
|
||||
```bash
|
||||
python batch_index.py
|
||||
```
|
||||
|
||||
等待处理完成,会显示:
|
||||
- 处理进度
|
||||
- 每个文件的字符数和向量块数
|
||||
- 总耗时
|
||||
|
||||
### 4. 返回结果
|
||||
|
||||
处理完成后,将 `index_data/` 文件夹打包发回:
|
||||
- 重命名为 `index_data_姓名/`
|
||||
- 包含 `documents.json` 和 `faiss.index` 两个文件
|
||||
|
||||
## 四、合并索引(汇总人员操作)
|
||||
|
||||
### 1. 收集所有人的结果
|
||||
|
||||
将各人返回的 `index_data_xxx/` 文件夹放到 `to_merge/` 目录:
|
||||
|
||||
```
|
||||
rag-python/
|
||||
├── to_merge/
|
||||
│ ├── index_data_张三/
|
||||
│ │ ├── documents.json
|
||||
│ │ └── faiss.index
|
||||
│ ├── index_data_李四/
|
||||
│ │ ├── documents.json
|
||||
│ │ └── faiss.index
|
||||
│ └── index_data_王五/
|
||||
│ ├── documents.json
|
||||
│ └── faiss.index
|
||||
└── merge_index.py
|
||||
```
|
||||
|
||||
### 2. 执行合并
|
||||
|
||||
```bash
|
||||
python merge_index.py
|
||||
```
|
||||
|
||||
### 3. 验证结果
|
||||
|
||||
```bash
|
||||
python app.py
|
||||
# 访问 http://localhost:5000/api/stats 查看统计
|
||||
```
|
||||
|
||||
## 五、注意事项
|
||||
|
||||
1. **Ollama 必须运行** - 所有处理人员的电脑都需要运行 Ollama
|
||||
2. **模型要一致** - 都使用 `nomic-embed-text` 模型
|
||||
3. **避免重复文件** - 不同人员处理的 PDF 不要重复
|
||||
4. **大文件耐心等待** - 200MB 的 PDF 可能需要 30-60 分钟
|
||||
|
||||
## 六、常见问题
|
||||
|
||||
### Q: 处理中断了怎么办?
|
||||
A: 删除 `index_data/` 目录,重新运行 `batch_index.py`
|
||||
|
||||
### Q: 某个 PDF 处理失败怎么办?
|
||||
A: 检查 PDF 是否损坏,或尝试用其他工具转换格式
|
||||
|
||||
### Q: 合并后发现有重复怎么办?
|
||||
A: 合并脚本会自动去重(按文件名判断)
|
||||
|
||||
## 七、预估时间
|
||||
|
||||
| PDF 类型 | 大小 | 预估时间 |
|
||||
|---------|------|---------|
|
||||
| 文字版 PDF | 10MB | 1-2 分钟 |
|
||||
| 扫描版 PDF | 10MB | 5-10 分钟 |
|
||||
| 大型扫描版 | 200MB | 30-60 分钟 |
|
||||
|
|
@ -1,10 +1,12 @@
|
|||
package com.ddnai.system.rag.service;
|
||||
|
||||
import com.ddnai.system.rag.client.ChromaDBClient;
|
||||
import com.ddnai.system.rag.client.PythonRagClient;
|
||||
import com.ddnai.system.rag.config.RagProperties;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
|
|
@ -13,7 +15,7 @@ import java.util.stream.Collectors;
|
|||
|
||||
/**
|
||||
* RAG检索服务
|
||||
* 从ChromaDB检索相关文档片段
|
||||
* 优先使用Python RAG服务,如果不可用则回退到ChromaDB
|
||||
*
|
||||
* @author ddnai
|
||||
*/
|
||||
|
|
@ -22,17 +24,21 @@ public class RetrievalService {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(RetrievalService.class);
|
||||
|
||||
@Autowired
|
||||
@Autowired(required = false)
|
||||
private ChromaDBClient chromaDBClient;
|
||||
|
||||
@Autowired
|
||||
@Autowired(required = false)
|
||||
private EmbeddingService embeddingService;
|
||||
|
||||
@Autowired
|
||||
private RagProperties ragProperties;
|
||||
|
||||
@Autowired
|
||||
private PythonRagClient pythonRagClient;
|
||||
|
||||
/**
|
||||
* 语义检索相关文档
|
||||
* 优先使用Python RAG服务
|
||||
*
|
||||
* @param query 查询文本
|
||||
* @param topK 返回结果数量
|
||||
|
|
@ -45,6 +51,33 @@ public class RetrievalService {
|
|||
throw new IllegalArgumentException("Query cannot be null or empty");
|
||||
}
|
||||
|
||||
// 优先使用Python RAG服务
|
||||
if (pythonRagClient != null && pythonRagClient.isAvailable()) {
|
||||
log.info("Using Python RAG service for retrieval");
|
||||
try {
|
||||
List<Map<String, Object>> results = pythonRagClient.search(query, topK);
|
||||
log.info("Python RAG retrieved {} documents for query", results.size());
|
||||
// 打印每个结果的详细信息用于调试
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
Map<String, Object> result = results.get(i);
|
||||
log.info("Result {}: filename={}, content_length={}, keys={}",
|
||||
i + 1,
|
||||
result.get("filename"),
|
||||
result.get("content") != null ? result.get("content").toString().length() : 0,
|
||||
result.keySet());
|
||||
}
|
||||
return results;
|
||||
} catch (Exception e) {
|
||||
log.warn("Python RAG service failed, falling back to ChromaDB: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// 回退到ChromaDB
|
||||
if (chromaDBClient == null || embeddingService == null) {
|
||||
log.warn("Neither Python RAG nor ChromaDB is available");
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
// 1. 向量化查询
|
||||
float[] queryEmbedding = embeddingService.embedText(query);
|
||||
|
||||
|
|
@ -57,7 +90,7 @@ public class RetrievalService {
|
|||
// 4. 过滤低相似度结果
|
||||
results = filterBySimilarity(results, threshold);
|
||||
|
||||
log.info("Retrieved {} documents for query (topK={}, threshold={})", results.size(), topK, threshold);
|
||||
log.info("ChromaDB retrieved {} documents for query (topK={}, threshold={})", results.size(), topK, threshold);
|
||||
return results;
|
||||
}
|
||||
|
||||
|
|
@ -75,6 +108,30 @@ public class RetrievalService {
|
|||
throw new IllegalArgumentException("Query cannot be null or empty");
|
||||
}
|
||||
|
||||
// 优先使用Python RAG服务(Python服务暂不支持分类过滤,直接搜索)
|
||||
if (pythonRagClient != null && pythonRagClient.isAvailable()) {
|
||||
log.info("Using Python RAG service for retrieval with filter");
|
||||
try {
|
||||
List<Map<String, Object>> results = pythonRagClient.search(query, topK);
|
||||
// 手动过滤分类
|
||||
if (category != null && !category.isEmpty()) {
|
||||
results = results.stream()
|
||||
.filter(r -> category.equals(r.get("category")))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
log.info("Python RAG retrieved {} documents with category filter '{}'", results.size(), category);
|
||||
return results;
|
||||
} catch (Exception e) {
|
||||
log.warn("Python RAG service failed, falling back to ChromaDB: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// 回退到ChromaDB
|
||||
if (chromaDBClient == null || embeddingService == null) {
|
||||
log.warn("Neither Python RAG nor ChromaDB is available");
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
// 1. 向量化查询
|
||||
float[] queryEmbedding = embeddingService.embedText(query);
|
||||
|
||||
|
|
@ -94,7 +151,7 @@ public class RetrievalService {
|
|||
double threshold = ragProperties.getRetrieval().getSimilarityThreshold();
|
||||
results = filterBySimilarity(results, threshold);
|
||||
|
||||
log.info("Retrieved {} documents for query with category filter '{}' (topK={})",
|
||||
log.info("ChromaDB retrieved {} documents with category filter '{}' (topK={})",
|
||||
results.size(), category, topK);
|
||||
return results;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -140,6 +140,7 @@ public class PsyAssessmentReportServiceImpl implements IPsyAssessmentReportServi
|
|||
// 5. 计算每个因子的得分
|
||||
List<PsyFactorScore> factorScores = new ArrayList<>();
|
||||
BigDecimal totalScore = BigDecimal.ZERO;
|
||||
|
||||
for (PsyFactor factor : factors)
|
||||
{
|
||||
List<PsyFactorRule> rules = factorRulesMap.get(factor.getFactorId());
|
||||
|
|
|
|||
|
|
@ -38,7 +38,14 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
|||
<include refid="selectInterpretationVo"/>
|
||||
where
|
||||
(scale_id = #{scaleId} OR scale_id IS NULL)
|
||||
AND (factor_id = #{factorId} OR factor_id IS NULL)
|
||||
<choose>
|
||||
<when test="factorId != null">
|
||||
AND factor_id = #{factorId}
|
||||
</when>
|
||||
<otherwise>
|
||||
AND factor_id IS NULL
|
||||
</otherwise>
|
||||
</choose>
|
||||
AND #{score} >= score_range_min
|
||||
AND #{score} <= score_range_max
|
||||
order by sort_order
|
||||
|
|
|
|||
|
|
@ -419,21 +419,9 @@ export default {
|
|||
const API_URL = 'https://api.moonshot.cn/v1/chat/completions';
|
||||
const API_KEY = 'sk-U9fdriPxwBcrpWW0Ite3N0eVtX7VxnqqqYUIBAdWd1hgEA9m';
|
||||
const MODEL = 'moonshot-v1-32k';
|
||||
// 备用:Ollama本地大模型
|
||||
// const API_URL = 'http://192.168.0.106:11434/api/chat';
|
||||
// const API_KEY = ''; // 本地模型不需要API Key
|
||||
// const MODEL = 'deepseek-r1:32b';
|
||||
|
||||
// 构建系统提示词
|
||||
const SYSTEM_PROMPT = [
|
||||
'你是专业心理测评报告分析师,请根据用户提供的报告内容进行深度分析。要求:',
|
||||
'1. 提取报告的核心信息和关键指标;',
|
||||
'2. 分析测评结果的含义和可能的影响;',
|
||||
'3. 提供专业、客观、易懂的分析解读(500-800字);',
|
||||
'4. 使用结构化的格式输出,包含:核心结论、详细分析、建议、总体结论四个部分;',
|
||||
'5. 仅输出分析结果,不添加额外建议、问候语或思考过程;',
|
||||
'6. 使用HTML格式输出,使用<h3>标签作为小标题,<p>标签作为段落。'
|
||||
].join('\n');
|
||||
// RAG知识库服务地址
|
||||
const RAG_API_URL = 'http://localhost:5000/api/rag-analyze';
|
||||
|
||||
// 构建完整的提示词
|
||||
const reportContent = this.reportForm.reportContent || '';
|
||||
|
|
@ -443,9 +431,45 @@ export default {
|
|||
// 提取纯文本内容(去除HTML标签)
|
||||
const textContent = reportContent.replace(/<[^>]*>/g, '').substring(0, 3000);
|
||||
|
||||
const userPrompt = `重要:请直接输出结果,不要包含任何思考过程、<think>标签或<think>标签。\n\n报告标题:${reportTitle}\n报告类型:${reportType}\n报告内容:${textContent}`;
|
||||
|
||||
try {
|
||||
// 1. 先调用RAG服务获取知识库上下文
|
||||
let knowledgeContext = '';
|
||||
let ragSources = [];
|
||||
try {
|
||||
const ragResponse = await axios.post(RAG_API_URL, {
|
||||
reportContent: reportContent,
|
||||
reportTitle: reportTitle
|
||||
}, { timeout: 10000 });
|
||||
|
||||
if (ragResponse.data && ragResponse.data.success) {
|
||||
knowledgeContext = ragResponse.data.data.knowledgeContext || '';
|
||||
ragSources = ragResponse.data.data.sources || [];
|
||||
console.log('RAG检索成功,获取到', ragSources.length, '条参考资料');
|
||||
}
|
||||
} catch (ragErr) {
|
||||
console.warn('RAG服务调用失败,将不使用知识库增强:', ragErr.message);
|
||||
// RAG失败不影响AI分析,继续执行
|
||||
}
|
||||
|
||||
// 2. 构建系统提示词(包含知识库上下文)
|
||||
let SYSTEM_PROMPT = [
|
||||
'你是专业心理测评报告分析师,请根据用户提供的报告内容进行深度分析。要求:',
|
||||
'1. 提取报告的核心信息和关键指标;',
|
||||
'2. 分析测评结果的含义和可能的影响;',
|
||||
'3. 提供专业、客观、易懂的分析解读(500-800字);',
|
||||
'4. 使用结构化的格式输出,包含:核心结论、详细分析、建议、总体结论四个部分;',
|
||||
'5. 仅输出分析结果,不添加额外建议、问候语或思考过程;',
|
||||
'6. 使用HTML格式输出,使用<h3>标签作为小标题,<p>标签作为段落。'
|
||||
].join('\n');
|
||||
|
||||
// 如果有知识库上下文,添加到系统提示词中
|
||||
if (knowledgeContext) {
|
||||
SYSTEM_PROMPT += '\n\n【专业知识库参考资料】\n' + knowledgeContext + '\n\n请结合以上专业资料进行分析,使分析更加专业和有深度。';
|
||||
}
|
||||
|
||||
const userPrompt = `重要:请直接输出结果,不要包含任何思考过程、<think>标签或<think>标签。\n\n报告标题:${reportTitle}\n报告类型:${reportType}\n报告内容:${textContent}`;
|
||||
|
||||
// 3. 调用AI API
|
||||
const { data } = await axios.post(API_URL, {
|
||||
model: MODEL,
|
||||
messages: [
|
||||
|
|
@ -453,7 +477,7 @@ export default {
|
|||
{ role: 'user', content: userPrompt }
|
||||
],
|
||||
temperature: 0.2,
|
||||
max_tokens: 1000,
|
||||
max_tokens: 1500,
|
||||
stream: false
|
||||
}, {
|
||||
headers: {
|
||||
|
|
@ -484,7 +508,20 @@ export default {
|
|||
}
|
||||
|
||||
// 格式化结果,确保HTML格式正确
|
||||
this.aiResult = this.formatAIResult(rawResponse);
|
||||
let formattedResult = this.formatAIResult(rawResponse);
|
||||
|
||||
// 4. 如果有知识库来源,添加参考资料说明
|
||||
if (ragSources && ragSources.length > 0) {
|
||||
formattedResult += '<div class="rag-sources" style="margin-top: 20px; padding: 15px; background: #f5f7fa; border-radius: 8px;">';
|
||||
formattedResult += '<h4 style="margin: 0 0 10px 0; color: #409EFF;"><i class="el-icon-document"></i> 参考知识库资料</h4>';
|
||||
formattedResult += '<ul style="margin: 0; padding-left: 20px; color: #666;">';
|
||||
ragSources.forEach((source, index) => {
|
||||
formattedResult += `<li style="margin-bottom: 5px;"><strong>${source.filename}</strong></li>`;
|
||||
});
|
||||
formattedResult += '</ul></div>';
|
||||
}
|
||||
|
||||
this.aiResult = formattedResult;
|
||||
|
||||
// 保存AI分析结果到数据库
|
||||
try {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user