2025年12月20日剩余内容提交

2025-12-20 18:33:07 +08:00 · 2025-12-20 18:33:07 +08:00 · 17dbd562f3
commit 17dbd562f3
parent 5e9de851e3
10 changed files with 710 additions and 31 deletions
--- a/rag-python/README-部署指南.md
+++ b/rag-python/README-部署指南.md
@ -0,0 +1,79 @@
+# RAG 知识库服务 - 部署指南
+
+## 环境要求
+
+- Python 3.11（必须是 3.11，不能用 3.14）
+- Ollama（本地大模型服务）
+
+## 一、安装 Python 3.11
+
+下载地址：https://www.python.org/downloads/release/python-3119/
+
+安装时勾选 "Add Python to PATH"
+
+## 二、安装 Ollama
+
+下载地址：https://ollama.com/download
+
+安装后运行以下命令下载模型：
+```bash
+ollama pull nomic-embed-text
+ollama pull qwen2.5:7b
+```
+
+## 三、安装依赖
+
+在 `rag-python` 目录下运行：
+```bash
+py -3.11 -m pip install -r requirements.txt
+```
+
+## 四、使用方法
+
+### 1. 添加文档
+
+把要索引的文档放到 `knowledge_docs` 文件夹中
+
+支持的格式：`.txt` `.md` `.pdf` `.docx`
+
+### 2. 建立索引
+
+```bash
+py -3.11 batch_index.py
+```
+
+注意：扫描版 PDF 需要 OCR 识别，速度较慢（每页约 5-10 秒）
+
+### 3. 启动服务
+
+```bash
+py -3.11 app.py
+```
+
+服务默认运行在 http://localhost:5000
+
+## 五、常见问题
+
+### Q: 提示缺少模块？
+```bash
+py -3.11 -m pip install 模块名
+```
+
+### Q: OCR 识别很慢？
+扫描版 PDF 需要逐页识别，272 页大约需要 20-30 分钟。有 GPU 会快很多。
+
+### Q: 如何测试服务？
+```bash
+curl http://localhost:5000/api/knowledge/search?query=测试
+```
+
+## 六、目录结构
+
+```
+rag-python/
+├── knowledge_docs/    # 放入要索引的文档
+├── index_data/        # 生成的索引文件（自动创建）
+├── batch_index.py     # 批量索引脚本
+├── app.py             # Web 服务入口
+└── requirements.txt   # 依赖列表
+```
--- a/rag-python/document_parser.py
+++ b/rag-python/document_parser.py
@ -1,11 +1,51 @@
 # -*- coding: utf-8 -*-
 """
-文档解析器 - 支持多种文件格式
+文档解析器 - 支持多种文件格式，包括扫描版PDF的OCR识别
+支持 EasyOCR（推荐）、PaddleOCR 和 Tesseract
 """
 import os
 import chardet
 from config import SUPPORTED_EXTENSIONS

+# Tesseract OCR 路径配置（Windows）- 备用方案
+TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+
+# OCR 引擎选择：'easyocr'（推荐）, 'paddle' 或 'tesseract'
+OCR_ENGINE = 'easyocr'
+
+# EasyOCR 实例（延迟初始化）
+_easy_ocr = None
+
+# PaddleOCR 实例（延迟初始化）
+_paddle_ocr = None
+
+def get_easy_ocr():
+    """获取 EasyOCR 实例（单例模式）"""
+    global _easy_ocr
+    if _easy_ocr is None:
+        try:
+            import easyocr
+            # 支持中文和英文
+            _easy_ocr = easyocr.Reader(['ch_sim', 'en'], gpu=False)
+            print("  EasyOCR 初始化成功")
+        except ImportError:
+            print("  EasyOCR 未安装，请运行: pip install easyocr")
+            return None
+    return _easy_ocr
+
+def get_paddle_ocr():
+    """获取 PaddleOCR 实例（单例模式）"""
+    global _paddle_ocr
+    if _paddle_ocr is None:
+        try:
+            from paddleocr import PaddleOCR
+            _paddle_ocr = PaddleOCR(use_angle_cls=True, lang='ch', show_log=False)
+            print("  PaddleOCR 初始化成功")
+        except ImportError:
+            print("  PaddleOCR 未安装")
+            return None
+    return _paddle_ocr
+
 def detect_encoding(file_path):
    """检测文件编码"""
    with open(file_path, 'rb') as f:
@ -27,14 +67,154 @@ def parse_md(file_path):
    """解析Markdown文件"""
    return parse_txt(file_path)

-def parse_pdf(file_path):
-    """解析PDF文件（支持大文件）"""
+def parse_pdf_with_ocr(file_path):
+    """使用 PyMuPDF + OCR 解析PDF（支持扫描版）"""
+    try:
+        import fitz  # PyMuPDF
+        from PIL import Image
+        import io
+        
+        file_size = os.path.getsize(file_path)
+        file_size_mb = file_size / (1024 * 1024)
+        print(f"  PDF文件大小: {file_size_mb:.1f} MB")
+        
+        doc = fitz.open(file_path)
+        total_pages = len(doc)
+        print(f"  PDF总页数: {total_pages}")
+        
+        text_parts = []
+        ocr_used = False
+        
+        for i, page in enumerate(doc):
+            if (i + 1) % 20 == 0 or i == 0:
+                print(f"  解析进度: {i + 1}/{total_pages} 页")
+            
+            try:
+                # 先尝试直接提取文本
+                text = page.get_text()
+                
+                # 如果文本太少，可能是扫描版，使用OCR
+                if len(text.strip()) < 50:
+                    if not ocr_used:
+                        print(f"  检测到扫描版PDF，启用OCR识别...")
+                        ocr_used = True
+                    
+                    # 将页面渲染为图片
+                    mat = fitz.Matrix(2, 2)  # 2x缩放提高OCR精度
+                    pix = page.get_pixmap(matrix=mat)
+                    img_data = pix.tobytes("png")
+                    
+                    # 使用 EasyOCR、PaddleOCR 或 Tesseract
+                    if OCR_ENGINE == 'easyocr':
+                        text = ocr_with_easyocr(img_data)
+                    elif OCR_ENGINE == 'paddle':
+                        text = ocr_with_paddle(img_data)
+                    else:
+                        text = ocr_with_tesseract(img_data)
+                
+                if text and text.strip():
+                    text_parts.append(text)
+                    
+            except Exception as e:
+                print(f"  警告: 第 {i + 1} 页解析失败: {e}")
+                continue
+        
+        doc.close()
+        total_chars = len(''.join(text_parts))
+        print(f"  PDF解析完成，提取文本 {total_chars} 字符" + (" (使用OCR)" if ocr_used else ""))
+        return "\n".join(text_parts)
+        
+    except ImportError as e:
+        print(f"  缺少依赖: {e}")
+        print(f"  请运行: pip install pymupdf easyocr pillow")
+        return parse_pdf_basic(file_path)
+    except Exception as e:
+        print(f"  PyMuPDF解析失败: {e}，尝试基础解析...")
+        return parse_pdf_basic(file_path)
+
+def ocr_with_easyocr(img_data):
+    """使用 EasyOCR 识别图片（推荐）"""
+    try:
+        from PIL import Image
+        import io
+        import numpy as np
+        
+        ocr = get_easy_ocr()
+        if ocr is None:
+            return ""
+        
+        # 将图片数据转换为 numpy 数组
+        img = Image.open(io.BytesIO(img_data))
+        img_array = np.array(img)
+        
+        # OCR 识别
+        result = ocr.readtext(img_array)
+        
+        # 提取文本
+        texts = [item[1] for item in result]
+        return '\n'.join(texts)
+    except Exception as e:
+        print(f"  EasyOCR 识别失败: {e}")
+        return ""
+
+def ocr_with_paddle(img_data):
+    """使用 PaddleOCR 识别图片"""
+    try:
+        from PIL import Image
+        import io
+        import numpy as np
+        
+        ocr = get_paddle_ocr()
+        if ocr is None:
+            return ""
+        
+        # 将图片数据转换为 numpy 数组
+        img = Image.open(io.BytesIO(img_data))
+        img_array = np.array(img)
+        
+        # OCR 识别
+        result = ocr.ocr(img_array, cls=True)
+        
+        # 提取文本
+        texts = []
+        if result and result[0]:
+            for line in result[0]:
+                if line and len(line) >= 2:
+                    texts.append(line[1][0])
+        
+        return '\n'.join(texts)
+    except Exception as e:
+        print(f"  PaddleOCR 识别失败: {e}")
+        return ""
+
+def ocr_with_tesseract(img_data):
+    """使用 Tesseract 识别图片（备用方案）"""
+    try:
+        import pytesseract
+        from PIL import Image
+        import io
+        
+        # 配置 Tesseract 路径
+        if os.path.exists(TESSERACT_PATH):
+            pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
+        
+        img = Image.open(io.BytesIO(img_data))
+        
+        # OCR识别（中文+英文）
+        text = pytesseract.image_to_string(img, lang='chi_sim+eng')
+        return text
+    except Exception as e:
+        print(f"  Tesseract 识别失败: {e}")
+        return ""
+
+def parse_pdf_basic(file_path):
+    """基础PDF解析（使用PyPDF2，不支持扫描版）"""
    try:
        from PyPDF2 import PdfReader
        
        file_size = os.path.getsize(file_path)
        file_size_mb = file_size / (1024 * 1024)
-        print(f"  PDF文件大小: {file_size_mb:.1f} MB")
+        print(f"  PDF文件大小: {file_size_mb:.1f} MB (基础模式)")
        
        reader = PdfReader(file_path)
        total_pages = len(reader.pages)
@ -58,6 +238,10 @@ def parse_pdf(file_path):
        print(f"解析PDF文件失败 {file_path}: {e}")
        return ""

+def parse_pdf(file_path):
+    """解析PDF文件（优先使用OCR方案）"""
+    return parse_pdf_with_ocr(file_path)
+
 def parse_docx(file_path):
    """解析Word文档"""
    try:
@ -95,3 +279,17 @@ def is_supported_file(filename):
    """检查文件是否支持"""
    ext = os.path.splitext(filename)[1].lower()
    return ext in SUPPORTED_EXTENSIONS
+
+def check_ocr_available():
+    """检查OCR是否可用"""
+    try:
+        import pytesseract
+        if os.path.exists(TESSERACT_PATH):
+            pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
+        version = pytesseract.get_tesseract_version()
+        print(f"Tesseract OCR 版本: {version}")
+        return True
+    except Exception as e:
+        print(f"OCR不可用: {e}")
+        print("请安装 Tesseract OCR: https://github.com/UB-Mannheim/tesseract/wiki")
+        return False
--- a/rag-python/merge_index.py
+++ b/rag-python/merge_index.py
@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+"""
+索引合并脚本 - 用于合并多人处理的知识库索引
+将多个 index_data 文件夹合并成一个统一的索引
+
+使用方法：
+1. 将各人处理好的 index_data 文件夹重命名后放到 to_merge/ 目录
+   例如: to_merge/index_data_张三/, to_merge/index_data_李四/
+2. 运行: python merge_index.py
+3. 合并后的索引会保存到 index_data/ 目录
+"""
+import os
+import sys
+import json
+import shutil
+
+# 添加当前目录到路径
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from config import INDEX_DIR, EMBEDDING_MODEL, OLLAMA_URL
+from vector_store import vector_store
+
+# 待合并的文件夹
+MERGE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'to_merge')
+
+def load_documents_from_folder(folder_path):
+    """从文件夹加载文档数据"""
+    docs_file = os.path.join(folder_path, 'documents.json')
+    if not os.path.exists(docs_file):
+        print(f"  警告: {folder_path} 中没有 documents.json")
+        return []
+    
+    try:
+        with open(docs_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            return data.get('documents', [])
+    except Exception as e:
+        print(f"  错误: 读取 {docs_file} 失败: {e}")
+        return []
+
+def merge_indexes():
+    """合并多个索引"""
+    print("=" * 60)
+    print("索引合并工具")
+    print("=" * 60)
+    
+    # 检查合并目录
+    if not os.path.exists(MERGE_DIR):
+        os.makedirs(MERGE_DIR)
+        print(f"已创建合并目录: {MERGE_DIR}")
+        print(f"请将各人处理好的 index_data 文件夹放到此目录中")
+        print(f"例如: {MERGE_DIR}/index_data_张三/")
+        return
+    
+    # 扫描待合并的文件夹
+    folders = []
+    for name in os.listdir(MERGE_DIR):
+        folder_path = os.path.join(MERGE_DIR, name)
+        if os.path.isdir(folder_path):
+            docs_file = os.path.join(folder_path, 'documents.json')
+            if os.path.exists(docs_file):
+                folders.append((name, folder_path))
+    
+    if not folders:
+        print(f"在 {MERGE_DIR} 中没有找到有效的索引文件夹")
+        print(f"请确保文件夹中包含 documents.json 文件")
+        return
+    
+    print(f"找到 {len(folders)} 个待合并的索引:")
+    for name, path in folders:
+        print(f"  - {name}")
+    print()
+    
+    # 确认合并
+    confirm = input("是否开始合并？这将覆盖现有的 index_data (y/n): ").strip().lower()
+    if confirm != 'y':
+        print("已取消")
+        return
+    
+    # 收集所有文档
+    all_documents = []
+    all_files = set()
+    
+    print()
+    print("正在收集文档...")
+    for name, folder_path in folders:
+        print(f"  处理: {name}")
+        docs = load_documents_from_folder(folder_path)
+        
+        for doc in docs:
+            filename = doc.get('filename', '')
+            # 避免重复文件
+            if filename and filename not in all_files:
+                all_files.add(filename)
+                all_documents.append(doc)
+        
+        print(f"    文档数: {len(docs)}, 累计: {len(all_documents)}")
+    
+    print()
+    print(f"共收集 {len(all_documents)} 个文档块")
+    print(f"来自 {len(all_files)} 个不同文件")
+    print()
+    
+    # 清空现有索引
+    print("清空现有索引...")
+    if os.path.exists(INDEX_DIR):
+        shutil.rmtree(INDEX_DIR)
+    os.makedirs(INDEX_DIR)
+    
+    # 重建索引
+    print("重建向量索引...")
+    print("这可能需要一些时间，请耐心等待...")
+    print()
+    
+    # 初始化向量存储
+    vector_store.documents = []
+    vector_store.index = None
+    
+    # 按文件分组处理
+    file_docs = {}
+    for doc in all_documents:
+        filename = doc.get('filename', 'unknown')
+        if filename not in file_docs:
+            file_docs[filename] = []
+        file_docs[filename].append(doc)
+    
+    total_files = len(file_docs)
+    processed = 0
+    
+    for filename, docs in file_docs.items():
+        processed += 1
+        print(f"[{processed}/{total_files}] 索引: {filename} ({len(docs)} 块)")
+        
+        # 提取文本块
+        chunks = [doc.get('content', '') for doc in docs]
+        metadata = {
+            'filename': filename,
+            'file_path': docs[0].get('file_path', ''),
+            'char_count': sum(len(c) for c in chunks)
+        }
+        
+        try:
+            vector_store.add_documents(chunks, metadata)
+        except Exception as e:
+            print(f"  错误: {e}")
+    
+    # 保存索引
+    print()
+    print("保存索引...")
+    vector_store.save_index()
+    
+    # 显示结果
+    stats = vector_store.get_stats()
+    print()
+    print("=" * 60)
+    print("合并完成!")
+    print("=" * 60)
+    print(f"总文件数: {stats['total_files']}")
+    print(f"总文本块: {stats['total_chunks']}")
+    print(f"索引目录: {INDEX_DIR}")
+    print()
+    print("提示: 合并完成后可以删除 to_merge/ 目录中的文件")
+
+if __name__ == '__main__':
+    merge_indexes()
--- a/rag-python/requirements.txt
+++ b/rag-python/requirements.txt
@ -1,4 +1,4 @@
-# RAG 知识库服务依赖（使用本地 Ollama）
+# RAG Knowledge Base Dependencies
 flask>=2.0.0
 flask-cors>=4.0.0
 faiss-cpu>=1.7.0
@ -9,3 +9,11 @@ python-docx>=0.8.0
 chardet>=5.0.0
 jieba>=0.42.0
 requests>=2.28.0
+
+# PDF OCR Support
+pymupdf>=1.23.0
+pillow>=10.0.0
+pycryptodome>=3.19.0
+
+# EasyOCR
+easyocr
--- a/rag-python/vector_store.py
+++ b/rag-python/vector_store.py
@ -249,10 +249,13 @@ class VectorStore:
        for i, idx in enumerate(indices[0]):
            if idx < len(self.documents) and idx >= 0:
                doc = self.documents[idx]
+                metadata = doc.get('metadata', {})
                results.append({
                    'content': doc['content'],
                    'score': float(scores[0][i]),
-                    'metadata': doc.get('metadata', {})
+                    'filename': metadata.get('filename', '未知来源'),
+                    'similarity': float(scores[0][i]),
+                    'metadata': metadata
                })
        
        return results
--- a/rag-python/知识库处理说明.md
+++ b/rag-python/知识库处理说明.md
@ -0,0 +1,124 @@
+# 知识库多人协作处理方案
+
+## 一、分工方式
+
+将 PDF 文件按数量或类型分配给不同人员处理：
+
+| 人员 | 负责文件 | 预计时间 |
+|-----|---------|---------|
+| 人员A | 心理测量类 (10个PDF) | 2-3小时 |
+| 人员B | 心理治疗类 (10个PDF) | 2-3小时 |
+| 人员C | 心理学基础 (10个PDF) | 2-3小时 |
+
+## 二、每个人需要的文件
+
+将以下文件打包发给每个处理人员：
+
+```
+rag-python-处理包/
+├── batch_index.py        # 索引脚本
+├── config.py             # 配置文件
+├── document_parser.py    # 文档解析
+├── text_splitter.py      # 文本分块
+├── vector_store.py       # 向量存储
+├── knowledge_docs/       # 空目录，用于放PDF
+├── index_data/           # 空目录，存放结果
+└── requirements.txt      # Python依赖
+```
+
+## 三、处理人员操作步骤
+
+### 1. 环境准备
+
+```bash
+# 安装 Python 依赖
+pip install -r requirements.txt
+
+# 安装 Tesseract OCR（用于扫描版PDF）
+# Windows: 下载安装 https://github.com/UB-Mannheim/tesseract/wiki
+# 安装时勾选中文语言包
+
+# 安装 Ollama 并下载嵌入模型
+ollama pull nomic-embed-text
+```
+
+### 2. 放入 PDF 文件
+
+将分配的 PDF 文件放入 `knowledge_docs/` 目录
+
+### 3. 执行索引
+
+```bash
+python batch_index.py
+```
+
+等待处理完成，会显示：
+- 处理进度
+- 每个文件的字符数和向量块数
+- 总耗时
+
+### 4. 返回结果
+
+处理完成后，将 `index_data/` 文件夹打包发回：
+- 重命名为 `index_data_姓名/`
+- 包含 `documents.json` 和 `faiss.index` 两个文件
+
+## 四、合并索引（汇总人员操作）
+
+### 1. 收集所有人的结果
+
+将各人返回的 `index_data_xxx/` 文件夹放到 `to_merge/` 目录：
+
+```
+rag-python/
+├── to_merge/
+│   ├── index_data_张三/
+│   │   ├── documents.json
+│   │   └── faiss.index
+│   ├── index_data_李四/
+│   │   ├── documents.json
+│   │   └── faiss.index
+│   └── index_data_王五/
+│       ├── documents.json
+│       └── faiss.index
+└── merge_index.py
+```
+
+### 2. 执行合并
+
+```bash
+python merge_index.py
+```
+
+### 3. 验证结果
+
+```bash
+python app.py
+# 访问 http://localhost:5000/api/stats 查看统计
+```
+
+## 五、注意事项
+
+1. **Ollama 必须运行** - 所有处理人员的电脑都需要运行 Ollama
+2. **模型要一致** - 都使用 `nomic-embed-text` 模型
+3. **避免重复文件** - 不同人员处理的 PDF 不要重复
+4. **大文件耐心等待** - 200MB 的 PDF 可能需要 30-60 分钟
+
+## 六、常见问题
+
+### Q: 处理中断了怎么办？
+A: 删除 `index_data/` 目录，重新运行 `batch_index.py`
+
+### Q: 某个 PDF 处理失败怎么办？
+A: 检查 PDF 是否损坏，或尝试用其他工具转换格式
+
+### Q: 合并后发现有重复怎么办？
+A: 合并脚本会自动去重（按文件名判断）
+
+## 七、预估时间
+
+| PDF 类型 | 大小 | 预估时间 |
+|---------|------|---------|
+| 文字版 PDF | 10MB | 1-2 分钟 |
+| 扫描版 PDF | 10MB | 5-10 分钟 |
+| 大型扫描版 | 200MB | 30-60 分钟 |
--- a/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/RetrievalService.java
+++ b/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/RetrievalService.java
@ -1,10 +1,12 @@
 package com.ddnai.system.rag.service;

 import com.ddnai.system.rag.client.ChromaDBClient;
+import com.ddnai.system.rag.client.PythonRagClient;
 import com.ddnai.system.rag.config.RagProperties;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;

 import java.io.IOException;
@ -13,7 +15,7 @@ import java.util.stream.Collectors;

 /**
 * RAG检索服务
- * 从ChromaDB检索相关文档片段
+ * 优先使用Python RAG服务，如果不可用则回退到ChromaDB
 * 
 * @author ddnai
 */
@ -22,17 +24,21 @@ public class RetrievalService {

    private static final Logger log = LoggerFactory.getLogger(RetrievalService.class);

-    @Autowired
+    @Autowired(required = false)
    private ChromaDBClient chromaDBClient;

-    @Autowired
+    @Autowired(required = false)
    private EmbeddingService embeddingService;

    @Autowired
    private RagProperties ragProperties;

+    @Autowired
+    private PythonRagClient pythonRagClient;
+
    /**
     * 语义检索相关文档
+     * 优先使用Python RAG服务
     * 
     * @param query 查询文本
     * @param topK 返回结果数量
@ -45,6 +51,33 @@ public class RetrievalService {
            throw new IllegalArgumentException("Query cannot be null or empty");
        }

+        // 优先使用Python RAG服务
+        if (pythonRagClient != null && pythonRagClient.isAvailable()) {
+            log.info("Using Python RAG service for retrieval");
+            try {
+                List<Map<String, Object>> results = pythonRagClient.search(query, topK);
+                log.info("Python RAG retrieved {} documents for query", results.size());
+                // 打印每个结果的详细信息用于调试
+                for (int i = 0; i < results.size(); i++) {
+                    Map<String, Object> result = results.get(i);
+                    log.info("Result {}: filename={}, content_length={}, keys={}", 
+                        i + 1, 
+                        result.get("filename"), 
+                        result.get("content") != null ? result.get("content").toString().length() : 0,
+                        result.keySet());
+                }
+                return results;
+            } catch (Exception e) {
+                log.warn("Python RAG service failed, falling back to ChromaDB: {}", e.getMessage());
+            }
+        }
+
+        // 回退到ChromaDB
+        if (chromaDBClient == null || embeddingService == null) {
+            log.warn("Neither Python RAG nor ChromaDB is available");
+            return new ArrayList<>();
+        }
+
        // 1. 向量化查询
        float[] queryEmbedding = embeddingService.embedText(query);
        
@ -57,7 +90,7 @@ public class RetrievalService {
        // 4. 过滤低相似度结果
        results = filterBySimilarity(results, threshold);
        
-        log.info("Retrieved {} documents for query (topK={}, threshold={})", results.size(), topK, threshold);
+        log.info("ChromaDB retrieved {} documents for query (topK={}, threshold={})", results.size(), topK, threshold);
        return results;
    }

@ -75,6 +108,30 @@ public class RetrievalService {
            throw new IllegalArgumentException("Query cannot be null or empty");
        }

+        // 优先使用Python RAG服务（Python服务暂不支持分类过滤，直接搜索）
+        if (pythonRagClient != null && pythonRagClient.isAvailable()) {
+            log.info("Using Python RAG service for retrieval with filter");
+            try {
+                List<Map<String, Object>> results = pythonRagClient.search(query, topK);
+                // 手动过滤分类
+                if (category != null && !category.isEmpty()) {
+                    results = results.stream()
+                            .filter(r -> category.equals(r.get("category")))
+                            .collect(Collectors.toList());
+                }
+                log.info("Python RAG retrieved {} documents with category filter '{}'", results.size(), category);
+                return results;
+            } catch (Exception e) {
+                log.warn("Python RAG service failed, falling back to ChromaDB: {}", e.getMessage());
+            }
+        }
+
+        // 回退到ChromaDB
+        if (chromaDBClient == null || embeddingService == null) {
+            log.warn("Neither Python RAG nor ChromaDB is available");
+            return new ArrayList<>();
+        }
+
        // 1. 向量化查询
        float[] queryEmbedding = embeddingService.embedText(query);
        
@ -94,7 +151,7 @@ public class RetrievalService {
        double threshold = ragProperties.getRetrieval().getSimilarityThreshold();
        results = filterBySimilarity(results, threshold);
        
-        log.info("Retrieved {} documents for query with category filter '{}' (topK={})", 
+        log.info("ChromaDB retrieved {} documents with category filter '{}' (topK={})", 
                results.size(), category, topK);
        return results;
    }
--- a/ry-xinli-system/src/main/java/com/ddnai/system/service/impl/psychology/PsyAssessmentReportServiceImpl.java
+++ b/ry-xinli-system/src/main/java/com/ddnai/system/service/impl/psychology/PsyAssessmentReportServiceImpl.java
@ -140,6 +140,7 @@ public class PsyAssessmentReportServiceImpl implements IPsyAssessmentReportServi
        // 5. 计算每个因子的得分
        List<PsyFactorScore> factorScores = new ArrayList<>();
        BigDecimal totalScore = BigDecimal.ZERO;
+        
        for (PsyFactor factor : factors)
        {
            List<PsyFactorRule> rules = factorRulesMap.get(factor.getFactorId());
--- a/ry-xinli-system/src/main/resources/mapper/system/psychology/PsyResultInterpretationMapper.xml
+++ b/ry-xinli-system/src/main/resources/mapper/system/psychology/PsyResultInterpretationMapper.xml
@ -38,7 +38,14 @@ PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        <include refid="selectInterpretationVo"/>
        where 
            (scale_id = #{scaleId} OR scale_id IS NULL)
-            AND (factor_id = #{factorId} OR factor_id IS NULL)
+            <choose>
+                <when test="factorId != null">
+                    AND factor_id = #{factorId}
+                </when>
+                <otherwise>
+                    AND factor_id IS NULL
+                </otherwise>
+            </choose>
            AND #{score} >= score_range_min 
            AND #{score} &lt;= score_range_max
        order by sort_order
--- a/xinli-ui/src/views/psychology/report/detail.vue
+++ b/xinli-ui/src/views/psychology/report/detail.vue
@ -419,21 +419,9 @@ export default {
      const API_URL = 'https://api.moonshot.cn/v1/chat/completions';
      const API_KEY = 'sk-U9fdriPxwBcrpWW0Ite3N0eVtX7VxnqqqYUIBAdWd1hgEA9m';
      const MODEL = 'moonshot-v1-32k';
-      // 备用：Ollama本地大模型
-      // const API_URL = 'http://192.168.0.106:11434/api/chat';
-      // const API_KEY = '';  // 本地模型不需要API Key
-      // const MODEL = 'deepseek-r1:32b';
      
-      // 构建系统提示词
-      const SYSTEM_PROMPT = [
-        '你是专业心理测评报告分析师，请根据用户提供的报告内容进行深度分析。要求：',
-        '1. 提取报告的核心信息和关键指标；',
-        '2. 分析测评结果的含义和可能的影响；',
-        '3. 提供专业、客观、易懂的分析解读（500-800字）；',
-        '4. 使用结构化的格式输出，包含：核心结论、详细分析、建议、总体结论四个部分；',
-        '5. 仅输出分析结果，不添加额外建议、问候语或思考过程；',
-        '6. 使用HTML格式输出，使用<h3>标签作为小标题，<p>标签作为段落。'
-      ].join('\n');
+      // RAG知识库服务地址
+      const RAG_API_URL = 'http://localhost:5000/api/rag-analyze';
      
      // 构建完整的提示词
      const reportContent = this.reportForm.reportContent || '';
@ -443,9 +431,45 @@ export default {
      // 提取纯文本内容（去除HTML标签）
      const textContent = reportContent.replace(/<[^>]*>/g, '').substring(0, 3000);

-      const userPrompt = `重要：请直接输出结果，不要包含任何思考过程、<think>标签或<think>标签。\n\n报告标题：${reportTitle}\n报告类型：${reportType}\n报告内容：${textContent}`;
-
      try {
+        // 1. 先调用RAG服务获取知识库上下文
+        let knowledgeContext = '';
+        let ragSources = [];
+        try {
+          const ragResponse = await axios.post(RAG_API_URL, {
+            reportContent: reportContent,
+            reportTitle: reportTitle
+          }, { timeout: 10000 });
+          
+          if (ragResponse.data && ragResponse.data.success) {
+            knowledgeContext = ragResponse.data.data.knowledgeContext || '';
+            ragSources = ragResponse.data.data.sources || [];
+            console.log('RAG检索成功，获取到', ragSources.length, '条参考资料');
+          }
+        } catch (ragErr) {
+          console.warn('RAG服务调用失败，将不使用知识库增强:', ragErr.message);
+          // RAG失败不影响AI分析，继续执行
+        }
+        
+        // 2. 构建系统提示词（包含知识库上下文）
+        let SYSTEM_PROMPT = [
+          '你是专业心理测评报告分析师，请根据用户提供的报告内容进行深度分析。要求：',
+          '1. 提取报告的核心信息和关键指标；',
+          '2. 分析测评结果的含义和可能的影响；',
+          '3. 提供专业、客观、易懂的分析解读（500-800字）；',
+          '4. 使用结构化的格式输出，包含：核心结论、详细分析、建议、总体结论四个部分；',
+          '5. 仅输出分析结果，不添加额外建议、问候语或思考过程；',
+          '6. 使用HTML格式输出，使用<h3>标签作为小标题，<p>标签作为段落。'
+        ].join('\n');
+        
+        // 如果有知识库上下文，添加到系统提示词中
+        if (knowledgeContext) {
+          SYSTEM_PROMPT += '\n\n【专业知识库参考资料】\n' + knowledgeContext + '\n\n请结合以上专业资料进行分析，使分析更加专业和有深度。';
+        }
+      
+        const userPrompt = `重要：请直接输出结果，不要包含任何思考过程、<think>标签或<think>标签。\n\n报告标题：${reportTitle}\n报告类型：${reportType}\n报告内容：${textContent}`;
+
+        // 3. 调用AI API
        const { data } = await axios.post(API_URL, {
          model: MODEL,
          messages: [
@ -453,7 +477,7 @@ export default {
            { role: 'user', content: userPrompt }
          ],
          temperature: 0.2,
-          max_tokens: 1000,
+          max_tokens: 1500,
          stream: false
        }, {
          headers: {
@ -484,7 +508,20 @@ export default {
        }

        // 格式化结果，确保HTML格式正确
-        this.aiResult = this.formatAIResult(rawResponse);
+        let formattedResult = this.formatAIResult(rawResponse);
+        
+        // 4. 如果有知识库来源，添加参考资料说明
+        if (ragSources && ragSources.length > 0) {
+          formattedResult += '<div class="rag-sources" style="margin-top: 20px; padding: 15px; background: #f5f7fa; border-radius: 8px;">';
+          formattedResult += '<h4 style="margin: 0 0 10px 0; color: #409EFF;"><i class="el-icon-document"></i> 参考知识库资料</h4>';
+          formattedResult += '<ul style="margin: 0; padding-left: 20px; color: #666;">';
+          ragSources.forEach((source, index) => {
+            formattedResult += `<li style="margin-bottom: 5px;"><strong>${source.filename}</strong></li>`;
+          });
+          formattedResult += '</ul></div>';
+        }
+        
+        this.aiResult = formattedResult;
        
        // 保存AI分析结果到数据库
        try {