diff --git a/.gitignore b/.gitignore index bb74f6f5..0c5f5cbb 100644 --- a/.gitignore +++ b/.gitignore @@ -60,11 +60,30 @@ rag-python/knowledge_docs/ rag-python/uploads/ rag-python/__pycache__/ rag-python/*.pyc +rag-python/*.bat + +###################################################################### +# Android App Build Output +xinli-App/build/ +xinli-App/app/build/ +xinli-App/.gradle/ +xinli-App/local.properties +xinli-App/*.keystore +xinli-App/*.jks + +# Android App 临时文档和脚本 +xinli-App/*.md +xinli-App/*.bat +xinli-App/*.txt ###################################################################### # PaddleSpeech - 大型第三方库 PaddleSpeech-develop/ +###################################################################### +# 项目介绍目录(本地文档) +项目介绍/ + ###################################################################### # 临时文档和调试文件(根目录下的中文 md/sql/bat 文件) /*.md diff --git a/rag-python/app.py b/rag-python/app.py new file mode 100644 index 00000000..e53154b9 --- /dev/null +++ b/rag-python/app.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +RAG 知识库服务 - Flask API +支持与 jar 包同级目录部署 +""" +import os +import sys +from flask import Flask, request, jsonify +from flask_cors import CORS +from config import HOST, PORT, KNOWLEDGE_DIR, BASE_DIR +from knowledge_service import knowledge_service +from file_watcher import FileWatcher + +app = Flask(__name__) +CORS(app) # 允许跨域请求 + +# 文件监控器 +file_watcher = None + +@app.route('/api/health', methods=['GET']) +def health_check(): + """健康检查""" + return jsonify({ + 'status': 'ok', + 'service': 'RAG Knowledge Service', + 'knowledge_dir': KNOWLEDGE_DIR, + 'base_dir': BASE_DIR + }) + +@app.route('/api/documents', methods=['GET']) +def list_documents(): + """列出所有文档""" + try: + documents = knowledge_service.list_documents() + return jsonify({ + 'success': True, + 'data': documents + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +@app.route('/api/documents/upload', methods=['POST']) +def upload_document(): + """上传文档""" + try: + if 'file' not in request.files: + return jsonify({'success': False, 'error': '没有上传文件'}), 400 + + file = request.files['file'] + if file.filename == '': + return jsonify({'success': False, 'error': '文件名为空'}), 400 + + result = knowledge_service.upload_and_index(file) + + if result['success']: + return jsonify({ + 'success': True, + 'data': result + }) + else: + return jsonify({'success': False, 'error': result.get('error')}), 400 + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +@app.route('/api/documents/', methods=['DELETE']) +def delete_document(filename): + """删除文档""" + try: + result = knowledge_service.delete_document(filename) + return jsonify({ + 'success': result['success'], + 'data': result + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +@app.route('/api/search', methods=['POST']) +def search(): + """搜索文档""" + try: + data = request.get_json() + query = data.get('query', '') + top_k = data.get('top_k', 5) + + if not query: + return jsonify({'success': False, 'error': '查询内容不能为空'}), 400 + + results = knowledge_service.search(query, top_k) + + return jsonify({ + 'success': True, + 'data': results + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +@app.route('/api/stats', methods=['GET']) +def get_stats(): + """获取统计信息""" + try: + stats = knowledge_service.get_stats() + return jsonify({ + 'success': True, + 'data': stats + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +@app.route('/api/rebuild', methods=['POST']) +def rebuild_index(): + """重建索引""" + try: + result = knowledge_service.rebuild_index() + return jsonify({ + 'success': True, + 'data': result + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +@app.route('/api/scan', methods=['POST']) +def scan_folder(): + """扫描文件夹并索引新文件""" + try: + result = knowledge_service.scan_and_index_folder() + return jsonify({ + 'success': True, + 'data': result + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + +def init_service(): + """初始化服务""" + print("=" * 50) + print("RAG 知识库服务启动中...") + print("=" * 50) + + # 初始化知识库服务 + knowledge_service.init() + + # 扫描并索引新文件 + knowledge_service.scan_and_index_folder() + + # 启动文件监控 + global file_watcher + file_watcher = FileWatcher(knowledge_service) + file_watcher.start() + + print("=" * 50) + print(f"服务已启动: http://{HOST}:{PORT}") + print(f"知识库文件夹: {KNOWLEDGE_DIR}") + print("=" * 50) + +if __name__ == '__main__': + init_service() + app.run(host=HOST, port=PORT, debug=False, threaded=True) diff --git a/rag-python/batch_index.py b/rag-python/batch_index.py new file mode 100644 index 00000000..f7689265 --- /dev/null +++ b/rag-python/batch_index.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +""" +批量索引脚本 - 用于处理大文件 +直接运行此脚本来索引 knowledge_docs 目录中的所有文件 + +使用方法: +1. 将 PDF 文件放入 rag-python/knowledge_docs/ 目录 +2. 运行: python batch_index.py +""" +import os +import sys +import time + +# 添加当前目录到路径 +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from config import KNOWLEDGE_DIR, CHUNK_SIZE +from document_parser import parse_document, is_supported_file +from text_splitter import split_text +from vector_store import vector_store + +def format_time(seconds): + """格式化时间""" + if seconds < 60: + return f"{seconds:.1f}秒" + elif seconds < 3600: + return f"{seconds/60:.1f}分钟" + else: + return f"{seconds/3600:.1f}小时" + +def estimate_time(char_count): + """估算处理时间""" + # 每300字符一个块,每块约1.5秒 + chunks = char_count / CHUNK_SIZE + seconds = chunks * 1.5 + return format_time(seconds) + +def batch_index(): + """批量索引所有文件""" + print("=" * 60) + print("批量索引工具") + print("=" * 60) + print(f"知识库目录: {KNOWLEDGE_DIR}") + print(f"分块大小: {CHUNK_SIZE} 字符") + print() + + # 加载现有索引 + print("加载现有索引...") + vector_store.load_index() + stats = vector_store.get_stats() + indexed_files = set(stats.get('files', [])) + print(f"已索引文件: {len(indexed_files)} 个") + print() + + # 扫描文件 + files_to_process = [] + for filename in os.listdir(KNOWLEDGE_DIR): + file_path = os.path.join(KNOWLEDGE_DIR, filename) + if os.path.isfile(file_path) and is_supported_file(filename): + if filename not in indexed_files: + file_size = os.path.getsize(file_path) + files_to_process.append((filename, file_path, file_size)) + + if not files_to_process: + print("没有新文件需要索引。") + print(f"如需重新索引,请先删除 index_data 目录中的文件。") + return + + # 显示待处理文件 + print(f"发现 {len(files_to_process)} 个新文件:") + total_size = 0 + for filename, _, size in files_to_process: + size_mb = size / (1024 * 1024) + total_size += size + print(f" - {filename} ({size_mb:.1f} MB)") + + print(f"\n总大小: {total_size / (1024 * 1024):.1f} MB") + print() + + # 确认处理 + confirm = input("是否开始处理?(y/n): ").strip().lower() + if confirm != 'y': + print("已取消。") + return + + print() + print("=" * 60) + print("开始处理...") + print("=" * 60) + + total_start = time.time() + success_count = 0 + fail_count = 0 + + for i, (filename, file_path, file_size) in enumerate(files_to_process): + print() + print(f"[{i+1}/{len(files_to_process)}] 处理: {filename}") + print("-" * 40) + + file_start = time.time() + + try: + # 解析文档 + print("解析文档...") + content = parse_document(file_path) + + if not content or not content.strip(): + print(f" 警告: 文档内容为空,跳过") + fail_count += 1 + continue + + char_count = len(content) + print(f" 提取文本: {char_count} 字符") + print(f" 预计处理时间: {estimate_time(char_count)}") + + # 分块 + print("分块处理...") + chunks = split_text(content) + print(f" 生成 {len(chunks)} 个文本块") + + # 向量化 + print("向量化处理...") + metadata = { + 'filename': filename, + 'file_path': file_path, + 'char_count': char_count + } + + added = vector_store.add_documents(chunks, metadata) + + file_time = time.time() - file_start + print(f" 完成! 耗时: {format_time(file_time)}") + success_count += 1 + + except Exception as e: + print(f" 错误: {e}") + fail_count += 1 + + # 总结 + total_time = time.time() - total_start + print() + print("=" * 60) + print("处理完成!") + print("=" * 60) + print(f"成功: {success_count} 个文件") + print(f"失败: {fail_count} 个文件") + print(f"总耗时: {format_time(total_time)}") + + # 显示最终统计 + final_stats = vector_store.get_stats() + print(f"索引总文件数: {final_stats['total_files']}") + print(f"索引总文本块: {final_stats['total_chunks']}") + +if __name__ == '__main__': + batch_index() diff --git a/rag-python/config.py b/rag-python/config.py new file mode 100644 index 00000000..6bdff42d --- /dev/null +++ b/rag-python/config.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +""" +RAG 服务配置文件 +支持与 jar 包同级目录部署 +使用本地 Ollama 进行向量化 +""" +import os +import sys + +# 服务配置 +HOST = "0.0.0.0" +PORT = 5000 + +# Ollama 配置(使用本地 Ollama 生成向量) +OLLAMA_URL = "http://localhost:11434" +OLLAMA_EMBED_MODEL = "nomic-embed-text" # 你已经下载的嵌入模型 + +# 获取程序运行目录(支持打包后部署) +# 当与 jar 包同级部署时,BASE_DIR 就是 rag-python 文件夹 +if getattr(sys, 'frozen', False): + # 如果是打包后的 exe + BASE_DIR = os.path.dirname(sys.executable) +else: + # 正常 Python 运行 + BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +# 文件夹配置 - 都在 rag-python 目录下 +KNOWLEDGE_DIR = os.path.join(BASE_DIR, "knowledge_docs") # 知识库文档目录 +INDEX_DIR = os.path.join(BASE_DIR, "index_data") # 索引存储目录 +UPLOAD_DIR = os.path.join(BASE_DIR, "uploads") # 上传文件临时目录 + +# 确保目录存在 +for dir_path in [KNOWLEDGE_DIR, INDEX_DIR, UPLOAD_DIR]: + os.makedirs(dir_path, exist_ok=True) + +# 支持的文件类型 +SUPPORTED_EXTENSIONS = {'.txt', '.md', '.pdf', '.docx', '.doc'} + +# 文本分块配置 +CHUNK_SIZE = 300 # 每个文本块的字符数(减小以适应 nomic-embed-text 的 2048 token 限制) +CHUNK_OVERLAP = 30 # 文本块之间的重叠字符数 + +# 检索配置 +TOP_K = 5 # 返回最相关的文档数量 + +# 打印配置信息 +print(f"[配置] 程序目录: {BASE_DIR}") +print(f"[配置] 知识库目录: {KNOWLEDGE_DIR}") +print(f"[配置] 索引目录: {INDEX_DIR}") +print(f"[配置] Ollama地址: {OLLAMA_URL}") +print(f"[配置] 嵌入模型: {OLLAMA_EMBED_MODEL}") diff --git a/rag-python/document_parser.py b/rag-python/document_parser.py new file mode 100644 index 00000000..a804515a --- /dev/null +++ b/rag-python/document_parser.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +文档解析器 - 支持多种文件格式 +""" +import os +import chardet +from config import SUPPORTED_EXTENSIONS + +def detect_encoding(file_path): + """检测文件编码""" + with open(file_path, 'rb') as f: + raw_data = f.read(10000) + result = chardet.detect(raw_data) + return result['encoding'] or 'utf-8' + +def parse_txt(file_path): + """解析纯文本文件""" + encoding = detect_encoding(file_path) + try: + with open(file_path, 'r', encoding=encoding, errors='ignore') as f: + return f.read() + except Exception as e: + print(f"解析TXT文件失败 {file_path}: {e}") + return "" + +def parse_md(file_path): + """解析Markdown文件""" + return parse_txt(file_path) + +def parse_pdf(file_path): + """解析PDF文件(支持大文件)""" + try: + from PyPDF2 import PdfReader + + file_size = os.path.getsize(file_path) + file_size_mb = file_size / (1024 * 1024) + print(f" PDF文件大小: {file_size_mb:.1f} MB") + + reader = PdfReader(file_path) + total_pages = len(reader.pages) + print(f" PDF总页数: {total_pages}") + + text_parts = [] + for i, page in enumerate(reader.pages): + if (i + 1) % 50 == 0 or i == 0: + print(f" 解析进度: {i + 1}/{total_pages} 页") + try: + text = page.extract_text() + if text: + text_parts.append(text) + except Exception as e: + print(f" 警告: 第 {i + 1} 页解析失败: {e}") + continue + + print(f" PDF解析完成,提取文本 {len(''.join(text_parts))} 字符") + return "\n".join(text_parts) + except Exception as e: + print(f"解析PDF文件失败 {file_path}: {e}") + return "" + +def parse_docx(file_path): + """解析Word文档""" + try: + from docx import Document + doc = Document(file_path) + text_parts = [] + for para in doc.paragraphs: + if para.text.strip(): + text_parts.append(para.text) + return "\n".join(text_parts) + except Exception as e: + print(f"解析DOCX文件失败 {file_path}: {e}") + return "" + +def parse_document(file_path): + """根据文件类型解析文档""" + ext = os.path.splitext(file_path)[1].lower() + + if ext not in SUPPORTED_EXTENSIONS: + print(f"不支持的文件类型: {ext}") + return "" + + parsers = { + '.txt': parse_txt, + '.md': parse_md, + '.pdf': parse_pdf, + '.docx': parse_docx, + '.doc': parse_docx, + } + + parser = parsers.get(ext, parse_txt) + return parser(file_path) + +def is_supported_file(filename): + """检查文件是否支持""" + ext = os.path.splitext(filename)[1].lower() + return ext in SUPPORTED_EXTENSIONS diff --git a/rag-python/file_watcher.py b/rag-python/file_watcher.py new file mode 100644 index 00000000..933ecb76 --- /dev/null +++ b/rag-python/file_watcher.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +""" +文件监控服务 - 监控知识库文件夹的变化 +""" +import os +import time +import threading +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler +from config import KNOWLEDGE_DIR +from document_parser import is_supported_file + +# 全局变量:正在上传的文件列表(用于避免文件监控器重复处理) +uploading_files = set() +uploading_lock = threading.Lock() + +def mark_uploading(filename): + """标记文件正在上传""" + with uploading_lock: + uploading_files.add(filename) + +def unmark_uploading(filename): + """取消上传标记""" + with uploading_lock: + uploading_files.discard(filename) + +def is_uploading(filename): + """检查文件是否正在上传""" + with uploading_lock: + return filename in uploading_files + + +class KnowledgeFileHandler(FileSystemEventHandler): + """文件变化处理器""" + + def __init__(self, knowledge_service): + self.knowledge_service = knowledge_service + self.pending_files = {} # 防抖:记录待处理的文件 + self.debounce_seconds = 3 # 增加防抖时间 + self._lock = threading.Lock() + + def _should_process(self, path): + """检查是否应该处理该文件""" + if not os.path.isfile(path): + return False + filename = os.path.basename(path) + # 跳过正在上传的文件 + if is_uploading(filename): + return False + return is_supported_file(filename) + + def _debounce_process(self, path, action): + """防抖处理""" + filename = os.path.basename(path) + + # 跳过正在上传的文件 + if is_uploading(filename): + print(f"[文件监控] 跳过正在上传的文件: {filename}") + return + + with self._lock: + self.pending_files[path] = { + 'action': action, + 'time': time.time() + } + + # 延迟处理 + def delayed_process(): + time.sleep(self.debounce_seconds) + with self._lock: + if path in self.pending_files: + info = self.pending_files.pop(path) + # 再次检查是否正在上传 + if not is_uploading(os.path.basename(path)): + self._do_process(path, info['action']) + + threading.Thread(target=delayed_process, daemon=True).start() + + def _do_process(self, path, action): + """执行实际处理""" + filename = os.path.basename(path) + + # 最后一次检查 + if is_uploading(filename): + return + + try: + if action == 'add': + # 检查文件是否已经在索引中 + stats = self.knowledge_service.get_stats() + if filename in stats.get('files', []): + print(f"[文件监控] 文件已索引,跳过: {filename}") + return + + print(f"[文件监控] 检测到新文件: {filename}") + result = self.knowledge_service.add_document(path, filename) + if result['success']: + print(f"[文件监控] 已索引: {filename}, {result['chunks']} 个文本块") + else: + print(f"[文件监控] 索引失败: {filename}, {result.get('error')}") + + elif action == 'delete': + # 确认文件确实不存在 + if os.path.exists(path): + print(f"[文件监控] 文件仍存在,跳过删除: {filename}") + return + print(f"[文件监控] 检测到文件删除: {filename}") + result = self.knowledge_service.delete_document(filename) + print(f"[文件监控] 已从索引删除: {filename}") + + elif action == 'modify': + # 对于修改事件,只有当文件内容确实变化时才重新索引 + # 这里简化处理:跳过修改事件,因为上传时已经索引过了 + print(f"[文件监控] 检测到文件修改,跳过: {filename}") + + except Exception as e: + print(f"[文件监控] 处理失败 {filename}: {e}") + + def on_created(self, event): + if not event.is_directory and self._should_process(event.src_path): + self._debounce_process(event.src_path, 'add') + + def on_deleted(self, event): + if not event.is_directory: + filename = os.path.basename(event.src_path) + if is_supported_file(filename) and not is_uploading(filename): + self._debounce_process(event.src_path, 'delete') + + def on_modified(self, event): + # 暂时禁用修改事件处理,避免与上传冲突 + pass + + def on_moved(self, event): + if not event.is_directory: + src_filename = os.path.basename(event.src_path) + # 处理移出 + if is_supported_file(src_filename) and not is_uploading(src_filename): + self._debounce_process(event.src_path, 'delete') + + # 处理移入 + if self._should_process(event.dest_path): + self._debounce_process(event.dest_path, 'add') + + +class FileWatcher: + """文件监控服务""" + + def __init__(self, knowledge_service): + self.knowledge_service = knowledge_service + self.observer = None + self.running = False + + def start(self): + """启动文件监控""" + if self.running: + return + + print(f"[文件监控] 开始监控文件夹: {KNOWLEDGE_DIR}") + + handler = KnowledgeFileHandler(self.knowledge_service) + self.observer = Observer() + self.observer.schedule(handler, KNOWLEDGE_DIR, recursive=True) + self.observer.start() + self.running = True + + def stop(self): + """停止文件监控""" + if self.observer: + self.observer.stop() + self.observer.join() + self.running = False + print("[文件监控] 已停止") diff --git a/rag-python/knowledge_service.py b/rag-python/knowledge_service.py new file mode 100644 index 00000000..89089fb7 --- /dev/null +++ b/rag-python/knowledge_service.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +""" +知识库服务 - 管理文档的添加、删除和检索 +""" +import os +import shutil +from datetime import datetime +from config import KNOWLEDGE_DIR, UPLOAD_DIR +from document_parser import parse_document, is_supported_file +from text_splitter import split_text +from vector_store import vector_store + +class KnowledgeService: + def __init__(self): + self.vector_store = vector_store + + def init(self): + """初始化服务,加载已有索引""" + self.vector_store.load_index() + + def scan_and_index_folder(self): + """ + 扫描知识库文件夹并索引所有文档 + 用于启动时或手动重建索引 + """ + print(f"开始扫描知识库文件夹: {KNOWLEDGE_DIR}") + + # 获取已索引的文件 + stats = self.vector_store.get_stats() + indexed_files = set(stats.get('files', [])) + + # 扫描文件夹 + new_files = [] + for root, dirs, files in os.walk(KNOWLEDGE_DIR): + for filename in files: + if is_supported_file(filename): + file_path = os.path.join(root, filename) + rel_path = os.path.relpath(file_path, KNOWLEDGE_DIR) + + if rel_path not in indexed_files: + new_files.append((filename, file_path, rel_path)) + + # 索引新文件 + indexed_count = 0 + for filename, file_path, rel_path in new_files: + try: + result = self.add_document(file_path, filename) + if result['success']: + indexed_count += 1 + print(f" 已索引: {rel_path}") + except Exception as e: + print(f" 索引失败 {rel_path}: {e}") + + print(f"扫描完成,新索引 {indexed_count} 个文件") + return { + 'scanned': len(new_files), + 'indexed': indexed_count + } + + def add_document(self, file_path, filename=None): + """ + 添加单个文档到知识库 + + Args: + file_path: 文件路径 + filename: 文件名(可选) + + Returns: + 处理结果 + """ + if filename is None: + filename = os.path.basename(file_path) + + if not os.path.exists(file_path): + return {'success': False, 'error': '文件不存在'} + + if not is_supported_file(filename): + return {'success': False, 'error': '不支持的文件类型'} + + # 解析文档 + print(f"正在解析文档: {filename}") + content = parse_document(file_path) + + if not content or not content.strip(): + return {'success': False, 'error': '文档内容为空'} + + # 分块 + chunks = split_text(content) + + if not chunks: + return {'success': False, 'error': '文档分块失败'} + + # 元数据 + metadata = { + 'filename': filename, + 'file_path': file_path, + 'indexed_at': datetime.now().isoformat(), + 'char_count': len(content) + } + + # 添加到向量存储 + added = self.vector_store.add_documents(chunks, metadata) + + return { + 'success': True, + 'filename': filename, + 'chunks': added, + 'char_count': len(content) + } + + def upload_and_index(self, file_storage, copy_to_knowledge=True): + """ + 处理上传的文件并索引 + + Args: + file_storage: Flask 的 FileStorage 对象 + copy_to_knowledge: 是否复制到知识库文件夹 + + Returns: + 处理结果 + """ + from file_watcher import mark_uploading, unmark_uploading + + filename = file_storage.filename + + if not is_supported_file(filename): + return {'success': False, 'error': '不支持的文件类型'} + + # 标记文件正在上传,防止文件监控器干扰 + mark_uploading(filename) + + # 保存到临时目录 + temp_path = os.path.join(UPLOAD_DIR, filename) + file_storage.save(temp_path) + + try: + # 索引文档 + result = self.add_document(temp_path, filename) + + if result['success'] and copy_to_knowledge: + # 复制到知识库文件夹 + dest_path = os.path.join(KNOWLEDGE_DIR, filename) + shutil.copy2(temp_path, dest_path) + result['saved_to'] = dest_path + + return result + finally: + # 清理临时文件 + if os.path.exists(temp_path): + os.remove(temp_path) + + # 延迟取消上传标记,给文件监控器足够时间忽略事件 + import threading + def delayed_unmark(): + import time + time.sleep(5) + unmark_uploading(filename) + threading.Thread(target=delayed_unmark, daemon=True).start() + + def delete_document(self, filename): + """ + 删除文档 + + Args: + filename: 文件名 + + Returns: + 删除结果 + """ + # 从向量存储删除 + deleted = self.vector_store.delete_by_filename(filename) + + # 从知识库文件夹删除 + file_path = os.path.join(KNOWLEDGE_DIR, filename) + file_deleted = False + if os.path.exists(file_path): + os.remove(file_path) + file_deleted = True + + return { + 'success': deleted > 0 or file_deleted, + 'chunks_deleted': deleted, + 'file_deleted': file_deleted + } + + def search(self, query, top_k=5): + """ + 搜索相关文档 + + Args: + query: 查询文本 + top_k: 返回结果数量 + + Returns: + 搜索结果 + """ + results = self.vector_store.search(query, top_k) + return results + + def get_stats(self): + """获取知识库统计信息""" + return self.vector_store.get_stats() + + def list_documents(self): + """列出所有已索引的文档""" + stats = self.vector_store.get_stats() + files = stats.get('files', []) + + documents = [] + for filename in files: + # 统计该文件的块数 + chunk_count = sum(1 for doc in self.vector_store.documents + if doc.get('metadata', {}).get('filename') == filename) + + # 获取文件信息 + file_path = os.path.join(KNOWLEDGE_DIR, filename) + file_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0 + + documents.append({ + 'filename': filename, + 'chunks': chunk_count, + 'size': file_size, + 'exists': os.path.exists(file_path) + }) + + return documents + + def rebuild_index(self): + """重建整个索引""" + print("开始重建索引...") + + # 清空现有索引 + self.vector_store.clear() + + # 重新扫描并索引 + result = self.scan_and_index_folder() + + return { + 'success': True, + 'indexed': result['indexed'] + } + +# 全局实例 +knowledge_service = KnowledgeService() diff --git a/rag-python/requirements.txt b/rag-python/requirements.txt new file mode 100644 index 00000000..7bc7bdd3 --- /dev/null +++ b/rag-python/requirements.txt @@ -0,0 +1,11 @@ +# RAG 知识库服务依赖(使用本地 Ollama) +flask>=2.0.0 +flask-cors>=4.0.0 +faiss-cpu>=1.7.0 +numpy>=1.21.0 +watchdog>=3.0.0 +pypdf2>=3.0.0 +python-docx>=0.8.0 +chardet>=5.0.0 +jieba>=0.42.0 +requests>=2.28.0 diff --git a/rag-python/text_splitter.py b/rag-python/text_splitter.py new file mode 100644 index 00000000..c2e31fe7 --- /dev/null +++ b/rag-python/text_splitter.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +""" +文本分块器 - 将长文本分割成小块 +""" +import re +from config import CHUNK_SIZE, CHUNK_OVERLAP + +def split_text(text, chunk_size=None, chunk_overlap=None): + """ + 将文本分割成小块 + + Args: + text: 要分割的文本 + chunk_size: 每块的最大字符数 + chunk_overlap: 块之间的重叠字符数 + + Returns: + 文本块列表 + """ + if chunk_size is None: + chunk_size = CHUNK_SIZE + if chunk_overlap is None: + chunk_overlap = CHUNK_OVERLAP + + if not text or not text.strip(): + return [] + + # 清理文本 + text = text.strip() + text = re.sub(r'\n{3,}', '\n\n', text) # 多个换行合并 + text = re.sub(r' {2,}', ' ', text) # 多个空格合并 + + # 按段落分割 + paragraphs = re.split(r'\n\n+', text) + + chunks = [] + current_chunk = "" + + for para in paragraphs: + para = para.strip() + if not para: + continue + + # 如果当前段落本身就超过chunk_size,需要进一步分割 + if len(para) > chunk_size: + # 先保存当前chunk + if current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = "" + + # 按句子分割长段落 + sentences = re.split(r'([。!?.!?])', para) + temp_chunk = "" + + for i in range(0, len(sentences), 2): + sentence = sentences[i] + if i + 1 < len(sentences): + sentence += sentences[i + 1] + + if len(temp_chunk) + len(sentence) <= chunk_size: + temp_chunk += sentence + else: + if temp_chunk: + chunks.append(temp_chunk.strip()) + temp_chunk = sentence[-chunk_overlap:] + sentence if len(sentence) > chunk_overlap else sentence + + if temp_chunk: + current_chunk = temp_chunk + else: + # 检查是否可以添加到当前chunk + if len(current_chunk) + len(para) + 1 <= chunk_size: + current_chunk += ("\n" if current_chunk else "") + para + else: + # 保存当前chunk,开始新的 + if current_chunk: + chunks.append(current_chunk.strip()) + # 保留重叠部分 + if len(current_chunk) > chunk_overlap: + current_chunk = current_chunk[-chunk_overlap:] + "\n" + para + else: + current_chunk = para + + # 保存最后一个chunk + if current_chunk.strip(): + chunks.append(current_chunk.strip()) + + return chunks diff --git a/rag-python/vector_store.py b/rag-python/vector_store.py new file mode 100644 index 00000000..9dd4bc0d --- /dev/null +++ b/rag-python/vector_store.py @@ -0,0 +1,326 @@ +# -*- coding: utf-8 -*- +""" +向量存储 - 使用 Ollama 生成向量,FAISS 进行索引和检索 +""" +import os +import json +import numpy as np +import requests +from config import INDEX_DIR, OLLAMA_URL, OLLAMA_EMBED_MODEL, TOP_K + +class VectorStore: + def __init__(self): + self.index = None + self.documents = [] # 存储文档内容和元数据 + self.dimension = 768 # nomic-embed-text 的向量维度 + self.index_file = os.path.join(INDEX_DIR, "faiss.index") + self.docs_file = os.path.join(INDEX_DIR, "documents.json") + self.faiss = None + + def _load_faiss(self): + """懒加载 FAISS""" + if self.faiss is None: + import faiss + self.faiss = faiss + + def _embed_with_ollama(self, text, retry_count=3): + """使用 Ollama 生成向量,带重试机制""" + import time + import urllib.request + import urllib.error + + url = f"{OLLAMA_URL}/api/embeddings" + + # 确保文本不为空且是字符串 + if not text or not isinstance(text, str): + text = "empty" + + # 清理文本中的特殊字符 + text = text.replace('\x00', '') # 移除 null 字符 + + # 截断过长的文本(nomic-embed-text 上下文限制约 2048 tokens) + # 中文约 1.5 字符/token,保守设置为 1000 字符 + max_length = 1000 + if len(text) > max_length: + text = text[:max_length] + + payload = { + "model": OLLAMA_EMBED_MODEL, + "prompt": text + } + + last_error = None + for attempt in range(retry_count): + try: + # 使用 urllib 代替 requests,避免潜在的编码问题 + data = json.dumps(payload, ensure_ascii=False).encode('utf-8') + req = urllib.request.Request( + url, + data=data, + headers={'Content-Type': 'application/json; charset=utf-8'}, + method='POST' + ) + + with urllib.request.urlopen(req, timeout=120) as response: + result = json.loads(response.read().decode('utf-8')) + return result.get("embedding", []) + + except urllib.error.HTTPError as e: + last_error = e + error_body = e.read().decode('utf-8') if e.fp else 'N/A' + print(f"Ollama HTTP 错误 (尝试 {attempt+1}/{retry_count}): {e.code} {e.reason}") + print(f"响应内容: {error_body[:500]}") + print(f"请求文本长度: {len(text)}") + if attempt < retry_count - 1: + wait_time = (attempt + 1) * 2 + print(f"等待 {wait_time} 秒后重试...") + time.sleep(wait_time) + except Exception as e: + last_error = e + print(f"Ollama 嵌入失败 (尝试 {attempt+1}/{retry_count}): {e}") + if attempt < retry_count - 1: + wait_time = (attempt + 1) * 2 + print(f"等待 {wait_time} 秒后重试...") + time.sleep(wait_time) + + raise last_error + + def _embed_batch(self, texts): + """批量生成向量""" + import time + embeddings = [] + for i, text in enumerate(texts): + # 打印文本信息用于调试 + print(f" 生成向量 {i+1}/{len(texts)}...") + print(f" 文本长度: {len(text)}, 前50字符: {repr(text[:50])}") + embedding = self._embed_with_ollama(text) + embeddings.append(embedding) + # 添加小延迟避免请求过快 + if i < len(texts) - 1: + time.sleep(1.0) + return embeddings + for i, text in enumerate(texts): + print(f" 生成向量 {i+1}/{len(texts)}...") + embedding = self._embed_with_ollama(text) + embeddings.append(embedding) + # 添加小延迟避免请求过快 + if i < len(texts) - 1: + time.sleep(0.5) + return embeddings + + def _init_index(self): + """初始化 FAISS 索引""" + self._load_faiss() + if self.index is None: + self.index = self.faiss.IndexFlatIP(self.dimension) + + def load_index(self): + """从磁盘加载索引""" + self._load_faiss() + + if os.path.exists(self.index_file) and os.path.exists(self.docs_file): + try: + print("正在加载已有索引...") + + # FAISS 在 Windows 上不支持中文路径,使用临时文件 + import tempfile + import shutil + + try: + # 复制到临时文件再读取 + with tempfile.NamedTemporaryFile(delete=False, suffix='.index') as tmp: + tmp_path = tmp.name + shutil.copy2(self.index_file, tmp_path) + self.index = self.faiss.read_index(tmp_path) + os.unlink(tmp_path) + except Exception as e: + print(f"临时文件方式失败,尝试直接读取: {e}") + self.index = self.faiss.read_index(self.index_file) + + with open(self.docs_file, 'r', encoding='utf-8') as f: + self.documents = json.load(f) + print(f"索引加载完成,共 {len(self.documents)} 个文档块") + return True + except Exception as e: + print(f"加载索引失败: {e}") + self._init_index() + self.documents = [] + return False + else: + print("未找到已有索引,创建新索引") + self._init_index() + self.documents = [] + return False + + def save_index(self): + """保存索引到磁盘""" + self._load_faiss() + if self.index is not None: + # 确保目录存在 + os.makedirs(os.path.dirname(self.index_file), exist_ok=True) + + # FAISS 在 Windows 上不支持中文路径,使用临时文件再移动 + import tempfile + import shutil + + try: + # 先写入临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix='.index') as tmp: + tmp_path = tmp.name + + self.faiss.write_index(self.index, tmp_path) + + # 移动到目标位置 + shutil.move(tmp_path, self.index_file) + except Exception as e: + # 如果临时文件方式失败,尝试直接写入 + print(f"临时文件方式失败,尝试直接写入: {e}") + self.faiss.write_index(self.index, self.index_file) + + with open(self.docs_file, 'w', encoding='utf-8') as f: + json.dump(self.documents, f, ensure_ascii=False, indent=2) + print(f"索引已保存,共 {len(self.documents)} 个文档块") + + def add_documents(self, chunks, metadata=None): + """添加文档块到索引""" + if not chunks: + return 0 + + self._load_faiss() + self._init_index() + + # 使用 Ollama 生成向量 + print(f"正在为 {len(chunks)} 个文本块生成向量...") + embeddings = self._embed_batch(chunks) + + # 检查向量维度 + if embeddings and len(embeddings[0]) != self.dimension: + self.dimension = len(embeddings[0]) + self.index = self.faiss.IndexFlatIP(self.dimension) + print(f"更新向量维度为: {self.dimension}") + + # 归一化向量(用于余弦相似度) + embeddings_np = np.array(embeddings).astype('float32') + norms = np.linalg.norm(embeddings_np, axis=1, keepdims=True) + embeddings_np = embeddings_np / (norms + 1e-10) + + # 添加到索引 + start_idx = len(self.documents) + self.index.add(embeddings_np) + + # 保存文档内容和元数据 + for i, chunk in enumerate(chunks): + doc = { + 'id': start_idx + i, + 'content': chunk, + 'metadata': metadata or {} + } + self.documents.append(doc) + + # 自动保存 + self.save_index() + + return len(chunks) + + def search(self, query, top_k=None): + """搜索相关文档""" + if top_k is None: + top_k = TOP_K + + if self.index is None or self.index.ntotal == 0: + return [] + + self._load_faiss() + + # 生成查询向量 + query_embedding = self._embed_with_ollama(query) + query_np = np.array([query_embedding]).astype('float32') + + # 归一化 + norm = np.linalg.norm(query_np) + query_np = query_np / (norm + 1e-10) + + # 搜索 + k = min(top_k, self.index.ntotal) + scores, indices = self.index.search(query_np, k) + + # 构建结果 + results = [] + for i, idx in enumerate(indices[0]): + if idx < len(self.documents) and idx >= 0: + doc = self.documents[idx] + results.append({ + 'content': doc['content'], + 'score': float(scores[0][i]), + 'metadata': doc.get('metadata', {}) + }) + + return results + + def delete_by_filename(self, filename): + """删除指定文件的所有文档块""" + if not self.documents: + return 0 + + self._load_faiss() + + # 找出要保留的文档 + remaining_docs = [] + deleted_count = 0 + + for doc in self.documents: + if doc.get('metadata', {}).get('filename') != filename: + remaining_docs.append(doc) + else: + deleted_count += 1 + + if deleted_count > 0: + # 重建索引 + self.documents = [] + self.index = self.faiss.IndexFlatIP(self.dimension) + + if remaining_docs: + chunks = [doc['content'] for doc in remaining_docs] + metadatas = [doc.get('metadata', {}) for doc in remaining_docs] + + embeddings = self._embed_batch(chunks) + embeddings_np = np.array(embeddings).astype('float32') + norms = np.linalg.norm(embeddings_np, axis=1, keepdims=True) + embeddings_np = embeddings_np / (norms + 1e-10) + self.index.add(embeddings_np) + + for i, (chunk, meta) in enumerate(zip(chunks, metadatas)): + self.documents.append({ + 'id': i, + 'content': chunk, + 'metadata': meta + }) + + self.save_index() + + return deleted_count + + def clear(self): + """清空所有索引""" + self._load_faiss() + self.index = self.faiss.IndexFlatIP(self.dimension) + self.documents = [] + self.save_index() + print("索引已清空") + + def get_stats(self): + """获取索引统计信息""" + files = set() + for doc in self.documents: + filename = doc.get('metadata', {}).get('filename') + if filename: + files.add(filename) + + return { + 'total_chunks': len(self.documents), + 'total_files': len(files), + 'files': list(files) + } + +# 全局实例 +vector_store = VectorStore() diff --git a/ry-xinli-admin/src/main/java/com/ddnai/web/controller/psychology/PsyRagController.java b/ry-xinli-admin/src/main/java/com/ddnai/web/controller/psychology/PsyRagController.java new file mode 100644 index 00000000..9010e16e --- /dev/null +++ b/ry-xinli-admin/src/main/java/com/ddnai/web/controller/psychology/PsyRagController.java @@ -0,0 +1,211 @@ +package com.ddnai.web.controller.psychology; + +import com.ddnai.common.core.controller.BaseController; +import com.ddnai.common.core.domain.AjaxResult; +import com.ddnai.system.rag.client.PythonRagClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.security.access.prepost.PreAuthorize; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * RAG 知识库管理Controller (调用Python服务) + * + * @author ddnai + */ +@RestController +@RequestMapping("/psychology/rag") +public class PsyRagController extends BaseController { + + private static final Logger log = LoggerFactory.getLogger(PsyRagController.class); + + @Autowired + private PythonRagClient pythonRagClient; + + /** + * 检查服务状态 + */ + @GetMapping("/status") + public AjaxResult checkStatus() { + boolean available = pythonRagClient.isAvailable(); + Map result = new HashMap<>(); + result.put("available", available); + result.put("message", available ? "Python RAG 服务运行中" : "Python RAG 服务未启动"); + + if (available) { + result.put("stats", pythonRagClient.getStats()); + } + + return AjaxResult.success(result); + } + + /** + * 上传文档 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:upload')") + @PostMapping("/upload") + public AjaxResult uploadDocument(@RequestParam("file") MultipartFile file) { + try { + if (!pythonRagClient.isAvailable()) { + return AjaxResult.error("Python RAG 服务未启动,请先启动 rag-service"); + } + + Map result = pythonRagClient.uploadDocument(file); + + if (Boolean.TRUE.equals(result.get("success"))) { + return AjaxResult.success("文档上传成功", result.get("data")); + } else { + return AjaxResult.error("上传失败: " + result.get("error")); + } + } catch (Exception e) { + log.error("文档上传失败", e); + return AjaxResult.error("文档上传失败:" + e.getMessage()); + } + } + + /** + * 获取文档列表 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:list')") + @GetMapping("/documents") + public AjaxResult listDocuments() { + try { + if (!pythonRagClient.isAvailable()) { + return AjaxResult.error("Python RAG 服务未启动"); + } + + List> documents = pythonRagClient.listDocuments(); + return AjaxResult.success(documents); + } catch (Exception e) { + log.error("获取文档列表失败", e); + return AjaxResult.error("获取文档列表失败:" + e.getMessage()); + } + } + + /** + * 删除文档 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:remove')") + @DeleteMapping("/documents/{filename}") + public AjaxResult deleteDocument(@PathVariable String filename) { + try { + if (!pythonRagClient.isAvailable()) { + return AjaxResult.error("Python RAG 服务未启动"); + } + + Map result = pythonRagClient.deleteDocument(filename); + + if (Boolean.TRUE.equals(result.get("success"))) { + return AjaxResult.success("删除成功"); + } else { + return AjaxResult.error("删除失败: " + result.get("error")); + } + } catch (Exception e) { + log.error("删除文档失败", e); + return AjaxResult.error("删除文档失败:" + e.getMessage()); + } + } + + /** + * 搜索文档 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:list')") + @PostMapping("/search") + public AjaxResult search(@RequestBody Map params) { + try { + if (!pythonRagClient.isAvailable()) { + return AjaxResult.error("Python RAG 服务未启动"); + } + + String query = (String) params.get("query"); + Integer topK = params.get("topK") != null ? (Integer) params.get("topK") : 5; + + if (query == null || query.trim().isEmpty()) { + return AjaxResult.error("查询内容不能为空"); + } + + List> results = pythonRagClient.search(query, topK); + return AjaxResult.success(results); + } catch (Exception e) { + log.error("搜索失败", e); + return AjaxResult.error("搜索失败:" + e.getMessage()); + } + } + + /** + * 获取统计信息 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:list')") + @GetMapping("/stats") + public AjaxResult getStats() { + try { + if (!pythonRagClient.isAvailable()) { + Map result = new HashMap<>(); + result.put("available", false); + result.put("message", "Python RAG 服务未启动"); + return AjaxResult.success(result); + } + + Map stats = pythonRagClient.getStats(); + stats.put("available", true); + return AjaxResult.success(stats); + } catch (Exception e) { + log.error("获取统计信息失败", e); + return AjaxResult.error("获取统计信息失败:" + e.getMessage()); + } + } + + /** + * 重建索引 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:rebuild')") + @PostMapping("/rebuild") + public AjaxResult rebuildIndex() { + try { + if (!pythonRagClient.isAvailable()) { + return AjaxResult.error("Python RAG 服务未启动"); + } + + Map result = pythonRagClient.rebuildIndex(); + + if (Boolean.TRUE.equals(result.get("success"))) { + return AjaxResult.success("索引重建成功", result.get("data")); + } else { + return AjaxResult.error("重建失败: " + result.get("error")); + } + } catch (Exception e) { + log.error("重建索引失败", e); + return AjaxResult.error("重建索引失败:" + e.getMessage()); + } + } + + /** + * 扫描文件夹 + */ + @PreAuthorize("@ss.hasPermi('psychology:knowledge:upload')") + @PostMapping("/scan") + public AjaxResult scanFolder() { + try { + if (!pythonRagClient.isAvailable()) { + return AjaxResult.error("Python RAG 服务未启动"); + } + + Map result = pythonRagClient.scanFolder(); + + if (Boolean.TRUE.equals(result.get("success"))) { + return AjaxResult.success("扫描完成", result.get("data")); + } else { + return AjaxResult.error("扫描失败: " + result.get("error")); + } + } catch (Exception e) { + log.error("扫描文件夹失败", e); + return AjaxResult.error("扫描文件夹失败:" + e.getMessage()); + } + } +} diff --git a/ry-xinli-admin/src/main/resources/application.yml b/ry-xinli-admin/src/main/resources/application.yml index 9cb65887..babcae05 100644 --- a/ry-xinli-admin/src/main/resources/application.yml +++ b/ry-xinli-admin/src/main/resources/application.yml @@ -143,40 +143,38 @@ xss: # RAG知识库配置 rag: - # 使用模式: openai(外部API)、ollama(本地) 或 hybrid(混合模式) - mode: hybrid + # Python RAG 服务配置(主要使用这个) + python: + url: http://localhost:5000 + enabled: true - # OpenAI兼容API配置(用于文本生成) + # 禁用 Java 端的 RAG 功能,全部由 Python 服务处理 + # 使用模式: disabled(禁用Java端)、python(仅Python) + mode: disabled + + # OpenAI兼容API配置(仅用于AI报告生成,不用于RAG) openai: - # Kimi API (Moonshot) - 你现有的API base-url: https://api.moonshot.cn/v1 - # 你的Kimi API Key api-key: sk-U9fdriPxwBcrpWW0Ite3N0eVtX7VxnqqqYUIBAdWd1hgEA9m - # 嵌入模型(混合模式下不使用,由Ollama提供) - embed-model: BAAI/bge-large-zh-v1.5 - # 生成模型(用于AI报告生成) + embed-model: none generate-model: moonshot-v1-32k - # 连接超时时间(秒) - connect-timeout: 30 - # 读取超时时间(秒) - read-timeout: 300 + connect-timeout: 10 + read-timeout: 60 - # Ollama配置(用于本地嵌入) + # Ollama配置(禁用) ollama: url: http://localhost:11434 - # 嵌入模型(已下载) - embed-model: nomic-embed-text - # 生成模型(混合模式下不使用,由OpenAI API提供) - generate-model: deepseek-r1:32b - # 连接超时时间(秒) - connect-timeout: 30 - # 读取超时时间(秒) - read-timeout: 300 + embed-model: none + generate-model: none + connect-timeout: 5 + read-timeout: 30 + enabled: false - # ChromaDB配置(本地部署,可选) + # ChromaDB配置(禁用) chromadb: url: http://localhost:8000 collection: psychology_knowledge + enabled: false # 存储配置 storage: @@ -184,9 +182,9 @@ rag: log-path: D:/wwwroot/RAG/logs chroma-data-path: D:/wwwroot/RAG/data/chroma_db - # 文件监听配置 + # 文件监听配置(禁用) file-watcher: - enabled: false # 默认关闭,避免自动处理 + enabled: false watch-path: D:/wwwroot/RAG/uploads scan-interval: 10 diff --git a/ry-xinli-admin/src/main/resources/banner.txt b/ry-xinli-admin/src/main/resources/banner.txt new file mode 100644 index 00000000..528be7f7 --- /dev/null +++ b/ry-xinli-admin/src/main/resources/banner.txt @@ -0,0 +1,3 @@ +Application Version: ${ruoyi.version} +Spring Boot Version: ${spring-boot.version} +// AI心理健康测评系统 永不宕机 永无BUG // diff --git a/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/ChromaDBClient.java b/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/ChromaDBClient.java index 62359252..6e7c0c67 100644 --- a/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/ChromaDBClient.java +++ b/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/ChromaDBClient.java @@ -40,22 +40,22 @@ public class ChromaDBClient { this.baseUrl = ragProperties.getChromadb().getUrl(); this.collectionName = ragProperties.getChromadb().getCollection(); - // 创建OkHttpClient实例 + // 创建OkHttpClient实例,使用较短的超时时间 this.httpClient = new OkHttpClient.Builder() - .connectTimeout(ragProperties.getChromadb().getConnectTimeout(), TimeUnit.SECONDS) - .readTimeout(ragProperties.getChromadb().getReadTimeout(), TimeUnit.SECONDS) - .writeTimeout(ragProperties.getChromadb().getConnectTimeout(), TimeUnit.SECONDS) - .retryOnConnectionFailure(true) + .connectTimeout(5, TimeUnit.SECONDS) + .readTimeout(10, TimeUnit.SECONDS) + .writeTimeout(5, TimeUnit.SECONDS) + .retryOnConnectionFailure(false) .build(); log.info("ChromaDBClient initialized with base URL: {}, collection: {}", baseUrl, collectionName); - // 尝试创建集合(如果不存在) - try { - ensureCollectionExists(); - } catch (IOException e) { - log.warn("Failed to ensure collection exists: {}", e.getMessage()); - } + // 不在启动时尝试连接,延迟到首次使用时 + // try { + // ensureCollectionExists(); + // } catch (IOException e) { + // log.warn("Failed to ensure collection exists: {}", e.getMessage()); + // } } /** diff --git a/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/OllamaClient.java b/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/OllamaClient.java index 97bc96d7..229d3dad 100644 --- a/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/OllamaClient.java +++ b/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/OllamaClient.java @@ -41,12 +41,12 @@ public class OllamaClient { public void init() { this.baseUrl = ragProperties.getOllama().getUrl(); - // 创建OkHttpClient实例 + // 创建OkHttpClient实例,使用较短的超时时间避免启动阻塞 this.httpClient = new OkHttpClient.Builder() - .connectTimeout(ragProperties.getOllama().getConnectTimeout(), TimeUnit.SECONDS) - .readTimeout(ragProperties.getOllama().getReadTimeout(), TimeUnit.SECONDS) - .writeTimeout(ragProperties.getOllama().getConnectTimeout(), TimeUnit.SECONDS) - .retryOnConnectionFailure(true) + .connectTimeout(5, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .writeTimeout(10, TimeUnit.SECONDS) + .retryOnConnectionFailure(false) .build(); log.info("OllamaClient initialized with base URL: {}", baseUrl); diff --git a/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/PythonRagClient.java b/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/PythonRagClient.java new file mode 100644 index 00000000..2aa26cb7 --- /dev/null +++ b/ry-xinli-system/src/main/java/com/ddnai/system/rag/client/PythonRagClient.java @@ -0,0 +1,239 @@ +package com.ddnai.system.rag.client; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.io.ByteArrayResource; +import org.springframework.http.*; +import org.springframework.stereotype.Component; +import org.springframework.util.LinkedMultiValueMap; +import org.springframework.util.MultiValueMap; +import org.springframework.web.client.RestTemplate; +import org.springframework.web.multipart.MultipartFile; + +import javax.annotation.PostConstruct; +import java.util.*; + +/** + * Python RAG 服务客户端 + * 调用独立的 Python 知识库服务 + */ +@Component +public class PythonRagClient { + + private static final Logger log = LoggerFactory.getLogger(PythonRagClient.class); + + @Value("${rag.python.url:http://localhost:5000}") + private String pythonServiceUrl; + + @Value("${rag.python.enabled:true}") + private boolean enabled; + + private final RestTemplate restTemplate; + private final ObjectMapper objectMapper; + + public PythonRagClient() { + this.restTemplate = new RestTemplate(); + this.objectMapper = new ObjectMapper(); + } + + @PostConstruct + public void init() { + log.info("Python RAG Client initialized, URL: {}, Enabled: {}", pythonServiceUrl, enabled); + } + + /** + * 检查服务是否可用 + */ + public boolean isAvailable() { + if (!enabled) { + return false; + } + try { + String url = pythonServiceUrl + "/api/health"; + ResponseEntity response = restTemplate.getForEntity(url, String.class); + return response.getStatusCode() == HttpStatus.OK; + } catch (Exception e) { + log.warn("Python RAG service not available: {}", e.getMessage()); + return false; + } + } + + /** + * 上传文档 + */ + public Map uploadDocument(MultipartFile file) { + try { + String url = pythonServiceUrl + "/api/documents/upload"; + + HttpHeaders headers = new HttpHeaders(); + headers.setContentType(MediaType.MULTIPART_FORM_DATA); + + MultiValueMap body = new LinkedMultiValueMap<>(); + body.add("file", new ByteArrayResource(file.getBytes()) { + @Override + public String getFilename() { + return file.getOriginalFilename(); + } + }); + + HttpEntity> requestEntity = new HttpEntity<>(body, headers); + ResponseEntity response = restTemplate.postForEntity(url, requestEntity, String.class); + + return parseResponse(response.getBody()); + } catch (Exception e) { + log.error("Failed to upload document to Python service: {}", e.getMessage()); + return errorResult("上传失败: " + e.getMessage()); + } + } + + /** + * 获取文档列表 + */ + public List> listDocuments() { + try { + String url = pythonServiceUrl + "/api/documents"; + ResponseEntity response = restTemplate.getForEntity(url, String.class); + + Map result = parseResponse(response.getBody()); + if (Boolean.TRUE.equals(result.get("success"))) { + Object data = result.get("data"); + if (data instanceof List) { + return (List>) data; + } + } + return new ArrayList<>(); + } catch (Exception e) { + log.error("Failed to list documents: {}", e.getMessage()); + return new ArrayList<>(); + } + } + + /** + * 删除文档 + */ + public Map deleteDocument(String filename) { + try { + String url = pythonServiceUrl + "/api/documents/" + filename; + restTemplate.delete(url); + return successResult("删除成功"); + } catch (Exception e) { + log.error("Failed to delete document: {}", e.getMessage()); + return errorResult("删除失败: " + e.getMessage()); + } + } + + /** + * 搜索文档 + */ + public List> search(String query, int topK) { + try { + String url = pythonServiceUrl + "/api/search"; + + HttpHeaders headers = new HttpHeaders(); + headers.setContentType(MediaType.APPLICATION_JSON); + + Map body = new HashMap<>(); + body.put("query", query); + body.put("top_k", topK); + + HttpEntity> requestEntity = new HttpEntity<>(body, headers); + ResponseEntity response = restTemplate.postForEntity(url, requestEntity, String.class); + + Map result = parseResponse(response.getBody()); + if (Boolean.TRUE.equals(result.get("success"))) { + Object data = result.get("data"); + if (data instanceof List) { + return (List>) data; + } + } + return new ArrayList<>(); + } catch (Exception e) { + log.error("Failed to search: {}", e.getMessage()); + return new ArrayList<>(); + } + } + + /** + * 获取统计信息 + */ + public Map getStats() { + try { + String url = pythonServiceUrl + "/api/stats"; + ResponseEntity response = restTemplate.getForEntity(url, String.class); + + Map result = parseResponse(response.getBody()); + if (Boolean.TRUE.equals(result.get("success"))) { + return (Map) result.get("data"); + } + return new HashMap<>(); + } catch (Exception e) { + log.error("Failed to get stats: {}", e.getMessage()); + return new HashMap<>(); + } + } + + /** + * 重建索引 + */ + public Map rebuildIndex() { + try { + String url = pythonServiceUrl + "/api/rebuild"; + + HttpHeaders headers = new HttpHeaders(); + headers.setContentType(MediaType.APPLICATION_JSON); + + HttpEntity requestEntity = new HttpEntity<>("{}", headers); + ResponseEntity response = restTemplate.postForEntity(url, requestEntity, String.class); + + return parseResponse(response.getBody()); + } catch (Exception e) { + log.error("Failed to rebuild index: {}", e.getMessage()); + return errorResult("重建索引失败: " + e.getMessage()); + } + } + + /** + * 扫描文件夹 + */ + public Map scanFolder() { + try { + String url = pythonServiceUrl + "/api/scan"; + + HttpHeaders headers = new HttpHeaders(); + headers.setContentType(MediaType.APPLICATION_JSON); + + HttpEntity requestEntity = new HttpEntity<>("{}", headers); + ResponseEntity response = restTemplate.postForEntity(url, requestEntity, String.class); + + return parseResponse(response.getBody()); + } catch (Exception e) { + log.error("Failed to scan folder: {}", e.getMessage()); + return errorResult("扫描失败: " + e.getMessage()); + } + } + + private Map parseResponse(String json) { + try { + return objectMapper.readValue(json, Map.class); + } catch (Exception e) { + return errorResult("解析响应失败"); + } + } + + private Map successResult(String message) { + Map result = new HashMap<>(); + result.put("success", true); + result.put("message", message); + return result; + } + + private Map errorResult(String error) { + Map result = new HashMap<>(); + result.put("success", false); + result.put("error", error); + return result; + } +} diff --git a/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/AIServiceAdapter.java b/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/AIServiceAdapter.java index f51bfd6b..ea67339a 100644 --- a/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/AIServiceAdapter.java +++ b/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/AIServiceAdapter.java @@ -51,6 +51,15 @@ public class AIServiceAdapter { @PostConstruct public void init() { + // 检查是否禁用 Java 端 RAG + if ("disabled".equalsIgnoreCase(mode) || "python".equalsIgnoreCase(mode)) { + log.info("RAG服务模式: 已禁用Java端RAG,使用Python服务"); + useOpenAI = false; + useSimpleStore = true; + isHybridMode = false; + return; + } + isHybridMode = "hybrid".equalsIgnoreCase(mode); useOpenAI = "openai".equalsIgnoreCase(mode) || isHybridMode; useSimpleStore = true; // 默认使用简单存储 diff --git a/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/KnowledgeService.java b/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/KnowledgeService.java index 022099e3..86a93a7f 100644 --- a/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/KnowledgeService.java +++ b/ry-xinli-system/src/main/java/com/ddnai/system/rag/service/KnowledgeService.java @@ -95,7 +95,7 @@ public class KnowledgeService { List chunks = textSplitter.split(text); log.info("Split document into {} chunks", chunks.size()); - // 5. 向量化 - 添加异常处理 + // 5. 向量化 List embeddings = null; boolean vectorizationSuccess = false; @@ -105,7 +105,7 @@ public class KnowledgeService { log.info("Successfully generated {} embeddings", embeddings.size()); vectorizationSuccess = true; } catch (Exception e) { - log.error("Failed to generate embeddings, document will be saved without vectors: {}", e.getMessage(), e); + log.error("Failed to generate embeddings: {}", e.getMessage()); // 继续处理,文档仍然会被保存,只是没有向量 } @@ -132,17 +132,17 @@ public class KnowledgeService { ids.add(docId + "_chunk_" + i); } - // 7. 存储到ChromaDB - 只有向量化成功才存储 + // 7. 存储向量 - 使用AIServiceAdapter(自动选择SimpleVectorStore或ChromaDB) if (vectorizationSuccess && embeddings != null) { try { - chromaDBClient.addDocuments(chunks, embeddings, metadatas, ids); - log.info("Stored {} chunks to ChromaDB", chunks.size()); + aiServiceAdapter.addDocuments(chunks, embeddings, metadatas, ids); + log.info("Stored {} chunks to vector store", chunks.size()); } catch (Exception e) { - log.warn("Failed to store to ChromaDB: {}", e.getMessage()); + log.warn("Failed to store to vector store: {}", e.getMessage()); // 继续处理,文档信息仍然会被保存 } } else { - log.info("Skipped ChromaDB storage (vectorization failed or disabled)"); + log.info("Skipped vector storage (vectorization failed)"); } // 8. 更新文档索引 diff --git a/xinli-ui/public/robots.txt b/xinli-ui/public/robots.txt new file mode 100644 index 00000000..77470cb3 --- /dev/null +++ b/xinli-ui/public/robots.txt @@ -0,0 +1,2 @@ +User-agent: * +Disallow: / \ No newline at end of file diff --git a/xinli-ui/src/api/psychology/knowledge.js b/xinli-ui/src/api/psychology/knowledge.js index 453e8e19..a884d74d 100644 --- a/xinli-ui/src/api/psychology/knowledge.js +++ b/xinli-ui/src/api/psychology/knowledge.js @@ -1,5 +1,66 @@ import request from '@/utils/request' +// ========== Python RAG 服务 API ========== + +// 检查RAG服务状态 +export function checkRagStatus() { + return request({ + url: '/psychology/rag/status', + method: 'get' + }) +} + +// 获取RAG文档列表 +export function listRagDocuments() { + return request({ + url: '/psychology/rag/documents', + method: 'get' + }) +} + +// 删除RAG文档 +export function delRagDocument(filename) { + return request({ + url: '/psychology/rag/documents/' + encodeURIComponent(filename), + method: 'delete' + }) +} + +// RAG搜索 +export function ragSearch(query, topK = 5) { + return request({ + url: '/psychology/rag/search', + method: 'post', + data: { query, topK } + }) +} + +// 获取RAG统计信息 +export function getRagStats() { + return request({ + url: '/psychology/rag/stats', + method: 'get' + }) +} + +// 重建RAG索引 +export function rebuildRagIndex() { + return request({ + url: '/psychology/rag/rebuild', + method: 'post' + }) +} + +// 扫描文件夹 +export function scanRagFolder() { + return request({ + url: '/psychology/rag/scan', + method: 'post' + }) +} + +// ========== 原有 API (保留兼容) ========== + // 查询知识库文档列表 export function listDocuments(query) { return request({ diff --git a/xinli-ui/src/views/psychology/assessment/taking.vue b/xinli-ui/src/views/psychology/assessment/taking.vue index 8c0cf3c9..b497dc0c 100644 --- a/xinli-ui/src/views/psychology/assessment/taking.vue +++ b/xinli-ui/src/views/psychology/assessment/taking.vue @@ -1155,6 +1155,14 @@ export default { .options-container { margin-top: 20px; + width: 100%; +} + +/* 确保radio-group和checkbox-group占满宽度 */ +.options-container .el-radio-group, +.options-container .el-checkbox-group { + width: 100%; + display: block; } .option-item { @@ -1167,6 +1175,9 @@ export default { gap: 15px; border-radius: 4px; margin-bottom: 2px; + width: 100%; + box-sizing: border-box; + min-height: 48px; } .option-item:hover { @@ -1209,7 +1220,27 @@ export default { .option-content { flex: 1; - pointer-events: none; + width: 100%; + min-width: 0; +} + +/* 让el-radio和el-checkbox占满整行,整行可点击 */ +.option-content .el-radio, +.option-content .el-checkbox { + display: flex; + width: 100%; + margin-right: 0; + padding-right: 20px; + white-space: normal; + align-items: flex-start; +} + +.option-content .el-radio__label, +.option-content .el-checkbox__label { + flex: 1; + white-space: normal; + line-height: 1.5; + padding-left: 10px; } .option-tts-btn { diff --git a/xinli-ui/src/views/psychology/knowledge/index.vue b/xinli-ui/src/views/psychology/knowledge/index.vue index 04b83161..7486bbed 100644 --- a/xinli-ui/src/views/psychology/knowledge/index.vue +++ b/xinli-ui/src/views/psychology/knowledge/index.vue @@ -1,5 +1,17 @@