加入AI分析知识库

This commit is contained in:
xiao12feng@outlook.com 2025-12-20 12:08:33 +08:00
parent ae39c44fa9
commit 0f490298f3
22 changed files with 2127 additions and 209 deletions

19
.gitignore vendored
View File

@ -60,11 +60,30 @@ rag-python/knowledge_docs/
rag-python/uploads/
rag-python/__pycache__/
rag-python/*.pyc
rag-python/*.bat
######################################################################
# Android App Build Output
xinli-App/build/
xinli-App/app/build/
xinli-App/.gradle/
xinli-App/local.properties
xinli-App/*.keystore
xinli-App/*.jks
# Android App 临时文档和脚本
xinli-App/*.md
xinli-App/*.bat
xinli-App/*.txt
######################################################################
# PaddleSpeech - 大型第三方库
PaddleSpeech-develop/
######################################################################
# 项目介绍目录(本地文档)
项目介绍/
######################################################################
# 临时文档和调试文件(根目录下的中文 md/sql/bat 文件)
/*.md

158
rag-python/app.py Normal file
View File

@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
"""
RAG 知识库服务 - Flask API
支持与 jar 包同级目录部署
"""
import os
import sys
from flask import Flask, request, jsonify
from flask_cors import CORS
from config import HOST, PORT, KNOWLEDGE_DIR, BASE_DIR
from knowledge_service import knowledge_service
from file_watcher import FileWatcher
app = Flask(__name__)
CORS(app) # 允许跨域请求
# 文件监控器
file_watcher = None
@app.route('/api/health', methods=['GET'])
def health_check():
"""健康检查"""
return jsonify({
'status': 'ok',
'service': 'RAG Knowledge Service',
'knowledge_dir': KNOWLEDGE_DIR,
'base_dir': BASE_DIR
})
@app.route('/api/documents', methods=['GET'])
def list_documents():
"""列出所有文档"""
try:
documents = knowledge_service.list_documents()
return jsonify({
'success': True,
'data': documents
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/documents/upload', methods=['POST'])
def upload_document():
"""上传文档"""
try:
if 'file' not in request.files:
return jsonify({'success': False, 'error': '没有上传文件'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'success': False, 'error': '文件名为空'}), 400
result = knowledge_service.upload_and_index(file)
if result['success']:
return jsonify({
'success': True,
'data': result
})
else:
return jsonify({'success': False, 'error': result.get('error')}), 400
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/documents/<filename>', methods=['DELETE'])
def delete_document(filename):
"""删除文档"""
try:
result = knowledge_service.delete_document(filename)
return jsonify({
'success': result['success'],
'data': result
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/search', methods=['POST'])
def search():
"""搜索文档"""
try:
data = request.get_json()
query = data.get('query', '')
top_k = data.get('top_k', 5)
if not query:
return jsonify({'success': False, 'error': '查询内容不能为空'}), 400
results = knowledge_service.search(query, top_k)
return jsonify({
'success': True,
'data': results
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/stats', methods=['GET'])
def get_stats():
"""获取统计信息"""
try:
stats = knowledge_service.get_stats()
return jsonify({
'success': True,
'data': stats
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/rebuild', methods=['POST'])
def rebuild_index():
"""重建索引"""
try:
result = knowledge_service.rebuild_index()
return jsonify({
'success': True,
'data': result
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/api/scan', methods=['POST'])
def scan_folder():
"""扫描文件夹并索引新文件"""
try:
result = knowledge_service.scan_and_index_folder()
return jsonify({
'success': True,
'data': result
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
def init_service():
"""初始化服务"""
print("=" * 50)
print("RAG 知识库服务启动中...")
print("=" * 50)
# 初始化知识库服务
knowledge_service.init()
# 扫描并索引新文件
knowledge_service.scan_and_index_folder()
# 启动文件监控
global file_watcher
file_watcher = FileWatcher(knowledge_service)
file_watcher.start()
print("=" * 50)
print(f"服务已启动: http://{HOST}:{PORT}")
print(f"知识库文件夹: {KNOWLEDGE_DIR}")
print("=" * 50)
if __name__ == '__main__':
init_service()
app.run(host=HOST, port=PORT, debug=False, threaded=True)

155
rag-python/batch_index.py Normal file
View File

@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-
"""
批量索引脚本 - 用于处理大文件
直接运行此脚本来索引 knowledge_docs 目录中的所有文件
使用方法
1. PDF 文件放入 rag-python/knowledge_docs/ 目录
2. 运行: python batch_index.py
"""
import os
import sys
import time
# 添加当前目录到路径
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from config import KNOWLEDGE_DIR, CHUNK_SIZE
from document_parser import parse_document, is_supported_file
from text_splitter import split_text
from vector_store import vector_store
def format_time(seconds):
"""格式化时间"""
if seconds < 60:
return f"{seconds:.1f}"
elif seconds < 3600:
return f"{seconds/60:.1f}分钟"
else:
return f"{seconds/3600:.1f}小时"
def estimate_time(char_count):
"""估算处理时间"""
# 每300字符一个块每块约1.5秒
chunks = char_count / CHUNK_SIZE
seconds = chunks * 1.5
return format_time(seconds)
def batch_index():
"""批量索引所有文件"""
print("=" * 60)
print("批量索引工具")
print("=" * 60)
print(f"知识库目录: {KNOWLEDGE_DIR}")
print(f"分块大小: {CHUNK_SIZE} 字符")
print()
# 加载现有索引
print("加载现有索引...")
vector_store.load_index()
stats = vector_store.get_stats()
indexed_files = set(stats.get('files', []))
print(f"已索引文件: {len(indexed_files)}")
print()
# 扫描文件
files_to_process = []
for filename in os.listdir(KNOWLEDGE_DIR):
file_path = os.path.join(KNOWLEDGE_DIR, filename)
if os.path.isfile(file_path) and is_supported_file(filename):
if filename not in indexed_files:
file_size = os.path.getsize(file_path)
files_to_process.append((filename, file_path, file_size))
if not files_to_process:
print("没有新文件需要索引。")
print(f"如需重新索引,请先删除 index_data 目录中的文件。")
return
# 显示待处理文件
print(f"发现 {len(files_to_process)} 个新文件:")
total_size = 0
for filename, _, size in files_to_process:
size_mb = size / (1024 * 1024)
total_size += size
print(f" - {filename} ({size_mb:.1f} MB)")
print(f"\n总大小: {total_size / (1024 * 1024):.1f} MB")
print()
# 确认处理
confirm = input("是否开始处理?(y/n): ").strip().lower()
if confirm != 'y':
print("已取消。")
return
print()
print("=" * 60)
print("开始处理...")
print("=" * 60)
total_start = time.time()
success_count = 0
fail_count = 0
for i, (filename, file_path, file_size) in enumerate(files_to_process):
print()
print(f"[{i+1}/{len(files_to_process)}] 处理: {filename}")
print("-" * 40)
file_start = time.time()
try:
# 解析文档
print("解析文档...")
content = parse_document(file_path)
if not content or not content.strip():
print(f" 警告: 文档内容为空,跳过")
fail_count += 1
continue
char_count = len(content)
print(f" 提取文本: {char_count} 字符")
print(f" 预计处理时间: {estimate_time(char_count)}")
# 分块
print("分块处理...")
chunks = split_text(content)
print(f" 生成 {len(chunks)} 个文本块")
# 向量化
print("向量化处理...")
metadata = {
'filename': filename,
'file_path': file_path,
'char_count': char_count
}
added = vector_store.add_documents(chunks, metadata)
file_time = time.time() - file_start
print(f" 完成! 耗时: {format_time(file_time)}")
success_count += 1
except Exception as e:
print(f" 错误: {e}")
fail_count += 1
# 总结
total_time = time.time() - total_start
print()
print("=" * 60)
print("处理完成!")
print("=" * 60)
print(f"成功: {success_count} 个文件")
print(f"失败: {fail_count} 个文件")
print(f"总耗时: {format_time(total_time)}")
# 显示最终统计
final_stats = vector_store.get_stats()
print(f"索引总文件数: {final_stats['total_files']}")
print(f"索引总文本块: {final_stats['total_chunks']}")
if __name__ == '__main__':
batch_index()

51
rag-python/config.py Normal file
View File

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
"""
RAG 服务配置文件
支持与 jar 包同级目录部署
使用本地 Ollama 进行向量化
"""
import os
import sys
# 服务配置
HOST = "0.0.0.0"
PORT = 5000
# Ollama 配置(使用本地 Ollama 生成向量)
OLLAMA_URL = "http://localhost:11434"
OLLAMA_EMBED_MODEL = "nomic-embed-text" # 你已经下载的嵌入模型
# 获取程序运行目录(支持打包后部署)
# 当与 jar 包同级部署时BASE_DIR 就是 rag-python 文件夹
if getattr(sys, 'frozen', False):
# 如果是打包后的 exe
BASE_DIR = os.path.dirname(sys.executable)
else:
# 正常 Python 运行
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# 文件夹配置 - 都在 rag-python 目录下
KNOWLEDGE_DIR = os.path.join(BASE_DIR, "knowledge_docs") # 知识库文档目录
INDEX_DIR = os.path.join(BASE_DIR, "index_data") # 索引存储目录
UPLOAD_DIR = os.path.join(BASE_DIR, "uploads") # 上传文件临时目录
# 确保目录存在
for dir_path in [KNOWLEDGE_DIR, INDEX_DIR, UPLOAD_DIR]:
os.makedirs(dir_path, exist_ok=True)
# 支持的文件类型
SUPPORTED_EXTENSIONS = {'.txt', '.md', '.pdf', '.docx', '.doc'}
# 文本分块配置
CHUNK_SIZE = 300 # 每个文本块的字符数(减小以适应 nomic-embed-text 的 2048 token 限制)
CHUNK_OVERLAP = 30 # 文本块之间的重叠字符数
# 检索配置
TOP_K = 5 # 返回最相关的文档数量
# 打印配置信息
print(f"[配置] 程序目录: {BASE_DIR}")
print(f"[配置] 知识库目录: {KNOWLEDGE_DIR}")
print(f"[配置] 索引目录: {INDEX_DIR}")
print(f"[配置] Ollama地址: {OLLAMA_URL}")
print(f"[配置] 嵌入模型: {OLLAMA_EMBED_MODEL}")

View File

@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
"""
文档解析器 - 支持多种文件格式
"""
import os
import chardet
from config import SUPPORTED_EXTENSIONS
def detect_encoding(file_path):
"""检测文件编码"""
with open(file_path, 'rb') as f:
raw_data = f.read(10000)
result = chardet.detect(raw_data)
return result['encoding'] or 'utf-8'
def parse_txt(file_path):
"""解析纯文本文件"""
encoding = detect_encoding(file_path)
try:
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
return f.read()
except Exception as e:
print(f"解析TXT文件失败 {file_path}: {e}")
return ""
def parse_md(file_path):
"""解析Markdown文件"""
return parse_txt(file_path)
def parse_pdf(file_path):
"""解析PDF文件支持大文件"""
try:
from PyPDF2 import PdfReader
file_size = os.path.getsize(file_path)
file_size_mb = file_size / (1024 * 1024)
print(f" PDF文件大小: {file_size_mb:.1f} MB")
reader = PdfReader(file_path)
total_pages = len(reader.pages)
print(f" PDF总页数: {total_pages}")
text_parts = []
for i, page in enumerate(reader.pages):
if (i + 1) % 50 == 0 or i == 0:
print(f" 解析进度: {i + 1}/{total_pages}")
try:
text = page.extract_text()
if text:
text_parts.append(text)
except Exception as e:
print(f" 警告: 第 {i + 1} 页解析失败: {e}")
continue
print(f" PDF解析完成提取文本 {len(''.join(text_parts))} 字符")
return "\n".join(text_parts)
except Exception as e:
print(f"解析PDF文件失败 {file_path}: {e}")
return ""
def parse_docx(file_path):
"""解析Word文档"""
try:
from docx import Document
doc = Document(file_path)
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
return "\n".join(text_parts)
except Exception as e:
print(f"解析DOCX文件失败 {file_path}: {e}")
return ""
def parse_document(file_path):
"""根据文件类型解析文档"""
ext = os.path.splitext(file_path)[1].lower()
if ext not in SUPPORTED_EXTENSIONS:
print(f"不支持的文件类型: {ext}")
return ""
parsers = {
'.txt': parse_txt,
'.md': parse_md,
'.pdf': parse_pdf,
'.docx': parse_docx,
'.doc': parse_docx,
}
parser = parsers.get(ext, parse_txt)
return parser(file_path)
def is_supported_file(filename):
"""检查文件是否支持"""
ext = os.path.splitext(filename)[1].lower()
return ext in SUPPORTED_EXTENSIONS

172
rag-python/file_watcher.py Normal file
View File

@ -0,0 +1,172 @@
# -*- coding: utf-8 -*-
"""
文件监控服务 - 监控知识库文件夹的变化
"""
import os
import time
import threading
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from config import KNOWLEDGE_DIR
from document_parser import is_supported_file
# 全局变量:正在上传的文件列表(用于避免文件监控器重复处理)
uploading_files = set()
uploading_lock = threading.Lock()
def mark_uploading(filename):
"""标记文件正在上传"""
with uploading_lock:
uploading_files.add(filename)
def unmark_uploading(filename):
"""取消上传标记"""
with uploading_lock:
uploading_files.discard(filename)
def is_uploading(filename):
"""检查文件是否正在上传"""
with uploading_lock:
return filename in uploading_files
class KnowledgeFileHandler(FileSystemEventHandler):
"""文件变化处理器"""
def __init__(self, knowledge_service):
self.knowledge_service = knowledge_service
self.pending_files = {} # 防抖:记录待处理的文件
self.debounce_seconds = 3 # 增加防抖时间
self._lock = threading.Lock()
def _should_process(self, path):
"""检查是否应该处理该文件"""
if not os.path.isfile(path):
return False
filename = os.path.basename(path)
# 跳过正在上传的文件
if is_uploading(filename):
return False
return is_supported_file(filename)
def _debounce_process(self, path, action):
"""防抖处理"""
filename = os.path.basename(path)
# 跳过正在上传的文件
if is_uploading(filename):
print(f"[文件监控] 跳过正在上传的文件: {filename}")
return
with self._lock:
self.pending_files[path] = {
'action': action,
'time': time.time()
}
# 延迟处理
def delayed_process():
time.sleep(self.debounce_seconds)
with self._lock:
if path in self.pending_files:
info = self.pending_files.pop(path)
# 再次检查是否正在上传
if not is_uploading(os.path.basename(path)):
self._do_process(path, info['action'])
threading.Thread(target=delayed_process, daemon=True).start()
def _do_process(self, path, action):
"""执行实际处理"""
filename = os.path.basename(path)
# 最后一次检查
if is_uploading(filename):
return
try:
if action == 'add':
# 检查文件是否已经在索引中
stats = self.knowledge_service.get_stats()
if filename in stats.get('files', []):
print(f"[文件监控] 文件已索引,跳过: {filename}")
return
print(f"[文件监控] 检测到新文件: {filename}")
result = self.knowledge_service.add_document(path, filename)
if result['success']:
print(f"[文件监控] 已索引: {filename}, {result['chunks']} 个文本块")
else:
print(f"[文件监控] 索引失败: {filename}, {result.get('error')}")
elif action == 'delete':
# 确认文件确实不存在
if os.path.exists(path):
print(f"[文件监控] 文件仍存在,跳过删除: {filename}")
return
print(f"[文件监控] 检测到文件删除: {filename}")
result = self.knowledge_service.delete_document(filename)
print(f"[文件监控] 已从索引删除: {filename}")
elif action == 'modify':
# 对于修改事件,只有当文件内容确实变化时才重新索引
# 这里简化处理:跳过修改事件,因为上传时已经索引过了
print(f"[文件监控] 检测到文件修改,跳过: {filename}")
except Exception as e:
print(f"[文件监控] 处理失败 {filename}: {e}")
def on_created(self, event):
if not event.is_directory and self._should_process(event.src_path):
self._debounce_process(event.src_path, 'add')
def on_deleted(self, event):
if not event.is_directory:
filename = os.path.basename(event.src_path)
if is_supported_file(filename) and not is_uploading(filename):
self._debounce_process(event.src_path, 'delete')
def on_modified(self, event):
# 暂时禁用修改事件处理,避免与上传冲突
pass
def on_moved(self, event):
if not event.is_directory:
src_filename = os.path.basename(event.src_path)
# 处理移出
if is_supported_file(src_filename) and not is_uploading(src_filename):
self._debounce_process(event.src_path, 'delete')
# 处理移入
if self._should_process(event.dest_path):
self._debounce_process(event.dest_path, 'add')
class FileWatcher:
"""文件监控服务"""
def __init__(self, knowledge_service):
self.knowledge_service = knowledge_service
self.observer = None
self.running = False
def start(self):
"""启动文件监控"""
if self.running:
return
print(f"[文件监控] 开始监控文件夹: {KNOWLEDGE_DIR}")
handler = KnowledgeFileHandler(self.knowledge_service)
self.observer = Observer()
self.observer.schedule(handler, KNOWLEDGE_DIR, recursive=True)
self.observer.start()
self.running = True
def stop(self):
"""停止文件监控"""
if self.observer:
self.observer.stop()
self.observer.join()
self.running = False
print("[文件监控] 已停止")

View File

@ -0,0 +1,244 @@
# -*- coding: utf-8 -*-
"""
知识库服务 - 管理文档的添加删除和检索
"""
import os
import shutil
from datetime import datetime
from config import KNOWLEDGE_DIR, UPLOAD_DIR
from document_parser import parse_document, is_supported_file
from text_splitter import split_text
from vector_store import vector_store
class KnowledgeService:
def __init__(self):
self.vector_store = vector_store
def init(self):
"""初始化服务,加载已有索引"""
self.vector_store.load_index()
def scan_and_index_folder(self):
"""
扫描知识库文件夹并索引所有文档
用于启动时或手动重建索引
"""
print(f"开始扫描知识库文件夹: {KNOWLEDGE_DIR}")
# 获取已索引的文件
stats = self.vector_store.get_stats()
indexed_files = set(stats.get('files', []))
# 扫描文件夹
new_files = []
for root, dirs, files in os.walk(KNOWLEDGE_DIR):
for filename in files:
if is_supported_file(filename):
file_path = os.path.join(root, filename)
rel_path = os.path.relpath(file_path, KNOWLEDGE_DIR)
if rel_path not in indexed_files:
new_files.append((filename, file_path, rel_path))
# 索引新文件
indexed_count = 0
for filename, file_path, rel_path in new_files:
try:
result = self.add_document(file_path, filename)
if result['success']:
indexed_count += 1
print(f" 已索引: {rel_path}")
except Exception as e:
print(f" 索引失败 {rel_path}: {e}")
print(f"扫描完成,新索引 {indexed_count} 个文件")
return {
'scanned': len(new_files),
'indexed': indexed_count
}
def add_document(self, file_path, filename=None):
"""
添加单个文档到知识库
Args:
file_path: 文件路径
filename: 文件名可选
Returns:
处理结果
"""
if filename is None:
filename = os.path.basename(file_path)
if not os.path.exists(file_path):
return {'success': False, 'error': '文件不存在'}
if not is_supported_file(filename):
return {'success': False, 'error': '不支持的文件类型'}
# 解析文档
print(f"正在解析文档: {filename}")
content = parse_document(file_path)
if not content or not content.strip():
return {'success': False, 'error': '文档内容为空'}
# 分块
chunks = split_text(content)
if not chunks:
return {'success': False, 'error': '文档分块失败'}
# 元数据
metadata = {
'filename': filename,
'file_path': file_path,
'indexed_at': datetime.now().isoformat(),
'char_count': len(content)
}
# 添加到向量存储
added = self.vector_store.add_documents(chunks, metadata)
return {
'success': True,
'filename': filename,
'chunks': added,
'char_count': len(content)
}
def upload_and_index(self, file_storage, copy_to_knowledge=True):
"""
处理上传的文件并索引
Args:
file_storage: Flask FileStorage 对象
copy_to_knowledge: 是否复制到知识库文件夹
Returns:
处理结果
"""
from file_watcher import mark_uploading, unmark_uploading
filename = file_storage.filename
if not is_supported_file(filename):
return {'success': False, 'error': '不支持的文件类型'}
# 标记文件正在上传,防止文件监控器干扰
mark_uploading(filename)
# 保存到临时目录
temp_path = os.path.join(UPLOAD_DIR, filename)
file_storage.save(temp_path)
try:
# 索引文档
result = self.add_document(temp_path, filename)
if result['success'] and copy_to_knowledge:
# 复制到知识库文件夹
dest_path = os.path.join(KNOWLEDGE_DIR, filename)
shutil.copy2(temp_path, dest_path)
result['saved_to'] = dest_path
return result
finally:
# 清理临时文件
if os.path.exists(temp_path):
os.remove(temp_path)
# 延迟取消上传标记,给文件监控器足够时间忽略事件
import threading
def delayed_unmark():
import time
time.sleep(5)
unmark_uploading(filename)
threading.Thread(target=delayed_unmark, daemon=True).start()
def delete_document(self, filename):
"""
删除文档
Args:
filename: 文件名
Returns:
删除结果
"""
# 从向量存储删除
deleted = self.vector_store.delete_by_filename(filename)
# 从知识库文件夹删除
file_path = os.path.join(KNOWLEDGE_DIR, filename)
file_deleted = False
if os.path.exists(file_path):
os.remove(file_path)
file_deleted = True
return {
'success': deleted > 0 or file_deleted,
'chunks_deleted': deleted,
'file_deleted': file_deleted
}
def search(self, query, top_k=5):
"""
搜索相关文档
Args:
query: 查询文本
top_k: 返回结果数量
Returns:
搜索结果
"""
results = self.vector_store.search(query, top_k)
return results
def get_stats(self):
"""获取知识库统计信息"""
return self.vector_store.get_stats()
def list_documents(self):
"""列出所有已索引的文档"""
stats = self.vector_store.get_stats()
files = stats.get('files', [])
documents = []
for filename in files:
# 统计该文件的块数
chunk_count = sum(1 for doc in self.vector_store.documents
if doc.get('metadata', {}).get('filename') == filename)
# 获取文件信息
file_path = os.path.join(KNOWLEDGE_DIR, filename)
file_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0
documents.append({
'filename': filename,
'chunks': chunk_count,
'size': file_size,
'exists': os.path.exists(file_path)
})
return documents
def rebuild_index(self):
"""重建整个索引"""
print("开始重建索引...")
# 清空现有索引
self.vector_store.clear()
# 重新扫描并索引
result = self.scan_and_index_folder()
return {
'success': True,
'indexed': result['indexed']
}
# 全局实例
knowledge_service = KnowledgeService()

View File

@ -0,0 +1,11 @@
# RAG 知识库服务依赖(使用本地 Ollama
flask>=2.0.0
flask-cors>=4.0.0
faiss-cpu>=1.7.0
numpy>=1.21.0
watchdog>=3.0.0
pypdf2>=3.0.0
python-docx>=0.8.0
chardet>=5.0.0
jieba>=0.42.0
requests>=2.28.0

View File

@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
文本分块器 - 将长文本分割成小块
"""
import re
from config import CHUNK_SIZE, CHUNK_OVERLAP
def split_text(text, chunk_size=None, chunk_overlap=None):
"""
将文本分割成小块
Args:
text: 要分割的文本
chunk_size: 每块的最大字符数
chunk_overlap: 块之间的重叠字符数
Returns:
文本块列表
"""
if chunk_size is None:
chunk_size = CHUNK_SIZE
if chunk_overlap is None:
chunk_overlap = CHUNK_OVERLAP
if not text or not text.strip():
return []
# 清理文本
text = text.strip()
text = re.sub(r'\n{3,}', '\n\n', text) # 多个换行合并
text = re.sub(r' {2,}', ' ', text) # 多个空格合并
# 按段落分割
paragraphs = re.split(r'\n\n+', text)
chunks = []
current_chunk = ""
for para in paragraphs:
para = para.strip()
if not para:
continue
# 如果当前段落本身就超过chunk_size需要进一步分割
if len(para) > chunk_size:
# 先保存当前chunk
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = ""
# 按句子分割长段落
sentences = re.split(r'([。!?.!?])', para)
temp_chunk = ""
for i in range(0, len(sentences), 2):
sentence = sentences[i]
if i + 1 < len(sentences):
sentence += sentences[i + 1]
if len(temp_chunk) + len(sentence) <= chunk_size:
temp_chunk += sentence
else:
if temp_chunk:
chunks.append(temp_chunk.strip())
temp_chunk = sentence[-chunk_overlap:] + sentence if len(sentence) > chunk_overlap else sentence
if temp_chunk:
current_chunk = temp_chunk
else:
# 检查是否可以添加到当前chunk
if len(current_chunk) + len(para) + 1 <= chunk_size:
current_chunk += ("\n" if current_chunk else "") + para
else:
# 保存当前chunk开始新的
if current_chunk:
chunks.append(current_chunk.strip())
# 保留重叠部分
if len(current_chunk) > chunk_overlap:
current_chunk = current_chunk[-chunk_overlap:] + "\n" + para
else:
current_chunk = para
# 保存最后一个chunk
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks

326
rag-python/vector_store.py Normal file
View File

@ -0,0 +1,326 @@
# -*- coding: utf-8 -*-
"""
向量存储 - 使用 Ollama 生成向量FAISS 进行索引和检索
"""
import os
import json
import numpy as np
import requests
from config import INDEX_DIR, OLLAMA_URL, OLLAMA_EMBED_MODEL, TOP_K
class VectorStore:
def __init__(self):
self.index = None
self.documents = [] # 存储文档内容和元数据
self.dimension = 768 # nomic-embed-text 的向量维度
self.index_file = os.path.join(INDEX_DIR, "faiss.index")
self.docs_file = os.path.join(INDEX_DIR, "documents.json")
self.faiss = None
def _load_faiss(self):
"""懒加载 FAISS"""
if self.faiss is None:
import faiss
self.faiss = faiss
def _embed_with_ollama(self, text, retry_count=3):
"""使用 Ollama 生成向量,带重试机制"""
import time
import urllib.request
import urllib.error
url = f"{OLLAMA_URL}/api/embeddings"
# 确保文本不为空且是字符串
if not text or not isinstance(text, str):
text = "empty"
# 清理文本中的特殊字符
text = text.replace('\x00', '') # 移除 null 字符
# 截断过长的文本nomic-embed-text 上下文限制约 2048 tokens
# 中文约 1.5 字符/token保守设置为 1000 字符
max_length = 1000
if len(text) > max_length:
text = text[:max_length]
payload = {
"model": OLLAMA_EMBED_MODEL,
"prompt": text
}
last_error = None
for attempt in range(retry_count):
try:
# 使用 urllib 代替 requests避免潜在的编码问题
data = json.dumps(payload, ensure_ascii=False).encode('utf-8')
req = urllib.request.Request(
url,
data=data,
headers={'Content-Type': 'application/json; charset=utf-8'},
method='POST'
)
with urllib.request.urlopen(req, timeout=120) as response:
result = json.loads(response.read().decode('utf-8'))
return result.get("embedding", [])
except urllib.error.HTTPError as e:
last_error = e
error_body = e.read().decode('utf-8') if e.fp else 'N/A'
print(f"Ollama HTTP 错误 (尝试 {attempt+1}/{retry_count}): {e.code} {e.reason}")
print(f"响应内容: {error_body[:500]}")
print(f"请求文本长度: {len(text)}")
if attempt < retry_count - 1:
wait_time = (attempt + 1) * 2
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
except Exception as e:
last_error = e
print(f"Ollama 嵌入失败 (尝试 {attempt+1}/{retry_count}): {e}")
if attempt < retry_count - 1:
wait_time = (attempt + 1) * 2
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
raise last_error
def _embed_batch(self, texts):
"""批量生成向量"""
import time
embeddings = []
for i, text in enumerate(texts):
# 打印文本信息用于调试
print(f" 生成向量 {i+1}/{len(texts)}...")
print(f" 文本长度: {len(text)}, 前50字符: {repr(text[:50])}")
embedding = self._embed_with_ollama(text)
embeddings.append(embedding)
# 添加小延迟避免请求过快
if i < len(texts) - 1:
time.sleep(1.0)
return embeddings
for i, text in enumerate(texts):
print(f" 生成向量 {i+1}/{len(texts)}...")
embedding = self._embed_with_ollama(text)
embeddings.append(embedding)
# 添加小延迟避免请求过快
if i < len(texts) - 1:
time.sleep(0.5)
return embeddings
def _init_index(self):
"""初始化 FAISS 索引"""
self._load_faiss()
if self.index is None:
self.index = self.faiss.IndexFlatIP(self.dimension)
def load_index(self):
"""从磁盘加载索引"""
self._load_faiss()
if os.path.exists(self.index_file) and os.path.exists(self.docs_file):
try:
print("正在加载已有索引...")
# FAISS 在 Windows 上不支持中文路径,使用临时文件
import tempfile
import shutil
try:
# 复制到临时文件再读取
with tempfile.NamedTemporaryFile(delete=False, suffix='.index') as tmp:
tmp_path = tmp.name
shutil.copy2(self.index_file, tmp_path)
self.index = self.faiss.read_index(tmp_path)
os.unlink(tmp_path)
except Exception as e:
print(f"临时文件方式失败,尝试直接读取: {e}")
self.index = self.faiss.read_index(self.index_file)
with open(self.docs_file, 'r', encoding='utf-8') as f:
self.documents = json.load(f)
print(f"索引加载完成,共 {len(self.documents)} 个文档块")
return True
except Exception as e:
print(f"加载索引失败: {e}")
self._init_index()
self.documents = []
return False
else:
print("未找到已有索引,创建新索引")
self._init_index()
self.documents = []
return False
def save_index(self):
"""保存索引到磁盘"""
self._load_faiss()
if self.index is not None:
# 确保目录存在
os.makedirs(os.path.dirname(self.index_file), exist_ok=True)
# FAISS 在 Windows 上不支持中文路径,使用临时文件再移动
import tempfile
import shutil
try:
# 先写入临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.index') as tmp:
tmp_path = tmp.name
self.faiss.write_index(self.index, tmp_path)
# 移动到目标位置
shutil.move(tmp_path, self.index_file)
except Exception as e:
# 如果临时文件方式失败,尝试直接写入
print(f"临时文件方式失败,尝试直接写入: {e}")
self.faiss.write_index(self.index, self.index_file)
with open(self.docs_file, 'w', encoding='utf-8') as f:
json.dump(self.documents, f, ensure_ascii=False, indent=2)
print(f"索引已保存,共 {len(self.documents)} 个文档块")
def add_documents(self, chunks, metadata=None):
"""添加文档块到索引"""
if not chunks:
return 0
self._load_faiss()
self._init_index()
# 使用 Ollama 生成向量
print(f"正在为 {len(chunks)} 个文本块生成向量...")
embeddings = self._embed_batch(chunks)
# 检查向量维度
if embeddings and len(embeddings[0]) != self.dimension:
self.dimension = len(embeddings[0])
self.index = self.faiss.IndexFlatIP(self.dimension)
print(f"更新向量维度为: {self.dimension}")
# 归一化向量(用于余弦相似度)
embeddings_np = np.array(embeddings).astype('float32')
norms = np.linalg.norm(embeddings_np, axis=1, keepdims=True)
embeddings_np = embeddings_np / (norms + 1e-10)
# 添加到索引
start_idx = len(self.documents)
self.index.add(embeddings_np)
# 保存文档内容和元数据
for i, chunk in enumerate(chunks):
doc = {
'id': start_idx + i,
'content': chunk,
'metadata': metadata or {}
}
self.documents.append(doc)
# 自动保存
self.save_index()
return len(chunks)
def search(self, query, top_k=None):
"""搜索相关文档"""
if top_k is None:
top_k = TOP_K
if self.index is None or self.index.ntotal == 0:
return []
self._load_faiss()
# 生成查询向量
query_embedding = self._embed_with_ollama(query)
query_np = np.array([query_embedding]).astype('float32')
# 归一化
norm = np.linalg.norm(query_np)
query_np = query_np / (norm + 1e-10)
# 搜索
k = min(top_k, self.index.ntotal)
scores, indices = self.index.search(query_np, k)
# 构建结果
results = []
for i, idx in enumerate(indices[0]):
if idx < len(self.documents) and idx >= 0:
doc = self.documents[idx]
results.append({
'content': doc['content'],
'score': float(scores[0][i]),
'metadata': doc.get('metadata', {})
})
return results
def delete_by_filename(self, filename):
"""删除指定文件的所有文档块"""
if not self.documents:
return 0
self._load_faiss()
# 找出要保留的文档
remaining_docs = []
deleted_count = 0
for doc in self.documents:
if doc.get('metadata', {}).get('filename') != filename:
remaining_docs.append(doc)
else:
deleted_count += 1
if deleted_count > 0:
# 重建索引
self.documents = []
self.index = self.faiss.IndexFlatIP(self.dimension)
if remaining_docs:
chunks = [doc['content'] for doc in remaining_docs]
metadatas = [doc.get('metadata', {}) for doc in remaining_docs]
embeddings = self._embed_batch(chunks)
embeddings_np = np.array(embeddings).astype('float32')
norms = np.linalg.norm(embeddings_np, axis=1, keepdims=True)
embeddings_np = embeddings_np / (norms + 1e-10)
self.index.add(embeddings_np)
for i, (chunk, meta) in enumerate(zip(chunks, metadatas)):
self.documents.append({
'id': i,
'content': chunk,
'metadata': meta
})
self.save_index()
return deleted_count
def clear(self):
"""清空所有索引"""
self._load_faiss()
self.index = self.faiss.IndexFlatIP(self.dimension)
self.documents = []
self.save_index()
print("索引已清空")
def get_stats(self):
"""获取索引统计信息"""
files = set()
for doc in self.documents:
filename = doc.get('metadata', {}).get('filename')
if filename:
files.add(filename)
return {
'total_chunks': len(self.documents),
'total_files': len(files),
'files': list(files)
}
# 全局实例
vector_store = VectorStore()

View File

@ -0,0 +1,211 @@
package com.ddnai.web.controller.psychology;
import com.ddnai.common.core.controller.BaseController;
import com.ddnai.common.core.domain.AjaxResult;
import com.ddnai.system.rag.client.PythonRagClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* RAG 知识库管理Controller (调用Python服务)
*
* @author ddnai
*/
@RestController
@RequestMapping("/psychology/rag")
public class PsyRagController extends BaseController {
private static final Logger log = LoggerFactory.getLogger(PsyRagController.class);
@Autowired
private PythonRagClient pythonRagClient;
/**
* 检查服务状态
*/
@GetMapping("/status")
public AjaxResult checkStatus() {
boolean available = pythonRagClient.isAvailable();
Map<String, Object> result = new HashMap<>();
result.put("available", available);
result.put("message", available ? "Python RAG 服务运行中" : "Python RAG 服务未启动");
if (available) {
result.put("stats", pythonRagClient.getStats());
}
return AjaxResult.success(result);
}
/**
* 上传文档
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:upload')")
@PostMapping("/upload")
public AjaxResult uploadDocument(@RequestParam("file") MultipartFile file) {
try {
if (!pythonRagClient.isAvailable()) {
return AjaxResult.error("Python RAG 服务未启动,请先启动 rag-service");
}
Map<String, Object> result = pythonRagClient.uploadDocument(file);
if (Boolean.TRUE.equals(result.get("success"))) {
return AjaxResult.success("文档上传成功", result.get("data"));
} else {
return AjaxResult.error("上传失败: " + result.get("error"));
}
} catch (Exception e) {
log.error("文档上传失败", e);
return AjaxResult.error("文档上传失败:" + e.getMessage());
}
}
/**
* 获取文档列表
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:list')")
@GetMapping("/documents")
public AjaxResult listDocuments() {
try {
if (!pythonRagClient.isAvailable()) {
return AjaxResult.error("Python RAG 服务未启动");
}
List<Map<String, Object>> documents = pythonRagClient.listDocuments();
return AjaxResult.success(documents);
} catch (Exception e) {
log.error("获取文档列表失败", e);
return AjaxResult.error("获取文档列表失败:" + e.getMessage());
}
}
/**
* 删除文档
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:remove')")
@DeleteMapping("/documents/{filename}")
public AjaxResult deleteDocument(@PathVariable String filename) {
try {
if (!pythonRagClient.isAvailable()) {
return AjaxResult.error("Python RAG 服务未启动");
}
Map<String, Object> result = pythonRagClient.deleteDocument(filename);
if (Boolean.TRUE.equals(result.get("success"))) {
return AjaxResult.success("删除成功");
} else {
return AjaxResult.error("删除失败: " + result.get("error"));
}
} catch (Exception e) {
log.error("删除文档失败", e);
return AjaxResult.error("删除文档失败:" + e.getMessage());
}
}
/**
* 搜索文档
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:list')")
@PostMapping("/search")
public AjaxResult search(@RequestBody Map<String, Object> params) {
try {
if (!pythonRagClient.isAvailable()) {
return AjaxResult.error("Python RAG 服务未启动");
}
String query = (String) params.get("query");
Integer topK = params.get("topK") != null ? (Integer) params.get("topK") : 5;
if (query == null || query.trim().isEmpty()) {
return AjaxResult.error("查询内容不能为空");
}
List<Map<String, Object>> results = pythonRagClient.search(query, topK);
return AjaxResult.success(results);
} catch (Exception e) {
log.error("搜索失败", e);
return AjaxResult.error("搜索失败:" + e.getMessage());
}
}
/**
* 获取统计信息
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:list')")
@GetMapping("/stats")
public AjaxResult getStats() {
try {
if (!pythonRagClient.isAvailable()) {
Map<String, Object> result = new HashMap<>();
result.put("available", false);
result.put("message", "Python RAG 服务未启动");
return AjaxResult.success(result);
}
Map<String, Object> stats = pythonRagClient.getStats();
stats.put("available", true);
return AjaxResult.success(stats);
} catch (Exception e) {
log.error("获取统计信息失败", e);
return AjaxResult.error("获取统计信息失败:" + e.getMessage());
}
}
/**
* 重建索引
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:rebuild')")
@PostMapping("/rebuild")
public AjaxResult rebuildIndex() {
try {
if (!pythonRagClient.isAvailable()) {
return AjaxResult.error("Python RAG 服务未启动");
}
Map<String, Object> result = pythonRagClient.rebuildIndex();
if (Boolean.TRUE.equals(result.get("success"))) {
return AjaxResult.success("索引重建成功", result.get("data"));
} else {
return AjaxResult.error("重建失败: " + result.get("error"));
}
} catch (Exception e) {
log.error("重建索引失败", e);
return AjaxResult.error("重建索引失败:" + e.getMessage());
}
}
/**
* 扫描文件夹
*/
@PreAuthorize("@ss.hasPermi('psychology:knowledge:upload')")
@PostMapping("/scan")
public AjaxResult scanFolder() {
try {
if (!pythonRagClient.isAvailable()) {
return AjaxResult.error("Python RAG 服务未启动");
}
Map<String, Object> result = pythonRagClient.scanFolder();
if (Boolean.TRUE.equals(result.get("success"))) {
return AjaxResult.success("扫描完成", result.get("data"));
} else {
return AjaxResult.error("扫描失败: " + result.get("error"));
}
} catch (Exception e) {
log.error("扫描文件夹失败", e);
return AjaxResult.error("扫描文件夹失败:" + e.getMessage());
}
}
}

View File

@ -143,40 +143,38 @@ xss:
# RAG知识库配置
rag:
# 使用模式: openai(外部API)、ollama(本地) 或 hybrid(混合模式)
mode: hybrid
# Python RAG 服务配置(主要使用这个)
python:
url: http://localhost:5000
enabled: true
# OpenAI兼容API配置用于文本生成
# 禁用 Java 端的 RAG 功能,全部由 Python 服务处理
# 使用模式: disabled(禁用Java端)、python(仅Python)
mode: disabled
# OpenAI兼容API配置仅用于AI报告生成不用于RAG
openai:
# Kimi API (Moonshot) - 你现有的API
base-url: https://api.moonshot.cn/v1
# 你的Kimi API Key
api-key: sk-U9fdriPxwBcrpWW0Ite3N0eVtX7VxnqqqYUIBAdWd1hgEA9m
# 嵌入模型混合模式下不使用由Ollama提供
embed-model: BAAI/bge-large-zh-v1.5
# 生成模型用于AI报告生成
embed-model: none
generate-model: moonshot-v1-32k
# 连接超时时间(秒)
connect-timeout: 30
# 读取超时时间(秒)
read-timeout: 300
connect-timeout: 10
read-timeout: 60
# Ollama配置于本地嵌入
# Ollama配置禁用
ollama:
url: http://localhost:11434
# 嵌入模型(已下载)
embed-model: nomic-embed-text
# 生成模型混合模式下不使用由OpenAI API提供
generate-model: deepseek-r1:32b
# 连接超时时间(秒)
connect-timeout: 30
# 读取超时时间(秒)
read-timeout: 300
embed-model: none
generate-model: none
connect-timeout: 5
read-timeout: 30
enabled: false
# ChromaDB配置本地部署,可选
# ChromaDB配置禁用
chromadb:
url: http://localhost:8000
collection: psychology_knowledge
enabled: false
# 存储配置
storage:
@ -184,9 +182,9 @@ rag:
log-path: D:/wwwroot/RAG/logs
chroma-data-path: D:/wwwroot/RAG/data/chroma_db
# 文件监听配置
# 文件监听配置(禁用)
file-watcher:
enabled: false # 默认关闭,避免自动处理
enabled: false
watch-path: D:/wwwroot/RAG/uploads
scan-interval: 10

View File

@ -0,0 +1,3 @@
Application Version: ${ruoyi.version}
Spring Boot Version: ${spring-boot.version}
// AI心理健康测评系统 永不宕机 永无BUG //

View File

@ -40,22 +40,22 @@ public class ChromaDBClient {
this.baseUrl = ragProperties.getChromadb().getUrl();
this.collectionName = ragProperties.getChromadb().getCollection();
// 创建OkHttpClient实例
// 创建OkHttpClient实例使用较短的超时时间
this.httpClient = new OkHttpClient.Builder()
.connectTimeout(ragProperties.getChromadb().getConnectTimeout(), TimeUnit.SECONDS)
.readTimeout(ragProperties.getChromadb().getReadTimeout(), TimeUnit.SECONDS)
.writeTimeout(ragProperties.getChromadb().getConnectTimeout(), TimeUnit.SECONDS)
.retryOnConnectionFailure(true)
.connectTimeout(5, TimeUnit.SECONDS)
.readTimeout(10, TimeUnit.SECONDS)
.writeTimeout(5, TimeUnit.SECONDS)
.retryOnConnectionFailure(false)
.build();
log.info("ChromaDBClient initialized with base URL: {}, collection: {}", baseUrl, collectionName);
// 尝试创建集合如果不存在
try {
ensureCollectionExists();
} catch (IOException e) {
log.warn("Failed to ensure collection exists: {}", e.getMessage());
}
// 不在启动时尝试连接延迟到首次使用时
// try {
// ensureCollectionExists();
// } catch (IOException e) {
// log.warn("Failed to ensure collection exists: {}", e.getMessage());
// }
}
/**

View File

@ -41,12 +41,12 @@ public class OllamaClient {
public void init() {
this.baseUrl = ragProperties.getOllama().getUrl();
// 创建OkHttpClient实例
// 创建OkHttpClient实例使用较短的超时时间避免启动阻塞
this.httpClient = new OkHttpClient.Builder()
.connectTimeout(ragProperties.getOllama().getConnectTimeout(), TimeUnit.SECONDS)
.readTimeout(ragProperties.getOllama().getReadTimeout(), TimeUnit.SECONDS)
.writeTimeout(ragProperties.getOllama().getConnectTimeout(), TimeUnit.SECONDS)
.retryOnConnectionFailure(true)
.connectTimeout(5, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.writeTimeout(10, TimeUnit.SECONDS)
.retryOnConnectionFailure(false)
.build();
log.info("OllamaClient initialized with base URL: {}", baseUrl);

View File

@ -0,0 +1,239 @@
package com.ddnai.system.rag.client;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.http.*;
import org.springframework.stereotype.Component;
import org.springframework.util.LinkedMultiValueMap;
import org.springframework.util.MultiValueMap;
import org.springframework.web.client.RestTemplate;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.PostConstruct;
import java.util.*;
/**
* Python RAG 服务客户端
* 调用独立的 Python 知识库服务
*/
@Component
public class PythonRagClient {
private static final Logger log = LoggerFactory.getLogger(PythonRagClient.class);
@Value("${rag.python.url:http://localhost:5000}")
private String pythonServiceUrl;
@Value("${rag.python.enabled:true}")
private boolean enabled;
private final RestTemplate restTemplate;
private final ObjectMapper objectMapper;
public PythonRagClient() {
this.restTemplate = new RestTemplate();
this.objectMapper = new ObjectMapper();
}
@PostConstruct
public void init() {
log.info("Python RAG Client initialized, URL: {}, Enabled: {}", pythonServiceUrl, enabled);
}
/**
* 检查服务是否可用
*/
public boolean isAvailable() {
if (!enabled) {
return false;
}
try {
String url = pythonServiceUrl + "/api/health";
ResponseEntity<String> response = restTemplate.getForEntity(url, String.class);
return response.getStatusCode() == HttpStatus.OK;
} catch (Exception e) {
log.warn("Python RAG service not available: {}", e.getMessage());
return false;
}
}
/**
* 上传文档
*/
public Map<String, Object> uploadDocument(MultipartFile file) {
try {
String url = pythonServiceUrl + "/api/documents/upload";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.MULTIPART_FORM_DATA);
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
body.add("file", new ByteArrayResource(file.getBytes()) {
@Override
public String getFilename() {
return file.getOriginalFilename();
}
});
HttpEntity<MultiValueMap<String, Object>> requestEntity = new HttpEntity<>(body, headers);
ResponseEntity<String> response = restTemplate.postForEntity(url, requestEntity, String.class);
return parseResponse(response.getBody());
} catch (Exception e) {
log.error("Failed to upload document to Python service: {}", e.getMessage());
return errorResult("上传失败: " + e.getMessage());
}
}
/**
* 获取文档列表
*/
public List<Map<String, Object>> listDocuments() {
try {
String url = pythonServiceUrl + "/api/documents";
ResponseEntity<String> response = restTemplate.getForEntity(url, String.class);
Map<String, Object> result = parseResponse(response.getBody());
if (Boolean.TRUE.equals(result.get("success"))) {
Object data = result.get("data");
if (data instanceof List) {
return (List<Map<String, Object>>) data;
}
}
return new ArrayList<>();
} catch (Exception e) {
log.error("Failed to list documents: {}", e.getMessage());
return new ArrayList<>();
}
}
/**
* 删除文档
*/
public Map<String, Object> deleteDocument(String filename) {
try {
String url = pythonServiceUrl + "/api/documents/" + filename;
restTemplate.delete(url);
return successResult("删除成功");
} catch (Exception e) {
log.error("Failed to delete document: {}", e.getMessage());
return errorResult("删除失败: " + e.getMessage());
}
}
/**
* 搜索文档
*/
public List<Map<String, Object>> search(String query, int topK) {
try {
String url = pythonServiceUrl + "/api/search";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
Map<String, Object> body = new HashMap<>();
body.put("query", query);
body.put("top_k", topK);
HttpEntity<Map<String, Object>> requestEntity = new HttpEntity<>(body, headers);
ResponseEntity<String> response = restTemplate.postForEntity(url, requestEntity, String.class);
Map<String, Object> result = parseResponse(response.getBody());
if (Boolean.TRUE.equals(result.get("success"))) {
Object data = result.get("data");
if (data instanceof List) {
return (List<Map<String, Object>>) data;
}
}
return new ArrayList<>();
} catch (Exception e) {
log.error("Failed to search: {}", e.getMessage());
return new ArrayList<>();
}
}
/**
* 获取统计信息
*/
public Map<String, Object> getStats() {
try {
String url = pythonServiceUrl + "/api/stats";
ResponseEntity<String> response = restTemplate.getForEntity(url, String.class);
Map<String, Object> result = parseResponse(response.getBody());
if (Boolean.TRUE.equals(result.get("success"))) {
return (Map<String, Object>) result.get("data");
}
return new HashMap<>();
} catch (Exception e) {
log.error("Failed to get stats: {}", e.getMessage());
return new HashMap<>();
}
}
/**
* 重建索引
*/
public Map<String, Object> rebuildIndex() {
try {
String url = pythonServiceUrl + "/api/rebuild";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
HttpEntity<String> requestEntity = new HttpEntity<>("{}", headers);
ResponseEntity<String> response = restTemplate.postForEntity(url, requestEntity, String.class);
return parseResponse(response.getBody());
} catch (Exception e) {
log.error("Failed to rebuild index: {}", e.getMessage());
return errorResult("重建索引失败: " + e.getMessage());
}
}
/**
* 扫描文件夹
*/
public Map<String, Object> scanFolder() {
try {
String url = pythonServiceUrl + "/api/scan";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
HttpEntity<String> requestEntity = new HttpEntity<>("{}", headers);
ResponseEntity<String> response = restTemplate.postForEntity(url, requestEntity, String.class);
return parseResponse(response.getBody());
} catch (Exception e) {
log.error("Failed to scan folder: {}", e.getMessage());
return errorResult("扫描失败: " + e.getMessage());
}
}
private Map<String, Object> parseResponse(String json) {
try {
return objectMapper.readValue(json, Map.class);
} catch (Exception e) {
return errorResult("解析响应失败");
}
}
private Map<String, Object> successResult(String message) {
Map<String, Object> result = new HashMap<>();
result.put("success", true);
result.put("message", message);
return result;
}
private Map<String, Object> errorResult(String error) {
Map<String, Object> result = new HashMap<>();
result.put("success", false);
result.put("error", error);
return result;
}
}

View File

@ -51,6 +51,15 @@ public class AIServiceAdapter {
@PostConstruct
public void init() {
// 检查是否禁用 Java RAG
if ("disabled".equalsIgnoreCase(mode) || "python".equalsIgnoreCase(mode)) {
log.info("RAG服务模式: 已禁用Java端RAG使用Python服务");
useOpenAI = false;
useSimpleStore = true;
isHybridMode = false;
return;
}
isHybridMode = "hybrid".equalsIgnoreCase(mode);
useOpenAI = "openai".equalsIgnoreCase(mode) || isHybridMode;
useSimpleStore = true; // 默认使用简单存储

View File

@ -95,7 +95,7 @@ public class KnowledgeService {
List<String> chunks = textSplitter.split(text);
log.info("Split document into {} chunks", chunks.size());
// 5. 向量化 - 添加异常处理
// 5. 向量化
List<float[]> embeddings = null;
boolean vectorizationSuccess = false;
@ -105,7 +105,7 @@ public class KnowledgeService {
log.info("Successfully generated {} embeddings", embeddings.size());
vectorizationSuccess = true;
} catch (Exception e) {
log.error("Failed to generate embeddings, document will be saved without vectors: {}", e.getMessage(), e);
log.error("Failed to generate embeddings: {}", e.getMessage());
// 继续处理文档仍然会被保存只是没有向量
}
@ -132,17 +132,17 @@ public class KnowledgeService {
ids.add(docId + "_chunk_" + i);
}
// 7. 存储到ChromaDB - 只有向量化成功才存储
// 7. 存储向量 - 使用AIServiceAdapter自动选择SimpleVectorStore或ChromaDB
if (vectorizationSuccess && embeddings != null) {
try {
chromaDBClient.addDocuments(chunks, embeddings, metadatas, ids);
log.info("Stored {} chunks to ChromaDB", chunks.size());
aiServiceAdapter.addDocuments(chunks, embeddings, metadatas, ids);
log.info("Stored {} chunks to vector store", chunks.size());
} catch (Exception e) {
log.warn("Failed to store to ChromaDB: {}", e.getMessage());
log.warn("Failed to store to vector store: {}", e.getMessage());
// 继续处理文档信息仍然会被保存
}
} else {
log.info("Skipped ChromaDB storage (vectorization failed or disabled)");
log.info("Skipped vector storage (vectorization failed)");
}
// 8. 更新文档索引

View File

@ -0,0 +1,2 @@
User-agent: *
Disallow: /

View File

@ -1,5 +1,66 @@
import request from '@/utils/request'
// ========== Python RAG 服务 API ==========
// 检查RAG服务状态
export function checkRagStatus() {
return request({
url: '/psychology/rag/status',
method: 'get'
})
}
// 获取RAG文档列表
export function listRagDocuments() {
return request({
url: '/psychology/rag/documents',
method: 'get'
})
}
// 删除RAG文档
export function delRagDocument(filename) {
return request({
url: '/psychology/rag/documents/' + encodeURIComponent(filename),
method: 'delete'
})
}
// RAG搜索
export function ragSearch(query, topK = 5) {
return request({
url: '/psychology/rag/search',
method: 'post',
data: { query, topK }
})
}
// 获取RAG统计信息
export function getRagStats() {
return request({
url: '/psychology/rag/stats',
method: 'get'
})
}
// 重建RAG索引
export function rebuildRagIndex() {
return request({
url: '/psychology/rag/rebuild',
method: 'post'
})
}
// 扫描文件夹
export function scanRagFolder() {
return request({
url: '/psychology/rag/scan',
method: 'post'
})
}
// ========== 原有 API (保留兼容) ==========
// 查询知识库文档列表
export function listDocuments(query) {
return request({

View File

@ -1155,6 +1155,14 @@ export default {
.options-container {
margin-top: 20px;
width: 100%;
}
/* 确保radio-group和checkbox-group占满宽度 */
.options-container .el-radio-group,
.options-container .el-checkbox-group {
width: 100%;
display: block;
}
.option-item {
@ -1167,6 +1175,9 @@ export default {
gap: 15px;
border-radius: 4px;
margin-bottom: 2px;
width: 100%;
box-sizing: border-box;
min-height: 48px;
}
.option-item:hover {
@ -1209,7 +1220,27 @@ export default {
.option-content {
flex: 1;
pointer-events: none;
width: 100%;
min-width: 0;
}
/* 让el-radio和el-checkbox占满整行整行可点击 */
.option-content .el-radio,
.option-content .el-checkbox {
display: flex;
width: 100%;
margin-right: 0;
padding-right: 20px;
white-space: normal;
align-items: flex-start;
}
.option-content .el-radio__label,
.option-content .el-checkbox__label {
flex: 1;
white-space: normal;
line-height: 1.5;
padding-left: 10px;
}
.option-tts-btn {

View File

@ -1,5 +1,17 @@
<template>
<div class="app-container">
<!-- 服务状态提示 -->
<el-alert
v-if="!serviceAvailable"
title="Python RAG 服务未启动"
type="warning"
description="请先运行 rag-service/start.bat 启动知识库服务"
show-icon
:closable="false"
style="margin-bottom: 20px;"
>
</el-alert>
<el-row :gutter="20">
<!-- 统计卡片 -->
<el-col :span="6">
@ -7,7 +19,7 @@
<div class="stat-content">
<i class="el-icon-document stat-icon"></i>
<div class="stat-info">
<div class="stat-value">{{ statistics.documentCount || 0 }}</div>
<div class="stat-value">{{ statistics.total_files || 0 }}</div>
<div class="stat-label">文档数量</div>
</div>
</div>
@ -18,7 +30,7 @@
<div class="stat-content">
<i class="el-icon-files stat-icon"></i>
<div class="stat-info">
<div class="stat-value">{{ statistics.chunkCount || 0 }}</div>
<div class="stat-value">{{ statistics.total_chunks || 0 }}</div>
<div class="stat-label">知识片段</div>
</div>
</div>
@ -27,10 +39,10 @@
<el-col :span="6">
<el-card class="stat-card">
<div class="stat-content">
<i class="el-icon-connection stat-icon"></i>
<i class="el-icon-connection stat-icon" :style="{ color: serviceAvailable ? '#67C23A' : '#F56C6C' }"></i>
<div class="stat-info">
<div class="stat-value">{{ statistics.aiStatus || '未知' }}</div>
<div class="stat-label">AI服务</div>
<div class="stat-value">{{ serviceAvailable ? '运行中' : '未启动' }}</div>
<div class="stat-label">RAG服务</div>
</div>
</div>
</el-card>
@ -38,10 +50,10 @@
<el-col :span="6">
<el-card class="stat-card">
<div class="stat-content">
<i class="el-icon-data-line stat-icon"></i>
<i class="el-icon-folder-opened stat-icon"></i>
<div class="stat-info">
<div class="stat-value">{{ statistics.vectorStatus || '未知' }}</div>
<div class="stat-label">向量存储</div>
<div class="stat-value" style="font-size: 14px;">knowledge_docs</div>
<div class="stat-label">知识库文件夹</div>
</div>
</div>
</el-card>
@ -56,8 +68,19 @@
icon="el-icon-upload"
size="mini"
@click="handleUpload"
:disabled="!serviceAvailable"
>上传文档</el-button>
</el-col>
<el-col :span="1.5">
<el-button
type="warning"
icon="el-icon-folder"
size="mini"
@click="handleScan"
:loading="scanLoading"
:disabled="!serviceAvailable"
>扫描文件夹</el-button>
</el-col>
<el-col :span="1.5">
<el-button
type="success"
@ -65,50 +88,65 @@
size="mini"
@click="handleRebuild"
:loading="rebuildLoading"
:disabled="!serviceAvailable"
>重建索引</el-button>
</el-col>
<el-col :span="1.5">
<el-button
type="danger"
icon="el-icon-delete"
size="mini"
@click="handleClearAll"
>清空知识库</el-button>
</el-col>
<el-col :span="1.5">
<el-button
type="info"
icon="el-icon-refresh"
size="mini"
@click="getList"
>刷新</el-button>
@click="checkStatus"
>刷新状态</el-button>
</el-col>
</el-row>
<!-- 搜索测试 -->
<el-card style="margin-top: 20px;" v-if="serviceAvailable">
<div slot="header">
<span>知识库搜索测试</span>
</div>
<el-row :gutter="10">
<el-col :span="18">
<el-input
v-model="searchQuery"
placeholder="输入问题测试知识库检索..."
@keyup.enter.native="handleSearch"
></el-input>
</el-col>
<el-col :span="6">
<el-button type="primary" @click="handleSearch" :loading="searchLoading">搜索</el-button>
</el-col>
</el-row>
<div v-if="searchResults.length > 0" style="margin-top: 15px;">
<div v-for="(result, index) in searchResults" :key="index" class="search-result">
<div class="result-header">
<span class="result-score">相关度: {{ (result.score * 100).toFixed(1) }}%</span>
<span class="result-source">来源: {{ getFilename(result) }}</span>
</div>
<div class="result-content">{{ result.content }}</div>
</div>
</div>
</el-card>
<!-- 文档列表 -->
<el-table v-loading="loading" :data="documentList" style="margin-top: 20px;">
<el-table-column label="文档名称" prop="filename" min-width="200" />
<el-table-column label="分类" prop="category" width="120">
<el-table-column label="片段数" prop="chunks" width="100" align="center" />
<el-table-column label="文件大小" prop="size" width="120" align="center">
<template slot-scope="scope">
<el-tag v-if="scope.row.category" size="small">{{ scope.row.category }}</el-tag>
<span v-else>-</span>
{{ formatFileSize(scope.row.size) }}
</template>
</el-table-column>
<el-table-column label="片段数" prop="chunkCount" width="100" align="center" />
<el-table-column label="文件大小" prop="fileSize" width="120" align="center">
<el-table-column label="状态" width="100" align="center">
<template slot-scope="scope">
{{ formatFileSize(scope.row.fileSize) }}
<el-tag :type="scope.row.exists ? 'success' : 'danger'" size="small">
{{ scope.row.exists ? '正常' : '文件丢失' }}
</el-tag>
</template>
</el-table-column>
<el-table-column label="上传时间" prop="uploadTime" width="180" align="center" />
<el-table-column label="操作" align="center" width="180" class-name="small-padding fixed-width">
<el-table-column label="操作" align="center" width="120" class-name="small-padding fixed-width">
<template slot-scope="scope">
<el-button
size="mini"
type="text"
icon="el-icon-view"
@click="handleView(scope.row)"
>查看</el-button>
<el-button
size="mini"
type="text"
@ -120,32 +158,12 @@
</el-table-column>
</el-table>
<pagination
v-show="total>0"
:total="total"
:page.sync="queryParams.pageNum"
:limit.sync="queryParams.pageSize"
@pagination="getList"
/>
<!-- 上传对话框 -->
<el-dialog title="上传文档" :visible.sync="uploadDialogVisible" width="600px" append-to-body>
<el-form ref="uploadForm" :model="uploadForm" label-width="100px">
<el-form-item label="文档分类">
<el-select v-model="uploadForm.category" placeholder="请选择分类" style="width: 100%;">
<el-option label="测评标准" value="assessment" />
<el-option label="理论知识" value="theory" />
<el-option label="案例分析" value="case" />
<el-option label="干预方法" value="intervention" />
<el-option label="其他" value="other" />
</el-select>
</el-form-item>
<el-form-item label="选择文件">
<el-upload
ref="upload"
:action="uploadUrl"
:headers="uploadHeaders"
:data="uploadForm"
:on-success="handleUploadSuccess"
:on-error="handleUploadError"
:before-upload="beforeUpload"
@ -153,33 +171,18 @@
:auto-upload="false"
accept=".txt,.pdf,.docx,.md"
drag
multiple
>
<i class="el-icon-upload"></i>
<div class="el-upload__text">将文件拖到此处<em>点击上传</em></div>
<div class="el-upload__tip" slot="tip">
支持格式txtpdfdocxmd单个文件不超过10MB
支持格式txtpdfdocxmd单个文件不超过10MB<br>
提示也可以直接将文件放入 rag-service/knowledge_docs 文件夹
</div>
</el-upload>
</el-form-item>
</el-form>
<div slot="footer" class="dialog-footer">
<el-button @click="uploadDialogVisible = false"> </el-button>
<el-button type="primary" @click="submitUpload" :loading="uploadLoading"> </el-button>
</div>
</el-dialog>
<!-- 查看对话框 -->
<el-dialog title="文档详情" :visible.sync="viewDialogVisible" width="800px" append-to-body>
<el-descriptions :column="2" border v-if="currentDocument">
<el-descriptions-item label="文档名称">{{ currentDocument.filename }}</el-descriptions-item>
<el-descriptions-item label="文档ID">{{ currentDocument.id }}</el-descriptions-item>
<el-descriptions-item label="分类">{{ currentDocument.category || '-' }}</el-descriptions-item>
<el-descriptions-item label="片段数">{{ currentDocument.chunkCount }}</el-descriptions-item>
<el-descriptions-item label="文件大小">{{ formatFileSize(currentDocument.fileSize) }}</el-descriptions-item>
<el-descriptions-item label="上传时间">{{ currentDocument.uploadTime }}</el-descriptions-item>
</el-descriptions>
<div slot="footer" class="dialog-footer">
<el-button @click="viewDialogVisible = false"> </el-button>
<el-button type="primary" @click="submitUpload" :loading="uploadLoading"> </el-button>
</div>
</el-dialog>
</div>
@ -187,84 +190,99 @@
<script>
import { getToken } from "@/utils/auth";
import { listDocuments, getDocument, delDocument, searchDocuments, rebuildIndex, getStatistics, clearKnowledge } from "@/api/psychology/knowledge";
import { checkRagStatus, listRagDocuments, delRagDocument, ragSearch, getRagStats, rebuildRagIndex, scanRagFolder } from "@/api/psychology/knowledge";
export default {
name: "KnowledgeManagement",
data() {
return {
//
serviceAvailable: false,
//
loading: false,
rebuildLoading: false,
scanLoading: false,
uploadLoading: false,
searchLoading: false,
//
statistics: {
documentCount: 0,
chunkCount: 0,
aiStatus: '检查中...',
vectorStatus: '检查中...'
total_files: 0,
total_chunks: 0
},
//
documentList: [],
total: 0,
//
queryParams: {
pageNum: 1,
pageSize: 10,
category: undefined
},
//
searchQuery: '',
searchResults: [],
//
uploadDialogVisible: false,
uploadForm: {
category: 'other'
},
uploadUrl: process.env.VUE_APP_BASE_API + "/psychology/knowledge/upload",
uploadUrl: process.env.VUE_APP_BASE_API + "/psychology/rag/upload",
uploadHeaders: {
Authorization: "Bearer " + getToken()
},
fileList: [],
//
viewDialogVisible: false,
currentDocument: null
fileList: []
};
},
created() {
this.getStatistics();
this.getList();
this.checkStatus();
},
methods: {
/** 检查服务状态 */
checkStatus() {
this.loading = true;
checkRagStatus().then(response => {
if (response.code === 200 && response.data) {
this.serviceAvailable = response.data.available || false;
if (response.data.stats) {
this.statistics = response.data.stats;
}
if (this.serviceAvailable) {
this.getList();
}
}
this.loading = false;
}).catch(() => {
this.serviceAvailable = false;
this.loading = false;
});
},
/** 查询文档列表 */
getList() {
if (!this.serviceAvailable) return;
this.loading = true;
listDocuments(this.queryParams).then(response => {
this.documentList = response.rows || [];
this.total = response.total || 0;
listRagDocuments().then(response => {
if (response.code === 200) {
this.documentList = response.data || [];
}
this.loading = false;
}).catch(() => {
this.loading = false;
});
},
/** 获取统计信息 */
getStatistics() {
getStatistics().then(response => {
if (response.code === 200) {
this.statistics = response.data;
/** 搜索 */
handleSearch() {
if (!this.searchQuery.trim()) {
this.$message.warning('请输入搜索内容');
return;
}
}).catch(error => {
console.error('获取统计信息失败:', error);
//
this.statistics = {
documentCount: 0,
chunkCount: 0,
aiStatus: '服务未启动',
vectorStatus: '服务未启动'
};
this.searchLoading = true;
ragSearch(this.searchQuery, 5).then(response => {
if (response.code === 200) {
this.searchResults = response.data || [];
if (this.searchResults.length === 0) {
this.$message.info('未找到相关内容');
}
}
this.searchLoading = false;
}).catch(() => {
this.searchLoading = false;
});
},
/** 上传按钮操作 */
handleUpload() {
this.uploadDialogVisible = true;
this.uploadForm = { category: 'other' };
this.fileList = [];
},
/** 上传前校验 */
@ -291,10 +309,9 @@ export default {
handleUploadSuccess(response, file, fileList) {
this.uploadLoading = false;
if (response.code === 200) {
this.$message.success('文档上传成功,已自动加入知识库');
this.$message.success('文档上传成功');
this.uploadDialogVisible = false;
this.getList();
this.getStatistics();
this.checkStatus();
} else {
this.$message.error(response.msg || '上传失败');
}
@ -302,22 +319,37 @@ export default {
/** 上传失败回调 */
handleUploadError(err, file, fileList) {
this.uploadLoading = false;
this.$message.error('上传失败:' + err);
this.$message.error('上传失败');
},
/** 扫描文件夹 */
handleScan() {
this.scanLoading = true;
scanRagFolder().then(response => {
this.scanLoading = false;
if (response.code === 200) {
const data = response.data || {};
this.$message.success(`扫描完成,新索引 ${data.indexed || 0} 个文件`);
this.checkStatus();
} else {
this.$message.error(response.msg || '扫描失败');
}
}).catch(() => {
this.scanLoading = false;
});
},
/** 重建索引 */
handleRebuild() {
this.$confirm('重建索引将重新处理所有文档,可能需要较长时间,是否继续?', '提示', {
this.$confirm('重建索引将重新处理所有文档,是否继续?', '提示', {
confirmButtonText: '确定',
cancelButtonText: '取消',
type: 'warning'
}).then(() => {
this.rebuildLoading = true;
rebuildIndex().then(response => {
rebuildRagIndex().then(response => {
this.rebuildLoading = false;
if (response.code === 200) {
this.$message.success('索引重建成功');
this.getList();
this.getStatistics();
this.checkStatus();
} else {
this.$message.error(response.msg || '重建失败');
}
@ -326,29 +358,6 @@ export default {
});
});
},
/** 清空知识库 */
handleClearAll() {
this.$confirm('此操作将清空所有知识库数据,是否继续?', '警告', {
confirmButtonText: '确定',
cancelButtonText: '取消',
type: 'error'
}).then(() => {
clearKnowledge().then(response => {
if (response.code === 200) {
this.$message.success('知识库已清空');
this.getList();
this.getStatistics();
} else {
this.$message.error(response.msg || '清空失败');
}
});
});
},
/** 查看详情 */
handleView(row) {
this.currentDocument = row;
this.viewDialogVisible = true;
},
/** 删除文档 */
handleDelete(row) {
this.$confirm('确认删除文档 "' + row.filename + '" 吗?', '提示', {
@ -356,11 +365,10 @@ export default {
cancelButtonText: '取消',
type: 'warning'
}).then(() => {
delDocument(row.id).then(response => {
delRagDocument(row.filename).then(response => {
if (response.code === 200) {
this.$message.success('删除成功');
this.getList();
this.getStatistics();
this.checkStatus();
} else {
this.$message.error(response.msg || '删除失败');
}
@ -373,6 +381,13 @@ export default {
if (bytes < 1024) return bytes + ' B';
if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(2) + ' KB';
return (bytes / 1024 / 1024).toFixed(2) + ' MB';
},
/** 获取文件名 */
getFilename(result) {
if (result && result.metadata && result.metadata.filename) {
return result.metadata.filename;
}
return '未知';
}
}
};
@ -409,4 +424,33 @@ export default {
color: #909399;
margin-top: 5px;
}
.search-result {
background: #f5f7fa;
border-radius: 4px;
padding: 12px;
margin-bottom: 10px;
}
.result-header {
display: flex;
justify-content: space-between;
margin-bottom: 8px;
font-size: 12px;
}
.result-score {
color: #67C23A;
font-weight: bold;
}
.result-source {
color: #909399;
}
.result-content {
color: #606266;
line-height: 1.6;
white-space: pre-wrap;
}
</style>