# -*- coding: utf-8 -*- """ 索引合并脚本 - 用于合并多人处理的知识库索引 将多个 index_data 文件夹合并成一个统一的索引 使用方法: 1. 将各人处理好的 index_data 文件夹重命名后放到 to_merge/ 目录 例如: to_merge/index_data_张三/, to_merge/index_data_李四/ 2. 运行: python merge_index.py 3. 合并后的索引会保存到 index_data/ 目录 """ import os import sys import json import shutil # 添加当前目录到路径 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from config import INDEX_DIR, EMBEDDING_MODEL, OLLAMA_URL from vector_store import vector_store # 待合并的文件夹 MERGE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'to_merge') def load_documents_from_folder(folder_path): """从文件夹加载文档数据""" docs_file = os.path.join(folder_path, 'documents.json') if not os.path.exists(docs_file): print(f" 警告: {folder_path} 中没有 documents.json") return [] try: with open(docs_file, 'r', encoding='utf-8') as f: data = json.load(f) return data.get('documents', []) except Exception as e: print(f" 错误: 读取 {docs_file} 失败: {e}") return [] def merge_indexes(): """合并多个索引""" print("=" * 60) print("索引合并工具") print("=" * 60) # 检查合并目录 if not os.path.exists(MERGE_DIR): os.makedirs(MERGE_DIR) print(f"已创建合并目录: {MERGE_DIR}") print(f"请将各人处理好的 index_data 文件夹放到此目录中") print(f"例如: {MERGE_DIR}/index_data_张三/") return # 扫描待合并的文件夹 folders = [] for name in os.listdir(MERGE_DIR): folder_path = os.path.join(MERGE_DIR, name) if os.path.isdir(folder_path): docs_file = os.path.join(folder_path, 'documents.json') if os.path.exists(docs_file): folders.append((name, folder_path)) if not folders: print(f"在 {MERGE_DIR} 中没有找到有效的索引文件夹") print(f"请确保文件夹中包含 documents.json 文件") return print(f"找到 {len(folders)} 个待合并的索引:") for name, path in folders: print(f" - {name}") print() # 确认合并 confirm = input("是否开始合并?这将覆盖现有的 index_data (y/n): ").strip().lower() if confirm != 'y': print("已取消") return # 收集所有文档 all_documents = [] all_files = set() print() print("正在收集文档...") for name, folder_path in folders: print(f" 处理: {name}") docs = load_documents_from_folder(folder_path) for doc in docs: filename = doc.get('filename', '') # 避免重复文件 if filename and filename not in all_files: all_files.add(filename) all_documents.append(doc) print(f" 文档数: {len(docs)}, 累计: {len(all_documents)}") print() print(f"共收集 {len(all_documents)} 个文档块") print(f"来自 {len(all_files)} 个不同文件") print() # 清空现有索引 print("清空现有索引...") if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) os.makedirs(INDEX_DIR) # 重建索引 print("重建向量索引...") print("这可能需要一些时间,请耐心等待...") print() # 初始化向量存储 vector_store.documents = [] vector_store.index = None # 按文件分组处理 file_docs = {} for doc in all_documents: filename = doc.get('filename', 'unknown') if filename not in file_docs: file_docs[filename] = [] file_docs[filename].append(doc) total_files = len(file_docs) processed = 0 for filename, docs in file_docs.items(): processed += 1 print(f"[{processed}/{total_files}] 索引: {filename} ({len(docs)} 块)") # 提取文本块 chunks = [doc.get('content', '') for doc in docs] metadata = { 'filename': filename, 'file_path': docs[0].get('file_path', ''), 'char_count': sum(len(c) for c in chunks) } try: vector_store.add_documents(chunks, metadata) except Exception as e: print(f" 错误: {e}") # 保存索引 print() print("保存索引...") vector_store.save_index() # 显示结果 stats = vector_store.get_stats() print() print("=" * 60) print("合并完成!") print("=" * 60) print(f"总文件数: {stats['total_files']}") print(f"总文本块: {stats['total_chunks']}") print(f"索引目录: {INDEX_DIR}") print() print("提示: 合并完成后可以删除 to_merge/ 目录中的文件") if __name__ == '__main__': merge_indexes()