156 lines
4.6 KiB
Python
156 lines
4.6 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
批量索引脚本 - 用于处理大文件
|
|||
|
|
直接运行此脚本来索引 knowledge_docs 目录中的所有文件
|
|||
|
|
|
|||
|
|
使用方法:
|
|||
|
|
1. 将 PDF 文件放入 rag-python/knowledge_docs/ 目录
|
|||
|
|
2. 运行: python batch_index.py
|
|||
|
|
"""
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
# 添加当前目录到路径
|
|||
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|||
|
|
|
|||
|
|
from config import KNOWLEDGE_DIR, CHUNK_SIZE
|
|||
|
|
from document_parser import parse_document, is_supported_file
|
|||
|
|
from text_splitter import split_text
|
|||
|
|
from vector_store import vector_store
|
|||
|
|
|
|||
|
|
def format_time(seconds):
|
|||
|
|
"""格式化时间"""
|
|||
|
|
if seconds < 60:
|
|||
|
|
return f"{seconds:.1f}秒"
|
|||
|
|
elif seconds < 3600:
|
|||
|
|
return f"{seconds/60:.1f}分钟"
|
|||
|
|
else:
|
|||
|
|
return f"{seconds/3600:.1f}小时"
|
|||
|
|
|
|||
|
|
def estimate_time(char_count):
|
|||
|
|
"""估算处理时间"""
|
|||
|
|
# 每300字符一个块,每块约1.5秒
|
|||
|
|
chunks = char_count / CHUNK_SIZE
|
|||
|
|
seconds = chunks * 1.5
|
|||
|
|
return format_time(seconds)
|
|||
|
|
|
|||
|
|
def batch_index():
|
|||
|
|
"""批量索引所有文件"""
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("批量索引工具")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print(f"知识库目录: {KNOWLEDGE_DIR}")
|
|||
|
|
print(f"分块大小: {CHUNK_SIZE} 字符")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 加载现有索引
|
|||
|
|
print("加载现有索引...")
|
|||
|
|
vector_store.load_index()
|
|||
|
|
stats = vector_store.get_stats()
|
|||
|
|
indexed_files = set(stats.get('files', []))
|
|||
|
|
print(f"已索引文件: {len(indexed_files)} 个")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 扫描文件
|
|||
|
|
files_to_process = []
|
|||
|
|
for filename in os.listdir(KNOWLEDGE_DIR):
|
|||
|
|
file_path = os.path.join(KNOWLEDGE_DIR, filename)
|
|||
|
|
if os.path.isfile(file_path) and is_supported_file(filename):
|
|||
|
|
if filename not in indexed_files:
|
|||
|
|
file_size = os.path.getsize(file_path)
|
|||
|
|
files_to_process.append((filename, file_path, file_size))
|
|||
|
|
|
|||
|
|
if not files_to_process:
|
|||
|
|
print("没有新文件需要索引。")
|
|||
|
|
print(f"如需重新索引,请先删除 index_data 目录中的文件。")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 显示待处理文件
|
|||
|
|
print(f"发现 {len(files_to_process)} 个新文件:")
|
|||
|
|
total_size = 0
|
|||
|
|
for filename, _, size in files_to_process:
|
|||
|
|
size_mb = size / (1024 * 1024)
|
|||
|
|
total_size += size
|
|||
|
|
print(f" - {filename} ({size_mb:.1f} MB)")
|
|||
|
|
|
|||
|
|
print(f"\n总大小: {total_size / (1024 * 1024):.1f} MB")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 确认处理
|
|||
|
|
confirm = input("是否开始处理?(y/n): ").strip().lower()
|
|||
|
|
if confirm != 'y':
|
|||
|
|
print("已取消。")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("开始处理...")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
total_start = time.time()
|
|||
|
|
success_count = 0
|
|||
|
|
fail_count = 0
|
|||
|
|
|
|||
|
|
for i, (filename, file_path, file_size) in enumerate(files_to_process):
|
|||
|
|
print()
|
|||
|
|
print(f"[{i+1}/{len(files_to_process)}] 处理: {filename}")
|
|||
|
|
print("-" * 40)
|
|||
|
|
|
|||
|
|
file_start = time.time()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 解析文档
|
|||
|
|
print("解析文档...")
|
|||
|
|
content = parse_document(file_path)
|
|||
|
|
|
|||
|
|
if not content or not content.strip():
|
|||
|
|
print(f" 警告: 文档内容为空,跳过")
|
|||
|
|
fail_count += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
char_count = len(content)
|
|||
|
|
print(f" 提取文本: {char_count} 字符")
|
|||
|
|
print(f" 预计处理时间: {estimate_time(char_count)}")
|
|||
|
|
|
|||
|
|
# 分块
|
|||
|
|
print("分块处理...")
|
|||
|
|
chunks = split_text(content)
|
|||
|
|
print(f" 生成 {len(chunks)} 个文本块")
|
|||
|
|
|
|||
|
|
# 向量化
|
|||
|
|
print("向量化处理...")
|
|||
|
|
metadata = {
|
|||
|
|
'filename': filename,
|
|||
|
|
'file_path': file_path,
|
|||
|
|
'char_count': char_count
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
added = vector_store.add_documents(chunks, metadata)
|
|||
|
|
|
|||
|
|
file_time = time.time() - file_start
|
|||
|
|
print(f" 完成! 耗时: {format_time(file_time)}")
|
|||
|
|
success_count += 1
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" 错误: {e}")
|
|||
|
|
fail_count += 1
|
|||
|
|
|
|||
|
|
# 总结
|
|||
|
|
total_time = time.time() - total_start
|
|||
|
|
print()
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("处理完成!")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print(f"成功: {success_count} 个文件")
|
|||
|
|
print(f"失败: {fail_count} 个文件")
|
|||
|
|
print(f"总耗时: {format_time(total_time)}")
|
|||
|
|
|
|||
|
|
# 显示最终统计
|
|||
|
|
final_stats = vector_store.get_stats()
|
|||
|
|
print(f"索引总文件数: {final_stats['total_files']}")
|
|||
|
|
print(f"索引总文本块: {final_stats['total_chunks']}")
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
batch_index()
|