156 lines
4.6 KiB
Python
156 lines
4.6 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
批量索引脚本 - 用于处理大文件
|
||
直接运行此脚本来索引 knowledge_docs 目录中的所有文件
|
||
|
||
使用方法:
|
||
1. 将 PDF 文件放入 rag-python/knowledge_docs/ 目录
|
||
2. 运行: python batch_index.py
|
||
"""
|
||
import os
|
||
import sys
|
||
import time
|
||
|
||
# 添加当前目录到路径
|
||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
from config import KNOWLEDGE_DIR, CHUNK_SIZE
|
||
from document_parser import parse_document, is_supported_file
|
||
from text_splitter import split_text
|
||
from vector_store import vector_store
|
||
|
||
def format_time(seconds):
|
||
"""格式化时间"""
|
||
if seconds < 60:
|
||
return f"{seconds:.1f}秒"
|
||
elif seconds < 3600:
|
||
return f"{seconds/60:.1f}分钟"
|
||
else:
|
||
return f"{seconds/3600:.1f}小时"
|
||
|
||
def estimate_time(char_count):
|
||
"""估算处理时间"""
|
||
# 每300字符一个块,每块约1.5秒
|
||
chunks = char_count / CHUNK_SIZE
|
||
seconds = chunks * 1.5
|
||
return format_time(seconds)
|
||
|
||
def batch_index():
|
||
"""批量索引所有文件"""
|
||
print("=" * 60)
|
||
print("批量索引工具")
|
||
print("=" * 60)
|
||
print(f"知识库目录: {KNOWLEDGE_DIR}")
|
||
print(f"分块大小: {CHUNK_SIZE} 字符")
|
||
print()
|
||
|
||
# 加载现有索引
|
||
print("加载现有索引...")
|
||
vector_store.load_index()
|
||
stats = vector_store.get_stats()
|
||
indexed_files = set(stats.get('files', []))
|
||
print(f"已索引文件: {len(indexed_files)} 个")
|
||
print()
|
||
|
||
# 扫描文件
|
||
files_to_process = []
|
||
for filename in os.listdir(KNOWLEDGE_DIR):
|
||
file_path = os.path.join(KNOWLEDGE_DIR, filename)
|
||
if os.path.isfile(file_path) and is_supported_file(filename):
|
||
if filename not in indexed_files:
|
||
file_size = os.path.getsize(file_path)
|
||
files_to_process.append((filename, file_path, file_size))
|
||
|
||
if not files_to_process:
|
||
print("没有新文件需要索引。")
|
||
print(f"如需重新索引,请先删除 index_data 目录中的文件。")
|
||
return
|
||
|
||
# 显示待处理文件
|
||
print(f"发现 {len(files_to_process)} 个新文件:")
|
||
total_size = 0
|
||
for filename, _, size in files_to_process:
|
||
size_mb = size / (1024 * 1024)
|
||
total_size += size
|
||
print(f" - {filename} ({size_mb:.1f} MB)")
|
||
|
||
print(f"\n总大小: {total_size / (1024 * 1024):.1f} MB")
|
||
print()
|
||
|
||
# 确认处理
|
||
confirm = input("是否开始处理?(y/n): ").strip().lower()
|
||
if confirm != 'y':
|
||
print("已取消。")
|
||
return
|
||
|
||
print()
|
||
print("=" * 60)
|
||
print("开始处理...")
|
||
print("=" * 60)
|
||
|
||
total_start = time.time()
|
||
success_count = 0
|
||
fail_count = 0
|
||
|
||
for i, (filename, file_path, file_size) in enumerate(files_to_process):
|
||
print()
|
||
print(f"[{i+1}/{len(files_to_process)}] 处理: {filename}")
|
||
print("-" * 40)
|
||
|
||
file_start = time.time()
|
||
|
||
try:
|
||
# 解析文档
|
||
print("解析文档...")
|
||
content = parse_document(file_path)
|
||
|
||
if not content or not content.strip():
|
||
print(f" 警告: 文档内容为空,跳过")
|
||
fail_count += 1
|
||
continue
|
||
|
||
char_count = len(content)
|
||
print(f" 提取文本: {char_count} 字符")
|
||
print(f" 预计处理时间: {estimate_time(char_count)}")
|
||
|
||
# 分块
|
||
print("分块处理...")
|
||
chunks = split_text(content)
|
||
print(f" 生成 {len(chunks)} 个文本块")
|
||
|
||
# 向量化
|
||
print("向量化处理...")
|
||
metadata = {
|
||
'filename': filename,
|
||
'file_path': file_path,
|
||
'char_count': char_count
|
||
}
|
||
|
||
added = vector_store.add_documents(chunks, metadata)
|
||
|
||
file_time = time.time() - file_start
|
||
print(f" 完成! 耗时: {format_time(file_time)}")
|
||
success_count += 1
|
||
|
||
except Exception as e:
|
||
print(f" 错误: {e}")
|
||
fail_count += 1
|
||
|
||
# 总结
|
||
total_time = time.time() - total_start
|
||
print()
|
||
print("=" * 60)
|
||
print("处理完成!")
|
||
print("=" * 60)
|
||
print(f"成功: {success_count} 个文件")
|
||
print(f"失败: {fail_count} 个文件")
|
||
print(f"总耗时: {format_time(total_time)}")
|
||
|
||
# 显示最终统计
|
||
final_stats = vector_store.get_stats()
|
||
print(f"索引总文件数: {final_stats['total_files']}")
|
||
print(f"索引总文本块: {final_stats['total_chunks']}")
|
||
|
||
if __name__ == '__main__':
|
||
batch_index()
|