xinli/rag-python/batch_index.py

# -*- coding: utf-8 -*-
"""
批量索引脚本 - 用于处理大文件
直接运行此脚本来索引 knowledge_docs 目录中的所有文件

使用方法：
1. 将 PDF 文件放入 rag-python/knowledge_docs/ 目录
2. 运行: python batch_index.py
"""
import os
import sys
import time

# 添加当前目录到路径
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from config import KNOWLEDGE_DIR, CHUNK_SIZE
from document_parser import parse_document, is_supported_file
from text_splitter import split_text
from vector_store import vector_store

def format_time(seconds):
    """格式化时间"""
    if seconds < 60:
        return f"{seconds:.1f}秒"
    elif seconds < 3600:
        return f"{seconds/60:.1f}分钟"
    else:
        return f"{seconds/3600:.1f}小时"

def estimate_time(char_count):
    """估算处理时间"""
    # 每300字符一个块，每块约1.5秒
    chunks = char_count / CHUNK_SIZE
    seconds = chunks * 1.5
    return format_time(seconds)

def batch_index():
    """批量索引所有文件"""
    print("=" * 60)
    print("批量索引工具")
    print("=" * 60)
    print(f"知识库目录: {KNOWLEDGE_DIR}")
    print(f"分块大小: {CHUNK_SIZE} 字符")
    print()

    # 加载现有索引
    print("加载现有索引...")
    vector_store.load_index()
    stats = vector_store.get_stats()
    indexed_files = set(stats.get('files', []))
    print(f"已索引文件: {len(indexed_files)} 个")
    print()

    # 扫描文件
    files_to_process = []
    for filename in os.listdir(KNOWLEDGE_DIR):
        file_path = os.path.join(KNOWLEDGE_DIR, filename)
        if os.path.isfile(file_path) and is_supported_file(filename):
            if filename not in indexed_files:
                file_size = os.path.getsize(file_path)
                files_to_process.append((filename, file_path, file_size))

    if not files_to_process:
        print("没有新文件需要索引。")
        print(f"如需重新索引，请先删除 index_data 目录中的文件。")
        return

    # 显示待处理文件
    print(f"发现 {len(files_to_process)} 个新文件:")
    total_size = 0
    for filename, _, size in files_to_process:
        size_mb = size / (1024 * 1024)
        total_size += size
        print(f"  - {filename} ({size_mb:.1f} MB)")

    print(f"\n总大小: {total_size / (1024 * 1024):.1f} MB")
    print()

    # 确认处理
    confirm = input("是否开始处理？(y/n): ").strip().lower()
    if confirm != 'y':
        print("已取消。")
        return

    print()
    print("=" * 60)
    print("开始处理...")
    print("=" * 60)

    total_start = time.time()
    success_count = 0
    fail_count = 0

    for i, (filename, file_path, file_size) in enumerate(files_to_process):
        print()
        print(f"[{i+1}/{len(files_to_process)}] 处理: {filename}")
        print("-" * 40)

        file_start = time.time()

        try:
            # 解析文档
            print("解析文档...")
            content = parse_document(file_path)

            if not content or not content.strip():
                print(f"  警告: 文档内容为空，跳过")
                fail_count += 1
                continue

            char_count = len(content)
            print(f"  提取文本: {char_count} 字符")
            print(f"  预计处理时间: {estimate_time(char_count)}")

            # 分块
            print("分块处理...")
            chunks = split_text(content)
            print(f"  生成 {len(chunks)} 个文本块")

            # 向量化
            print("向量化处理...")
            metadata = {
                'filename': filename,
                'file_path': file_path,
                'char_count': char_count
            }

            added = vector_store.add_documents(chunks, metadata)

            file_time = time.time() - file_start
            print(f"  完成! 耗时: {format_time(file_time)}")
            success_count += 1

        except Exception as e:
            print(f"  错误: {e}")
            fail_count += 1

    # 总结
    total_time = time.time() - total_start
    print()
    print("=" * 60)
    print("处理完成!")
    print("=" * 60)
    print(f"成功: {success_count} 个文件")
    print(f"失败: {fail_count} 个文件")
    print(f"总耗时: {format_time(total_time)}")

    # 显示最终统计
    final_stats = vector_store.get_stats()
    print(f"索引总文件数: {final_stats['total_files']}")
    print(f"索引总文本块: {final_stats['total_chunks']}")

if __name__ == '__main__':
    batch_index()