# -*- coding: utf-8 -*- """ 文档解析器 - 支持多种文件格式 """ import os import chardet from config import SUPPORTED_EXTENSIONS def detect_encoding(file_path): """检测文件编码""" with open(file_path, 'rb') as f: raw_data = f.read(10000) result = chardet.detect(raw_data) return result['encoding'] or 'utf-8' def parse_txt(file_path): """解析纯文本文件""" encoding = detect_encoding(file_path) try: with open(file_path, 'r', encoding=encoding, errors='ignore') as f: return f.read() except Exception as e: print(f"解析TXT文件失败 {file_path}: {e}") return "" def parse_md(file_path): """解析Markdown文件""" return parse_txt(file_path) def parse_pdf(file_path): """解析PDF文件(支持大文件)""" try: from PyPDF2 import PdfReader file_size = os.path.getsize(file_path) file_size_mb = file_size / (1024 * 1024) print(f" PDF文件大小: {file_size_mb:.1f} MB") reader = PdfReader(file_path) total_pages = len(reader.pages) print(f" PDF总页数: {total_pages}") text_parts = [] for i, page in enumerate(reader.pages): if (i + 1) % 50 == 0 or i == 0: print(f" 解析进度: {i + 1}/{total_pages} 页") try: text = page.extract_text() if text: text_parts.append(text) except Exception as e: print(f" 警告: 第 {i + 1} 页解析失败: {e}") continue print(f" PDF解析完成,提取文本 {len(''.join(text_parts))} 字符") return "\n".join(text_parts) except Exception as e: print(f"解析PDF文件失败 {file_path}: {e}") return "" def parse_docx(file_path): """解析Word文档""" try: from docx import Document doc = Document(file_path) text_parts = [] for para in doc.paragraphs: if para.text.strip(): text_parts.append(para.text) return "\n".join(text_parts) except Exception as e: print(f"解析DOCX文件失败 {file_path}: {e}") return "" def parse_document(file_path): """根据文件类型解析文档""" ext = os.path.splitext(file_path)[1].lower() if ext not in SUPPORTED_EXTENSIONS: print(f"不支持的文件类型: {ext}") return "" parsers = { '.txt': parse_txt, '.md': parse_md, '.pdf': parse_pdf, '.docx': parse_docx, '.doc': parse_docx, } parser = parsers.get(ext, parse_txt) return parser(file_path) def is_supported_file(filename): """检查文件是否支持""" ext = os.path.splitext(filename)[1].lower() return ext in SUPPORTED_EXTENSIONS