98 lines
2.8 KiB
Python
98 lines
2.8 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
文档解析器 - 支持多种文件格式
|
||
"""
|
||
import os
|
||
import chardet
|
||
from config import SUPPORTED_EXTENSIONS
|
||
|
||
def detect_encoding(file_path):
|
||
"""检测文件编码"""
|
||
with open(file_path, 'rb') as f:
|
||
raw_data = f.read(10000)
|
||
result = chardet.detect(raw_data)
|
||
return result['encoding'] or 'utf-8'
|
||
|
||
def parse_txt(file_path):
|
||
"""解析纯文本文件"""
|
||
encoding = detect_encoding(file_path)
|
||
try:
|
||
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
||
return f.read()
|
||
except Exception as e:
|
||
print(f"解析TXT文件失败 {file_path}: {e}")
|
||
return ""
|
||
|
||
def parse_md(file_path):
|
||
"""解析Markdown文件"""
|
||
return parse_txt(file_path)
|
||
|
||
def parse_pdf(file_path):
|
||
"""解析PDF文件(支持大文件)"""
|
||
try:
|
||
from PyPDF2 import PdfReader
|
||
|
||
file_size = os.path.getsize(file_path)
|
||
file_size_mb = file_size / (1024 * 1024)
|
||
print(f" PDF文件大小: {file_size_mb:.1f} MB")
|
||
|
||
reader = PdfReader(file_path)
|
||
total_pages = len(reader.pages)
|
||
print(f" PDF总页数: {total_pages}")
|
||
|
||
text_parts = []
|
||
for i, page in enumerate(reader.pages):
|
||
if (i + 1) % 50 == 0 or i == 0:
|
||
print(f" 解析进度: {i + 1}/{total_pages} 页")
|
||
try:
|
||
text = page.extract_text()
|
||
if text:
|
||
text_parts.append(text)
|
||
except Exception as e:
|
||
print(f" 警告: 第 {i + 1} 页解析失败: {e}")
|
||
continue
|
||
|
||
print(f" PDF解析完成,提取文本 {len(''.join(text_parts))} 字符")
|
||
return "\n".join(text_parts)
|
||
except Exception as e:
|
||
print(f"解析PDF文件失败 {file_path}: {e}")
|
||
return ""
|
||
|
||
def parse_docx(file_path):
|
||
"""解析Word文档"""
|
||
try:
|
||
from docx import Document
|
||
doc = Document(file_path)
|
||
text_parts = []
|
||
for para in doc.paragraphs:
|
||
if para.text.strip():
|
||
text_parts.append(para.text)
|
||
return "\n".join(text_parts)
|
||
except Exception as e:
|
||
print(f"解析DOCX文件失败 {file_path}: {e}")
|
||
return ""
|
||
|
||
def parse_document(file_path):
|
||
"""根据文件类型解析文档"""
|
||
ext = os.path.splitext(file_path)[1].lower()
|
||
|
||
if ext not in SUPPORTED_EXTENSIONS:
|
||
print(f"不支持的文件类型: {ext}")
|
||
return ""
|
||
|
||
parsers = {
|
||
'.txt': parse_txt,
|
||
'.md': parse_md,
|
||
'.pdf': parse_pdf,
|
||
'.docx': parse_docx,
|
||
'.doc': parse_docx,
|
||
}
|
||
|
||
parser = parsers.get(ext, parse_txt)
|
||
return parser(file_path)
|
||
|
||
def is_supported_file(filename):
|
||
"""检查文件是否支持"""
|
||
ext = os.path.splitext(filename)[1].lower()
|
||
return ext in SUPPORTED_EXTENSIONS
|