xinli/rag-python/document_parser.py

296 lines
9.2 KiB
Python
Raw Permalink Normal View History

2025-12-20 12:08:33 +08:00
# -*- coding: utf-8 -*-
"""
2025-12-20 18:33:07 +08:00
文档解析器 - 支持多种文件格式包括扫描版PDF的OCR识别
支持 EasyOCR推荐PaddleOCR Tesseract
2025-12-20 12:08:33 +08:00
"""
import os
import chardet
from config import SUPPORTED_EXTENSIONS
2025-12-20 18:33:07 +08:00
# Tesseract OCR 路径配置Windows- 备用方案
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# OCR 引擎选择:'easyocr'(推荐), 'paddle' 或 'tesseract'
OCR_ENGINE = 'easyocr'
# EasyOCR 实例(延迟初始化)
_easy_ocr = None
# PaddleOCR 实例(延迟初始化)
_paddle_ocr = None
def get_easy_ocr():
"""获取 EasyOCR 实例(单例模式)"""
global _easy_ocr
if _easy_ocr is None:
try:
import easyocr
# 支持中文和英文
_easy_ocr = easyocr.Reader(['ch_sim', 'en'], gpu=False)
print(" EasyOCR 初始化成功")
except ImportError:
print(" EasyOCR 未安装,请运行: pip install easyocr")
return None
return _easy_ocr
def get_paddle_ocr():
"""获取 PaddleOCR 实例(单例模式)"""
global _paddle_ocr
if _paddle_ocr is None:
try:
from paddleocr import PaddleOCR
_paddle_ocr = PaddleOCR(use_angle_cls=True, lang='ch', show_log=False)
print(" PaddleOCR 初始化成功")
except ImportError:
print(" PaddleOCR 未安装")
return None
return _paddle_ocr
2025-12-20 12:08:33 +08:00
def detect_encoding(file_path):
"""检测文件编码"""
with open(file_path, 'rb') as f:
raw_data = f.read(10000)
result = chardet.detect(raw_data)
return result['encoding'] or 'utf-8'
def parse_txt(file_path):
"""解析纯文本文件"""
encoding = detect_encoding(file_path)
try:
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
return f.read()
except Exception as e:
print(f"解析TXT文件失败 {file_path}: {e}")
return ""
def parse_md(file_path):
"""解析Markdown文件"""
return parse_txt(file_path)
2025-12-20 18:33:07 +08:00
def parse_pdf_with_ocr(file_path):
"""使用 PyMuPDF + OCR 解析PDF支持扫描版"""
2025-12-20 12:08:33 +08:00
try:
2025-12-20 18:33:07 +08:00
import fitz # PyMuPDF
from PIL import Image
import io
2025-12-20 12:08:33 +08:00
file_size = os.path.getsize(file_path)
file_size_mb = file_size / (1024 * 1024)
print(f" PDF文件大小: {file_size_mb:.1f} MB")
2025-12-20 18:33:07 +08:00
doc = fitz.open(file_path)
total_pages = len(doc)
print(f" PDF总页数: {total_pages}")
text_parts = []
ocr_used = False
for i, page in enumerate(doc):
if (i + 1) % 20 == 0 or i == 0:
print(f" 解析进度: {i + 1}/{total_pages}")
try:
# 先尝试直接提取文本
text = page.get_text()
# 如果文本太少可能是扫描版使用OCR
if len(text.strip()) < 50:
if not ocr_used:
print(f" 检测到扫描版PDF启用OCR识别...")
ocr_used = True
# 将页面渲染为图片
mat = fitz.Matrix(2, 2) # 2x缩放提高OCR精度
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
# 使用 EasyOCR、PaddleOCR 或 Tesseract
if OCR_ENGINE == 'easyocr':
text = ocr_with_easyocr(img_data)
elif OCR_ENGINE == 'paddle':
text = ocr_with_paddle(img_data)
else:
text = ocr_with_tesseract(img_data)
if text and text.strip():
text_parts.append(text)
except Exception as e:
print(f" 警告: 第 {i + 1} 页解析失败: {e}")
continue
doc.close()
total_chars = len(''.join(text_parts))
print(f" PDF解析完成提取文本 {total_chars} 字符" + (" (使用OCR)" if ocr_used else ""))
return "\n".join(text_parts)
except ImportError as e:
print(f" 缺少依赖: {e}")
print(f" 请运行: pip install pymupdf easyocr pillow")
return parse_pdf_basic(file_path)
except Exception as e:
print(f" PyMuPDF解析失败: {e},尝试基础解析...")
return parse_pdf_basic(file_path)
def ocr_with_easyocr(img_data):
"""使用 EasyOCR 识别图片(推荐)"""
try:
from PIL import Image
import io
import numpy as np
ocr = get_easy_ocr()
if ocr is None:
return ""
# 将图片数据转换为 numpy 数组
img = Image.open(io.BytesIO(img_data))
img_array = np.array(img)
# OCR 识别
result = ocr.readtext(img_array)
# 提取文本
texts = [item[1] for item in result]
return '\n'.join(texts)
except Exception as e:
print(f" EasyOCR 识别失败: {e}")
return ""
def ocr_with_paddle(img_data):
"""使用 PaddleOCR 识别图片"""
try:
from PIL import Image
import io
import numpy as np
ocr = get_paddle_ocr()
if ocr is None:
return ""
# 将图片数据转换为 numpy 数组
img = Image.open(io.BytesIO(img_data))
img_array = np.array(img)
# OCR 识别
result = ocr.ocr(img_array, cls=True)
# 提取文本
texts = []
if result and result[0]:
for line in result[0]:
if line and len(line) >= 2:
texts.append(line[1][0])
return '\n'.join(texts)
except Exception as e:
print(f" PaddleOCR 识别失败: {e}")
return ""
def ocr_with_tesseract(img_data):
"""使用 Tesseract 识别图片(备用方案)"""
try:
import pytesseract
from PIL import Image
import io
# 配置 Tesseract 路径
if os.path.exists(TESSERACT_PATH):
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
img = Image.open(io.BytesIO(img_data))
# OCR识别中文+英文)
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
return text
except Exception as e:
print(f" Tesseract 识别失败: {e}")
return ""
def parse_pdf_basic(file_path):
"""基础PDF解析使用PyPDF2不支持扫描版"""
try:
from PyPDF2 import PdfReader
file_size = os.path.getsize(file_path)
file_size_mb = file_size / (1024 * 1024)
print(f" PDF文件大小: {file_size_mb:.1f} MB (基础模式)")
2025-12-20 12:08:33 +08:00
reader = PdfReader(file_path)
total_pages = len(reader.pages)
print(f" PDF总页数: {total_pages}")
text_parts = []
for i, page in enumerate(reader.pages):
if (i + 1) % 50 == 0 or i == 0:
print(f" 解析进度: {i + 1}/{total_pages}")
try:
text = page.extract_text()
if text:
text_parts.append(text)
except Exception as e:
print(f" 警告: 第 {i + 1} 页解析失败: {e}")
continue
print(f" PDF解析完成提取文本 {len(''.join(text_parts))} 字符")
return "\n".join(text_parts)
except Exception as e:
print(f"解析PDF文件失败 {file_path}: {e}")
return ""
2025-12-20 18:33:07 +08:00
def parse_pdf(file_path):
"""解析PDF文件优先使用OCR方案"""
return parse_pdf_with_ocr(file_path)
2025-12-20 12:08:33 +08:00
def parse_docx(file_path):
"""解析Word文档"""
try:
from docx import Document
doc = Document(file_path)
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
return "\n".join(text_parts)
except Exception as e:
print(f"解析DOCX文件失败 {file_path}: {e}")
return ""
def parse_document(file_path):
"""根据文件类型解析文档"""
ext = os.path.splitext(file_path)[1].lower()
if ext not in SUPPORTED_EXTENSIONS:
print(f"不支持的文件类型: {ext}")
return ""
parsers = {
'.txt': parse_txt,
'.md': parse_md,
'.pdf': parse_pdf,
'.docx': parse_docx,
'.doc': parse_docx,
}
parser = parsers.get(ext, parse_txt)
return parser(file_path)
def is_supported_file(filename):
"""检查文件是否支持"""
ext = os.path.splitext(filename)[1].lower()
return ext in SUPPORTED_EXTENSIONS
2025-12-20 18:33:07 +08:00
def check_ocr_available():
"""检查OCR是否可用"""
try:
import pytesseract
if os.path.exists(TESSERACT_PATH):
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
version = pytesseract.get_tesseract_version()
print(f"Tesseract OCR 版本: {version}")
return True
except Exception as e:
print(f"OCR不可用: {e}")
print("请安装 Tesseract OCR: https://github.com/UB-Mannheim/tesseract/wiki")
return False