Initial commit: 胡汉三千年项目 - 北朝宇宙理论体系
This commit is contained in:
55
ai-tools/scripts/ocr_extractor.py
Normal file
55
ai-tools/scripts/ocr_extractor.py
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OCR 文字提取工具
|
||||
需要安装: pip install pytesseract pillow
|
||||
"""
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
def extract_text_from_image(image_path):
|
||||
"""从图片中提取文字"""
|
||||
try:
|
||||
# 打开图片
|
||||
image = Image.open(image_path)
|
||||
|
||||
# 使用 OCR 提取文字
|
||||
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
|
||||
|
||||
return text.strip()
|
||||
|
||||
except Exception as e:
|
||||
return f"OCR 失败: {e}"
|
||||
|
||||
def batch_ocr_images(image_dir, output_file="ocr_results.md"):
|
||||
"""批量 OCR 图片"""
|
||||
results = []
|
||||
|
||||
# 获取所有 PNG 图片
|
||||
png_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
|
||||
png_files.sort()
|
||||
|
||||
for filename in png_files:
|
||||
image_path = os.path.join(image_dir, filename)
|
||||
print(f"正在 OCR: {filename}")
|
||||
|
||||
text = extract_text_from_image(image_path)
|
||||
|
||||
if text:
|
||||
results.append(f"## {filename}\n\n```\n{text}\n```\n\n---\n")
|
||||
else:
|
||||
results.append(f"## {filename}\n\n*无文字内容*\n\n---\n")
|
||||
|
||||
# 保存结果
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write("# OCR 文字提取结果\n\n")
|
||||
f.writelines(results)
|
||||
|
||||
print(f"OCR 完成,结果保存到: {output_file}")
|
||||
|
||||
except ImportError:
|
||||
print("需要安装 OCR 依赖:")
|
||||
print("pip install pytesseract pillow")
|
||||
print("还需要安装 tesseract 引擎")
|
||||
Reference in New Issue
Block a user