#!/usr/bin/env python3 """ OCR 文字提取工具 需要安装: pip install pytesseract pillow """ try: import pytesseract from PIL import Image import os def extract_text_from_image(image_path): """从图片中提取文字""" try: # 打开图片 image = Image.open(image_path) # 使用 OCR 提取文字 text = pytesseract.image_to_string(image, lang='chi_sim+eng') return text.strip() except Exception as e: return f"OCR 失败: {e}" def batch_ocr_images(image_dir, output_file="ocr_results.md"): """批量 OCR 图片""" results = [] # 获取所有 PNG 图片 png_files = [f for f in os.listdir(image_dir) if f.endswith('.png')] png_files.sort() for filename in png_files: image_path = os.path.join(image_dir, filename) print(f"正在 OCR: {filename}") text = extract_text_from_image(image_path) if text: results.append(f"## {filename}\n\n```\n{text}\n```\n\n---\n") else: results.append(f"## {filename}\n\n*无文字内容*\n\n---\n") # 保存结果 with open(output_file, 'w', encoding='utf-8') as f: f.write("# OCR 文字提取结果\n\n") f.writelines(results) print(f"OCR 完成,结果保存到: {output_file}") except ImportError: print("需要安装 OCR 依赖:") print("pip install pytesseract pillow") print("还需要安装 tesseract 引擎")