huhan3000/ai-tools/scripts/ocr_extractor.py

#!/usr/bin/env python3
"""
OCR 文字提取工具
需要安装: pip install pytesseract pillow
"""

try:
    import pytesseract
    from PIL import Image
    import os

    def extract_text_from_image(image_path):
        """从图片中提取文字"""
        try:
            # 打开图片
            image = Image.open(image_path)

            # 使用 OCR 提取文字
            text = pytesseract.image_to_string(image, lang='chi_sim+eng')

            return text.strip()

        except Exception as e:
            return f"OCR 失败: {e}"

    def batch_ocr_images(image_dir, output_file="ocr_results.md"):
        """批量 OCR 图片"""
        results = []

        # 获取所有 PNG 图片
        png_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
        png_files.sort()

        for filename in png_files:
            image_path = os.path.join(image_dir, filename)
            print(f"正在 OCR: {filename}")

            text = extract_text_from_image(image_path)

            if text:
                results.append(f"## {filename}\n\n```\n{text}\n```\n\n---\n")
            else:
                results.append(f"## {filename}\n\n*无文字内容*\n\n---\n")

        # 保存结果
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("# OCR 文字提取结果\n\n")
            f.writelines(results)

        print(f"OCR 完成，结果保存到: {output_file}")

except ImportError:
    print("需要安装 OCR 依赖:")
    print("pip install pytesseract pillow")
    print("还需要安装 tesseract 引擎")