Initial commit: 胡汉三千年项目 - 北朝宇宙理论体系

2025-10-15 07:01:04 +00:00
commit 3b21c65457
2566 changed files with 1867622 additions and 0 deletions
--- a/ai-tools/scripts/ocr_extractor.py
+++ b/ai-tools/scripts/ocr_extractor.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+OCR 文字提取工具
+需要安装: pip install pytesseract pillow
+"""
+
+try:
+    import pytesseract
+    from PIL import Image
+    import os
+    
+    def extract_text_from_image(image_path):
+        """从图片中提取文字"""
+        try:
+            # 打开图片
+            image = Image.open(image_path)
+            
+            # 使用 OCR 提取文字
+            text = pytesseract.image_to_string(image, lang='chi_sim+eng')
+            
+            return text.strip()
+            
+        except Exception as e:
+            return f"OCR 失败: {e}"
+    
+    def batch_ocr_images(image_dir, output_file="ocr_results.md"):
+        """批量 OCR 图片"""
+        results = []
+        
+        # 获取所有 PNG 图片
+        png_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
+        png_files.sort()
+        
+        for filename in png_files:
+            image_path = os.path.join(image_dir, filename)
+            print(f"正在 OCR: {filename}")
+            
+            text = extract_text_from_image(image_path)
+            
+            if text:
+                results.append(f"## {filename}\n\n```\n{text}\n```\n\n---\n")
+            else:
+                results.append(f"## {filename}\n\n*无文字内容*\n\n---\n")
+        
+        # 保存结果
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write("# OCR 文字提取结果\n\n")
+            f.writelines(results)
+        
+        print(f"OCR 完成，结果保存到: {output_file}")
+
+except ImportError:
+    print("需要安装 OCR 依赖:")
+    print("pip install pytesseract pillow")
+    print("还需要安装 tesseract 引擎")