huhan3000/tools/ai-tools/scripts/batch_convert.py

88 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""
批量转换PDF文件为Markdown格式并提取图片
"""
import os
import subprocess
import sys
def convert_pdf_to_markdown(pdf_file, output_dir="converted", image_dir="images"):
"""转换单个PDF文件"""
# 获取文件名(不含扩展名和路径)
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
# 创建输出文件名
output_file = os.path.join(output_dir, f"{base_name}.md")
# 创建图片目录
image_subdir = os.path.join(image_dir, base_name)
os.makedirs(image_subdir, exist_ok=True)
print(f"正在处理: {pdf_file}")
# 转换PDF为Markdown
try:
cmd = ["markitdown", pdf_file, "-o", output_file]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"转换失败: {pdf_file}")
print(f"错误: {result.stderr}")
return False
print(f"✓ Markdown转换完成: {output_file}")
except Exception as e:
print(f"转换异常: {pdf_file} - {e}")
return False
# 提取图片
try:
cmd = ["pdfimages", pdf_file, os.path.join(image_subdir, "image")]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"图片提取失败: {pdf_file}")
print(f"错误: {result.stderr}")
return False
# 转换PPM为PNG
ppm_files = [f for f in os.listdir(image_subdir) if f.endswith('.ppm')]
if ppm_files:
for ppm_file in ppm_files:
png_file = ppm_file.replace('.ppm', '.png')
cmd = ["convert", os.path.join(image_subdir, ppm_file),
os.path.join(image_subdir, png_file)]
subprocess.run(cmd, capture_output=True)
print(f"✓ 图片转换完成: {len(ppm_files)}张图片")
print(f"✓ 处理完成: {pdf_file}")
return True
except Exception as e:
print(f"图片处理异常: {pdf_file} - {e}")
return False
def main():
"""主函数"""
# 获取所有PDF文件
pdf_dir = "documents/pdfs"
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
pdf_files.sort() # 按文件名排序
print(f"找到 {len(pdf_files)} 个PDF文件")
# 创建输出目录
os.makedirs("converted", exist_ok=True)
os.makedirs("images", exist_ok=True)
success_count = 0
for pdf_file in pdf_files:
pdf_path = os.path.join(pdf_dir, pdf_file)
if convert_pdf_to_markdown(pdf_path):
success_count += 1
print("-" * 50)
print(f"\n处理完成!成功转换 {success_count}/{len(pdf_files)} 个文件")
if __name__ == "__main__":
main()