#!/usr/bin/env python3 """ 批量转换PDF文件为Markdown格式并提取图片 """ import os import subprocess import sys def convert_pdf_to_markdown(pdf_file, output_dir="converted", image_dir="images"): """转换单个PDF文件""" # 获取文件名(不含扩展名和路径) base_name = os.path.splitext(os.path.basename(pdf_file))[0] # 创建输出文件名 output_file = os.path.join(output_dir, f"{base_name}.md") # 创建图片目录 image_subdir = os.path.join(image_dir, base_name) os.makedirs(image_subdir, exist_ok=True) print(f"正在处理: {pdf_file}") # 转换PDF为Markdown try: cmd = ["markitdown", pdf_file, "-o", output_file] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"转换失败: {pdf_file}") print(f"错误: {result.stderr}") return False print(f"✓ Markdown转换完成: {output_file}") except Exception as e: print(f"转换异常: {pdf_file} - {e}") return False # 提取图片 try: cmd = ["pdfimages", pdf_file, os.path.join(image_subdir, "image")] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"图片提取失败: {pdf_file}") print(f"错误: {result.stderr}") return False # 转换PPM为PNG ppm_files = [f for f in os.listdir(image_subdir) if f.endswith('.ppm')] if ppm_files: for ppm_file in ppm_files: png_file = ppm_file.replace('.ppm', '.png') cmd = ["convert", os.path.join(image_subdir, ppm_file), os.path.join(image_subdir, png_file)] subprocess.run(cmd, capture_output=True) print(f"✓ 图片转换完成: {len(ppm_files)}张图片") print(f"✓ 处理完成: {pdf_file}") return True except Exception as e: print(f"图片处理异常: {pdf_file} - {e}") return False def main(): """主函数""" # 获取所有PDF文件 pdf_dir = "documents/pdfs" pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')] pdf_files.sort() # 按文件名排序 print(f"找到 {len(pdf_files)} 个PDF文件") # 创建输出目录 os.makedirs("converted", exist_ok=True) os.makedirs("images", exist_ok=True) success_count = 0 for pdf_file in pdf_files: pdf_path = os.path.join(pdf_dir, pdf_file) if convert_pdf_to_markdown(pdf_path): success_count += 1 print("-" * 50) print(f"\n处理完成!成功转换 {success_count}/{len(pdf_files)} 个文件") if __name__ == "__main__": main()