88 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			88 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| 批量转换PDF文件为Markdown格式并提取图片
 | |
| """
 | |
| 
 | |
| import os
 | |
| import subprocess
 | |
| import sys
 | |
| 
 | |
| def convert_pdf_to_markdown(pdf_file, output_dir="converted", image_dir="images"):
 | |
|     """转换单个PDF文件"""
 | |
|     # 获取文件名(不含扩展名和路径)
 | |
|     base_name = os.path.splitext(os.path.basename(pdf_file))[0]
 | |
|     
 | |
|     # 创建输出文件名
 | |
|     output_file = os.path.join(output_dir, f"{base_name}.md")
 | |
|     
 | |
|     # 创建图片目录
 | |
|     image_subdir = os.path.join(image_dir, base_name)
 | |
|     os.makedirs(image_subdir, exist_ok=True)
 | |
|     
 | |
|     print(f"正在处理: {pdf_file}")
 | |
|     
 | |
|     # 转换PDF为Markdown
 | |
|     try:
 | |
|         cmd = ["markitdown", pdf_file, "-o", output_file]
 | |
|         result = subprocess.run(cmd, capture_output=True, text=True)
 | |
|         if result.returncode != 0:
 | |
|             print(f"转换失败: {pdf_file}")
 | |
|             print(f"错误: {result.stderr}")
 | |
|             return False
 | |
|         print(f"✓ Markdown转换完成: {output_file}")
 | |
|     except Exception as e:
 | |
|         print(f"转换异常: {pdf_file} - {e}")
 | |
|         return False
 | |
|     
 | |
|     # 提取图片
 | |
|     try:
 | |
|         cmd = ["pdfimages", pdf_file, os.path.join(image_subdir, "image")]
 | |
|         result = subprocess.run(cmd, capture_output=True, text=True)
 | |
|         if result.returncode != 0:
 | |
|             print(f"图片提取失败: {pdf_file}")
 | |
|             print(f"错误: {result.stderr}")
 | |
|             return False
 | |
|         
 | |
|         # 转换PPM为PNG
 | |
|         ppm_files = [f for f in os.listdir(image_subdir) if f.endswith('.ppm')]
 | |
|         if ppm_files:
 | |
|             for ppm_file in ppm_files:
 | |
|                 png_file = ppm_file.replace('.ppm', '.png')
 | |
|                 cmd = ["convert", os.path.join(image_subdir, ppm_file), 
 | |
|                        os.path.join(image_subdir, png_file)]
 | |
|                 subprocess.run(cmd, capture_output=True)
 | |
|             
 | |
|             print(f"✓ 图片转换完成: {len(ppm_files)}张图片")
 | |
|         
 | |
|         print(f"✓ 处理完成: {pdf_file}")
 | |
|         return True
 | |
|         
 | |
|     except Exception as e:
 | |
|         print(f"图片处理异常: {pdf_file} - {e}")
 | |
|         return False
 | |
| 
 | |
| def main():
 | |
|     """主函数"""
 | |
|     # 获取所有PDF文件
 | |
|     pdf_dir = "documents/pdfs"
 | |
|     pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
 | |
|     pdf_files.sort()  # 按文件名排序
 | |
|     
 | |
|     print(f"找到 {len(pdf_files)} 个PDF文件")
 | |
|     
 | |
|     # 创建输出目录
 | |
|     os.makedirs("converted", exist_ok=True)
 | |
|     os.makedirs("images", exist_ok=True)
 | |
|     
 | |
|     success_count = 0
 | |
|     for pdf_file in pdf_files:
 | |
|         pdf_path = os.path.join(pdf_dir, pdf_file)
 | |
|         if convert_pdf_to_markdown(pdf_path):
 | |
|             success_count += 1
 | |
|         print("-" * 50)
 | |
|     
 | |
|     print(f"\n处理完成!成功转换 {success_count}/{len(pdf_files)} 个文件")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 |