88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
批量转换PDF文件为Markdown格式并提取图片
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
def convert_pdf_to_markdown(pdf_file, output_dir="converted", image_dir="images"):
|
|
"""转换单个PDF文件"""
|
|
# 获取文件名(不含扩展名和路径)
|
|
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
|
|
|
# 创建输出文件名
|
|
output_file = os.path.join(output_dir, f"{base_name}.md")
|
|
|
|
# 创建图片目录
|
|
image_subdir = os.path.join(image_dir, base_name)
|
|
os.makedirs(image_subdir, exist_ok=True)
|
|
|
|
print(f"正在处理: {pdf_file}")
|
|
|
|
# 转换PDF为Markdown
|
|
try:
|
|
cmd = ["markitdown", pdf_file, "-o", output_file]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0:
|
|
print(f"转换失败: {pdf_file}")
|
|
print(f"错误: {result.stderr}")
|
|
return False
|
|
print(f"✓ Markdown转换完成: {output_file}")
|
|
except Exception as e:
|
|
print(f"转换异常: {pdf_file} - {e}")
|
|
return False
|
|
|
|
# 提取图片
|
|
try:
|
|
cmd = ["pdfimages", pdf_file, os.path.join(image_subdir, "image")]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0:
|
|
print(f"图片提取失败: {pdf_file}")
|
|
print(f"错误: {result.stderr}")
|
|
return False
|
|
|
|
# 转换PPM为PNG
|
|
ppm_files = [f for f in os.listdir(image_subdir) if f.endswith('.ppm')]
|
|
if ppm_files:
|
|
for ppm_file in ppm_files:
|
|
png_file = ppm_file.replace('.ppm', '.png')
|
|
cmd = ["convert", os.path.join(image_subdir, ppm_file),
|
|
os.path.join(image_subdir, png_file)]
|
|
subprocess.run(cmd, capture_output=True)
|
|
|
|
print(f"✓ 图片转换完成: {len(ppm_files)}张图片")
|
|
|
|
print(f"✓ 处理完成: {pdf_file}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"图片处理异常: {pdf_file} - {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""主函数"""
|
|
# 获取所有PDF文件
|
|
pdf_dir = "documents/pdfs"
|
|
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
|
|
pdf_files.sort() # 按文件名排序
|
|
|
|
print(f"找到 {len(pdf_files)} 个PDF文件")
|
|
|
|
# 创建输出目录
|
|
os.makedirs("converted", exist_ok=True)
|
|
os.makedirs("images", exist_ok=True)
|
|
|
|
success_count = 0
|
|
for pdf_file in pdf_files:
|
|
pdf_path = os.path.join(pdf_dir, pdf_file)
|
|
if convert_pdf_to_markdown(pdf_path):
|
|
success_count += 1
|
|
print("-" * 50)
|
|
|
|
print(f"\n处理完成!成功转换 {success_count}/{len(pdf_files)} 个文件")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|