feat: manually mirror opencoze's code from bytedance
Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
152
backend/infra/impl/document/parser/builtin/parse_pdf.py
Normal file
152
backend/infra/impl/document/parser/builtin/parse_pdf.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import base64
|
||||
|
||||
from typing import Literal
|
||||
import pdfplumber
|
||||
from PIL import Image, ImageChops
|
||||
from pdfminer.pdfcolor import (
|
||||
LITERAL_DEVICE_CMYK,
|
||||
)
|
||||
from pdfminer.pdftypes import (
|
||||
LITERALS_DCT_DECODE,
|
||||
LITERALS_FLATE_DECODE,
|
||||
)
|
||||
|
||||
def bbox_overlap(bbox1, bbox2):
|
||||
x0_1, y0_1, x1_1, y1_1 = bbox1
|
||||
x0_2, y0_2, x1_2, y1_2 = bbox2
|
||||
|
||||
x_overlap = max(0, min(x1_1, x1_2) - max(x0_1, x0_2))
|
||||
y_overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
|
||||
|
||||
overlap_area = x_overlap * y_overlap
|
||||
|
||||
bbox1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
|
||||
bbox2_area = (x1_2 - x0_2) * (y1_2 - y0_2)
|
||||
if bbox1_area == 0 or bbox2_area == 0:
|
||||
return 0
|
||||
|
||||
return overlap_area / min(bbox1_area, bbox2_area)
|
||||
|
||||
|
||||
def is_structured_table(table):
|
||||
if not table:
|
||||
return False
|
||||
row_count = len(table)
|
||||
col_count = max(len(row) for row in table)
|
||||
return row_count >= 2 and col_count >= 2
|
||||
|
||||
|
||||
def extract_pdf_content(pdf_data: bytes, extract_images, extract_tables: bool, filter_pages: []):
|
||||
with pdfplumber.open(io.BytesIO(pdf_data)) as pdf:
|
||||
content = []
|
||||
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
if filter_pages is not None and page_num + 1 in filter_pages:
|
||||
print(f"Skip page {page_num + 1}...")
|
||||
continue
|
||||
print(f"Processing page {page_num + 1}...")
|
||||
text = page.extract_text(x_tolerance=2)
|
||||
content.append({
|
||||
'type': 'text',
|
||||
'content': text,
|
||||
'page': page_num + 1,
|
||||
'bbox': page.bbox
|
||||
})
|
||||
|
||||
if extract_images:
|
||||
images = page.images
|
||||
for img_index, img in enumerate(images):
|
||||
try:
|
||||
filters = img['stream'].get_filters()
|
||||
data = img['stream'].get_data()
|
||||
buffered = io.BytesIO()
|
||||
|
||||
if filters[-1][0] in LITERALS_DCT_DECODE:
|
||||
if LITERAL_DEVICE_CMYK in img['colorspace']:
|
||||
i = Image.open(io.BytesIO(data))
|
||||
i = ImageChops.invert(i)
|
||||
i = i.convert("RGB")
|
||||
i.save(buffered, format="PNG")
|
||||
else:
|
||||
buffered.write(data)
|
||||
|
||||
elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
|
||||
width, height = img['srcsize']
|
||||
channels = len(img['stream'].get_data()) / width / height / (img['bits'] / 8)
|
||||
mode: Literal["1", "L", "RGB", "CMYK"]
|
||||
if img['bits'] == 1:
|
||||
mode = "1"
|
||||
elif img['bits'] == 8 and channels == 1:
|
||||
mode = "L"
|
||||
elif img['bits'] == 8 and channels == 3:
|
||||
mode = "RGB"
|
||||
elif img['bits'] == 8 and channels == 4:
|
||||
mode = "CMYK"
|
||||
i = Image.frombytes(mode, img['srcsize'], data, "raw")
|
||||
i.save(buffered, format="PNG")
|
||||
else:
|
||||
buffered.write(data)
|
||||
content.append({
|
||||
'type': 'image',
|
||||
'content': base64.b64encode(buffered.getvalue()).decode('utf-8'),
|
||||
'page': page_num + 1,
|
||||
'bbox': (img['x0'], img['top'], img['x1'], img['bottom'])
|
||||
})
|
||||
except Exception as err:
|
||||
print(f"Skipping an unsupported image on page {page_num + 1}, error message: {err}")
|
||||
|
||||
if extract_tables:
|
||||
tables = page.extract_tables()
|
||||
for table in tables:
|
||||
content.append({
|
||||
'type': 'table',
|
||||
'table': table,
|
||||
'page': page_num + 1,
|
||||
'bbox': page.bbox
|
||||
})
|
||||
|
||||
content.sort(key=lambda x: (x['page'], x['bbox'][1], x['bbox'][0]))
|
||||
|
||||
filtered_content = []
|
||||
for item in content:
|
||||
if item['type'] == 'table':
|
||||
if is_structured_table(item['table']):
|
||||
filtered_content.append(item)
|
||||
continue
|
||||
overlap_found = False
|
||||
for existing_item in filtered_content:
|
||||
if existing_item['type'] == 'text' and bbox_overlap(item['bbox'], existing_item['bbox']) > 0.8:
|
||||
overlap_found = True
|
||||
break
|
||||
if overlap_found:
|
||||
continue
|
||||
filtered_content.append(item)
|
||||
|
||||
return filtered_content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
w = os.fdopen(3, "wb", )
|
||||
r = os.fdopen(4, "rb", )
|
||||
pdf_data = sys.stdin.buffer.read()
|
||||
print(f"Read {len(pdf_data)} bytes of PDF data")
|
||||
|
||||
try:
|
||||
req = json.load(r)
|
||||
ei, et, fp = req['extract_images'], req['extract_tables'], req['filter_pages']
|
||||
extracted_content = extract_pdf_content(pdf_data, ei, et, fp)
|
||||
print(f"Extracted {len(extracted_content)} items")
|
||||
result = json.dumps({"content": extracted_content}, ensure_ascii=False)
|
||||
w.write(str.encode(result))
|
||||
w.flush()
|
||||
w.close()
|
||||
print("Pdf parse done")
|
||||
except Exception as e:
|
||||
print("Pdf parse error", e)
|
||||
w.write(str.encode(json.dumps({"error": str(e)})))
|
||||
w.flush()
|
||||
w.close()
|
||||
Reference in New Issue
Block a user