feat: manually mirror opencoze's code from bytedance

Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
2025-07-20 17:36:12 +08:00
commit 890153324f
14811 changed files with 1923430 additions and 0 deletions
--- a/backend/infra/impl/document/parser/builtin/parse_docx.py
+++ b/backend/infra/impl/document/parser/builtin/parse_docx.py
@@ -0,0 +1,172 @@
+import io
+import os
+import json
+import sys
+import base64
+import logging
+import time
+from abc import ABC
+from typing import List, IO
+
+from docx import ImagePart
+from docx.oxml import CT_P, CT_Tbl
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+from docx import Document
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+class DocxLoader(ABC):
+    def __init__(
+            self,
+            file_content: IO[bytes],
+            extract_images: bool = True,
+            extract_tables: bool = True,
+    ):
+        self.file_content = file_content
+        self.extract_images = extract_images
+        self.extract_tables = extract_tables
+
+    def load(self) -> List[dict]:
+        result = []
+        doc = Document(self.file_content)
+        it = iter(doc.element.body)
+        text = ""
+
+        for part in it:
+            blocks = self.parse_part(part, doc)
+            if blocks is None or len(blocks) == 0:
+                continue
+            for block in blocks:
+                if self.extract_images and isinstance(block, list):
+                    for b in block:
+                        image = io.BytesIO()
+                        try:
+                            Image.open(io.BytesIO(b.image.blob)).save(image, format="png")
+                        except Exception as e:
+                            logging.error(f"load image failed, time={time.asctime()}, err:{e}")
+                            raise RuntimeError("ExtractImageError")
+
+                        if len(text) > 0:
+                            result.append(
+                                {
+                                    "content": text,
+                                    "type": "text",
+                                }
+                            )
+                            text = ""
+
+                        result.append(
+                            {
+                                "content": base64.b64encode(image.getvalue()).decode('utf-8'),
+                                "type": "image",
+                            }
+                        )
+
+                if isinstance(block, Paragraph):
+                    text += block.text
+
+                if self.extract_tables and isinstance(block, Table):
+                    rows = block.rows
+                    if len(text) > 0:
+                        result.append(
+                            {
+                                "content": text,
+                                "type": "text",
+                            }
+                        )
+                        text = ""
+                    table = self.convert_table(rows)
+                    result.append(
+                        {
+                            "table": table,
+                            "type": "table",
+                        }
+                    )
+            if text:
+                text += "\n\n"
+        if len(text) > 0:
+            result.append(
+                {
+                    "content": text,
+                    "type": "text",
+                }
+            )
+
+        return result
+
+    def parse_part(self, block, doc: Document):
+        if isinstance(block, CT_P):
+            blocks = []
+            para = Paragraph(block, doc)
+            image_part = self.get_image_part(para, doc)
+            if image_part and para.text:
+                blocks.extend(self.parse_run(para))
+            elif image_part:
+                blocks.append(image_part)
+            elif para.text:
+                blocks.append(para)
+            return blocks
+        elif isinstance(block, CT_Tbl):
+            return [Table(block, doc)]
+
+    def parse_run(self, para: Paragraph):
+        runs = para.runs
+        paras = []
+        if runs is None or len(runs) == 0:
+            return paras
+        for run in runs:
+            if run is None or run.element is None:
+                continue
+            p = Paragraph(run.element, para)
+            image_part = self.get_image_part(p, para)
+            if image_part:
+                paras.append(image_part)
+            else:
+                paras.append(p)
+        return paras
+
+    @staticmethod
+    def get_image_part(graph: Paragraph, doc: Document):
+        images = graph._element.xpath(".//pic:pic")
+        image_parts = []
+        for image in images:
+            for img_id in image.xpath(".//a:blip/@r:embed"):
+                part = doc.part.related_parts[img_id]
+                if isinstance(part, ImagePart):
+                    image_parts.append(part)
+        return image_parts
+
+    @staticmethod
+    def convert_table(rows) -> List[List[str]]:
+        resp_rows = []
+        for i, row in enumerate(rows):
+            resp_row = []
+            for j, cell in enumerate(row.cells):
+                resp_row.append(cell.text if cell is not None else '')
+            resp_rows.append(resp_row)
+
+        return resp_rows
+
+
+if __name__ == "__main__":
+    w = os.fdopen(3, "wb", )
+    r = os.fdopen(4, "rb", )
+
+    try:
+        req = json.load(r)
+        ei, et = req['extract_images'], req['extract_tables']
+        loader = DocxLoader(file_content=io.BytesIO(sys.stdin.buffer.read()), extract_images=ei, extract_tables=et)
+        resp = loader.load()
+        print(f"Extracted {len(resp)} items")
+        result = json.dumps({"content": resp}, ensure_ascii=False)
+        w.write(str.encode(result))
+        w.flush()
+        w.close()
+        print("Docx parse done")
+    except Exception as e:
+        print("Docx parse error", e)
+        w.write(str.encode(json.dumps({"error": str(e)})))
+        w.flush()
+        w.close()