feat(infra): integrate PaddleOCR's PP-StructureV3 as a document parser backend (#714)

This commit is contained in:
Lin Manhui
2025-08-13 16:37:42 +08:00
committed by GitHub
parent 708a6ed0c0
commit 6b60c07c22
30 changed files with 657 additions and 174 deletions

View File

@@ -73,15 +73,15 @@ func (p *pyPDFTableIterator) NextRow() (row []string, end bool, err error) {
return row, false, nil
}
func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR, pyPath, scriptPath string) parseFn {
func ParseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR, pyPath, scriptPath string) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
pr, pw, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("[parseByPython] create rpipe failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] create rpipe failed, %w", err)
}
r, w, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("[parseByPython] create pipe failed: %w", err)
return nil, fmt.Errorf("[ParseByPython] create pipe failed: %w", err)
}
options := parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...)
@@ -91,13 +91,13 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
FilterPages: config.ParsingStrategy.FilterPages,
})
if err != nil {
return nil, fmt.Errorf("[parseByPython] create parse request failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] create parse request failed, %w", err)
}
if _, err = pw.Write(reqb); err != nil {
return nil, fmt.Errorf("[parseByPython] write parse request bytes failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] write parse request bytes failed, %w", err)
}
if err = pw.Close(); err != nil {
return nil, fmt.Errorf("[parseByPython] close write request pipe failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] close write request pipe failed, %w", err)
}
cmd := exec.Command(pyPath, scriptPath)
@@ -105,31 +105,31 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
cmd.Stdout = os.Stdout
cmd.ExtraFiles = []*os.File{w, pr}
if err = cmd.Start(); err != nil {
return nil, fmt.Errorf("[parseByPython] failed to start Python script: %w", err)
return nil, fmt.Errorf("[ParseByPython] failed to start Python script: %w", err)
}
if err = w.Close(); err != nil {
return nil, fmt.Errorf("[parseByPython] failed to close write pipe: %w", err)
return nil, fmt.Errorf("[ParseByPython] failed to close write pipe: %w", err)
}
result := &pyParseResult{}
if err = json.NewDecoder(r).Decode(result); err != nil {
return nil, fmt.Errorf("[parseByPython] failed to decode result: %w", err)
return nil, fmt.Errorf("[ParseByPython] failed to decode result: %w", err)
}
if err = cmd.Wait(); err != nil {
return nil, fmt.Errorf("[parseByPython] cmd wait err: %w", err)
return nil, fmt.Errorf("[ParseByPython] cmd wait err: %w", err)
}
if result.Error != "" {
return nil, fmt.Errorf("[parseByPython] python execution failed: %s", result.Error)
return nil, fmt.Errorf("[ParseByPython] python execution failed: %s", result.Error)
}
for i, item := range result.Content {
switch item.Type {
case contentTypeText:
partDocs, err := chunkCustom(ctx, item.Content, config, opts...)
partDocs, err := ChunkCustom(ctx, item.Content, config, opts...)
if err != nil {
return nil, fmt.Errorf("[parseByPython] chunk text failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] chunk text failed, %w", err)
}
docs = append(docs, partDocs...)
case contentTypeImage:
@@ -138,9 +138,9 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
}
image, err := base64.StdEncoding.DecodeString(item.Content)
if err != nil {
return nil, fmt.Errorf("[parseByPython] decode image failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] decode image failed, %w", err)
}
imgSrc, err := putImageObject(ctx, storage, "png", getCreatorIDFromExtraMeta(options.ExtraMeta), image)
imgSrc, err := PutImageObject(ctx, storage, "png", GetCreatorIDFromExtraMeta(options.ExtraMeta), image)
if err != nil {
return nil, err
}
@@ -148,7 +148,7 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
if config.ParsingStrategy.ImageOCR && ocr != nil {
texts, err := ocr.FromBase64(ctx, item.Content)
if err != nil {
return nil, fmt.Errorf("[parseByPython] FromBase64 failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] FromBase64 failed, %w", err)
}
label += strings.Join(texts, "\n")
}
@@ -181,15 +181,15 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
ChunkingStrategy: config.ChunkingStrategy,
}, opts...)
if err != nil {
return nil, fmt.Errorf("[parseByPython] parse table failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] parse table failed, %w", err)
}
fmtTableDocs, err := formatTablesInDocument(rawTableDocs)
if err != nil {
return nil, fmt.Errorf("[parseByPython] format table failed, %w", err)
return nil, fmt.Errorf("[ParseByPython] format table failed, %w", err)
}
docs = append(docs, fmtTableDocs...)
default:
return nil, fmt.Errorf("[parseByPython] invalid content type: %s", item.Type)
return nil, fmt.Errorf("[ParseByPython] invalid content type: %s", item.Type)
}
}