feat(infra): integrate PaddleOCR's PP-StructureV3 as a document parser backend (#714)

This commit is contained in:
Lin Manhui
2025-08-13 16:37:42 +08:00
committed by GitHub
parent 708a6ed0c0
commit 6b60c07c22
30 changed files with 657 additions and 174 deletions

View File

@@ -88,7 +88,7 @@ func TestParseXLSX(t *testing.T) {
ChunkingStrategy: nil,
}
pfn := parseXLSX(config)
pfn := ParseXLSX(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123),
"knowledge_id": int64(456),
@@ -159,7 +159,7 @@ func TestParseXLSXConvertColumnType(t *testing.T) {
ChunkingStrategy: nil,
}
pfn := parseXLSX(config)
pfn := ParseXLSX(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123),
"knowledge_id": int64(456),