From 6b60c07c221a4e847a4ead89fa2153b0dbdcd1c5 Mon Sep 17 00:00:00 2001 From: Lin Manhui Date: Wed, 13 Aug 2025 16:37:42 +0800 Subject: [PATCH] feat(infra): integrate PaddleOCR's PP-StructureV3 as a document parser backend (#714) --- backend/application/application.go | 18 +- .../application/base/appinfra/app_infra.go | 67 ++++ backend/application/knowledge/init.go | 75 ++-- backend/domain/knowledge/service/knowledge.go | 89 +++-- .../infra/contract/document/parser/manager.go | 1 + .../paddleocr_ocr.go => ppocr/ppocr.go} | 12 +- .../document/parser/builtin/chunk_custom.go | 4 +- .../parser/builtin/chunk_custom_test.go | 2 +- .../impl/document/parser/builtin/image.go | 2 +- .../impl/document/parser/builtin/manager.go | 26 +- .../impl/document/parser/builtin/parse_csv.go | 2 +- .../document/parser/builtin/parse_csv_test.go | 8 +- .../document/parser/builtin/parse_image.go | 6 +- .../document/parser/builtin/parse_json.go | 4 +- .../parser/builtin/parse_json_maps.go | 2 +- .../parser/builtin/parse_json_maps_test.go | 2 +- .../parser/builtin/parse_json_test.go | 4 +- .../document/parser/builtin/parse_markdown.go | 8 +- .../parser/builtin/parse_markdown_test.go | 2 +- .../document/parser/builtin/parse_text.go | 6 +- .../document/parser/builtin/parse_xlsx.go | 2 +- .../parser/builtin/parse_xlsx_test.go | 4 +- .../impl/document/parser/builtin/parser.go | 10 +- .../parser/builtin/py_parser_protocol.go | 38 +- .../impl/document/parser/builtin/util.go | 2 +- .../document/parser/ppstructure/manager.go | 91 +++++ .../document/parser/ppstructure/parser.go | 324 ++++++++++++++++++ backend/types/consts/consts.go | 8 + docker/.env.debug.example | 6 + docker/.env.example | 6 + 30 files changed, 657 insertions(+), 174 deletions(-) rename backend/infra/impl/document/ocr/{veocr/paddleocr_ocr.go => ppocr/ppocr.go} (95%) create mode 100644 backend/infra/impl/document/parser/ppstructure/manager.go create mode 100644 backend/infra/impl/document/parser/ppstructure/parser.go diff --git a/backend/application/application.go b/backend/application/application.go index cc806af2..74f6a697 100644 --- a/backend/application/application.go +++ b/backend/application/application.go @@ -254,14 +254,16 @@ func (b *basicServices) toPluginServiceComponents() *plugin.ServiceComponents { func (b *basicServices) toKnowledgeServiceComponents(memoryService *memory.MemoryApplicationServices) *knowledge.ServiceComponents { return &knowledge.ServiceComponents{ - DB: b.infra.DB, - IDGenSVC: b.infra.IDGenSVC, - Storage: b.infra.TOSClient, - RDB: memoryService.RDBDomainSVC, - ImageX: b.infra.ImageXClient, - ES: b.infra.ESClient, - EventBus: b.eventbus.resourceEventBus, - CacheCli: b.infra.CacheCli, + DB: b.infra.DB, + IDGenSVC: b.infra.IDGenSVC, + Storage: b.infra.TOSClient, + RDB: memoryService.RDBDomainSVC, + ImageX: b.infra.ImageXClient, + ES: b.infra.ESClient, + EventBus: b.eventbus.resourceEventBus, + CacheCli: b.infra.CacheCli, + OCR: b.infra.OCR, + ParserManager: b.infra.ParserManager, } } diff --git a/backend/application/base/appinfra/app_infra.go b/backend/application/base/appinfra/app_infra.go index 23409fb1..0d78420f 100644 --- a/backend/application/base/appinfra/app_infra.go +++ b/backend/application/base/appinfra/app_infra.go @@ -19,25 +19,37 @@ package appinfra import ( "context" "fmt" + "net/http" "os" "strconv" "strings" "gorm.io/gorm" + "github.com/volcengine/volc-sdk-golang/service/visual" + + "github.com/coze-dev/coze-studio/backend/application/internal" "github.com/coze-dev/coze-studio/backend/infra/contract/cache" + "github.com/coze-dev/coze-studio/backend/infra/contract/chatmodel" "github.com/coze-dev/coze-studio/backend/infra/contract/coderunner" + "github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr" + "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" "github.com/coze-dev/coze-studio/backend/infra/contract/imagex" "github.com/coze-dev/coze-studio/backend/infra/contract/modelmgr" "github.com/coze-dev/coze-studio/backend/infra/impl/cache/redis" "github.com/coze-dev/coze-studio/backend/infra/impl/coderunner/direct" "github.com/coze-dev/coze-studio/backend/infra/impl/coderunner/sandbox" + "github.com/coze-dev/coze-studio/backend/infra/impl/document/ocr/ppocr" + "github.com/coze-dev/coze-studio/backend/infra/impl/document/ocr/veocr" + builtinParser "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin" + "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/ppstructure" "github.com/coze-dev/coze-studio/backend/infra/impl/es" "github.com/coze-dev/coze-studio/backend/infra/impl/eventbus" "github.com/coze-dev/coze-studio/backend/infra/impl/idgen" "github.com/coze-dev/coze-studio/backend/infra/impl/imagex/veimagex" "github.com/coze-dev/coze-studio/backend/infra/impl/mysql" "github.com/coze-dev/coze-studio/backend/infra/impl/storage" + "github.com/coze-dev/coze-studio/backend/pkg/logs" "github.com/coze-dev/coze-studio/backend/types/consts" ) @@ -52,6 +64,8 @@ type AppDependencies struct { AppEventProducer eventbus.Producer ModelMgr modelmgr.Manager CodeRunner coderunner.Runner + OCR ocr.OCR + ParserManager parser.Manager } func Init(ctx context.Context) (*AppDependencies, error) { @@ -102,6 +116,14 @@ func Init(ctx context.Context) (*AppDependencies, error) { deps.CodeRunner = initCodeRunner() + deps.OCR = initOCR() + + imageAnnotationModel, _, err := internal.GetBuiltinChatModel(ctx, "IA_") + if err != nil { + return nil, err + } + deps.ParserManager, err = initParserManager(deps.TOSClient, deps.OCR, imageAnnotationModel) + return deps, nil } @@ -183,3 +205,48 @@ func initCodeRunner() coderunner.Runner { return direct.NewRunner() } } + +func initOCR() ocr.OCR { + var ocr ocr.OCR + switch os.Getenv(consts.OCRType) { + case "ve": + ocrAK := os.Getenv(consts.VeOCRAK) + ocrSK := os.Getenv(consts.VeOCRSK) + if ocrAK == "" || ocrSK == "" { + logs.Warnf("[ve_ocr] ak / sk not configured, ocr might not work well") + } + inst := visual.NewInstance() + inst.Client.SetAccessKey(ocrAK) + inst.Client.SetSecretKey(ocrSK) + ocr = veocr.NewOCR(&veocr.Config{Client: inst}) + case "paddleocr": + url := os.Getenv(consts.PPOCRAPIURL) + client := &http.Client{} + ocr = ppocr.NewOCR(&ppocr.Config{Client: client, URL: url}) + default: + // accept ocr not configured + } + + return ocr +} + +func initParserManager(storage storage.Storage, ocr ocr.OCR, imageAnnotationModel chatmodel.BaseChatModel) (parser.Manager, error) { + var parserManager parser.Manager + parserType := os.Getenv(consts.ParserType) + switch parserType { + case "builtin": + parserManager = builtinParser.NewManager(storage, ocr, imageAnnotationModel) + case "paddleocr": + url := os.Getenv(consts.PPStructureAPIURL) + client := &http.Client{} + apiConfig := &ppstructure.APIConfig{ + Client: client, + URL: url, + } + parserManager = ppstructure.NewManager(apiConfig, ocr, storage, imageAnnotationModel) + default: + return nil, fmt.Errorf("unexpected document parser type, type=%s", parserType) + } + + return parserManager, nil +} diff --git a/backend/application/knowledge/init.go b/backend/application/knowledge/init.go index ac763177..c7ea4b32 100644 --- a/backend/application/knowledge/init.go +++ b/backend/application/knowledge/init.go @@ -20,7 +20,6 @@ import ( "context" "encoding/json" "fmt" - netHTTP "net/http" "os" "path/filepath" "strconv" @@ -33,7 +32,6 @@ import ( "github.com/cloudwego/eino/schema" "github.com/milvus-io/milvus/client/v2/milvusclient" "github.com/volcengine/volc-sdk-golang/service/vikingdb" - "github.com/volcengine/volc-sdk-golang/service/visual" "gorm.io/gorm" "github.com/coze-dev/coze-studio/backend/application/internal" @@ -42,6 +40,7 @@ import ( "github.com/coze-dev/coze-studio/backend/infra/contract/cache" "github.com/coze-dev/coze-studio/backend/infra/contract/document/nl2sql" "github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr" + "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" "github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore" "github.com/coze-dev/coze-studio/backend/infra/contract/embedding" "github.com/coze-dev/coze-studio/backend/infra/contract/es" @@ -52,8 +51,6 @@ import ( "github.com/coze-dev/coze-studio/backend/infra/contract/storage" chatmodelImpl "github.com/coze-dev/coze-studio/backend/infra/impl/chatmodel" builtinNL2SQL "github.com/coze-dev/coze-studio/backend/infra/impl/document/nl2sql/builtin" - "github.com/coze-dev/coze-studio/backend/infra/impl/document/ocr/veocr" - builtinParser "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin" "github.com/coze-dev/coze-studio/backend/infra/impl/document/rerank/rrf" sses "github.com/coze-dev/coze-studio/backend/infra/impl/document/searchstore/elasticsearch" ssmilvus "github.com/coze-dev/coze-studio/backend/infra/impl/document/searchstore/milvus" @@ -70,14 +67,16 @@ import ( ) type ServiceComponents struct { - DB *gorm.DB - IDGenSVC idgen.IDGenerator - Storage storage.Storage - RDB rdb.RDB - ImageX imagex.ImageX - ES es.Client - EventBus search.ResourceEventBus - CacheCli cache.Cmdable + DB *gorm.DB + IDGenSVC idgen.IDGenerator + Storage storage.Storage + RDB rdb.RDB + ImageX imagex.ImageX + ES es.Client + EventBus search.ResourceEventBus + CacheCli cache.Cmdable + OCR ocr.OCR + ParserManager parser.Manager } func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) { @@ -102,26 +101,6 @@ func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) { } sManagers = append(sManagers, mgr) - var ocrImpl ocr.OCR - switch os.Getenv("OCR_TYPE") { - case "ve": - ocrAK := os.Getenv("VE_OCR_AK") - ocrSK := os.Getenv("VE_OCR_SK") - if ocrAK == "" || ocrSK == "" { - logs.Warnf("[ve_ocr] ak / sk not configured, ocr might not work well") - } - inst := visual.NewInstance() - inst.Client.SetAccessKey(ocrAK) - inst.Client.SetSecretKey(ocrSK) - ocrImpl = veocr.NewOCR(&veocr.Config{Client: inst}) - case "paddleocr": - ppocrURL := os.Getenv("PADDLEOCR_OCR_API_URL") - client := &netHTTP.Client{} - ocrImpl = veocr.NewPPOCR(&veocr.PPOCRConfig{Client: client, URL: ppocrURL}) - default: - // accept ocr not configured - } - root, err := os.Getwd() if err != nil { logs.Warnf("[InitConfig] Failed to get current working directory: %v", err) @@ -158,26 +137,20 @@ func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) { } } - imageAnnoChatModel, configured, err := internal.GetBuiltinChatModel(ctx, "IA_") - if err != nil { - return nil, err - } - knowledgeDomainSVC, knowledgeEventHandler := knowledgeImpl.NewKnowledgeSVC(&knowledgeImpl.KnowledgeSVCConfig{ - DB: c.DB, - IDGen: c.IDGenSVC, - RDB: c.RDB, - Producer: knowledgeProducer, - SearchStoreManagers: sManagers, - ParseManager: builtinParser.NewManager(c.Storage, ocrImpl, imageAnnoChatModel), // default builtin - Storage: c.Storage, - Rewriter: rewriter, - Reranker: rrf.NewRRFReranker(0), // default rrf - NL2Sql: n2s, - OCR: ocrImpl, - CacheCli: c.CacheCli, - IsAutoAnnotationSupported: configured, - ModelFactory: chatmodelImpl.NewDefaultFactory(), + DB: c.DB, + IDGen: c.IDGenSVC, + RDB: c.RDB, + Producer: knowledgeProducer, + SearchStoreManagers: sManagers, + ParseManager: c.ParserManager, + Storage: c.Storage, + Rewriter: rewriter, + Reranker: rrf.NewRRFReranker(0), // default rrf + NL2Sql: n2s, + OCR: c.OCR, + CacheCli: c.CacheCli, + ModelFactory: chatmodelImpl.NewDefaultFactory(), }) if err = eventbus.DefaultSVC().RegisterConsumer(nameServer, consts.RMQTopicKnowledge, consts.RMQConsumeGroupKnowledge, knowledgeEventHandler); err != nil { diff --git a/backend/domain/knowledge/service/knowledge.go b/backend/domain/knowledge/service/knowledge.go index 8a7c1a25..cf14a065 100644 --- a/backend/domain/knowledge/service/knowledge.go +++ b/backend/domain/knowledge/service/knowledge.go @@ -70,23 +70,22 @@ import ( func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHandler) { svc := &knowledgeSVC{ - knowledgeRepo: repository.NewKnowledgeDAO(config.DB), - documentRepo: repository.NewKnowledgeDocumentDAO(config.DB), - sliceRepo: repository.NewKnowledgeDocumentSliceDAO(config.DB), - reviewRepo: repository.NewKnowledgeDocumentReviewDAO(config.DB), - idgen: config.IDGen, - rdb: config.RDB, - producer: config.Producer, - searchStoreManagers: config.SearchStoreManagers, - parseManager: config.ParseManager, - storage: config.Storage, - reranker: config.Reranker, - rewriter: config.Rewriter, - nl2Sql: config.NL2Sql, - enableCompactTable: ptr.FromOrDefault(config.EnableCompactTable, true), - cacheCli: config.CacheCli, - isAutoAnnotationSupported: config.IsAutoAnnotationSupported, - modelFactory: config.ModelFactory, + knowledgeRepo: repository.NewKnowledgeDAO(config.DB), + documentRepo: repository.NewKnowledgeDocumentDAO(config.DB), + sliceRepo: repository.NewKnowledgeDocumentSliceDAO(config.DB), + reviewRepo: repository.NewKnowledgeDocumentReviewDAO(config.DB), + idgen: config.IDGen, + rdb: config.RDB, + producer: config.Producer, + searchStoreManagers: config.SearchStoreManagers, + parseManager: config.ParseManager, + storage: config.Storage, + reranker: config.Reranker, + rewriter: config.Rewriter, + nl2Sql: config.NL2Sql, + enableCompactTable: ptr.FromOrDefault(config.EnableCompactTable, true), + cacheCli: config.CacheCli, + modelFactory: config.ModelFactory, } if svc.reranker == nil { svc.reranker = rrf.NewRRFReranker(0) @@ -99,21 +98,20 @@ func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHa } type KnowledgeSVCConfig struct { - DB *gorm.DB // required - IDGen idgen.IDGenerator // required - RDB rdb.RDB // Required: Form storage - Producer eventbus.Producer // Required: Document indexing process goes through mq asynchronous processing - SearchStoreManagers []searchstore.Manager // Required: Vector/Full Text - ParseManager parser.Manager // Optional: document segmentation and processing capability, default builtin parser - Storage storage.Storage // required: oss - ModelFactory chatmodel.Factory // Required: Model factory - Rewriter messages2query.MessagesToQuery // Optional: Do not overwrite when not configured - Reranker rerank.Reranker // Optional: default rrf when not configured - NL2Sql nl2sql.NL2SQL // Optional: Not supported by default when not configured - EnableCompactTable *bool // Optional: Table data compression, default true - OCR ocr.OCR // Optional: ocr, ocr function is not available when not provided - CacheCli cache.Cmdable // Optional: cache implementation - IsAutoAnnotationSupported bool // Does it support automatic image labeling? + DB *gorm.DB // required + IDGen idgen.IDGenerator // required + RDB rdb.RDB // Required: Form storage + Producer eventbus.Producer // Required: Document indexing process goes through mq asynchronous processing + SearchStoreManagers []searchstore.Manager // Required: Vector/Full Text + ParseManager parser.Manager // Optional: document segmentation and processing capability, default builtin parser + Storage storage.Storage // required: oss + ModelFactory chatmodel.Factory // Required: Model factory + Rewriter messages2query.MessagesToQuery // Optional: Do not overwrite when not configured + Reranker rerank.Reranker // Optional: default rrf when not configured + NL2Sql nl2sql.NL2SQL // Optional: Not supported by default when not configured + EnableCompactTable *bool // Optional: Table data compression, default true + OCR ocr.OCR // Optional: ocr, ocr function is not available when not provided + CacheCli cache.Cmdable // Optional: cache implementation } type knowledgeSVC struct { @@ -123,18 +121,17 @@ type knowledgeSVC struct { reviewRepo repository.KnowledgeDocumentReviewRepo modelFactory chatmodel.Factory - idgen idgen.IDGenerator - rdb rdb.RDB - producer eventbus.Producer - searchStoreManagers []searchstore.Manager - parseManager parser.Manager - rewriter messages2query.MessagesToQuery - reranker rerank.Reranker - storage storage.Storage - nl2Sql nl2sql.NL2SQL - cacheCli cache.Cmdable - enableCompactTable bool // Table data compression - isAutoAnnotationSupported bool // Does it support automatic image labeling? + idgen idgen.IDGenerator + rdb rdb.RDB + producer eventbus.Producer + searchStoreManagers []searchstore.Manager + parseManager parser.Manager + rewriter messages2query.MessagesToQuery + reranker rerank.Reranker + storage storage.Storage + nl2Sql nl2sql.NL2SQL + cacheCli cache.Cmdable + enableCompactTable bool // Table data compression } func (k *knowledgeSVC) CreateKnowledge(ctx context.Context, request *CreateKnowledgeRequest) (response *CreateKnowledgeResponse, err error) { @@ -318,7 +315,7 @@ func (k *knowledgeSVC) checkRequest(request *CreateDocumentRequest) error { } for i := range request.Documents { if request.Documents[i].Type == knowledgeModel.DocumentTypeImage && ptr.From(request.Documents[i].ParsingStrategy.CaptionType) == parser.ImageAnnotationTypeModel { - if !k.isAutoAnnotationSupported { + if !k.parseManager.IsAutoAnnotationSupported() { return errors.New("auto caption type is not supported") } } @@ -1411,7 +1408,7 @@ func (k *knowledgeSVC) ExtractPhotoCaption(ctx context.Context, request *Extract if request == nil { return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty")) } - if !k.isAutoAnnotationSupported { + if !k.parseManager.IsAutoAnnotationSupported() { return nil, errorx.New(errno.ErrKnowledgeAutoAnnotationNotSupportedCode, errorx.KV("msg", "auto annotation is not supported")) } docInfo, err := k.documentRepo.GetByID(ctx, request.DocumentID) diff --git a/backend/infra/contract/document/parser/manager.go b/backend/infra/contract/document/parser/manager.go index 4ce37dee..8c073168 100644 --- a/backend/infra/contract/document/parser/manager.go +++ b/backend/infra/contract/document/parser/manager.go @@ -23,6 +23,7 @@ import ( type Manager interface { GetParser(config *Config) (Parser, error) + IsAutoAnnotationSupported() bool } type Config struct { diff --git a/backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go b/backend/infra/impl/document/ocr/ppocr/ppocr.go similarity index 95% rename from backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go rename to backend/infra/impl/document/ocr/ppocr/ppocr.go index 3a4cef09..468f9b24 100644 --- a/backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go +++ b/backend/infra/impl/document/ocr/ppocr/ppocr.go @@ -14,7 +14,7 @@ * limitations under the License. */ -package veocr +package ppocr import ( "bytes" @@ -28,7 +28,7 @@ import ( "github.com/coze-dev/coze-studio/backend/types/errno" ) -type PPOCRConfig struct { +type Config struct { Client *http.Client URL string @@ -44,12 +44,12 @@ type PPOCRConfig struct { TextRecScoreThresh *float64 } -func NewPPOCR(config *PPOCRConfig) ocr.OCR { +func NewOCR(config *Config) ocr.OCR { return &ppocrImpl{config} } type ppocrImpl struct { - config *PPOCRConfig + config *Config } type ppocrResponse struct { @@ -133,6 +133,10 @@ func (o *ppocrImpl) makeRequest(reqBody map[string]interface{}) ([]string, error } defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) diff --git a/backend/infra/impl/document/parser/builtin/chunk_custom.go b/backend/infra/impl/document/parser/builtin/chunk_custom.go index 6d67c6e7..2a56cc8f 100644 --- a/backend/infra/impl/document/parser/builtin/chunk_custom.go +++ b/backend/infra/impl/document/parser/builtin/chunk_custom.go @@ -34,10 +34,10 @@ var ( emailRegex = regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`) ) -func chunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) { +func ChunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) { cs := config.ChunkingStrategy if cs.Overlap >= cs.ChunkSize { - return nil, fmt.Errorf("[chunkCustom] invalid param, overlap >= chunk_size") + return nil, fmt.Errorf("[ChunkCustom] invalid param, overlap >= chunk_size") } var ( diff --git a/backend/infra/impl/document/parser/builtin/chunk_custom_test.go b/backend/infra/impl/document/parser/builtin/chunk_custom_test.go index 1064a666..7ecae8a8 100644 --- a/backend/infra/impl/document/parser/builtin/chunk_custom_test.go +++ b/backend/infra/impl/document/parser/builtin/chunk_custom_test.go @@ -39,7 +39,7 @@ func TestChunkCustom(t *testing.T) { TrimURLAndEmail: true, } - slices, err := chunkCustom(ctx, text, &parser.Config{ChunkingStrategy: cs}) + slices, err := ChunkCustom(ctx, text, &parser.Config{ChunkingStrategy: cs}) assert.NoError(t, err) assert.Len(t, slices, 10) diff --git a/backend/infra/impl/document/parser/builtin/image.go b/backend/infra/impl/document/parser/builtin/image.go index b3411a20..516ada61 100644 --- a/backend/infra/impl/document/parser/builtin/image.go +++ b/backend/infra/impl/document/parser/builtin/image.go @@ -24,7 +24,7 @@ import ( "github.com/coze-dev/coze-studio/backend/infra/contract/storage" ) -func putImageObject(ctx context.Context, st storage.Storage, imgExt string, uid int64, img []byte) (format string, err error) { +func PutImageObject(ctx context.Context, st storage.Storage, imgExt string, uid int64, img []byte) (format string, err error) { secret := createSecret(uid, imgExt) fileName := fmt.Sprintf("%d_%d_%s.%s", uid, time.Now().UnixNano(), secret, imgExt) objectName := fmt.Sprintf("%s/%s", knowledgePrefix, fileName) diff --git a/backend/infra/impl/document/parser/builtin/manager.go b/backend/infra/impl/document/parser/builtin/manager.go index 1b33b4ac..a7a3a983 100644 --- a/backend/infra/impl/document/parser/builtin/manager.go +++ b/backend/infra/impl/document/parser/builtin/manager.go @@ -41,7 +41,7 @@ type manager struct { } func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) { - var pFn parseFn + var pFn ParseFn if config.ParsingStrategy.HeaderLine == 0 && config.ParsingStrategy.DataStartLine == 0 { config.ParsingStrategy.DataStartLine = 1 @@ -52,26 +52,30 @@ func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) { switch config.FileExtension { case parser.FileExtensionPDF: - pFn = parseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_pdf.py")) + pFn = ParseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_pdf.py")) case parser.FileExtensionTXT: - pFn = parseText(config) + pFn = ParseText(config) case parser.FileExtensionMarkdown: - pFn = parseMarkdown(config, m.storage, m.ocr) + pFn = ParseMarkdown(config, m.storage, m.ocr) case parser.FileExtensionDocx: - pFn = parseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_docx.py")) + pFn = ParseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_docx.py")) case parser.FileExtensionCSV: - pFn = parseCSV(config) + pFn = ParseCSV(config) case parser.FileExtensionXLSX: - pFn = parseXLSX(config) + pFn = ParseXLSX(config) case parser.FileExtensionJSON: - pFn = parseJSON(config) + pFn = ParseJSON(config) case parser.FileExtensionJsonMaps: - pFn = parseJSONMaps(config) + pFn = ParseJSONMaps(config) case parser.FileExtensionJPG, parser.FileExtensionJPEG, parser.FileExtensionPNG: - pFn = parseImage(config, m.model) + pFn = ParseImage(config, m.model) default: return nil, fmt.Errorf("[Parse] document type not support, type=%s", config.FileExtension) } - return &p{parseFn: pFn}, nil + return &Parser{ParseFn: pFn}, nil +} + +func (m *manager) IsAutoAnnotationSupported() bool { + return m.model != nil } diff --git a/backend/infra/impl/document/parser/builtin/parse_csv.go b/backend/infra/impl/document/parser/builtin/parse_csv.go index 55ccc759..e69237f9 100644 --- a/backend/infra/impl/document/parser/builtin/parse_csv.go +++ b/backend/infra/impl/document/parser/builtin/parse_csv.go @@ -29,7 +29,7 @@ import ( contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" ) -func parseCSV(config *contract.Config) parseFn { +func ParseCSV(config *contract.Config) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { iter := &csvIterator{csv.NewReader(utfbom.SkipOnly(reader))} return parseByRowIterator(iter, config, opts...) diff --git a/backend/infra/impl/document/parser/builtin/parse_csv_test.go b/backend/infra/impl/document/parser/builtin/parse_csv_test.go index 434f7b03..3c9ab4c2 100644 --- a/backend/infra/impl/document/parser/builtin/parse_csv_test.go +++ b/backend/infra/impl/document/parser/builtin/parse_csv_test.go @@ -47,7 +47,7 @@ func TestParseCSV(t *testing.T) { }, ChunkingStrategy: nil, } - p1 := parseCSV(c1) + p1 := ParseCSV(c1) docs, err := p1(ctx, r1, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), @@ -112,7 +112,7 @@ func TestParseCSV(t *testing.T) { }, ChunkingStrategy: nil, } - p2 := parseCSV(c2) + p2 := ParseCSV(c2) docs, err = p2(ctx, r2, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), @@ -131,7 +131,7 @@ func TestParseCSVBadCases(t *testing.T) { b, err := io.ReadAll(f) assert.NoError(t, err) - pfn := parseCSV(&contract.Config{ + pfn := ParseCSV(&contract.Config{ FileExtension: "csv", ParsingStrategy: &contract.ParsingStrategy{ ExtractImage: true, @@ -154,7 +154,7 @@ func TestParseCSVBadCases(t *testing.T) { cols, err := document.GetDocumentColumns(resp[0]) assert.NoError(t, err) cols[5].Nullable = false - npfn := parseCSV(&contract.Config{ + npfn := ParseCSV(&contract.Config{ FileExtension: "csv", ParsingStrategy: &contract.ParsingStrategy{ ExtractImage: true, diff --git a/backend/infra/impl/document/parser/builtin/parse_image.go b/backend/infra/impl/document/parser/builtin/parse_image.go index 42620708..9cffb66a 100644 --- a/backend/infra/impl/document/parser/builtin/parse_image.go +++ b/backend/infra/impl/document/parser/builtin/parse_image.go @@ -31,7 +31,7 @@ import ( "github.com/coze-dev/coze-studio/backend/types/errno" ) -func parseImage(config *contract.Config, model chatmodel.BaseChatModel) parseFn { +func ParseImage(config *contract.Config, model chatmodel.BaseChatModel) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { options := parser.GetCommonOptions(&parser.Options{}, opts...) doc := &schema.Document{ @@ -76,14 +76,14 @@ func parseImage(config *contract.Config, model chatmodel.BaseChatModel) parseFn output, err := model.Generate(ctx, []*schema.Message{input}) if err != nil { - return nil, fmt.Errorf("[parseImage] model generate failed: %w", err) + return nil, fmt.Errorf("[ParseImage] model generate failed: %w", err) } doc.Content = output.Content case contract.ImageAnnotationTypeManual: // do nothing default: - return nil, fmt.Errorf("[parseImage] unknown image annotation type=%d", config.ParsingStrategy.ImageAnnotationType) + return nil, fmt.Errorf("[ParseImage] unknown image annotation type=%d", config.ParsingStrategy.ImageAnnotationType) } return []*schema.Document{doc}, nil diff --git a/backend/infra/impl/document/parser/builtin/parse_json.go b/backend/infra/impl/document/parser/builtin/parse_json.go index e031701c..25fa9332 100644 --- a/backend/infra/impl/document/parser/builtin/parse_json.go +++ b/backend/infra/impl/document/parser/builtin/parse_json.go @@ -28,7 +28,7 @@ import ( contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" ) -func parseJSON(config *contract.Config) parseFn { +func ParseJSON(config *contract.Config) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { b, err := io.ReadAll(reader) if err != nil { @@ -41,7 +41,7 @@ func parseJSON(config *contract.Config) parseFn { } if len(rawSlices) == 0 { - return nil, fmt.Errorf("[parseJSON] json data is empty") + return nil, fmt.Errorf("[ParseJSON] json data is empty") } var header []string diff --git a/backend/infra/impl/document/parser/builtin/parse_json_maps.go b/backend/infra/impl/document/parser/builtin/parse_json_maps.go index cd5502cb..55d7df7a 100644 --- a/backend/infra/impl/document/parser/builtin/parse_json_maps.go +++ b/backend/infra/impl/document/parser/builtin/parse_json_maps.go @@ -29,7 +29,7 @@ import ( contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" ) -func parseJSONMaps(config *contract.Config) parseFn { +func ParseJSONMaps(config *contract.Config) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { b, err := io.ReadAll(reader) if err != nil { diff --git a/backend/infra/impl/document/parser/builtin/parse_json_maps_test.go b/backend/infra/impl/document/parser/builtin/parse_json_maps_test.go index 520cf146..1668b761 100644 --- a/backend/infra/impl/document/parser/builtin/parse_json_maps_test.go +++ b/backend/infra/impl/document/parser/builtin/parse_json_maps_test.go @@ -85,7 +85,7 @@ func TestParseTableCustomContent(t *testing.T) { }, } - pfn := parseJSONMaps(config) + pfn := ParseJSONMaps(config) docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), diff --git a/backend/infra/impl/document/parser/builtin/parse_json_test.go b/backend/infra/impl/document/parser/builtin/parse_json_test.go index 99d34bd3..47a605db 100644 --- a/backend/infra/impl/document/parser/builtin/parse_json_test.go +++ b/backend/infra/impl/document/parser/builtin/parse_json_test.go @@ -55,7 +55,7 @@ func TestParseJSON(t *testing.T) { }, ChunkingStrategy: nil, } - pfn := parseJSON(config) + pfn := ParseJSON(config) docs, err := pfn(context.Background(), reader, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), @@ -121,7 +121,7 @@ func TestParseJSONWithSchema(t *testing.T) { }, }, } - pfn := parseJSON(config) + pfn := ParseJSON(config) docs, err := pfn(context.Background(), reader, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), diff --git a/backend/infra/impl/document/parser/builtin/parse_markdown.go b/backend/infra/impl/document/parser/builtin/parse_markdown.go index 5f247c0d..f734d5e8 100644 --- a/backend/infra/impl/document/parser/builtin/parse_markdown.go +++ b/backend/infra/impl/document/parser/builtin/parse_markdown.go @@ -38,7 +38,7 @@ import ( "github.com/coze-dev/coze-studio/backend/pkg/logs" ) -func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR) parseFn { +func ParseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { options := parser.GetCommonOptions(&parser.Options{}, opts...) mdParser := goldmark.DefaultParser() @@ -52,7 +52,7 @@ func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR ps := config.ParsingStrategy if cs.ChunkType != contract.ChunkTypeCustom && cs.ChunkType != contract.ChunkTypeDefault { - return nil, fmt.Errorf("[parseMarkdown] chunk type not support, chunk type=%d", cs.ChunkType) + return nil, fmt.Errorf("[ParseMarkdown] chunk type not support, chunk type=%d", cs.ChunkType) } var ( @@ -173,7 +173,7 @@ func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR return ast.WalkStop, fmt.Errorf("failed to download image: %w", err) } - imgSrc, err := putImageObject(ctx, storage, ext, getCreatorIDFromExtraMeta(options.ExtraMeta), img) + imgSrc, err := PutImageObject(ctx, storage, ext, GetCreatorIDFromExtraMeta(options.ExtraMeta), img) if err != nil { return ast.WalkStop, err } @@ -198,7 +198,7 @@ func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR pushSlice() } } else { - logs.CtxInfof(ctx, "[parseMarkdown] not a valid image url, skip, got=%s", imageURL) + logs.CtxInfof(ctx, "[ParseMarkdown] not a valid image url, skip, got=%s", imageURL) } } } diff --git a/backend/infra/impl/document/parser/builtin/parse_markdown_test.go b/backend/infra/impl/document/parser/builtin/parse_markdown_test.go index 984d19fe..cdf93302 100644 --- a/backend/infra/impl/document/parser/builtin/parse_markdown_test.go +++ b/backend/infra/impl/document/parser/builtin/parse_markdown_test.go @@ -37,7 +37,7 @@ func TestParseMarkdown(t *testing.T) { mockStorage := ms.NewMockStorage(ctrl) mockStorage.EXPECT().PutObject(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - pfn := parseMarkdown(&contract.Config{ + pfn := ParseMarkdown(&contract.Config{ FileExtension: contract.FileExtensionMarkdown, ParsingStrategy: &contract.ParsingStrategy{ ExtractImage: true, diff --git a/backend/infra/impl/document/parser/builtin/parse_text.go b/backend/infra/impl/document/parser/builtin/parse_text.go index 691a9c1a..b7ccfe59 100644 --- a/backend/infra/impl/document/parser/builtin/parse_text.go +++ b/backend/infra/impl/document/parser/builtin/parse_text.go @@ -27,7 +27,7 @@ import ( contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" ) -func parseText(config *contract.Config) parseFn { +func ParseText(config *contract.Config) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { content, err := io.ReadAll(reader) if err != nil { @@ -36,9 +36,9 @@ func parseText(config *contract.Config) parseFn { switch config.ChunkingStrategy.ChunkType { case contract.ChunkTypeCustom, contract.ChunkTypeDefault: - docs, err = chunkCustom(ctx, string(content), config, opts...) + docs, err = ChunkCustom(ctx, string(content), config, opts...) default: - return nil, fmt.Errorf("[parseText] chunk type not support, type=%d", config.ChunkingStrategy.ChunkType) + return nil, fmt.Errorf("[ParseText] chunk type not support, type=%d", config.ChunkingStrategy.ChunkType) } if err != nil { return nil, err diff --git a/backend/infra/impl/document/parser/builtin/parse_xlsx.go b/backend/infra/impl/document/parser/builtin/parse_xlsx.go index 2642d323..e6d2761d 100644 --- a/backend/infra/impl/document/parser/builtin/parse_xlsx.go +++ b/backend/infra/impl/document/parser/builtin/parse_xlsx.go @@ -27,7 +27,7 @@ import ( contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" ) -func parseXLSX(config *contract.Config) parseFn { +func ParseXLSX(config *contract.Config) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { f, err := excelize.OpenReader(reader) if err != nil { diff --git a/backend/infra/impl/document/parser/builtin/parse_xlsx_test.go b/backend/infra/impl/document/parser/builtin/parse_xlsx_test.go index 6aca2355..0238291e 100644 --- a/backend/infra/impl/document/parser/builtin/parse_xlsx_test.go +++ b/backend/infra/impl/document/parser/builtin/parse_xlsx_test.go @@ -88,7 +88,7 @@ func TestParseXLSX(t *testing.T) { ChunkingStrategy: nil, } - pfn := parseXLSX(config) + pfn := ParseXLSX(config) docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), @@ -159,7 +159,7 @@ func TestParseXLSXConvertColumnType(t *testing.T) { ChunkingStrategy: nil, } - pfn := parseXLSX(config) + pfn := ParseXLSX(config) docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{ "document_id": int64(123), "knowledge_id": int64(456), diff --git a/backend/infra/impl/document/parser/builtin/parser.go b/backend/infra/impl/document/parser/builtin/parser.go index f73f7786..db5f0fd5 100644 --- a/backend/infra/impl/document/parser/builtin/parser.go +++ b/backend/infra/impl/document/parser/builtin/parser.go @@ -24,12 +24,12 @@ import ( "github.com/cloudwego/eino/schema" ) -type p struct { - parseFn +type Parser struct { + ParseFn } -func (p p) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) ([]*schema.Document, error) { - return p.parseFn(ctx, reader, opts...) +func (p Parser) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) ([]*schema.Document, error) { + return p.ParseFn(ctx, reader, opts...) } -type parseFn func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) +type ParseFn func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) diff --git a/backend/infra/impl/document/parser/builtin/py_parser_protocol.go b/backend/infra/impl/document/parser/builtin/py_parser_protocol.go index 460c7bd0..3977533a 100644 --- a/backend/infra/impl/document/parser/builtin/py_parser_protocol.go +++ b/backend/infra/impl/document/parser/builtin/py_parser_protocol.go @@ -73,15 +73,15 @@ func (p *pyPDFTableIterator) NextRow() (row []string, end bool, err error) { return row, false, nil } -func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR, pyPath, scriptPath string) parseFn { +func ParseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR, pyPath, scriptPath string) ParseFn { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { pr, pw, err := os.Pipe() if err != nil { - return nil, fmt.Errorf("[parseByPython] create rpipe failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] create rpipe failed, %w", err) } r, w, err := os.Pipe() if err != nil { - return nil, fmt.Errorf("[parseByPython] create pipe failed: %w", err) + return nil, fmt.Errorf("[ParseByPython] create pipe failed: %w", err) } options := parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...) @@ -91,13 +91,13 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR FilterPages: config.ParsingStrategy.FilterPages, }) if err != nil { - return nil, fmt.Errorf("[parseByPython] create parse request failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] create parse request failed, %w", err) } if _, err = pw.Write(reqb); err != nil { - return nil, fmt.Errorf("[parseByPython] write parse request bytes failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] write parse request bytes failed, %w", err) } if err = pw.Close(); err != nil { - return nil, fmt.Errorf("[parseByPython] close write request pipe failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] close write request pipe failed, %w", err) } cmd := exec.Command(pyPath, scriptPath) @@ -105,31 +105,31 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR cmd.Stdout = os.Stdout cmd.ExtraFiles = []*os.File{w, pr} if err = cmd.Start(); err != nil { - return nil, fmt.Errorf("[parseByPython] failed to start Python script: %w", err) + return nil, fmt.Errorf("[ParseByPython] failed to start Python script: %w", err) } if err = w.Close(); err != nil { - return nil, fmt.Errorf("[parseByPython] failed to close write pipe: %w", err) + return nil, fmt.Errorf("[ParseByPython] failed to close write pipe: %w", err) } result := &pyParseResult{} if err = json.NewDecoder(r).Decode(result); err != nil { - return nil, fmt.Errorf("[parseByPython] failed to decode result: %w", err) + return nil, fmt.Errorf("[ParseByPython] failed to decode result: %w", err) } if err = cmd.Wait(); err != nil { - return nil, fmt.Errorf("[parseByPython] cmd wait err: %w", err) + return nil, fmt.Errorf("[ParseByPython] cmd wait err: %w", err) } if result.Error != "" { - return nil, fmt.Errorf("[parseByPython] python execution failed: %s", result.Error) + return nil, fmt.Errorf("[ParseByPython] python execution failed: %s", result.Error) } for i, item := range result.Content { switch item.Type { case contentTypeText: - partDocs, err := chunkCustom(ctx, item.Content, config, opts...) + partDocs, err := ChunkCustom(ctx, item.Content, config, opts...) if err != nil { - return nil, fmt.Errorf("[parseByPython] chunk text failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] chunk text failed, %w", err) } docs = append(docs, partDocs...) case contentTypeImage: @@ -138,9 +138,9 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR } image, err := base64.StdEncoding.DecodeString(item.Content) if err != nil { - return nil, fmt.Errorf("[parseByPython] decode image failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] decode image failed, %w", err) } - imgSrc, err := putImageObject(ctx, storage, "png", getCreatorIDFromExtraMeta(options.ExtraMeta), image) + imgSrc, err := PutImageObject(ctx, storage, "png", GetCreatorIDFromExtraMeta(options.ExtraMeta), image) if err != nil { return nil, err } @@ -148,7 +148,7 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR if config.ParsingStrategy.ImageOCR && ocr != nil { texts, err := ocr.FromBase64(ctx, item.Content) if err != nil { - return nil, fmt.Errorf("[parseByPython] FromBase64 failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] FromBase64 failed, %w", err) } label += strings.Join(texts, "\n") } @@ -181,15 +181,15 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR ChunkingStrategy: config.ChunkingStrategy, }, opts...) if err != nil { - return nil, fmt.Errorf("[parseByPython] parse table failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] parse table failed, %w", err) } fmtTableDocs, err := formatTablesInDocument(rawTableDocs) if err != nil { - return nil, fmt.Errorf("[parseByPython] format table failed, %w", err) + return nil, fmt.Errorf("[ParseByPython] format table failed, %w", err) } docs = append(docs, fmtTableDocs...) default: - return nil, fmt.Errorf("[parseByPython] invalid content type: %s", item.Type) + return nil, fmt.Errorf("[ParseByPython] invalid content type: %s", item.Type) } } diff --git a/backend/infra/impl/document/parser/builtin/util.go b/backend/infra/impl/document/parser/builtin/util.go index 924d7e96..40425345 100644 --- a/backend/infra/impl/document/parser/builtin/util.go +++ b/backend/infra/impl/document/parser/builtin/util.go @@ -61,7 +61,7 @@ func getExtension(uri string) string { return "" } -func getCreatorIDFromExtraMeta(extraMeta map[string]any) int64 { +func GetCreatorIDFromExtraMeta(extraMeta map[string]any) int64 { if extraMeta == nil { return 0 } diff --git a/backend/infra/impl/document/parser/ppstructure/manager.go b/backend/infra/impl/document/parser/ppstructure/manager.go new file mode 100644 index 00000000..46c1f940 --- /dev/null +++ b/backend/infra/impl/document/parser/ppstructure/manager.go @@ -0,0 +1,91 @@ +/* + * Copyright 2025 coze-dev Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ppstructure + +import ( + "fmt" + + "github.com/coze-dev/coze-studio/backend/infra/contract/chatmodel" + "github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr" + "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" + "github.com/coze-dev/coze-studio/backend/infra/contract/storage" + "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin" + "github.com/coze-dev/coze-studio/backend/pkg/goutil" +) + +func NewManager(apiConfig *APIConfig, ocr ocr.OCR, storage storage.Storage, imageAnnotationModel chatmodel.BaseChatModel) parser.Manager { + return &manager{ + apiConfig: apiConfig, + ocr: ocr, + storage: storage, + imageAnnotationModel: imageAnnotationModel, + } +} + +type manager struct { + apiConfig *APIConfig + ocr ocr.OCR + storage storage.Storage + imageAnnotationModel chatmodel.BaseChatModel +} + +func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) { + + if config.ParsingStrategy.HeaderLine == 0 && config.ParsingStrategy.DataStartLine == 0 { + config.ParsingStrategy.DataStartLine = 1 + } else if config.ParsingStrategy.HeaderLine >= config.ParsingStrategy.DataStartLine { + return nil, fmt.Errorf("[GetParser] invalid header line and data start line, header=%d, data_start=%d", + config.ParsingStrategy.HeaderLine, config.ParsingStrategy.DataStartLine) + } + + var pFn builtin.ParseFn + switch config.FileExtension { + case parser.FileExtensionPDF: + fileType := 0 + return &ppstructureParser{config, m.apiConfig, fileType, m.ocr, m.storage}, nil + case parser.FileExtensionTXT: + pFn = builtin.ParseText(config) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionMarkdown: + pFn = builtin.ParseMarkdown(config, m.storage, m.ocr) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionDocx: + pFn = builtin.ParseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_docx.py")) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionCSV: + pFn = builtin.ParseCSV(config) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionXLSX: + pFn = builtin.ParseXLSX(config) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionJSON: + pFn = builtin.ParseJSON(config) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionJsonMaps: + pFn = builtin.ParseJSONMaps(config) + return &builtin.Parser{ParseFn: pFn}, nil + case parser.FileExtensionJPG, parser.FileExtensionJPEG, parser.FileExtensionPNG: + pFn = builtin.ParseImage(config, m.imageAnnotationModel) + return &builtin.Parser{ParseFn: pFn}, nil + default: + return nil, fmt.Errorf("[Parse] document type not support, type=%s", config.FileExtension) + } +} + +func (m *manager) IsAutoAnnotationSupported() bool { + return m.imageAnnotationModel != nil +} diff --git a/backend/infra/impl/document/parser/ppstructure/parser.go b/backend/infra/impl/document/parser/ppstructure/parser.go new file mode 100644 index 00000000..6fe62525 --- /dev/null +++ b/backend/infra/impl/document/parser/ppstructure/parser.go @@ -0,0 +1,324 @@ +/* + * Copyright 2025 coze-dev Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ppstructure + +import ( + "bytes" + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "regexp" + "strings" + + "github.com/cloudwego/eino/components/document/parser" + "github.com/cloudwego/eino/schema" + + "github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr" + contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" + "github.com/coze-dev/coze-studio/backend/infra/contract/storage" + "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin" +) + +type ppstructureParser struct { + parserConfig *contract.Config + apiConfig *APIConfig + fileType int + ocr ocr.OCR + storage storage.Storage +} + +type APIConfig struct { + Client *http.Client + URL string + + // see: https://paddlepaddle.github.io/PaddleX/latest/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.html#3 + UseDocOrientationClassify *bool + UseDocUnwarping *bool + UseTextlineOrientation *bool + UseSealRecognition *bool + UseFormulaRecognition *bool + UseChartRecognition *bool + UseRegionDetection *bool + LayoutThreshold *float64 + LayoutNms *bool + LayoutUnclipRatio *float64 + LayoutMergeBboxesMode *string + TextDetLimitSideLen *int64 + TextDetLimitType *string + TextDetThresh *float64 + TextDetBoxThresh *float64 + TextDetUnclipRatio *float64 + TextRecScoreThresh *float64 + SealDetLimitSideLen *int64 + SealDetLimitType *string + SealDetThresh *float64 + SealDetBoxThresh *float64 + SealDetUnclipRatio *float64 + SealRecScoreThresh *float64 + UseWiredTableCellsTransToHtml *bool + UseWirelessTableCellsTransToHtml *bool + UseTableOrientationClassify *bool + UseOcrResultsWithTableCells *bool + UseE2eWiredTableRecModel *bool + UseE2eWirelessTableRecModel *bool +} + +type ppstructureResponse struct { + Result *ppstructureInferResult `json:"result"` +} + +type ppstructureInferResult struct { + LayoutParsingResults []*ppstructureInnerResult `json:"layoutParsingResults"` +} + +type ppstructureInnerResult struct { + Markdown *ppstructureMarkdown `json:"markdown"` +} + +type ppstructureMarkdown struct { + Text *string `json:"text"` + Images map[string]string `json:"images"` + IsStart *bool `json:"isStart"` + IsEnd *bool `json:"isEnd"` +} + +func (p *ppstructureParser) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { + // TODO(Bobholamovic): Current chunking strategy is rather naive; we should + // implement a more sophisticated one that at least takes tables and text + // extracted from the images into consideration. + options := parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...) + + fileBytes, err := io.ReadAll(reader) + if err != nil { + return nil, fmt.Errorf("[Parse] failed to read the file bytes, %w", err) + } + + b64 := base64.StdEncoding.EncodeToString(fileBytes) + + reqBody := p.newRequestBody(b64, p.fileType, p.parserConfig.ParsingStrategy.ExtractImage, p.parserConfig.ParsingStrategy.ExtractTable) + + bodyBytes, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("[Parse] failed to serizalize the request body, %w", err) + } + + req, err := http.NewRequest("POST", p.apiConfig.URL, bytes.NewReader(bodyBytes)) + if err != nil { + return nil, fmt.Errorf("[Parse] failed to create a new request, %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := p.apiConfig.Client.Do(req) + if err != nil { + return nil, fmt.Errorf("[Parse] request failed, %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("[Parse] request failed, %w", err) + } + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("[Parse] failed to read the response body, %w", err) + } + + var res ppstructureResponse + if err := json.Unmarshal(respBody, &res); err != nil { + return nil, fmt.Errorf("[Parse] failed to deserialize the response body, %w", err) + } + + if res.Result == nil || + res.Result.LayoutParsingResults == nil { + return nil, fmt.Errorf("[Parse] failed to get the layout parsing result, %w", err) + } + + for i, item := range res.Result.LayoutParsingResults { + if item.Markdown == nil { + return nil, fmt.Errorf("[Parse] invalid response, %w", err) + } + if item.Markdown.Text == nil { + return nil, fmt.Errorf("[Parse] invalid response, %w", err) + } + + should_skip := false + for _, v := range p.parserConfig.ParsingStrategy.FilterPages { + if i+1 == v { + should_skip = true + break + } + } + if should_skip { + continue + } + + text := *item.Markdown.Text + // Convert the image in markdown to comments, as the image content will be added later. + pattern := `(?i)]*>` + re := regexp.MustCompile(pattern) + // TODO(Bobholamovic): Add image numbering + text = re.ReplaceAllStringFunc(text, func(matched string) string { + return "" + }) + + partDocs, err := builtin.ChunkCustom(ctx, text, p.parserConfig, opts...) + if err != nil { + return nil, fmt.Errorf("[Parse] chunk text failed, %w", err) + } + docs = append(docs, partDocs...) + + if p.parserConfig.ParsingStrategy.ExtractImage { + if item.Markdown.Images == nil { + return nil, fmt.Errorf("[Parse] invalid response, %w", err) + } + for _, v := range item.Markdown.Images { + image, err := base64.StdEncoding.DecodeString(v) + if err != nil { + return nil, fmt.Errorf("[Parse] failed to decode an image, %w", err) + } + + imgSrc, err := builtin.PutImageObject(ctx, p.storage, "png", builtin.GetCreatorIDFromExtraMeta(options.ExtraMeta), image) + if err != nil { + return nil, err + } + label := fmt.Sprintf("\n%s", imgSrc) + + if p.parserConfig.ParsingStrategy.ImageOCR && p.ocr != nil { + texts, err := p.ocr.FromBase64(ctx, v) + if err != nil { + return nil, fmt.Errorf("[Parse] FromBase64 failed, %w", err) + } + label += strings.Join(texts, "\n") + } + + doc := &schema.Document{ + Content: label, + MetaData: map[string]any{}, + } + for k, v := range options.ExtraMeta { + doc.MetaData[k] = v + } + docs = append(docs, doc) + } + } + + } + + return docs, nil +} + +func (p *ppstructureParser) newRequestBody(file string, fileType int, extractImage bool, extractTable bool) map[string]interface{} { + payload := map[string]interface{}{ + "file": file, + "fileType": fileType, + "useTableRecognition": extractTable, + "visualize": extractImage, + } + + if p.apiConfig.UseDocOrientationClassify != nil { + payload["useDocOrientationClassify"] = *p.apiConfig.UseDocOrientationClassify + } + if p.apiConfig.UseDocUnwarping != nil { + payload["useDocUnwarping"] = *p.apiConfig.UseDocUnwarping + } + if p.apiConfig.UseTextlineOrientation != nil { + payload["useTextlineOrientation"] = *p.apiConfig.UseTextlineOrientation + } + if p.apiConfig.UseSealRecognition != nil { + payload["useSealRecognition"] = *p.apiConfig.UseSealRecognition + } + if p.apiConfig.UseFormulaRecognition != nil { + payload["useFormulaRecognition"] = *p.apiConfig.UseFormulaRecognition + } + if p.apiConfig.UseChartRecognition != nil { + payload["useChartRecognition"] = *p.apiConfig.UseChartRecognition + } + if p.apiConfig.UseRegionDetection != nil { + payload["useRegionDetection"] = *p.apiConfig.UseRegionDetection + } + if p.apiConfig.LayoutThreshold != nil { + payload["layoutThreshold"] = *p.apiConfig.LayoutThreshold + } + if p.apiConfig.LayoutNms != nil { + payload["layoutNms"] = *p.apiConfig.LayoutNms + } + if p.apiConfig.LayoutUnclipRatio != nil { + payload["layoutUnclipRatio"] = *p.apiConfig.LayoutUnclipRatio + } + if p.apiConfig.LayoutMergeBboxesMode != nil { + payload["layoutMergeBboxesMode"] = *p.apiConfig.LayoutMergeBboxesMode + } + if p.apiConfig.TextDetLimitSideLen != nil { + payload["textDetLimitSideLen"] = *p.apiConfig.TextDetLimitSideLen + } + if p.apiConfig.TextDetLimitType != nil { + payload["textDetLimitType"] = *p.apiConfig.TextDetLimitType + } + if p.apiConfig.TextDetThresh != nil { + payload["textDetThresh"] = *p.apiConfig.TextDetThresh + } + if p.apiConfig.TextDetBoxThresh != nil { + payload["textDetBoxThresh"] = *p.apiConfig.TextDetBoxThresh + } + if p.apiConfig.TextDetUnclipRatio != nil { + payload["textDetUnclipRatio"] = *p.apiConfig.TextDetUnclipRatio + } + if p.apiConfig.TextRecScoreThresh != nil { + payload["textRecScoreThresh"] = *p.apiConfig.TextRecScoreThresh + } + if p.apiConfig.SealDetLimitSideLen != nil { + payload["sealDetLimitSideLen"] = *p.apiConfig.SealDetLimitSideLen + } + if p.apiConfig.SealDetLimitType != nil { + payload["sealDetLimitType"] = *p.apiConfig.SealDetLimitType + } + if p.apiConfig.SealDetThresh != nil { + payload["sealDetThresh"] = *p.apiConfig.SealDetThresh + } + if p.apiConfig.SealDetBoxThresh != nil { + payload["sealDetBoxThresh"] = *p.apiConfig.SealDetBoxThresh + } + if p.apiConfig.SealDetUnclipRatio != nil { + payload["sealDetUnclipRatio"] = *p.apiConfig.SealDetUnclipRatio + } + if p.apiConfig.SealRecScoreThresh != nil { + payload["sealRecScoreThresh"] = *p.apiConfig.SealRecScoreThresh + } + if p.apiConfig.UseWiredTableCellsTransToHtml != nil { + payload["useWiredTableCellsTransToHtml"] = *p.apiConfig.UseWiredTableCellsTransToHtml + } + if p.apiConfig.UseWirelessTableCellsTransToHtml != nil { + payload["useWirelessTableCellsTransToHtml"] = *p.apiConfig.UseWirelessTableCellsTransToHtml + } + if p.apiConfig.UseTableOrientationClassify != nil { + payload["useTableOrientationClassify"] = *p.apiConfig.UseTableOrientationClassify + } + if p.apiConfig.UseOcrResultsWithTableCells != nil { + payload["useOcrResultsWithTableCells"] = *p.apiConfig.UseOcrResultsWithTableCells + } + if p.apiConfig.UseE2eWiredTableRecModel != nil { + payload["useE2eWiredTableRecModel"] = *p.apiConfig.UseE2eWiredTableRecModel + } + if p.apiConfig.UseE2eWirelessTableRecModel != nil { + payload["useE2eWirelessTableRecModel"] = *p.apiConfig.UseE2eWirelessTableRecModel + } + + return payload +} diff --git a/backend/types/consts/consts.go b/backend/types/consts/consts.go index c034337e..5f103017 100644 --- a/backend/types/consts/consts.go +++ b/backend/types/consts/consts.go @@ -87,6 +87,14 @@ const ( UseSSL = "USE_SSL" SSLCertFile = "SSL_CERT_FILE" SSLKeyFile = "SSL_KEY_FILE" + + OCRType = "OCR_TYPE" + VeOCRAK = "VE_OCR_AK" + VeOCRSK = "VE_OCR_SK" + PPOCRAPIURL = "PADDLEOCR_OCR_API_URL" + + ParserType = "PARSER_TYPE" + PPStructureAPIURL = "PADDLEOCR_STRUCTURE_API_URL" ) const ( diff --git a/docker/.env.debug.example b/docker/.env.debug.example index a585ee4b..f383ade8 100644 --- a/docker/.env.debug.example +++ b/docker/.env.debug.example @@ -140,6 +140,12 @@ export VE_OCR_SK="" # paddleocr ocr export PADDLEOCR_OCR_API_URL="" +# Settings for Document Parser +# Supported parser types: `builtin`, `paddleocr` +export PARSER_TYPE="builtin" +# paddleocr structure +export PADDLEOCR_STRUCTURE_API_URL="" + # Settings for Model # Model for agent & workflow # add suffix number to add different models diff --git a/docker/.env.example b/docker/.env.example index 29671f70..ab7e5a45 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -137,6 +137,12 @@ export VE_OCR_SK="" # paddleocr ocr export PADDLEOCR_OCR_API_URL="" +# Settings for Document Parser +# Supported parser types: `builtin`, `paddleocr` +export PARSER_TYPE="builtin" +# paddleocr structure +export PADDLEOCR_STRUCTURE_API_URL="" + # Settings for Model # Model for agent & workflow # add suffix number to add different models