feat(infra): integrate PaddleOCR's PP-StructureV3 as a document parser backend (#714)

This commit is contained in:
Lin Manhui 2025-08-13 16:37:42 +08:00 committed by GitHub
parent 708a6ed0c0
commit 6b60c07c22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
30 changed files with 657 additions and 174 deletions

View File

@ -254,14 +254,16 @@ func (b *basicServices) toPluginServiceComponents() *plugin.ServiceComponents {
func (b *basicServices) toKnowledgeServiceComponents(memoryService *memory.MemoryApplicationServices) *knowledge.ServiceComponents { func (b *basicServices) toKnowledgeServiceComponents(memoryService *memory.MemoryApplicationServices) *knowledge.ServiceComponents {
return &knowledge.ServiceComponents{ return &knowledge.ServiceComponents{
DB: b.infra.DB, DB: b.infra.DB,
IDGenSVC: b.infra.IDGenSVC, IDGenSVC: b.infra.IDGenSVC,
Storage: b.infra.TOSClient, Storage: b.infra.TOSClient,
RDB: memoryService.RDBDomainSVC, RDB: memoryService.RDBDomainSVC,
ImageX: b.infra.ImageXClient, ImageX: b.infra.ImageXClient,
ES: b.infra.ESClient, ES: b.infra.ESClient,
EventBus: b.eventbus.resourceEventBus, EventBus: b.eventbus.resourceEventBus,
CacheCli: b.infra.CacheCli, CacheCli: b.infra.CacheCli,
OCR: b.infra.OCR,
ParserManager: b.infra.ParserManager,
} }
} }

View File

@ -19,25 +19,37 @@ package appinfra
import ( import (
"context" "context"
"fmt" "fmt"
"net/http"
"os" "os"
"strconv" "strconv"
"strings" "strings"
"gorm.io/gorm" "gorm.io/gorm"
"github.com/volcengine/volc-sdk-golang/service/visual"
"github.com/coze-dev/coze-studio/backend/application/internal"
"github.com/coze-dev/coze-studio/backend/infra/contract/cache" "github.com/coze-dev/coze-studio/backend/infra/contract/cache"
"github.com/coze-dev/coze-studio/backend/infra/contract/chatmodel"
"github.com/coze-dev/coze-studio/backend/infra/contract/coderunner" "github.com/coze-dev/coze-studio/backend/infra/contract/coderunner"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/imagex" "github.com/coze-dev/coze-studio/backend/infra/contract/imagex"
"github.com/coze-dev/coze-studio/backend/infra/contract/modelmgr" "github.com/coze-dev/coze-studio/backend/infra/contract/modelmgr"
"github.com/coze-dev/coze-studio/backend/infra/impl/cache/redis" "github.com/coze-dev/coze-studio/backend/infra/impl/cache/redis"
"github.com/coze-dev/coze-studio/backend/infra/impl/coderunner/direct" "github.com/coze-dev/coze-studio/backend/infra/impl/coderunner/direct"
"github.com/coze-dev/coze-studio/backend/infra/impl/coderunner/sandbox" "github.com/coze-dev/coze-studio/backend/infra/impl/coderunner/sandbox"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/ocr/ppocr"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/ocr/veocr"
builtinParser "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/ppstructure"
"github.com/coze-dev/coze-studio/backend/infra/impl/es" "github.com/coze-dev/coze-studio/backend/infra/impl/es"
"github.com/coze-dev/coze-studio/backend/infra/impl/eventbus" "github.com/coze-dev/coze-studio/backend/infra/impl/eventbus"
"github.com/coze-dev/coze-studio/backend/infra/impl/idgen" "github.com/coze-dev/coze-studio/backend/infra/impl/idgen"
"github.com/coze-dev/coze-studio/backend/infra/impl/imagex/veimagex" "github.com/coze-dev/coze-studio/backend/infra/impl/imagex/veimagex"
"github.com/coze-dev/coze-studio/backend/infra/impl/mysql" "github.com/coze-dev/coze-studio/backend/infra/impl/mysql"
"github.com/coze-dev/coze-studio/backend/infra/impl/storage" "github.com/coze-dev/coze-studio/backend/infra/impl/storage"
"github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/consts" "github.com/coze-dev/coze-studio/backend/types/consts"
) )
@ -52,6 +64,8 @@ type AppDependencies struct {
AppEventProducer eventbus.Producer AppEventProducer eventbus.Producer
ModelMgr modelmgr.Manager ModelMgr modelmgr.Manager
CodeRunner coderunner.Runner CodeRunner coderunner.Runner
OCR ocr.OCR
ParserManager parser.Manager
} }
func Init(ctx context.Context) (*AppDependencies, error) { func Init(ctx context.Context) (*AppDependencies, error) {
@ -102,6 +116,14 @@ func Init(ctx context.Context) (*AppDependencies, error) {
deps.CodeRunner = initCodeRunner() deps.CodeRunner = initCodeRunner()
deps.OCR = initOCR()
imageAnnotationModel, _, err := internal.GetBuiltinChatModel(ctx, "IA_")
if err != nil {
return nil, err
}
deps.ParserManager, err = initParserManager(deps.TOSClient, deps.OCR, imageAnnotationModel)
return deps, nil return deps, nil
} }
@ -183,3 +205,48 @@ func initCodeRunner() coderunner.Runner {
return direct.NewRunner() return direct.NewRunner()
} }
} }
func initOCR() ocr.OCR {
var ocr ocr.OCR
switch os.Getenv(consts.OCRType) {
case "ve":
ocrAK := os.Getenv(consts.VeOCRAK)
ocrSK := os.Getenv(consts.VeOCRSK)
if ocrAK == "" || ocrSK == "" {
logs.Warnf("[ve_ocr] ak / sk not configured, ocr might not work well")
}
inst := visual.NewInstance()
inst.Client.SetAccessKey(ocrAK)
inst.Client.SetSecretKey(ocrSK)
ocr = veocr.NewOCR(&veocr.Config{Client: inst})
case "paddleocr":
url := os.Getenv(consts.PPOCRAPIURL)
client := &http.Client{}
ocr = ppocr.NewOCR(&ppocr.Config{Client: client, URL: url})
default:
// accept ocr not configured
}
return ocr
}
func initParserManager(storage storage.Storage, ocr ocr.OCR, imageAnnotationModel chatmodel.BaseChatModel) (parser.Manager, error) {
var parserManager parser.Manager
parserType := os.Getenv(consts.ParserType)
switch parserType {
case "builtin":
parserManager = builtinParser.NewManager(storage, ocr, imageAnnotationModel)
case "paddleocr":
url := os.Getenv(consts.PPStructureAPIURL)
client := &http.Client{}
apiConfig := &ppstructure.APIConfig{
Client: client,
URL: url,
}
parserManager = ppstructure.NewManager(apiConfig, ocr, storage, imageAnnotationModel)
default:
return nil, fmt.Errorf("unexpected document parser type, type=%s", parserType)
}
return parserManager, nil
}

View File

@ -20,7 +20,6 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
netHTTP "net/http"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
@ -33,7 +32,6 @@ import (
"github.com/cloudwego/eino/schema" "github.com/cloudwego/eino/schema"
"github.com/milvus-io/milvus/client/v2/milvusclient" "github.com/milvus-io/milvus/client/v2/milvusclient"
"github.com/volcengine/volc-sdk-golang/service/vikingdb" "github.com/volcengine/volc-sdk-golang/service/vikingdb"
"github.com/volcengine/volc-sdk-golang/service/visual"
"gorm.io/gorm" "gorm.io/gorm"
"github.com/coze-dev/coze-studio/backend/application/internal" "github.com/coze-dev/coze-studio/backend/application/internal"
@ -42,6 +40,7 @@ import (
"github.com/coze-dev/coze-studio/backend/infra/contract/cache" "github.com/coze-dev/coze-studio/backend/infra/contract/cache"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/nl2sql" "github.com/coze-dev/coze-studio/backend/infra/contract/document/nl2sql"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr" "github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore" "github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore"
"github.com/coze-dev/coze-studio/backend/infra/contract/embedding" "github.com/coze-dev/coze-studio/backend/infra/contract/embedding"
"github.com/coze-dev/coze-studio/backend/infra/contract/es" "github.com/coze-dev/coze-studio/backend/infra/contract/es"
@ -52,8 +51,6 @@ import (
"github.com/coze-dev/coze-studio/backend/infra/contract/storage" "github.com/coze-dev/coze-studio/backend/infra/contract/storage"
chatmodelImpl "github.com/coze-dev/coze-studio/backend/infra/impl/chatmodel" chatmodelImpl "github.com/coze-dev/coze-studio/backend/infra/impl/chatmodel"
builtinNL2SQL "github.com/coze-dev/coze-studio/backend/infra/impl/document/nl2sql/builtin" builtinNL2SQL "github.com/coze-dev/coze-studio/backend/infra/impl/document/nl2sql/builtin"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/ocr/veocr"
builtinParser "github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/rerank/rrf" "github.com/coze-dev/coze-studio/backend/infra/impl/document/rerank/rrf"
sses "github.com/coze-dev/coze-studio/backend/infra/impl/document/searchstore/elasticsearch" sses "github.com/coze-dev/coze-studio/backend/infra/impl/document/searchstore/elasticsearch"
ssmilvus "github.com/coze-dev/coze-studio/backend/infra/impl/document/searchstore/milvus" ssmilvus "github.com/coze-dev/coze-studio/backend/infra/impl/document/searchstore/milvus"
@ -70,14 +67,16 @@ import (
) )
type ServiceComponents struct { type ServiceComponents struct {
DB *gorm.DB DB *gorm.DB
IDGenSVC idgen.IDGenerator IDGenSVC idgen.IDGenerator
Storage storage.Storage Storage storage.Storage
RDB rdb.RDB RDB rdb.RDB
ImageX imagex.ImageX ImageX imagex.ImageX
ES es.Client ES es.Client
EventBus search.ResourceEventBus EventBus search.ResourceEventBus
CacheCli cache.Cmdable CacheCli cache.Cmdable
OCR ocr.OCR
ParserManager parser.Manager
} }
func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) { func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) {
@ -102,26 +101,6 @@ func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) {
} }
sManagers = append(sManagers, mgr) sManagers = append(sManagers, mgr)
var ocrImpl ocr.OCR
switch os.Getenv("OCR_TYPE") {
case "ve":
ocrAK := os.Getenv("VE_OCR_AK")
ocrSK := os.Getenv("VE_OCR_SK")
if ocrAK == "" || ocrSK == "" {
logs.Warnf("[ve_ocr] ak / sk not configured, ocr might not work well")
}
inst := visual.NewInstance()
inst.Client.SetAccessKey(ocrAK)
inst.Client.SetSecretKey(ocrSK)
ocrImpl = veocr.NewOCR(&veocr.Config{Client: inst})
case "paddleocr":
ppocrURL := os.Getenv("PADDLEOCR_OCR_API_URL")
client := &netHTTP.Client{}
ocrImpl = veocr.NewPPOCR(&veocr.PPOCRConfig{Client: client, URL: ppocrURL})
default:
// accept ocr not configured
}
root, err := os.Getwd() root, err := os.Getwd()
if err != nil { if err != nil {
logs.Warnf("[InitConfig] Failed to get current working directory: %v", err) logs.Warnf("[InitConfig] Failed to get current working directory: %v", err)
@ -158,26 +137,20 @@ func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) {
} }
} }
imageAnnoChatModel, configured, err := internal.GetBuiltinChatModel(ctx, "IA_")
if err != nil {
return nil, err
}
knowledgeDomainSVC, knowledgeEventHandler := knowledgeImpl.NewKnowledgeSVC(&knowledgeImpl.KnowledgeSVCConfig{ knowledgeDomainSVC, knowledgeEventHandler := knowledgeImpl.NewKnowledgeSVC(&knowledgeImpl.KnowledgeSVCConfig{
DB: c.DB, DB: c.DB,
IDGen: c.IDGenSVC, IDGen: c.IDGenSVC,
RDB: c.RDB, RDB: c.RDB,
Producer: knowledgeProducer, Producer: knowledgeProducer,
SearchStoreManagers: sManagers, SearchStoreManagers: sManagers,
ParseManager: builtinParser.NewManager(c.Storage, ocrImpl, imageAnnoChatModel), // default builtin ParseManager: c.ParserManager,
Storage: c.Storage, Storage: c.Storage,
Rewriter: rewriter, Rewriter: rewriter,
Reranker: rrf.NewRRFReranker(0), // default rrf Reranker: rrf.NewRRFReranker(0), // default rrf
NL2Sql: n2s, NL2Sql: n2s,
OCR: ocrImpl, OCR: c.OCR,
CacheCli: c.CacheCli, CacheCli: c.CacheCli,
IsAutoAnnotationSupported: configured, ModelFactory: chatmodelImpl.NewDefaultFactory(),
ModelFactory: chatmodelImpl.NewDefaultFactory(),
}) })
if err = eventbus.DefaultSVC().RegisterConsumer(nameServer, consts.RMQTopicKnowledge, consts.RMQConsumeGroupKnowledge, knowledgeEventHandler); err != nil { if err = eventbus.DefaultSVC().RegisterConsumer(nameServer, consts.RMQTopicKnowledge, consts.RMQConsumeGroupKnowledge, knowledgeEventHandler); err != nil {

View File

@ -70,23 +70,22 @@ import (
func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHandler) { func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHandler) {
svc := &knowledgeSVC{ svc := &knowledgeSVC{
knowledgeRepo: repository.NewKnowledgeDAO(config.DB), knowledgeRepo: repository.NewKnowledgeDAO(config.DB),
documentRepo: repository.NewKnowledgeDocumentDAO(config.DB), documentRepo: repository.NewKnowledgeDocumentDAO(config.DB),
sliceRepo: repository.NewKnowledgeDocumentSliceDAO(config.DB), sliceRepo: repository.NewKnowledgeDocumentSliceDAO(config.DB),
reviewRepo: repository.NewKnowledgeDocumentReviewDAO(config.DB), reviewRepo: repository.NewKnowledgeDocumentReviewDAO(config.DB),
idgen: config.IDGen, idgen: config.IDGen,
rdb: config.RDB, rdb: config.RDB,
producer: config.Producer, producer: config.Producer,
searchStoreManagers: config.SearchStoreManagers, searchStoreManagers: config.SearchStoreManagers,
parseManager: config.ParseManager, parseManager: config.ParseManager,
storage: config.Storage, storage: config.Storage,
reranker: config.Reranker, reranker: config.Reranker,
rewriter: config.Rewriter, rewriter: config.Rewriter,
nl2Sql: config.NL2Sql, nl2Sql: config.NL2Sql,
enableCompactTable: ptr.FromOrDefault(config.EnableCompactTable, true), enableCompactTable: ptr.FromOrDefault(config.EnableCompactTable, true),
cacheCli: config.CacheCli, cacheCli: config.CacheCli,
isAutoAnnotationSupported: config.IsAutoAnnotationSupported, modelFactory: config.ModelFactory,
modelFactory: config.ModelFactory,
} }
if svc.reranker == nil { if svc.reranker == nil {
svc.reranker = rrf.NewRRFReranker(0) svc.reranker = rrf.NewRRFReranker(0)
@ -99,21 +98,20 @@ func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHa
} }
type KnowledgeSVCConfig struct { type KnowledgeSVCConfig struct {
DB *gorm.DB // required DB *gorm.DB // required
IDGen idgen.IDGenerator // required IDGen idgen.IDGenerator // required
RDB rdb.RDB // Required: Form storage RDB rdb.RDB // Required: Form storage
Producer eventbus.Producer // Required: Document indexing process goes through mq asynchronous processing Producer eventbus.Producer // Required: Document indexing process goes through mq asynchronous processing
SearchStoreManagers []searchstore.Manager // Required: Vector/Full Text SearchStoreManagers []searchstore.Manager // Required: Vector/Full Text
ParseManager parser.Manager // Optional: document segmentation and processing capability, default builtin parser ParseManager parser.Manager // Optional: document segmentation and processing capability, default builtin parser
Storage storage.Storage // required: oss Storage storage.Storage // required: oss
ModelFactory chatmodel.Factory // Required: Model factory ModelFactory chatmodel.Factory // Required: Model factory
Rewriter messages2query.MessagesToQuery // Optional: Do not overwrite when not configured Rewriter messages2query.MessagesToQuery // Optional: Do not overwrite when not configured
Reranker rerank.Reranker // Optional: default rrf when not configured Reranker rerank.Reranker // Optional: default rrf when not configured
NL2Sql nl2sql.NL2SQL // Optional: Not supported by default when not configured NL2Sql nl2sql.NL2SQL // Optional: Not supported by default when not configured
EnableCompactTable *bool // Optional: Table data compression, default true EnableCompactTable *bool // Optional: Table data compression, default true
OCR ocr.OCR // Optional: ocr, ocr function is not available when not provided OCR ocr.OCR // Optional: ocr, ocr function is not available when not provided
CacheCli cache.Cmdable // Optional: cache implementation CacheCli cache.Cmdable // Optional: cache implementation
IsAutoAnnotationSupported bool // Does it support automatic image labeling?
} }
type knowledgeSVC struct { type knowledgeSVC struct {
@ -123,18 +121,17 @@ type knowledgeSVC struct {
reviewRepo repository.KnowledgeDocumentReviewRepo reviewRepo repository.KnowledgeDocumentReviewRepo
modelFactory chatmodel.Factory modelFactory chatmodel.Factory
idgen idgen.IDGenerator idgen idgen.IDGenerator
rdb rdb.RDB rdb rdb.RDB
producer eventbus.Producer producer eventbus.Producer
searchStoreManagers []searchstore.Manager searchStoreManagers []searchstore.Manager
parseManager parser.Manager parseManager parser.Manager
rewriter messages2query.MessagesToQuery rewriter messages2query.MessagesToQuery
reranker rerank.Reranker reranker rerank.Reranker
storage storage.Storage storage storage.Storage
nl2Sql nl2sql.NL2SQL nl2Sql nl2sql.NL2SQL
cacheCli cache.Cmdable cacheCli cache.Cmdable
enableCompactTable bool // Table data compression enableCompactTable bool // Table data compression
isAutoAnnotationSupported bool // Does it support automatic image labeling?
} }
func (k *knowledgeSVC) CreateKnowledge(ctx context.Context, request *CreateKnowledgeRequest) (response *CreateKnowledgeResponse, err error) { func (k *knowledgeSVC) CreateKnowledge(ctx context.Context, request *CreateKnowledgeRequest) (response *CreateKnowledgeResponse, err error) {
@ -318,7 +315,7 @@ func (k *knowledgeSVC) checkRequest(request *CreateDocumentRequest) error {
} }
for i := range request.Documents { for i := range request.Documents {
if request.Documents[i].Type == knowledgeModel.DocumentTypeImage && ptr.From(request.Documents[i].ParsingStrategy.CaptionType) == parser.ImageAnnotationTypeModel { if request.Documents[i].Type == knowledgeModel.DocumentTypeImage && ptr.From(request.Documents[i].ParsingStrategy.CaptionType) == parser.ImageAnnotationTypeModel {
if !k.isAutoAnnotationSupported { if !k.parseManager.IsAutoAnnotationSupported() {
return errors.New("auto caption type is not supported") return errors.New("auto caption type is not supported")
} }
} }
@ -1411,7 +1408,7 @@ func (k *knowledgeSVC) ExtractPhotoCaption(ctx context.Context, request *Extract
if request == nil { if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty")) return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
} }
if !k.isAutoAnnotationSupported { if !k.parseManager.IsAutoAnnotationSupported() {
return nil, errorx.New(errno.ErrKnowledgeAutoAnnotationNotSupportedCode, errorx.KV("msg", "auto annotation is not supported")) return nil, errorx.New(errno.ErrKnowledgeAutoAnnotationNotSupportedCode, errorx.KV("msg", "auto annotation is not supported"))
} }
docInfo, err := k.documentRepo.GetByID(ctx, request.DocumentID) docInfo, err := k.documentRepo.GetByID(ctx, request.DocumentID)

View File

@ -23,6 +23,7 @@ import (
type Manager interface { type Manager interface {
GetParser(config *Config) (Parser, error) GetParser(config *Config) (Parser, error)
IsAutoAnnotationSupported() bool
} }
type Config struct { type Config struct {

View File

@ -14,7 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
package veocr package ppocr
import ( import (
"bytes" "bytes"
@ -28,7 +28,7 @@ import (
"github.com/coze-dev/coze-studio/backend/types/errno" "github.com/coze-dev/coze-studio/backend/types/errno"
) )
type PPOCRConfig struct { type Config struct {
Client *http.Client Client *http.Client
URL string URL string
@ -44,12 +44,12 @@ type PPOCRConfig struct {
TextRecScoreThresh *float64 TextRecScoreThresh *float64
} }
func NewPPOCR(config *PPOCRConfig) ocr.OCR { func NewOCR(config *Config) ocr.OCR {
return &ppocrImpl{config} return &ppocrImpl{config}
} }
type ppocrImpl struct { type ppocrImpl struct {
config *PPOCRConfig config *Config
} }
type ppocrResponse struct { type ppocrResponse struct {
@ -133,6 +133,10 @@ func (o *ppocrImpl) makeRequest(reqBody map[string]interface{}) ([]string, error
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
respBody, err := io.ReadAll(resp.Body) respBody, err := io.ReadAll(resp.Body)
if err != nil { if err != nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)

View File

@ -34,10 +34,10 @@ var (
emailRegex = regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`) emailRegex = regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`)
) )
func chunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) { func ChunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) {
cs := config.ChunkingStrategy cs := config.ChunkingStrategy
if cs.Overlap >= cs.ChunkSize { if cs.Overlap >= cs.ChunkSize {
return nil, fmt.Errorf("[chunkCustom] invalid param, overlap >= chunk_size") return nil, fmt.Errorf("[ChunkCustom] invalid param, overlap >= chunk_size")
} }
var ( var (

View File

@ -39,7 +39,7 @@ func TestChunkCustom(t *testing.T) {
TrimURLAndEmail: true, TrimURLAndEmail: true,
} }
slices, err := chunkCustom(ctx, text, &parser.Config{ChunkingStrategy: cs}) slices, err := ChunkCustom(ctx, text, &parser.Config{ChunkingStrategy: cs})
assert.NoError(t, err) assert.NoError(t, err)
assert.Len(t, slices, 10) assert.Len(t, slices, 10)

View File

@ -24,7 +24,7 @@ import (
"github.com/coze-dev/coze-studio/backend/infra/contract/storage" "github.com/coze-dev/coze-studio/backend/infra/contract/storage"
) )
func putImageObject(ctx context.Context, st storage.Storage, imgExt string, uid int64, img []byte) (format string, err error) { func PutImageObject(ctx context.Context, st storage.Storage, imgExt string, uid int64, img []byte) (format string, err error) {
secret := createSecret(uid, imgExt) secret := createSecret(uid, imgExt)
fileName := fmt.Sprintf("%d_%d_%s.%s", uid, time.Now().UnixNano(), secret, imgExt) fileName := fmt.Sprintf("%d_%d_%s.%s", uid, time.Now().UnixNano(), secret, imgExt)
objectName := fmt.Sprintf("%s/%s", knowledgePrefix, fileName) objectName := fmt.Sprintf("%s/%s", knowledgePrefix, fileName)

View File

@ -41,7 +41,7 @@ type manager struct {
} }
func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) { func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) {
var pFn parseFn var pFn ParseFn
if config.ParsingStrategy.HeaderLine == 0 && config.ParsingStrategy.DataStartLine == 0 { if config.ParsingStrategy.HeaderLine == 0 && config.ParsingStrategy.DataStartLine == 0 {
config.ParsingStrategy.DataStartLine = 1 config.ParsingStrategy.DataStartLine = 1
@ -52,26 +52,30 @@ func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) {
switch config.FileExtension { switch config.FileExtension {
case parser.FileExtensionPDF: case parser.FileExtensionPDF:
pFn = parseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_pdf.py")) pFn = ParseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_pdf.py"))
case parser.FileExtensionTXT: case parser.FileExtensionTXT:
pFn = parseText(config) pFn = ParseText(config)
case parser.FileExtensionMarkdown: case parser.FileExtensionMarkdown:
pFn = parseMarkdown(config, m.storage, m.ocr) pFn = ParseMarkdown(config, m.storage, m.ocr)
case parser.FileExtensionDocx: case parser.FileExtensionDocx:
pFn = parseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_docx.py")) pFn = ParseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_docx.py"))
case parser.FileExtensionCSV: case parser.FileExtensionCSV:
pFn = parseCSV(config) pFn = ParseCSV(config)
case parser.FileExtensionXLSX: case parser.FileExtensionXLSX:
pFn = parseXLSX(config) pFn = ParseXLSX(config)
case parser.FileExtensionJSON: case parser.FileExtensionJSON:
pFn = parseJSON(config) pFn = ParseJSON(config)
case parser.FileExtensionJsonMaps: case parser.FileExtensionJsonMaps:
pFn = parseJSONMaps(config) pFn = ParseJSONMaps(config)
case parser.FileExtensionJPG, parser.FileExtensionJPEG, parser.FileExtensionPNG: case parser.FileExtensionJPG, parser.FileExtensionJPEG, parser.FileExtensionPNG:
pFn = parseImage(config, m.model) pFn = ParseImage(config, m.model)
default: default:
return nil, fmt.Errorf("[Parse] document type not support, type=%s", config.FileExtension) return nil, fmt.Errorf("[Parse] document type not support, type=%s", config.FileExtension)
} }
return &p{parseFn: pFn}, nil return &Parser{ParseFn: pFn}, nil
}
func (m *manager) IsAutoAnnotationSupported() bool {
return m.model != nil
} }

View File

@ -29,7 +29,7 @@ import (
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
) )
func parseCSV(config *contract.Config) parseFn { func ParseCSV(config *contract.Config) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
iter := &csvIterator{csv.NewReader(utfbom.SkipOnly(reader))} iter := &csvIterator{csv.NewReader(utfbom.SkipOnly(reader))}
return parseByRowIterator(iter, config, opts...) return parseByRowIterator(iter, config, opts...)

View File

@ -47,7 +47,7 @@ func TestParseCSV(t *testing.T) {
}, },
ChunkingStrategy: nil, ChunkingStrategy: nil,
} }
p1 := parseCSV(c1) p1 := ParseCSV(c1)
docs, err := p1(ctx, r1, parser.WithExtraMeta(map[string]any{ docs, err := p1(ctx, r1, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),
@ -112,7 +112,7 @@ func TestParseCSV(t *testing.T) {
}, },
ChunkingStrategy: nil, ChunkingStrategy: nil,
} }
p2 := parseCSV(c2) p2 := ParseCSV(c2)
docs, err = p2(ctx, r2, parser.WithExtraMeta(map[string]any{ docs, err = p2(ctx, r2, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),
@ -131,7 +131,7 @@ func TestParseCSVBadCases(t *testing.T) {
b, err := io.ReadAll(f) b, err := io.ReadAll(f)
assert.NoError(t, err) assert.NoError(t, err)
pfn := parseCSV(&contract.Config{ pfn := ParseCSV(&contract.Config{
FileExtension: "csv", FileExtension: "csv",
ParsingStrategy: &contract.ParsingStrategy{ ParsingStrategy: &contract.ParsingStrategy{
ExtractImage: true, ExtractImage: true,
@ -154,7 +154,7 @@ func TestParseCSVBadCases(t *testing.T) {
cols, err := document.GetDocumentColumns(resp[0]) cols, err := document.GetDocumentColumns(resp[0])
assert.NoError(t, err) assert.NoError(t, err)
cols[5].Nullable = false cols[5].Nullable = false
npfn := parseCSV(&contract.Config{ npfn := ParseCSV(&contract.Config{
FileExtension: "csv", FileExtension: "csv",
ParsingStrategy: &contract.ParsingStrategy{ ParsingStrategy: &contract.ParsingStrategy{
ExtractImage: true, ExtractImage: true,

View File

@ -31,7 +31,7 @@ import (
"github.com/coze-dev/coze-studio/backend/types/errno" "github.com/coze-dev/coze-studio/backend/types/errno"
) )
func parseImage(config *contract.Config, model chatmodel.BaseChatModel) parseFn { func ParseImage(config *contract.Config, model chatmodel.BaseChatModel) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
options := parser.GetCommonOptions(&parser.Options{}, opts...) options := parser.GetCommonOptions(&parser.Options{}, opts...)
doc := &schema.Document{ doc := &schema.Document{
@ -76,14 +76,14 @@ func parseImage(config *contract.Config, model chatmodel.BaseChatModel) parseFn
output, err := model.Generate(ctx, []*schema.Message{input}) output, err := model.Generate(ctx, []*schema.Message{input})
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseImage] model generate failed: %w", err) return nil, fmt.Errorf("[ParseImage] model generate failed: %w", err)
} }
doc.Content = output.Content doc.Content = output.Content
case contract.ImageAnnotationTypeManual: case contract.ImageAnnotationTypeManual:
// do nothing // do nothing
default: default:
return nil, fmt.Errorf("[parseImage] unknown image annotation type=%d", config.ParsingStrategy.ImageAnnotationType) return nil, fmt.Errorf("[ParseImage] unknown image annotation type=%d", config.ParsingStrategy.ImageAnnotationType)
} }
return []*schema.Document{doc}, nil return []*schema.Document{doc}, nil

View File

@ -28,7 +28,7 @@ import (
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
) )
func parseJSON(config *contract.Config) parseFn { func ParseJSON(config *contract.Config) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
b, err := io.ReadAll(reader) b, err := io.ReadAll(reader)
if err != nil { if err != nil {
@ -41,7 +41,7 @@ func parseJSON(config *contract.Config) parseFn {
} }
if len(rawSlices) == 0 { if len(rawSlices) == 0 {
return nil, fmt.Errorf("[parseJSON] json data is empty") return nil, fmt.Errorf("[ParseJSON] json data is empty")
} }
var header []string var header []string

View File

@ -29,7 +29,7 @@ import (
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
) )
func parseJSONMaps(config *contract.Config) parseFn { func ParseJSONMaps(config *contract.Config) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
b, err := io.ReadAll(reader) b, err := io.ReadAll(reader)
if err != nil { if err != nil {

View File

@ -85,7 +85,7 @@ func TestParseTableCustomContent(t *testing.T) {
}, },
} }
pfn := parseJSONMaps(config) pfn := ParseJSONMaps(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{ docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),

View File

@ -55,7 +55,7 @@ func TestParseJSON(t *testing.T) {
}, },
ChunkingStrategy: nil, ChunkingStrategy: nil,
} }
pfn := parseJSON(config) pfn := ParseJSON(config)
docs, err := pfn(context.Background(), reader, parser.WithExtraMeta(map[string]any{ docs, err := pfn(context.Background(), reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),
@ -121,7 +121,7 @@ func TestParseJSONWithSchema(t *testing.T) {
}, },
}, },
} }
pfn := parseJSON(config) pfn := ParseJSON(config)
docs, err := pfn(context.Background(), reader, parser.WithExtraMeta(map[string]any{ docs, err := pfn(context.Background(), reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),

View File

@ -38,7 +38,7 @@ import (
"github.com/coze-dev/coze-studio/backend/pkg/logs" "github.com/coze-dev/coze-studio/backend/pkg/logs"
) )
func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR) parseFn { func ParseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
options := parser.GetCommonOptions(&parser.Options{}, opts...) options := parser.GetCommonOptions(&parser.Options{}, opts...)
mdParser := goldmark.DefaultParser() mdParser := goldmark.DefaultParser()
@ -52,7 +52,7 @@ func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR
ps := config.ParsingStrategy ps := config.ParsingStrategy
if cs.ChunkType != contract.ChunkTypeCustom && cs.ChunkType != contract.ChunkTypeDefault { if cs.ChunkType != contract.ChunkTypeCustom && cs.ChunkType != contract.ChunkTypeDefault {
return nil, fmt.Errorf("[parseMarkdown] chunk type not support, chunk type=%d", cs.ChunkType) return nil, fmt.Errorf("[ParseMarkdown] chunk type not support, chunk type=%d", cs.ChunkType)
} }
var ( var (
@ -173,7 +173,7 @@ func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR
return ast.WalkStop, fmt.Errorf("failed to download image: %w", err) return ast.WalkStop, fmt.Errorf("failed to download image: %w", err)
} }
imgSrc, err := putImageObject(ctx, storage, ext, getCreatorIDFromExtraMeta(options.ExtraMeta), img) imgSrc, err := PutImageObject(ctx, storage, ext, GetCreatorIDFromExtraMeta(options.ExtraMeta), img)
if err != nil { if err != nil {
return ast.WalkStop, err return ast.WalkStop, err
} }
@ -198,7 +198,7 @@ func parseMarkdown(config *contract.Config, storage storage.Storage, ocr ocr.OCR
pushSlice() pushSlice()
} }
} else { } else {
logs.CtxInfof(ctx, "[parseMarkdown] not a valid image url, skip, got=%s", imageURL) logs.CtxInfof(ctx, "[ParseMarkdown] not a valid image url, skip, got=%s", imageURL)
} }
} }
} }

View File

@ -37,7 +37,7 @@ func TestParseMarkdown(t *testing.T) {
mockStorage := ms.NewMockStorage(ctrl) mockStorage := ms.NewMockStorage(ctrl)
mockStorage.EXPECT().PutObject(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() mockStorage.EXPECT().PutObject(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes()
pfn := parseMarkdown(&contract.Config{ pfn := ParseMarkdown(&contract.Config{
FileExtension: contract.FileExtensionMarkdown, FileExtension: contract.FileExtensionMarkdown,
ParsingStrategy: &contract.ParsingStrategy{ ParsingStrategy: &contract.ParsingStrategy{
ExtractImage: true, ExtractImage: true,

View File

@ -27,7 +27,7 @@ import (
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
) )
func parseText(config *contract.Config) parseFn { func ParseText(config *contract.Config) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
content, err := io.ReadAll(reader) content, err := io.ReadAll(reader)
if err != nil { if err != nil {
@ -36,9 +36,9 @@ func parseText(config *contract.Config) parseFn {
switch config.ChunkingStrategy.ChunkType { switch config.ChunkingStrategy.ChunkType {
case contract.ChunkTypeCustom, contract.ChunkTypeDefault: case contract.ChunkTypeCustom, contract.ChunkTypeDefault:
docs, err = chunkCustom(ctx, string(content), config, opts...) docs, err = ChunkCustom(ctx, string(content), config, opts...)
default: default:
return nil, fmt.Errorf("[parseText] chunk type not support, type=%d", config.ChunkingStrategy.ChunkType) return nil, fmt.Errorf("[ParseText] chunk type not support, type=%d", config.ChunkingStrategy.ChunkType)
} }
if err != nil { if err != nil {
return nil, err return nil, err

View File

@ -27,7 +27,7 @@ import (
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser" contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
) )
func parseXLSX(config *contract.Config) parseFn { func ParseXLSX(config *contract.Config) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
f, err := excelize.OpenReader(reader) f, err := excelize.OpenReader(reader)
if err != nil { if err != nil {

View File

@ -88,7 +88,7 @@ func TestParseXLSX(t *testing.T) {
ChunkingStrategy: nil, ChunkingStrategy: nil,
} }
pfn := parseXLSX(config) pfn := ParseXLSX(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{ docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),
@ -159,7 +159,7 @@ func TestParseXLSXConvertColumnType(t *testing.T) {
ChunkingStrategy: nil, ChunkingStrategy: nil,
} }
pfn := parseXLSX(config) pfn := ParseXLSX(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{ docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123), "document_id": int64(123),
"knowledge_id": int64(456), "knowledge_id": int64(456),

View File

@ -24,12 +24,12 @@ import (
"github.com/cloudwego/eino/schema" "github.com/cloudwego/eino/schema"
) )
type p struct { type Parser struct {
parseFn ParseFn
} }
func (p p) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) ([]*schema.Document, error) { func (p Parser) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) ([]*schema.Document, error) {
return p.parseFn(ctx, reader, opts...) return p.ParseFn(ctx, reader, opts...)
} }
type parseFn func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) type ParseFn func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error)

View File

@ -73,15 +73,15 @@ func (p *pyPDFTableIterator) NextRow() (row []string, end bool, err error) {
return row, false, nil return row, false, nil
} }
func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR, pyPath, scriptPath string) parseFn { func ParseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR, pyPath, scriptPath string) ParseFn {
return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) { return func(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
pr, pw, err := os.Pipe() pr, pw, err := os.Pipe()
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] create rpipe failed, %w", err) return nil, fmt.Errorf("[ParseByPython] create rpipe failed, %w", err)
} }
r, w, err := os.Pipe() r, w, err := os.Pipe()
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] create pipe failed: %w", err) return nil, fmt.Errorf("[ParseByPython] create pipe failed: %w", err)
} }
options := parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...) options := parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...)
@ -91,13 +91,13 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
FilterPages: config.ParsingStrategy.FilterPages, FilterPages: config.ParsingStrategy.FilterPages,
}) })
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] create parse request failed, %w", err) return nil, fmt.Errorf("[ParseByPython] create parse request failed, %w", err)
} }
if _, err = pw.Write(reqb); err != nil { if _, err = pw.Write(reqb); err != nil {
return nil, fmt.Errorf("[parseByPython] write parse request bytes failed, %w", err) return nil, fmt.Errorf("[ParseByPython] write parse request bytes failed, %w", err)
} }
if err = pw.Close(); err != nil { if err = pw.Close(); err != nil {
return nil, fmt.Errorf("[parseByPython] close write request pipe failed, %w", err) return nil, fmt.Errorf("[ParseByPython] close write request pipe failed, %w", err)
} }
cmd := exec.Command(pyPath, scriptPath) cmd := exec.Command(pyPath, scriptPath)
@ -105,31 +105,31 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
cmd.Stdout = os.Stdout cmd.Stdout = os.Stdout
cmd.ExtraFiles = []*os.File{w, pr} cmd.ExtraFiles = []*os.File{w, pr}
if err = cmd.Start(); err != nil { if err = cmd.Start(); err != nil {
return nil, fmt.Errorf("[parseByPython] failed to start Python script: %w", err) return nil, fmt.Errorf("[ParseByPython] failed to start Python script: %w", err)
} }
if err = w.Close(); err != nil { if err = w.Close(); err != nil {
return nil, fmt.Errorf("[parseByPython] failed to close write pipe: %w", err) return nil, fmt.Errorf("[ParseByPython] failed to close write pipe: %w", err)
} }
result := &pyParseResult{} result := &pyParseResult{}
if err = json.NewDecoder(r).Decode(result); err != nil { if err = json.NewDecoder(r).Decode(result); err != nil {
return nil, fmt.Errorf("[parseByPython] failed to decode result: %w", err) return nil, fmt.Errorf("[ParseByPython] failed to decode result: %w", err)
} }
if err = cmd.Wait(); err != nil { if err = cmd.Wait(); err != nil {
return nil, fmt.Errorf("[parseByPython] cmd wait err: %w", err) return nil, fmt.Errorf("[ParseByPython] cmd wait err: %w", err)
} }
if result.Error != "" { if result.Error != "" {
return nil, fmt.Errorf("[parseByPython] python execution failed: %s", result.Error) return nil, fmt.Errorf("[ParseByPython] python execution failed: %s", result.Error)
} }
for i, item := range result.Content { for i, item := range result.Content {
switch item.Type { switch item.Type {
case contentTypeText: case contentTypeText:
partDocs, err := chunkCustom(ctx, item.Content, config, opts...) partDocs, err := ChunkCustom(ctx, item.Content, config, opts...)
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] chunk text failed, %w", err) return nil, fmt.Errorf("[ParseByPython] chunk text failed, %w", err)
} }
docs = append(docs, partDocs...) docs = append(docs, partDocs...)
case contentTypeImage: case contentTypeImage:
@ -138,9 +138,9 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
} }
image, err := base64.StdEncoding.DecodeString(item.Content) image, err := base64.StdEncoding.DecodeString(item.Content)
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] decode image failed, %w", err) return nil, fmt.Errorf("[ParseByPython] decode image failed, %w", err)
} }
imgSrc, err := putImageObject(ctx, storage, "png", getCreatorIDFromExtraMeta(options.ExtraMeta), image) imgSrc, err := PutImageObject(ctx, storage, "png", GetCreatorIDFromExtraMeta(options.ExtraMeta), image)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -148,7 +148,7 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
if config.ParsingStrategy.ImageOCR && ocr != nil { if config.ParsingStrategy.ImageOCR && ocr != nil {
texts, err := ocr.FromBase64(ctx, item.Content) texts, err := ocr.FromBase64(ctx, item.Content)
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] FromBase64 failed, %w", err) return nil, fmt.Errorf("[ParseByPython] FromBase64 failed, %w", err)
} }
label += strings.Join(texts, "\n") label += strings.Join(texts, "\n")
} }
@ -181,15 +181,15 @@ func parseByPython(config *contract.Config, storage storage.Storage, ocr ocr.OCR
ChunkingStrategy: config.ChunkingStrategy, ChunkingStrategy: config.ChunkingStrategy,
}, opts...) }, opts...)
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] parse table failed, %w", err) return nil, fmt.Errorf("[ParseByPython] parse table failed, %w", err)
} }
fmtTableDocs, err := formatTablesInDocument(rawTableDocs) fmtTableDocs, err := formatTablesInDocument(rawTableDocs)
if err != nil { if err != nil {
return nil, fmt.Errorf("[parseByPython] format table failed, %w", err) return nil, fmt.Errorf("[ParseByPython] format table failed, %w", err)
} }
docs = append(docs, fmtTableDocs...) docs = append(docs, fmtTableDocs...)
default: default:
return nil, fmt.Errorf("[parseByPython] invalid content type: %s", item.Type) return nil, fmt.Errorf("[ParseByPython] invalid content type: %s", item.Type)
} }
} }

View File

@ -61,7 +61,7 @@ func getExtension(uri string) string {
return "" return ""
} }
func getCreatorIDFromExtraMeta(extraMeta map[string]any) int64 { func GetCreatorIDFromExtraMeta(extraMeta map[string]any) int64 {
if extraMeta == nil { if extraMeta == nil {
return 0 return 0
} }

View File

@ -0,0 +1,91 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ppstructure
import (
"fmt"
"github.com/coze-dev/coze-studio/backend/infra/contract/chatmodel"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin"
"github.com/coze-dev/coze-studio/backend/pkg/goutil"
)
func NewManager(apiConfig *APIConfig, ocr ocr.OCR, storage storage.Storage, imageAnnotationModel chatmodel.BaseChatModel) parser.Manager {
return &manager{
apiConfig: apiConfig,
ocr: ocr,
storage: storage,
imageAnnotationModel: imageAnnotationModel,
}
}
type manager struct {
apiConfig *APIConfig
ocr ocr.OCR
storage storage.Storage
imageAnnotationModel chatmodel.BaseChatModel
}
func (m *manager) GetParser(config *parser.Config) (parser.Parser, error) {
if config.ParsingStrategy.HeaderLine == 0 && config.ParsingStrategy.DataStartLine == 0 {
config.ParsingStrategy.DataStartLine = 1
} else if config.ParsingStrategy.HeaderLine >= config.ParsingStrategy.DataStartLine {
return nil, fmt.Errorf("[GetParser] invalid header line and data start line, header=%d, data_start=%d",
config.ParsingStrategy.HeaderLine, config.ParsingStrategy.DataStartLine)
}
var pFn builtin.ParseFn
switch config.FileExtension {
case parser.FileExtensionPDF:
fileType := 0
return &ppstructureParser{config, m.apiConfig, fileType, m.ocr, m.storage}, nil
case parser.FileExtensionTXT:
pFn = builtin.ParseText(config)
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionMarkdown:
pFn = builtin.ParseMarkdown(config, m.storage, m.ocr)
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionDocx:
pFn = builtin.ParseByPython(config, m.storage, m.ocr, goutil.GetPython3Path(), goutil.GetPythonFilePath("parse_docx.py"))
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionCSV:
pFn = builtin.ParseCSV(config)
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionXLSX:
pFn = builtin.ParseXLSX(config)
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionJSON:
pFn = builtin.ParseJSON(config)
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionJsonMaps:
pFn = builtin.ParseJSONMaps(config)
return &builtin.Parser{ParseFn: pFn}, nil
case parser.FileExtensionJPG, parser.FileExtensionJPEG, parser.FileExtensionPNG:
pFn = builtin.ParseImage(config, m.imageAnnotationModel)
return &builtin.Parser{ParseFn: pFn}, nil
default:
return nil, fmt.Errorf("[Parse] document type not support, type=%s", config.FileExtension)
}
}
func (m *manager) IsAutoAnnotationSupported() bool {
return m.imageAnnotationModel != nil
}

View File

@ -0,0 +1,324 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ppstructure
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr"
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin"
)
type ppstructureParser struct {
parserConfig *contract.Config
apiConfig *APIConfig
fileType int
ocr ocr.OCR
storage storage.Storage
}
type APIConfig struct {
Client *http.Client
URL string
// see: https://paddlepaddle.github.io/PaddleX/latest/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.html#3
UseDocOrientationClassify *bool
UseDocUnwarping *bool
UseTextlineOrientation *bool
UseSealRecognition *bool
UseFormulaRecognition *bool
UseChartRecognition *bool
UseRegionDetection *bool
LayoutThreshold *float64
LayoutNms *bool
LayoutUnclipRatio *float64
LayoutMergeBboxesMode *string
TextDetLimitSideLen *int64
TextDetLimitType *string
TextDetThresh *float64
TextDetBoxThresh *float64
TextDetUnclipRatio *float64
TextRecScoreThresh *float64
SealDetLimitSideLen *int64
SealDetLimitType *string
SealDetThresh *float64
SealDetBoxThresh *float64
SealDetUnclipRatio *float64
SealRecScoreThresh *float64
UseWiredTableCellsTransToHtml *bool
UseWirelessTableCellsTransToHtml *bool
UseTableOrientationClassify *bool
UseOcrResultsWithTableCells *bool
UseE2eWiredTableRecModel *bool
UseE2eWirelessTableRecModel *bool
}
type ppstructureResponse struct {
Result *ppstructureInferResult `json:"result"`
}
type ppstructureInferResult struct {
LayoutParsingResults []*ppstructureInnerResult `json:"layoutParsingResults"`
}
type ppstructureInnerResult struct {
Markdown *ppstructureMarkdown `json:"markdown"`
}
type ppstructureMarkdown struct {
Text *string `json:"text"`
Images map[string]string `json:"images"`
IsStart *bool `json:"isStart"`
IsEnd *bool `json:"isEnd"`
}
func (p *ppstructureParser) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
// TODO(Bobholamovic): Current chunking strategy is rather naive; we should
// implement a more sophisticated one that at least takes tables and text
// extracted from the images into consideration.
options := parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...)
fileBytes, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("[Parse] failed to read the file bytes, %w", err)
}
b64 := base64.StdEncoding.EncodeToString(fileBytes)
reqBody := p.newRequestBody(b64, p.fileType, p.parserConfig.ParsingStrategy.ExtractImage, p.parserConfig.ParsingStrategy.ExtractTable)
bodyBytes, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("[Parse] failed to serizalize the request body, %w", err)
}
req, err := http.NewRequest("POST", p.apiConfig.URL, bytes.NewReader(bodyBytes))
if err != nil {
return nil, fmt.Errorf("[Parse] failed to create a new request, %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := p.apiConfig.Client.Do(req)
if err != nil {
return nil, fmt.Errorf("[Parse] request failed, %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("[Parse] request failed, %w", err)
}
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("[Parse] failed to read the response body, %w", err)
}
var res ppstructureResponse
if err := json.Unmarshal(respBody, &res); err != nil {
return nil, fmt.Errorf("[Parse] failed to deserialize the response body, %w", err)
}
if res.Result == nil ||
res.Result.LayoutParsingResults == nil {
return nil, fmt.Errorf("[Parse] failed to get the layout parsing result, %w", err)
}
for i, item := range res.Result.LayoutParsingResults {
if item.Markdown == nil {
return nil, fmt.Errorf("[Parse] invalid response, %w", err)
}
if item.Markdown.Text == nil {
return nil, fmt.Errorf("[Parse] invalid response, %w", err)
}
should_skip := false
for _, v := range p.parserConfig.ParsingStrategy.FilterPages {
if i+1 == v {
should_skip = true
break
}
}
if should_skip {
continue
}
text := *item.Markdown.Text
// Convert the image in markdown to comments, as the image content will be added later.
pattern := `(?i)<img[^>]*>`
re := regexp.MustCompile(pattern)
// TODO(Bobholamovic): Add image numbering
text = re.ReplaceAllStringFunc(text, func(matched string) string {
return "<!-- image -->"
})
partDocs, err := builtin.ChunkCustom(ctx, text, p.parserConfig, opts...)
if err != nil {
return nil, fmt.Errorf("[Parse] chunk text failed, %w", err)
}
docs = append(docs, partDocs...)
if p.parserConfig.ParsingStrategy.ExtractImage {
if item.Markdown.Images == nil {
return nil, fmt.Errorf("[Parse] invalid response, %w", err)
}
for _, v := range item.Markdown.Images {
image, err := base64.StdEncoding.DecodeString(v)
if err != nil {
return nil, fmt.Errorf("[Parse] failed to decode an image, %w", err)
}
imgSrc, err := builtin.PutImageObject(ctx, p.storage, "png", builtin.GetCreatorIDFromExtraMeta(options.ExtraMeta), image)
if err != nil {
return nil, err
}
label := fmt.Sprintf("\n%s", imgSrc)
if p.parserConfig.ParsingStrategy.ImageOCR && p.ocr != nil {
texts, err := p.ocr.FromBase64(ctx, v)
if err != nil {
return nil, fmt.Errorf("[Parse] FromBase64 failed, %w", err)
}
label += strings.Join(texts, "\n")
}
doc := &schema.Document{
Content: label,
MetaData: map[string]any{},
}
for k, v := range options.ExtraMeta {
doc.MetaData[k] = v
}
docs = append(docs, doc)
}
}
}
return docs, nil
}
func (p *ppstructureParser) newRequestBody(file string, fileType int, extractImage bool, extractTable bool) map[string]interface{} {
payload := map[string]interface{}{
"file": file,
"fileType": fileType,
"useTableRecognition": extractTable,
"visualize": extractImage,
}
if p.apiConfig.UseDocOrientationClassify != nil {
payload["useDocOrientationClassify"] = *p.apiConfig.UseDocOrientationClassify
}
if p.apiConfig.UseDocUnwarping != nil {
payload["useDocUnwarping"] = *p.apiConfig.UseDocUnwarping
}
if p.apiConfig.UseTextlineOrientation != nil {
payload["useTextlineOrientation"] = *p.apiConfig.UseTextlineOrientation
}
if p.apiConfig.UseSealRecognition != nil {
payload["useSealRecognition"] = *p.apiConfig.UseSealRecognition
}
if p.apiConfig.UseFormulaRecognition != nil {
payload["useFormulaRecognition"] = *p.apiConfig.UseFormulaRecognition
}
if p.apiConfig.UseChartRecognition != nil {
payload["useChartRecognition"] = *p.apiConfig.UseChartRecognition
}
if p.apiConfig.UseRegionDetection != nil {
payload["useRegionDetection"] = *p.apiConfig.UseRegionDetection
}
if p.apiConfig.LayoutThreshold != nil {
payload["layoutThreshold"] = *p.apiConfig.LayoutThreshold
}
if p.apiConfig.LayoutNms != nil {
payload["layoutNms"] = *p.apiConfig.LayoutNms
}
if p.apiConfig.LayoutUnclipRatio != nil {
payload["layoutUnclipRatio"] = *p.apiConfig.LayoutUnclipRatio
}
if p.apiConfig.LayoutMergeBboxesMode != nil {
payload["layoutMergeBboxesMode"] = *p.apiConfig.LayoutMergeBboxesMode
}
if p.apiConfig.TextDetLimitSideLen != nil {
payload["textDetLimitSideLen"] = *p.apiConfig.TextDetLimitSideLen
}
if p.apiConfig.TextDetLimitType != nil {
payload["textDetLimitType"] = *p.apiConfig.TextDetLimitType
}
if p.apiConfig.TextDetThresh != nil {
payload["textDetThresh"] = *p.apiConfig.TextDetThresh
}
if p.apiConfig.TextDetBoxThresh != nil {
payload["textDetBoxThresh"] = *p.apiConfig.TextDetBoxThresh
}
if p.apiConfig.TextDetUnclipRatio != nil {
payload["textDetUnclipRatio"] = *p.apiConfig.TextDetUnclipRatio
}
if p.apiConfig.TextRecScoreThresh != nil {
payload["textRecScoreThresh"] = *p.apiConfig.TextRecScoreThresh
}
if p.apiConfig.SealDetLimitSideLen != nil {
payload["sealDetLimitSideLen"] = *p.apiConfig.SealDetLimitSideLen
}
if p.apiConfig.SealDetLimitType != nil {
payload["sealDetLimitType"] = *p.apiConfig.SealDetLimitType
}
if p.apiConfig.SealDetThresh != nil {
payload["sealDetThresh"] = *p.apiConfig.SealDetThresh
}
if p.apiConfig.SealDetBoxThresh != nil {
payload["sealDetBoxThresh"] = *p.apiConfig.SealDetBoxThresh
}
if p.apiConfig.SealDetUnclipRatio != nil {
payload["sealDetUnclipRatio"] = *p.apiConfig.SealDetUnclipRatio
}
if p.apiConfig.SealRecScoreThresh != nil {
payload["sealRecScoreThresh"] = *p.apiConfig.SealRecScoreThresh
}
if p.apiConfig.UseWiredTableCellsTransToHtml != nil {
payload["useWiredTableCellsTransToHtml"] = *p.apiConfig.UseWiredTableCellsTransToHtml
}
if p.apiConfig.UseWirelessTableCellsTransToHtml != nil {
payload["useWirelessTableCellsTransToHtml"] = *p.apiConfig.UseWirelessTableCellsTransToHtml
}
if p.apiConfig.UseTableOrientationClassify != nil {
payload["useTableOrientationClassify"] = *p.apiConfig.UseTableOrientationClassify
}
if p.apiConfig.UseOcrResultsWithTableCells != nil {
payload["useOcrResultsWithTableCells"] = *p.apiConfig.UseOcrResultsWithTableCells
}
if p.apiConfig.UseE2eWiredTableRecModel != nil {
payload["useE2eWiredTableRecModel"] = *p.apiConfig.UseE2eWiredTableRecModel
}
if p.apiConfig.UseE2eWirelessTableRecModel != nil {
payload["useE2eWirelessTableRecModel"] = *p.apiConfig.UseE2eWirelessTableRecModel
}
return payload
}

View File

@ -87,6 +87,14 @@ const (
UseSSL = "USE_SSL" UseSSL = "USE_SSL"
SSLCertFile = "SSL_CERT_FILE" SSLCertFile = "SSL_CERT_FILE"
SSLKeyFile = "SSL_KEY_FILE" SSLKeyFile = "SSL_KEY_FILE"
OCRType = "OCR_TYPE"
VeOCRAK = "VE_OCR_AK"
VeOCRSK = "VE_OCR_SK"
PPOCRAPIURL = "PADDLEOCR_OCR_API_URL"
ParserType = "PARSER_TYPE"
PPStructureAPIURL = "PADDLEOCR_STRUCTURE_API_URL"
) )
const ( const (

View File

@ -140,6 +140,12 @@ export VE_OCR_SK=""
# paddleocr ocr # paddleocr ocr
export PADDLEOCR_OCR_API_URL="" export PADDLEOCR_OCR_API_URL=""
# Settings for Document Parser
# Supported parser types: `builtin`, `paddleocr`
export PARSER_TYPE="builtin"
# paddleocr structure
export PADDLEOCR_STRUCTURE_API_URL=""
# Settings for Model # Settings for Model
# Model for agent & workflow # Model for agent & workflow
# add suffix number to add different models # add suffix number to add different models

View File

@ -137,6 +137,12 @@ export VE_OCR_SK=""
# paddleocr ocr # paddleocr ocr
export PADDLEOCR_OCR_API_URL="" export PADDLEOCR_OCR_API_URL=""
# Settings for Document Parser
# Supported parser types: `builtin`, `paddleocr`
export PARSER_TYPE="builtin"
# paddleocr structure
export PADDLEOCR_STRUCTURE_API_URL=""
# Settings for Model # Settings for Model
# Model for agent & workflow # Model for agent & workflow
# add suffix number to add different models # add suffix number to add different models