From b19ae505f07add077d64f6f683f384ab2959811b Mon Sep 17 00:00:00 2001 From: Lin Manhui Date: Mon, 11 Aug 2025 12:00:41 +0800 Subject: [PATCH] feat(infra): add PaddleOCR as a new OCR type (#668) --- backend/application/knowledge/init.go | 5 + .../impl/document/ocr/veocr/paddleocr_ocr.go | 156 ++++++++++++++++++ docker/.env.debug.example | 4 +- docker/.env.example | 4 +- 4 files changed, 167 insertions(+), 2 deletions(-) create mode 100644 backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go diff --git a/backend/application/knowledge/init.go b/backend/application/knowledge/init.go index de7b710f..ac763177 100644 --- a/backend/application/knowledge/init.go +++ b/backend/application/knowledge/init.go @@ -20,6 +20,7 @@ import ( "context" "encoding/json" "fmt" + netHTTP "net/http" "os" "path/filepath" "strconv" @@ -113,6 +114,10 @@ func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) { inst.Client.SetAccessKey(ocrAK) inst.Client.SetSecretKey(ocrSK) ocrImpl = veocr.NewOCR(&veocr.Config{Client: inst}) + case "paddleocr": + ppocrURL := os.Getenv("PADDLEOCR_OCR_API_URL") + client := &netHTTP.Client{} + ocrImpl = veocr.NewPPOCR(&veocr.PPOCRConfig{Client: client, URL: ppocrURL}) default: // accept ocr not configured } diff --git a/backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go b/backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go new file mode 100644 index 00000000..3a4cef09 --- /dev/null +++ b/backend/infra/impl/document/ocr/veocr/paddleocr_ocr.go @@ -0,0 +1,156 @@ +/* + * Copyright 2025 coze-dev Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package veocr + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + + "github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr" + "github.com/coze-dev/coze-studio/backend/pkg/errorx" + "github.com/coze-dev/coze-studio/backend/types/errno" +) + +type PPOCRConfig struct { + Client *http.Client + URL string + + // see: https://paddlepaddle.github.io/PaddleX/latest/pipeline_usage/tutorials/ocr_pipelines/OCR.html#3 + UseDocOrientationClassify *bool + UseDocUnwarping *bool + UseTextlineOrientation *bool + TextDetLimitSideLen *int + TextDetLimitType *string + TextDetThresh *float64 + TextDetBoxThresh *float64 + TextDetUnclipRatio *float64 + TextRecScoreThresh *float64 +} + +func NewPPOCR(config *PPOCRConfig) ocr.OCR { + return &ppocrImpl{config} +} + +type ppocrImpl struct { + config *PPOCRConfig +} + +type ppocrResponse struct { + Result *ppocrInferResult `json:"result"` +} + +type ppocrInferResult struct { + OCRResults []*ppocrInnerResult `json:"ocrResults"` +} + +type ppocrInnerResult struct { + PrunedResult *ppocrPrunedResult `json:"prunedResult"` +} + +type ppocrPrunedResult struct { + RecTexts []string `json:"rec_texts"` +} + +func (o *ppocrImpl) FromBase64(ctx context.Context, b64 string) ([]string, error) { + return o.makeRequest(o.newRequestBody(b64)) +} + +func (o *ppocrImpl) FromURL(ctx context.Context, url string) ([]string, error) { + return o.makeRequest(o.newRequestBody(url)) +} + +func (o *ppocrImpl) newRequestBody(file string) map[string]interface{} { + payload := map[string]interface{}{ + "file": file, + "fileType": 1, + "visualize": false, + } + if o.config.UseDocOrientationClassify != nil { + payload["useDocOrientationClassify"] = *o.config.UseDocOrientationClassify + } else { + payload["useDocOrientationClassify"] = false + } + if o.config.UseDocUnwarping != nil { + payload["useDocUnwarping"] = *o.config.UseDocUnwarping + } else { + payload["useDocUnwarping"] = false + } + if o.config.UseTextlineOrientation != nil { + payload["useTextlineOrientation"] = *o.config.UseTextlineOrientation + } else { + payload["useTextlineOrientation"] = false + } + if o.config.TextDetLimitSideLen != nil { + payload["textDetLimitSideLen"] = *o.config.TextDetLimitSideLen + } + if o.config.TextDetLimitType != nil { + payload["textDetLimitType"] = *o.config.TextDetLimitType + } + if o.config.TextDetThresh != nil { + payload["textDetThresh"] = *o.config.TextDetThresh + } + if o.config.TextDetUnclipRatio != nil { + payload["textDetUnclipRatio"] = *o.config.TextDetUnclipRatio + } + if o.config.TextRecScoreThresh != nil { + payload["textRecScoreThresh"] = *o.config.TextRecScoreThresh + } + return payload +} + +func (o *ppocrImpl) makeRequest(reqBody map[string]interface{}) ([]string, error) { + bodyBytes, err := json.Marshal(reqBody) + if err != nil { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + + req, err := http.NewRequest("POST", o.config.URL, bytes.NewReader(bodyBytes)) + if err != nil { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := o.config.Client.Do(req) + if err != nil { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + + var res ppocrResponse + if err := json.Unmarshal(respBody, &res); err != nil { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + + if res.Result == nil || + res.Result.OCRResults == nil || + len(res.Result.OCRResults) != 1 || + res.Result.OCRResults[0] == nil || + res.Result.OCRResults[0].PrunedResult == nil || + res.Result.OCRResults[0].PrunedResult.RecTexts == nil { + return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode) + } + + return res.Result.OCRResults[0].PrunedResult.RecTexts, nil +} diff --git a/docker/.env.debug.example b/docker/.env.debug.example index aff24f6b..a585ee4b 100644 --- a/docker/.env.debug.example +++ b/docker/.env.debug.example @@ -132,11 +132,13 @@ export HTTP_EMBEDDING_DIMS=1024 # (string, required) http embedding dimensions # Settings for OCR # If you want to use the OCR-related functions in the knowledge base feature,You need to set up the OCR configuration. # Currently, Coze Studio has built-in Volcano OCR. -# ocr_type: default type `ve` +# Supported OCR types: `ve`, `paddleocr` export OCR_TYPE="ve" # ve ocr export VE_OCR_AK="" export VE_OCR_SK="" +# paddleocr ocr +export PADDLEOCR_OCR_API_URL="" # Settings for Model # Model for agent & workflow diff --git a/docker/.env.example b/docker/.env.example index d3c23206..29671f70 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -129,11 +129,13 @@ export HTTP_EMBEDDING_DIMS=1024 # (string, required) http embedding dimensions # Settings for OCR # If you want to use the OCR-related functions in the knowledge base feature,You need to set up the OCR configuration. # Currently, Coze Studio has built-in Volcano OCR. -# ocr_type: default type `ve` +# Supported OCR types: `ve`, `paddleocr` export OCR_TYPE="ve" # ve ocr export VE_OCR_AK="" export VE_OCR_SK="" +# paddleocr ocr +export PADDLEOCR_OCR_API_URL="" # Settings for Model # Model for agent & workflow