feat(infra): add PaddleOCR as a new OCR type (#668)

This commit is contained in:
Lin Manhui 2025-08-11 12:00:41 +08:00 committed by GitHub
parent a44e566bda
commit b19ae505f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 167 additions and 2 deletions

View File

@ -20,6 +20,7 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
netHTTP "net/http"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
@ -113,6 +114,10 @@ func InitService(c *ServiceComponents) (*KnowledgeApplicationService, error) {
inst.Client.SetAccessKey(ocrAK) inst.Client.SetAccessKey(ocrAK)
inst.Client.SetSecretKey(ocrSK) inst.Client.SetSecretKey(ocrSK)
ocrImpl = veocr.NewOCR(&veocr.Config{Client: inst}) ocrImpl = veocr.NewOCR(&veocr.Config{Client: inst})
case "paddleocr":
ppocrURL := os.Getenv("PADDLEOCR_OCR_API_URL")
client := &netHTTP.Client{}
ocrImpl = veocr.NewPPOCR(&veocr.PPOCRConfig{Client: client, URL: ppocrURL})
default: default:
// accept ocr not configured // accept ocr not configured
} }

View File

@ -0,0 +1,156 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package veocr
import (
"bytes"
"context"
"encoding/json"
"io"
"net/http"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr"
"github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/types/errno"
)
type PPOCRConfig struct {
Client *http.Client
URL string
// see: https://paddlepaddle.github.io/PaddleX/latest/pipeline_usage/tutorials/ocr_pipelines/OCR.html#3
UseDocOrientationClassify *bool
UseDocUnwarping *bool
UseTextlineOrientation *bool
TextDetLimitSideLen *int
TextDetLimitType *string
TextDetThresh *float64
TextDetBoxThresh *float64
TextDetUnclipRatio *float64
TextRecScoreThresh *float64
}
func NewPPOCR(config *PPOCRConfig) ocr.OCR {
return &ppocrImpl{config}
}
type ppocrImpl struct {
config *PPOCRConfig
}
type ppocrResponse struct {
Result *ppocrInferResult `json:"result"`
}
type ppocrInferResult struct {
OCRResults []*ppocrInnerResult `json:"ocrResults"`
}
type ppocrInnerResult struct {
PrunedResult *ppocrPrunedResult `json:"prunedResult"`
}
type ppocrPrunedResult struct {
RecTexts []string `json:"rec_texts"`
}
func (o *ppocrImpl) FromBase64(ctx context.Context, b64 string) ([]string, error) {
return o.makeRequest(o.newRequestBody(b64))
}
func (o *ppocrImpl) FromURL(ctx context.Context, url string) ([]string, error) {
return o.makeRequest(o.newRequestBody(url))
}
func (o *ppocrImpl) newRequestBody(file string) map[string]interface{} {
payload := map[string]interface{}{
"file": file,
"fileType": 1,
"visualize": false,
}
if o.config.UseDocOrientationClassify != nil {
payload["useDocOrientationClassify"] = *o.config.UseDocOrientationClassify
} else {
payload["useDocOrientationClassify"] = false
}
if o.config.UseDocUnwarping != nil {
payload["useDocUnwarping"] = *o.config.UseDocUnwarping
} else {
payload["useDocUnwarping"] = false
}
if o.config.UseTextlineOrientation != nil {
payload["useTextlineOrientation"] = *o.config.UseTextlineOrientation
} else {
payload["useTextlineOrientation"] = false
}
if o.config.TextDetLimitSideLen != nil {
payload["textDetLimitSideLen"] = *o.config.TextDetLimitSideLen
}
if o.config.TextDetLimitType != nil {
payload["textDetLimitType"] = *o.config.TextDetLimitType
}
if o.config.TextDetThresh != nil {
payload["textDetThresh"] = *o.config.TextDetThresh
}
if o.config.TextDetUnclipRatio != nil {
payload["textDetUnclipRatio"] = *o.config.TextDetUnclipRatio
}
if o.config.TextRecScoreThresh != nil {
payload["textRecScoreThresh"] = *o.config.TextRecScoreThresh
}
return payload
}
func (o *ppocrImpl) makeRequest(reqBody map[string]interface{}) ([]string, error) {
bodyBytes, err := json.Marshal(reqBody)
if err != nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
req, err := http.NewRequest("POST", o.config.URL, bytes.NewReader(bodyBytes))
if err != nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
req.Header.Set("Content-Type", "application/json")
resp, err := o.config.Client.Do(req)
if err != nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
var res ppocrResponse
if err := json.Unmarshal(respBody, &res); err != nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
if res.Result == nil ||
res.Result.OCRResults == nil ||
len(res.Result.OCRResults) != 1 ||
res.Result.OCRResults[0] == nil ||
res.Result.OCRResults[0].PrunedResult == nil ||
res.Result.OCRResults[0].PrunedResult.RecTexts == nil {
return nil, errorx.WrapByCode(err, errno.ErrKnowledgeNonRetryableCode)
}
return res.Result.OCRResults[0].PrunedResult.RecTexts, nil
}

View File

@ -132,11 +132,13 @@ export HTTP_EMBEDDING_DIMS=1024 # (string, required) http embedding dimensions
# Settings for OCR # Settings for OCR
# If you want to use the OCR-related functions in the knowledge base featureYou need to set up the OCR configuration. # If you want to use the OCR-related functions in the knowledge base featureYou need to set up the OCR configuration.
# Currently, Coze Studio has built-in Volcano OCR. # Currently, Coze Studio has built-in Volcano OCR.
# ocr_type: default type `ve` # Supported OCR types: `ve`, `paddleocr`
export OCR_TYPE="ve" export OCR_TYPE="ve"
# ve ocr # ve ocr
export VE_OCR_AK="" export VE_OCR_AK=""
export VE_OCR_SK="" export VE_OCR_SK=""
# paddleocr ocr
export PADDLEOCR_OCR_API_URL=""
# Settings for Model # Settings for Model
# Model for agent & workflow # Model for agent & workflow

View File

@ -129,11 +129,13 @@ export HTTP_EMBEDDING_DIMS=1024 # (string, required) http embedding dimensions
# Settings for OCR # Settings for OCR
# If you want to use the OCR-related functions in the knowledge base featureYou need to set up the OCR configuration. # If you want to use the OCR-related functions in the knowledge base featureYou need to set up the OCR configuration.
# Currently, Coze Studio has built-in Volcano OCR. # Currently, Coze Studio has built-in Volcano OCR.
# ocr_type: default type `ve` # Supported OCR types: `ve`, `paddleocr`
export OCR_TYPE="ve" export OCR_TYPE="ve"
# ve ocr # ve ocr
export VE_OCR_AK="" export VE_OCR_AK=""
export VE_OCR_SK="" export VE_OCR_SK=""
# paddleocr ocr
export PADDLEOCR_OCR_API_URL=""
# Settings for Model # Settings for Model
# Model for agent & workflow # Model for agent & workflow