feat: manually mirror opencoze's code from bytedance
Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
111
backend/domain/knowledge/internal/convert/parser.go
Normal file
111
backend/domain/knowledge/internal/convert/parser.go
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Copyright 2025 coze-dev Authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package convert
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
|
||||
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
|
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
|
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
|
||||
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
|
||||
)
|
||||
|
||||
func DocumentToParseConfig(doc *entity.Document) *parser.Config {
|
||||
return ToParseConfig(doc.FileExtension, doc.ParsingStrategy, doc.ChunkingStrategy, doc.IsAppend, doc.TableInfo.Columns)
|
||||
}
|
||||
|
||||
func ToParseConfig(fileExtension parser.FileExtension, ps *entity.ParsingStrategy, cs *entity.ChunkingStrategy, isAppend bool, columns []*entity.TableColumn) *parser.Config {
|
||||
if ps == nil {
|
||||
ps = &entity.ParsingStrategy{HeaderLine: 0, DataStartLine: 1}
|
||||
}
|
||||
|
||||
p := &parser.ParsingStrategy{
|
||||
ExtractImage: ps.ExtractImage,
|
||||
ExtractTable: ps.ExtractTable,
|
||||
ImageOCR: ps.ImageOCR,
|
||||
FilterPages: ps.FilterPages,
|
||||
SheetID: ptr.Of(int(ps.SheetID)),
|
||||
HeaderLine: ps.HeaderLine,
|
||||
DataStartLine: ps.DataStartLine,
|
||||
RowsCount: ps.RowsCount,
|
||||
IsAppend: isAppend,
|
||||
Columns: convColumns(columns),
|
||||
IgnoreColumnTypeErr: true, // default true
|
||||
ImageAnnotationType: ptr.From(ptr.From(ps).CaptionType),
|
||||
}
|
||||
|
||||
var c *parser.ChunkingStrategy
|
||||
if cs != nil {
|
||||
c = &parser.ChunkingStrategy{
|
||||
ChunkType: cs.ChunkType,
|
||||
ChunkSize: cs.ChunkSize,
|
||||
Separator: cs.Separator,
|
||||
Overlap: cs.Overlap,
|
||||
TrimSpace: cs.TrimSpace,
|
||||
TrimURLAndEmail: cs.TrimURLAndEmail,
|
||||
MaxDepth: cs.MaxDepth,
|
||||
SaveTitle: cs.SaveTitle,
|
||||
}
|
||||
}
|
||||
|
||||
return &parser.Config{
|
||||
FileExtension: fileExtension,
|
||||
ParsingStrategy: p,
|
||||
ChunkingStrategy: c,
|
||||
}
|
||||
}
|
||||
|
||||
func convColumns(src []*entity.TableColumn) []*document.Column {
|
||||
resp := make([]*document.Column, 0, len(src))
|
||||
for _, c := range src {
|
||||
if c.Name == consts.RDBFieldID {
|
||||
continue
|
||||
}
|
||||
dc := &document.Column{
|
||||
ID: c.ID,
|
||||
Name: c.Name,
|
||||
Type: c.Type,
|
||||
Description: c.Description,
|
||||
Nullable: !c.Indexing,
|
||||
IsPrimary: false,
|
||||
Sequence: int(c.Sequence),
|
||||
}
|
||||
resp = append(resp, dc)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
func Type2DefaultVal(typ document.TableColumnType) any {
|
||||
switch typ {
|
||||
case document.TableColumnTypeString:
|
||||
return ""
|
||||
case document.TableColumnTypeInteger:
|
||||
return 0
|
||||
case document.TableColumnTypeTime:
|
||||
return time.Time{}
|
||||
case document.TableColumnTypeNumber:
|
||||
return 0.0
|
||||
case document.TableColumnTypeBoolean:
|
||||
return false
|
||||
case document.TableColumnTypeImage:
|
||||
return []byte{}
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user