coze-studio/backend/domain/knowledge/internal/convert/parser.go

112 lines
3.2 KiB
Go

/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package convert
import (
"time"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
)
func DocumentToParseConfig(doc *entity.Document) *parser.Config {
return ToParseConfig(doc.FileExtension, doc.ParsingStrategy, doc.ChunkingStrategy, doc.IsAppend, doc.TableInfo.Columns)
}
func ToParseConfig(fileExtension parser.FileExtension, ps *entity.ParsingStrategy, cs *entity.ChunkingStrategy, isAppend bool, columns []*entity.TableColumn) *parser.Config {
if ps == nil {
ps = &entity.ParsingStrategy{HeaderLine: 0, DataStartLine: 1}
}
p := &parser.ParsingStrategy{
ExtractImage: ps.ExtractImage,
ExtractTable: ps.ExtractTable,
ImageOCR: ps.ImageOCR,
FilterPages: ps.FilterPages,
SheetID: ptr.Of(int(ps.SheetID)),
HeaderLine: ps.HeaderLine,
DataStartLine: ps.DataStartLine,
RowsCount: ps.RowsCount,
IsAppend: isAppend,
Columns: convColumns(columns),
IgnoreColumnTypeErr: true, // default true
ImageAnnotationType: ptr.From(ptr.From(ps).CaptionType),
}
var c *parser.ChunkingStrategy
if cs != nil {
c = &parser.ChunkingStrategy{
ChunkType: cs.ChunkType,
ChunkSize: cs.ChunkSize,
Separator: cs.Separator,
Overlap: cs.Overlap,
TrimSpace: cs.TrimSpace,
TrimURLAndEmail: cs.TrimURLAndEmail,
MaxDepth: cs.MaxDepth,
SaveTitle: cs.SaveTitle,
}
}
return &parser.Config{
FileExtension: fileExtension,
ParsingStrategy: p,
ChunkingStrategy: c,
}
}
func convColumns(src []*entity.TableColumn) []*document.Column {
resp := make([]*document.Column, 0, len(src))
for _, c := range src {
if c.Name == consts.RDBFieldID {
continue
}
dc := &document.Column{
ID: c.ID,
Name: c.Name,
Type: c.Type,
Description: c.Description,
Nullable: !c.Indexing,
IsPrimary: false,
Sequence: int(c.Sequence),
}
resp = append(resp, dc)
}
return resp
}
func Type2DefaultVal(typ document.TableColumnType) any {
switch typ {
case document.TableColumnTypeString:
return ""
case document.TableColumnTypeInteger:
return 0
case document.TableColumnTypeTime:
return time.Time{}
case document.TableColumnTypeNumber:
return 0.0
case document.TableColumnTypeBoolean:
return false
case document.TableColumnTypeImage:
return []byte{}
default:
return ""
}
}