feat: manually mirror opencoze's code from bytedance

Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
fanlv
2025-07-20 17:36:12 +08:00
commit 890153324f
14811 changed files with 1923430 additions and 0 deletions

View File

@@ -0,0 +1,111 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package convert
import (
"time"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
)
func DocumentToParseConfig(doc *entity.Document) *parser.Config {
return ToParseConfig(doc.FileExtension, doc.ParsingStrategy, doc.ChunkingStrategy, doc.IsAppend, doc.TableInfo.Columns)
}
func ToParseConfig(fileExtension parser.FileExtension, ps *entity.ParsingStrategy, cs *entity.ChunkingStrategy, isAppend bool, columns []*entity.TableColumn) *parser.Config {
if ps == nil {
ps = &entity.ParsingStrategy{HeaderLine: 0, DataStartLine: 1}
}
p := &parser.ParsingStrategy{
ExtractImage: ps.ExtractImage,
ExtractTable: ps.ExtractTable,
ImageOCR: ps.ImageOCR,
FilterPages: ps.FilterPages,
SheetID: ptr.Of(int(ps.SheetID)),
HeaderLine: ps.HeaderLine,
DataStartLine: ps.DataStartLine,
RowsCount: ps.RowsCount,
IsAppend: isAppend,
Columns: convColumns(columns),
IgnoreColumnTypeErr: true, // default true
ImageAnnotationType: ptr.From(ptr.From(ps).CaptionType),
}
var c *parser.ChunkingStrategy
if cs != nil {
c = &parser.ChunkingStrategy{
ChunkType: cs.ChunkType,
ChunkSize: cs.ChunkSize,
Separator: cs.Separator,
Overlap: cs.Overlap,
TrimSpace: cs.TrimSpace,
TrimURLAndEmail: cs.TrimURLAndEmail,
MaxDepth: cs.MaxDepth,
SaveTitle: cs.SaveTitle,
}
}
return &parser.Config{
FileExtension: fileExtension,
ParsingStrategy: p,
ChunkingStrategy: c,
}
}
func convColumns(src []*entity.TableColumn) []*document.Column {
resp := make([]*document.Column, 0, len(src))
for _, c := range src {
if c.Name == consts.RDBFieldID {
continue
}
dc := &document.Column{
ID: c.ID,
Name: c.Name,
Type: c.Type,
Description: c.Description,
Nullable: !c.Indexing,
IsPrimary: false,
Sequence: int(c.Sequence),
}
resp = append(resp, dc)
}
return resp
}
func Type2DefaultVal(typ document.TableColumnType) any {
switch typ {
case document.TableColumnTypeString:
return ""
case document.TableColumnTypeInteger:
return 0
case document.TableColumnTypeTime:
return time.Time{}
case document.TableColumnTypeNumber:
return 0.0
case document.TableColumnTypeBoolean:
return false
case document.TableColumnTypeImage:
return []byte{}
default:
return ""
}
}

View File

@@ -0,0 +1,39 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package convert
import (
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
rdbEntity "github.com/coze-dev/coze-studio/backend/infra/contract/rdb/entity"
)
func ConvertColumnType(columnType document.TableColumnType) rdbEntity.DataType {
switch columnType {
case document.TableColumnTypeBoolean:
return rdbEntity.TypeBoolean
case document.TableColumnTypeInteger:
return rdbEntity.TypeBigInt
case document.TableColumnTypeNumber:
return rdbEntity.TypeDouble
case document.TableColumnTypeString, document.TableColumnTypeImage:
return rdbEntity.TypeText
case document.TableColumnTypeTime:
return rdbEntity.TypeTimestamp
default:
return rdbEntity.TypeText
}
}

View File

@@ -0,0 +1,158 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package convert
import (
"fmt"
"reflect"
"time"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
)
const timeFormat = "2006-01-02 15:04:05"
func TransformColumnType(src, dst document.TableColumnType) document.TableColumnType {
if src == document.TableColumnTypeUnknown {
return dst
}
if dst == document.TableColumnTypeUnknown {
return src
}
if dst == document.TableColumnTypeString {
return dst
}
if src == dst {
return dst
}
if src == document.TableColumnTypeInteger && dst == document.TableColumnTypeNumber {
return dst
}
return document.TableColumnTypeString
}
const columnPrefix = "c_%d"
func ColumnIDToRDBField(colID int64) string {
return fmt.Sprintf(columnPrefix, colID)
}
func ParseAnyData(col *entity.TableColumn, data any) (*document.ColumnData, error) {
resp := &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
}
if data == nil {
return resp, nil
}
switch col.Type {
case document.TableColumnTypeString:
switch v := data.(type) {
case string:
resp.ValString = ptr.Of(v)
case []byte:
resp.ValString = ptr.Of(string(v))
default:
return nil, fmt.Errorf("[ParseAnyData] type assertion failed")
}
case document.TableColumnTypeInteger:
switch data.(type) {
case int, int8, int16, int32, int64:
resp.ValInteger = ptr.Of(reflect.ValueOf(data).Int())
case uint, uint8, uint16, uint32, uint64, uintptr:
resp.ValInteger = ptr.Of(int64(reflect.ValueOf(data).Uint()))
default:
return nil, fmt.Errorf("[ParseAnyData] type assertion failed")
}
case document.TableColumnTypeTime:
if t, ok := data.(time.Time); ok {
resp.ValTime = &t
} else if b, ok := data.([]byte); ok {
t, err := time.Parse(timeFormat, string(b))
if err != nil {
return nil, fmt.Errorf("[ParseAnyData] format time failed, %w", err)
}
resp.ValTime = &t
} else {
return nil, fmt.Errorf("[ParseAnyData] type assertion failed")
}
case document.TableColumnTypeNumber:
switch data.(type) {
case float32, float64:
resp.ValNumber = ptr.Of(reflect.ValueOf(data).Float())
default:
return nil, fmt.Errorf("[ParseAnyData] type assertion failed")
}
case document.TableColumnTypeBoolean:
switch data.(type) {
case bool:
resp.ValBoolean = ptr.Of(data.(bool))
case int, int8, int16, int32, int64:
if reflect.ValueOf(data).Int() >= 1 {
resp.ValBoolean = ptr.Of(true)
} else {
resp.ValBoolean = ptr.Of(false)
}
case uint, uint8, uint16, uint32, uint64, uintptr:
resp.ValInteger = ptr.Of(int64(reflect.ValueOf(data).Uint()))
if reflect.ValueOf(data).Int() >= 1 {
resp.ValBoolean = ptr.Of(true)
} else {
resp.ValBoolean = ptr.Of(false)
}
default:
return nil, fmt.Errorf("[ParseAnyData] type assertion failed")
}
case document.TableColumnTypeImage:
switch v := data.(type) {
case string:
resp.ValImage = ptr.Of(v)
case []byte:
resp.ValImage = ptr.Of(string(v))
default:
return nil, fmt.Errorf("[ParseAnyData] type assertion failed")
}
default:
return nil, fmt.Errorf("[ParseAnyData] column type not support, type=%d", col.Type)
}
return resp, nil
}
func FilterColumnsRDBID(cols []*entity.TableColumn) []*entity.TableColumn {
for i := len(cols) - 1; i >= 0; i-- {
if cols[i].Name == consts.RDBFieldID {
cols = append(cols[:i], cols[i+1:]...)
break
}
}
return cols
}
func ColumnIDMapping(cols []*entity.TableColumn) map[int64]*entity.TableColumn {
resp := make(map[int64]*entity.TableColumn, len(cols))
for i := range cols {
col := cols[i]
resp[col.ID] = col
}
return resp
}

View File

@@ -0,0 +1,267 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package convert
import (
"fmt"
"testing"
"time"
. "github.com/bytedance/mockey"
"github.com/smartystreets/goconvey/convey"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
)
func TestParseAnyData(t *testing.T) {
PatchConvey("test ParseAnyData", t, func() {
PatchConvey("test data is nil", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeString,
}
resp, err := ParseAnyData(col, nil)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
})
})
PatchConvey("test unsupported type", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: 10001,
}
data := 123
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] column type not support, type=%d", col.Type))
convey.So(resp, convey.ShouldBeNil)
})
PatchConvey("test string", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeString,
}
PatchConvey("test string", func() {
data := "hello"
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValString: ptr.Of(data),
})
})
PatchConvey("test []byte", func() {
data := "hello"
resp, err := ParseAnyData(col, []byte(data))
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValString: ptr.Of(data),
})
})
PatchConvey("test failed", func() {
data := 123
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] type assertion failed"))
convey.So(resp, convey.ShouldBeNil)
})
})
PatchConvey("test integer", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeInteger,
}
PatchConvey("test int", func() {
allData := []any{1, int8(1), int16(1), int32(1), int64(1)}
for _, data := range allData {
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValInteger: ptr.Of(int64(1)),
})
}
})
PatchConvey("test uint", func() {
allData := []any{uint(1), uint8(1), uint16(1), uint32(1), uint64(1), uintptr(1)}
for _, data := range allData {
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValInteger: ptr.Of(int64(1)),
})
}
})
PatchConvey("test failed", func() {
data := "hello"
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] type assertion failed"))
convey.So(resp, convey.ShouldBeNil)
})
})
PatchConvey("test time", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeTime,
}
PatchConvey("test time", func() {
data := time.Now()
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValTime: ptr.Of(data),
})
})
PatchConvey("test failed", func() {
data := "hello"
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] type assertion failed"))
convey.So(resp, convey.ShouldBeNil)
})
})
PatchConvey("test number", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeNumber,
}
PatchConvey("test float", func() {
allData := []any{float32(1), 1.0}
for _, data := range allData {
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValNumber: ptr.Of(float64(1)),
})
}
})
PatchConvey("test failed", func() {
data := "hello"
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] type assertion failed"))
convey.So(resp, convey.ShouldBeNil)
})
})
PatchConvey("test boolean", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeBoolean,
}
PatchConvey("test float", func() {
resp, err := ParseAnyData(col, true)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValBoolean: ptr.Of(true),
})
})
PatchConvey("test failed", func() {
data := "hello"
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] type assertion failed"))
convey.So(resp, convey.ShouldBeNil)
})
})
PatchConvey("test image", func() {
col := &entity.TableColumn{
ID: 123,
Name: "test",
Type: document.TableColumnTypeImage,
}
PatchConvey("test string", func() {
data := "hello"
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValImage: ptr.Of(data),
})
})
PatchConvey("test []byte", func() {
data := "hello"
resp, err := ParseAnyData(col, []byte(data))
convey.So(err, convey.ShouldBeNil)
convey.So(resp, convey.ShouldEqual, &document.ColumnData{
ColumnID: col.ID,
ColumnName: col.Name,
Type: col.Type,
ValImage: ptr.Of(data),
})
})
PatchConvey("test failed", func() {
data := 123
resp, err := ParseAnyData(col, data)
convey.So(err, convey.ShouldBeError, fmt.Errorf("[ParseAnyData] type assertion failed"))
convey.So(resp, convey.ShouldBeNil)
})
})
})
}