feat: manually mirror opencoze's code from bytedance

Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
fanlv
2025-07-20 17:36:12 +08:00
commit 890153324f
14811 changed files with 1923430 additions and 0 deletions

View File

@@ -0,0 +1,126 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package document
import (
"fmt"
"github.com/cloudwego/eino/schema"
)
const (
MetaDataKeyColumns = "table_columns" // val: []*Column
MetaDataKeyColumnData = "table_column_data" // val: []*ColumnData
MetaDataKeyColumnsOnly = "table_columns_only" // val: struct{}, which means table has no data, only header.
MetaDataKeyCreatorID = "creator_id" // val: int64
MetaDataKeyExternalStorage = "external_storage" // val: map[string]any
)
func GetDocumentColumns(doc *schema.Document) ([]*Column, error) {
if doc == nil || doc.MetaData == nil {
return nil, fmt.Errorf("invalid document")
}
columns, ok := doc.MetaData[MetaDataKeyColumns].([]*Column)
if !ok {
return nil, fmt.Errorf("invalid document columns")
}
return columns, nil
}
func WithDocumentColumns(doc *schema.Document, columns []*Column) *schema.Document {
doc.MetaData[MetaDataKeyColumns] = columns
return doc
}
func GetDocumentColumnData(doc *schema.Document) ([]*ColumnData, error) {
if doc == nil || doc.MetaData == nil {
return nil, fmt.Errorf("invalid document")
}
data, ok := doc.MetaData[MetaDataKeyColumnData].([]*ColumnData)
if !ok {
return nil, fmt.Errorf("invalid document column data")
}
return data, nil
}
func WithDocumentColumnData(doc *schema.Document, data []*ColumnData) *schema.Document {
doc.MetaData[MetaDataKeyColumnData] = data
return doc
}
func WithDocumentColumnsOnly(doc *schema.Document) *schema.Document {
doc.MetaData[MetaDataKeyColumnsOnly] = struct{}{}
return doc
}
func GetDocumentColumnsOnly(doc *schema.Document) (bool, error) {
if doc == nil || doc.MetaData == nil {
return false, fmt.Errorf("invalid document")
}
_, ok := doc.MetaData[MetaDataKeyColumnsOnly].(struct{})
return ok, nil
}
func GetDocumentsColumnsOnly(docs []*schema.Document) (bool, error) {
if len(docs) != 1 {
return false, nil
}
return GetDocumentColumnsOnly(docs[0])
}
func GetDocumentCreatorID(doc *schema.Document) (int64, error) {
if doc == nil || doc.MetaData == nil {
return 0, fmt.Errorf("invalid document")
}
creatorID, ok := doc.MetaData[MetaDataKeyCreatorID].(int64)
if !ok {
return 0, fmt.Errorf("invalid document creator id")
}
return creatorID, nil
}
func WithDocumentCreatorID(doc *schema.Document, creatorID int64) *schema.Document {
doc.MetaData[MetaDataKeyCreatorID] = creatorID
return doc
}
func GetDocumentExternalStorage(doc *schema.Document) (map[string]any, error) {
if doc == nil || doc.MetaData == nil {
return nil, fmt.Errorf("invalid document")
}
data, ok := doc.MetaData[MetaDataKeyExternalStorage].(map[string]any)
if !ok {
return nil, fmt.Errorf("invalid document external storage")
}
return data, nil
}
func WithDocumentExternalStorage(doc *schema.Document, externalStorage map[string]any) *schema.Document {
doc.MetaData[MetaDataKeyExternalStorage] = externalStorage
return doc
}

View File

@@ -0,0 +1,23 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package imageunderstand
import "context"
type ImageUnderstand interface {
ImageUnderstand(ctx context.Context, image []byte) (content string, err error)
}

View File

@@ -0,0 +1,29 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl2sql
import (
"context"
"github.com/cloudwego/eino/schema"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
)
type NL2SQL interface {
NL2SQL(ctx context.Context, messages []*schema.Message, tables []*document.TableSchema, opts ...Option) (sql string, err error)
}

View File

@@ -0,0 +1,31 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl2sql
import "github.com/coze-dev/coze-studio/backend/infra/contract/chatmodel"
type Option func(o *Options)
type Options struct {
ChatModel chatmodel.BaseChatModel
}
func WithChatModel(cm chatmodel.BaseChatModel) Option {
return func(o *Options) {
o.ChatModel = cm
}
}

View File

@@ -0,0 +1,24 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ocr
import "context"
type OCR interface {
FromBase64(ctx context.Context, b64 string) (texts []string, err error)
FromURL(ctx context.Context, url string) (texts []string, err error)
}

View File

@@ -0,0 +1,128 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parser
import (
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
"github.com/coze-dev/coze-studio/backend/pkg/lang/sets"
)
type Manager interface {
GetParser(config *Config) (Parser, error)
}
type Config struct {
FileExtension FileExtension
ParsingStrategy *ParsingStrategy
ChunkingStrategy *ChunkingStrategy
}
// ParsingStrategy for document parse before indexing
type ParsingStrategy struct {
// Doc
ExtractImage bool `json:"extract_image"` // 提取图片元素
ExtractTable bool `json:"extract_table"` // 提取表格元素
ImageOCR bool `json:"image_ocr"` // 图片 ocr
FilterPages []int `json:"filter_pages"` // 页过滤, 第一页=1
// Sheet
SheetID *int `json:"sheet_id"` // xlsx sheet id
HeaderLine int `json:"header_line"` // 表头行
DataStartLine int `json:"data_start_line"` // 数据起始行
RowsCount int `json:"rows_count"` // 读取数据行数
IsAppend bool `json:"-"` // 行插入
Columns []*document.Column `json:"-"` // sheet 对齐表头
IgnoreColumnTypeErr bool `json:"-"` // true 时忽略 column type 与 value 未对齐的问题,此时 value 为空
// Image
ImageAnnotationType ImageAnnotationType `json:"image_annotation_type"` // 图片内容标注类型
}
type ChunkingStrategy struct {
ChunkType ChunkType `json:"chunk_type"`
// custom config
ChunkSize int64 `json:"chunk_size"` // 分段最大长度
Separator string `json:"separator"` // 分段标识符
Overlap int64 `json:"overlap"` // 分段重叠比例
TrimSpace bool `json:"trim_space"`
TrimURLAndEmail bool `json:"trim_url_and_email"`
// leveled config
MaxDepth int64 `json:"max_depth"` // 按层级分段时的最大层级
SaveTitle bool `json:"save_title"` // 保留层级标题
}
type ChunkType int64
const (
ChunkTypeDefault ChunkType = 0 // 自动分片
ChunkTypeCustom ChunkType = 1 // 自定义规则分片
ChunkTypeLeveled ChunkType = 2 // 层级分片
)
type ImageAnnotationType int64
const (
ImageAnnotationTypeModel ImageAnnotationType = 0 // 模型自动标注
ImageAnnotationTypeManual ImageAnnotationType = 1 // 人工标注
)
type FileExtension string
const (
// document
FileExtensionPDF FileExtension = "pdf"
FileExtensionTXT FileExtension = "txt"
FileExtensionDoc FileExtension = "doc"
FileExtensionDocx FileExtension = "docx"
FileExtensionMarkdown FileExtension = "md"
// sheet
FileExtensionCSV FileExtension = "csv"
FileExtensionXLSX FileExtension = "xlsx"
FileExtensionJSON FileExtension = "json"
FileExtensionJsonMaps FileExtension = "json_maps" // json of []map[string]string
// image
FileExtensionJPG FileExtension = "jpg"
FileExtensionJPEG FileExtension = "jpeg"
FileExtensionPNG FileExtension = "png"
)
func ValidateFileExtension(fileSuffix string) (ext FileExtension, support bool) {
fileExtension := FileExtension(fileSuffix)
_, ok := fileExtensionSet[fileExtension]
if !ok {
return "", false
}
return fileExtension, true
}
var fileExtensionSet = sets.Set[FileExtension]{
FileExtensionPDF: {},
FileExtensionTXT: {},
FileExtensionDoc: {},
FileExtensionDocx: {},
FileExtensionMarkdown: {},
FileExtensionCSV: {},
FileExtensionJSON: {},
FileExtensionJsonMaps: {},
FileExtensionJPG: {},
FileExtensionJPEG: {},
FileExtensionPNG: {},
}

View File

@@ -0,0 +1,21 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package parser
import "github.com/cloudwego/eino/components/document/parser"
type Parser = parser.Parser

View File

@@ -0,0 +1,26 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package progressbar
import "context"
// ProgressBar is the interface for the progress bar.
type ProgressBar interface {
AddN(n int) error
ReportError(err error) error
GetProgress(ctx context.Context) (percent int, remainSec int, errMsg string)
}

View File

@@ -0,0 +1,43 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package rerank
import (
"context"
"github.com/cloudwego/eino/schema"
)
type Reranker interface {
Rerank(ctx context.Context, req *Request) (*Response, error)
}
type Request struct {
Query string
Data [][]*Data
TopN *int64
}
type Response struct {
SortedData []*Data // 高分在前
TokenUsage *int64
}
type Data struct {
Document *schema.Document
Score float64
}

View File

@@ -0,0 +1,54 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package searchstore
import "fmt"
type DSL struct {
Op Op
Field string
Value interface{} // builtin types / []*DSL
}
type Op string
const (
OpEq Op = "eq"
OpNe Op = "ne"
OpLike Op = "like"
OpIn Op = "in"
OpAnd Op = "and"
OpOr Op = "or"
)
func (d *DSL) DSL() map[string]any {
return map[string]any{"dsl": d}
}
func LoadDSL(src map[string]any) (*DSL, error) {
if src == nil {
return nil, nil
}
dsl, ok := src["dsl"].(*DSL)
if !ok {
return nil, fmt.Errorf("load dsl failed")
}
return dsl, nil
}

View File

@@ -0,0 +1,82 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package searchstore
import (
"context"
)
type Manager interface {
Create(ctx context.Context, req *CreateRequest) error
Drop(ctx context.Context, req *DropRequest) error
GetType() SearchStoreType
GetSearchStore(ctx context.Context, collectionName string) (SearchStore, error)
}
type CreateRequest struct {
CollectionName string
Fields []*Field
CollectionMeta map[string]string
}
type DropRequest struct {
CollectionName string
}
type GetSearchStoreRequest struct {
CollectionName string
}
type Field struct {
Name FieldName
Type FieldType
Description string
Nullable bool
IsPrimary bool
Indexing bool
}
type SearchStoreType string
const (
TypeVectorStore SearchStoreType = "vector"
TypeTextStore SearchStoreType = "text"
)
type FieldName = string
// 内置 field name
const (
FieldID FieldName = "id" // int64
FieldCreatorID FieldName = "creator_id" // int64
FieldTextContent FieldName = "text_content" // string
)
type FieldType int64
const (
FieldTypeUnknown FieldType = 0
FieldTypeInt64 FieldType = 1
FieldTypeText FieldType = 2
FieldTypeDenseVector FieldType = 3
FieldTypeSparseVector FieldType = 4
)

View File

@@ -0,0 +1,87 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package searchstore
import (
"github.com/cloudwego/eino/components/indexer"
"github.com/cloudwego/eino/components/retriever"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/progressbar"
)
type IndexerOptions struct {
PartitionKey *string
Partition *string // 存储分片映射
IndexingFields []string
ProgressBar progressbar.ProgressBar
}
type RetrieverOptions struct {
MultiMatch *MultiMatch // 多 field 查询
PartitionKey *string
Partitions []string // 查询分片映射
}
type MultiMatch struct {
Fields []string
Query string
}
func WithIndexerPartitionKey(key string) indexer.Option {
return indexer.WrapImplSpecificOptFn(func(o *IndexerOptions) {
o.PartitionKey = &key
})
}
func WithPartition(partition string) indexer.Option {
return indexer.WrapImplSpecificOptFn(func(o *IndexerOptions) {
o.Partition = &partition
})
}
func WithIndexingFields(fields []string) indexer.Option {
return indexer.WrapImplSpecificOptFn(func(o *IndexerOptions) {
o.IndexingFields = fields
})
}
func WithProgressBar(progressBar progressbar.ProgressBar) indexer.Option {
return indexer.WrapImplSpecificOptFn(func(o *IndexerOptions) {
o.ProgressBar = progressBar
})
}
func WithMultiMatch(fields []string, query string) retriever.Option {
return retriever.WrapImplSpecificOptFn(func(o *RetrieverOptions) {
o.MultiMatch = &MultiMatch{
Fields: fields,
Query: query,
}
})
}
func WithRetrieverPartitionKey(key string) retriever.Option {
return retriever.WrapImplSpecificOptFn(func(o *RetrieverOptions) {
o.PartitionKey = &key
})
}
func WithPartitions(partitions []string) retriever.Option {
return retriever.WrapImplSpecificOptFn(func(o *RetrieverOptions) {
o.Partitions = partitions
})
}

View File

@@ -0,0 +1,32 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package searchstore
import (
"context"
"github.com/cloudwego/eino/components/indexer"
"github.com/cloudwego/eino/components/retriever"
)
type SearchStore interface {
indexer.Indexer
retriever.Retriever
Delete(ctx context.Context, ids []string) error
}

View File

@@ -0,0 +1,155 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package document
import (
"strconv"
"time"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
)
type TableSchema struct {
Name string
Comment string
Columns []*Column
}
type Column struct {
ID int64
Name string
Type TableColumnType
Description string
Nullable bool
IsPrimary bool
Sequence int // 排序编号
}
type TableColumnType int
const (
TableColumnTypeUnknown TableColumnType = 0
TableColumnTypeString TableColumnType = 1
TableColumnTypeInteger TableColumnType = 2
TableColumnTypeTime TableColumnType = 3
TableColumnTypeNumber TableColumnType = 4
TableColumnTypeBoolean TableColumnType = 5
TableColumnTypeImage TableColumnType = 6
)
func (t TableColumnType) String() string {
switch t {
case TableColumnTypeString:
return "varchar"
case TableColumnTypeInteger:
return "bigint"
case TableColumnTypeTime:
return "timestamp"
case TableColumnTypeNumber:
return "double"
case TableColumnTypeBoolean:
return "boolean"
case TableColumnTypeImage:
return "image"
default:
return "unknown"
}
}
type ColumnData struct {
ColumnID int64
ColumnName string
Type TableColumnType
ValString *string
ValInteger *int64
ValTime *time.Time
ValNumber *float64
ValBoolean *bool
ValImage *string // base64 / url
}
func (d *ColumnData) GetValue() interface{} {
switch d.Type {
case TableColumnTypeString:
return d.ValString
case TableColumnTypeInteger:
return d.ValInteger
case TableColumnTypeTime:
return d.ValTime
case TableColumnTypeNumber:
return d.ValNumber
case TableColumnTypeBoolean:
return d.ValBoolean
case TableColumnTypeImage:
return d.ValImage
default:
return nil
}
}
func (d *ColumnData) GetStringValue() string {
switch d.Type {
case TableColumnTypeString:
return ptr.From(d.ValString)
case TableColumnTypeInteger:
return strconv.FormatInt(ptr.From(d.ValInteger), 10)
case TableColumnTypeTime:
return ptr.From(d.ValTime).Format(time.DateTime)
case TableColumnTypeNumber:
return strconv.FormatFloat(ptr.From(d.ValNumber), 'f', 20, 64)
case TableColumnTypeBoolean:
return strconv.FormatBool(ptr.From(d.ValBoolean))
case TableColumnTypeImage:
return ptr.From(d.ValImage)
default:
return ptr.From(d.ValString)
}
}
func (d *ColumnData) GetNullableStringValue() string {
switch d.Type {
case TableColumnTypeString:
return ptr.From(d.ValString)
case TableColumnTypeInteger:
if d.ValInteger == nil {
return ""
}
return strconv.FormatInt(ptr.From(d.ValInteger), 10)
case TableColumnTypeTime:
if d.ValTime == nil {
return ""
}
return ptr.From(d.ValTime).Format(time.DateTime)
case TableColumnTypeNumber:
if d.ValNumber == nil {
return ""
}
return strconv.FormatFloat(ptr.From(d.ValNumber), 'f', 20, 64)
case TableColumnTypeBoolean:
if d.ValBoolean == nil {
return ""
}
return strconv.FormatBool(ptr.From(d.ValBoolean))
case TableColumnTypeImage:
if d.ValImage == nil {
return ""
}
return ptr.From(d.ValImage)
default:
return ptr.From(d.ValString)
}
}