feat: manually mirror opencoze's code from bytedance

Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
fanlv
2025-07-20 17:36:12 +08:00
commit 890153324f
14811 changed files with 1923430 additions and 0 deletions

View File

@@ -0,0 +1,246 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package impl
import (
"context"
"time"
"github.com/bytedance/sonic"
"github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/convert"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/events"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/repository"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/eventbus"
"github.com/coze-dev/coze-studio/backend/infra/contract/idgen"
"github.com/coze-dev/coze-studio/backend/infra/contract/rdb"
rdbEntity "github.com/coze-dev/coze-studio/backend/infra/contract/rdb/entity"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage"
"github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/errno"
)
type baseDocProcessor struct {
ctx context.Context
UserID int64
SpaceID int64
Documents []*entity.Document
documentSource *entity.DocumentSource
// 落DB 的 model
TableName string
docModels []*model.KnowledgeDocument
storage storage.Storage
knowledgeRepo repository.KnowledgeRepo
documentRepo repository.KnowledgeDocumentRepo
sliceRepo repository.KnowledgeDocumentSliceRepo
idgen idgen.IDGenerator
rdb rdb.RDB
producer eventbus.Producer
parseManager parser.Manager
}
func (p *baseDocProcessor) BeforeCreate() error {
// 从数据源拉取数据
return nil
}
func (p *baseDocProcessor) BuildDBModel() error {
p.docModels = make([]*model.KnowledgeDocument, 0, len(p.Documents))
ids, err := p.idgen.GenMultiIDs(p.ctx, len(p.Documents))
if err != nil {
logs.CtxErrorf(p.ctx, "gen ids failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeIDGenCode)
}
for i := range p.Documents {
docModel := &model.KnowledgeDocument{
ID: ids[i],
KnowledgeID: p.Documents[i].KnowledgeID,
Name: p.Documents[i].Name,
FileExtension: string(p.Documents[i].FileExtension),
URI: p.Documents[i].URI,
DocumentType: int32(p.Documents[i].Type),
CreatorID: p.UserID,
SpaceID: p.SpaceID,
SourceType: int32(p.Documents[i].Source),
Status: int32(knowledge.KnowledgeStatusInit),
ParseRule: &model.DocumentParseRule{
ParsingStrategy: p.Documents[i].ParsingStrategy,
ChunkingStrategy: p.Documents[i].ChunkingStrategy,
},
CreatedAt: time.Now().UnixMilli(),
UpdatedAt: time.Now().UnixMilli(),
}
p.Documents[i].ID = docModel.ID
p.docModels = append(p.docModels, docModel)
}
return nil
}
func (p *baseDocProcessor) InsertDBModel() (err error) {
ctx := p.ctx
if !isTableAppend(p.Documents) {
err = p.createTable()
if err != nil {
logs.CtxErrorf(ctx, "create table failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
}
tx, err := p.knowledgeRepo.InitTx()
if err != nil {
logs.CtxErrorf(ctx, "init tx failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
defer func() {
if e := recover(); e != nil {
logs.CtxErrorf(ctx, "panic: %v", e)
err = errorx.New(errno.ErrKnowledgeSystemCode, errorx.KVf("msg", "panic: %v", e))
tx.Rollback()
return
}
if err != nil {
logs.CtxErrorf(ctx, "InsertDBModel err: %v", err)
tx.Rollback()
if p.TableName != "" {
deleteErr := p.deleteTable()
if deleteErr != nil {
logs.CtxErrorf(ctx, "delete table failed, err: %v", deleteErr)
return
}
}
} else {
tx.Commit()
}
}()
err = p.documentRepo.CreateWithTx(ctx, tx, p.docModels)
if err != nil {
logs.CtxErrorf(ctx, "create document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
err = p.knowledgeRepo.UpdateWithTx(ctx, tx, p.Documents[0].KnowledgeID, map[string]interface{}{
"updated_at": time.Now().UnixMilli(),
})
if err != nil {
logs.CtxErrorf(ctx, "update knowledge failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (p *baseDocProcessor) createTable() error {
if len(p.Documents) == 1 && p.Documents[0].Type == knowledge.DocumentTypeTable {
// 表格型知识库,创建表
rdbColumns := []*rdbEntity.Column{}
tableColumns := p.Documents[0].TableInfo.Columns
columnIDs, err := p.idgen.GenMultiIDs(p.ctx, len(tableColumns)+1)
if err != nil {
logs.CtxErrorf(p.ctx, "gen ids failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeIDGenCode)
}
for i := range tableColumns {
tableColumns[i].ID = columnIDs[i]
rdbColumns = append(rdbColumns, &rdbEntity.Column{
Name: convert.ColumnIDToRDBField(columnIDs[i]),
DataType: convert.ConvertColumnType(tableColumns[i].Type),
NotNull: tableColumns[i].Indexing,
})
}
p.Documents[0].TableInfo.Columns = append(p.Documents[0].TableInfo.Columns, &entity.TableColumn{
ID: columnIDs[len(columnIDs)-1],
Name: consts.RDBFieldID,
Type: document.TableColumnTypeInteger,
Description: "主键ID",
Indexing: false,
Sequence: -1,
})
// 为每个表格增加个主键ID
rdbColumns = append(rdbColumns, &rdbEntity.Column{
Name: consts.RDBFieldID,
DataType: rdbEntity.TypeBigInt,
NotNull: true,
})
// 创建一个数据表
resp, err := p.rdb.CreateTable(p.ctx, &rdb.CreateTableRequest{
Table: &rdbEntity.Table{
Columns: rdbColumns,
Indexes: []*rdbEntity.Index{
{
Name: "pk",
Type: rdbEntity.PrimaryKey,
Columns: []string{consts.RDBFieldID},
},
},
},
})
if err != nil {
logs.CtxErrorf(p.ctx, "create table failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
p.TableName = resp.Table.Name
p.Documents[0].TableInfo.PhysicalTableName = p.TableName
p.docModels[0].TableInfo = &entity.TableInfo{
VirtualTableName: p.Documents[0].Name,
PhysicalTableName: p.TableName,
TableDesc: p.Documents[0].Description,
Columns: p.Documents[0].TableInfo.Columns,
}
}
return nil
}
func (p *baseDocProcessor) deleteTable() error {
if len(p.Documents) == 1 && p.Documents[0].Type == knowledge.DocumentTypeTable {
_, err := p.rdb.DropTable(p.ctx, &rdb.DropTableRequest{
TableName: p.TableName,
IfExists: false,
})
if err != nil {
logs.CtxErrorf(p.ctx, "[deleteTable] drop table failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
}
return nil
}
func (p *baseDocProcessor) Indexing() error {
event := events.NewIndexDocumentsEvent(p.Documents[0].KnowledgeID, p.Documents)
body, err := sonic.Marshal(event)
if err != nil {
return errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", err.Error()))
}
if err = p.producer.Send(p.ctx, body); err != nil {
logs.CtxErrorf(p.ctx, "send message failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeMQSendFailCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (p *baseDocProcessor) GetResp() []*entity.Document {
return p.Documents
}

View File

@@ -0,0 +1,41 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package impl
import "github.com/coze-dev/coze-studio/backend/pkg/logs"
// 用户输入自定义内容后创建文档
type customDocProcessor struct {
baseDocProcessor
}
func (c *customDocProcessor) BeforeCreate() error {
for i := range c.Documents {
if c.Documents[i].RawContent != "" {
c.Documents[i].FileExtension = getFormatType(c.Documents[i].Type)
uri := getTosUri(c.UserID, string(c.Documents[i].FileExtension))
err := c.storage.PutObject(c.ctx, uri, []byte(c.Documents[i].RawContent))
if err != nil {
logs.CtxErrorf(c.ctx, "put object failed, err: %v", err)
return err
}
c.Documents[i].URI = uri
}
}
return nil
}

View File

@@ -0,0 +1,108 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package impl
import (
"github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/errno"
)
// 用户自定义表格创建文档
type customTableProcessor struct {
baseDocProcessor
}
func (c *customTableProcessor) BeforeCreate() error {
if isTableAppend(c.Documents) {
tableDoc, _, err := c.documentRepo.FindDocumentByCondition(c.ctx, &entity.WhereDocumentOpt{KnowledgeIDs: []int64{c.Documents[0].KnowledgeID}, SelectAll: true})
if err != nil {
logs.CtxErrorf(c.ctx, "find document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if len(tableDoc) == 0 {
logs.CtxErrorf(c.ctx, "table doc not found")
return errorx.New(errno.ErrKnowledgeDocumentNotExistCode, errorx.KV("msg", "doc not found"))
}
c.Documents[0].ID = tableDoc[0].ID
if tableDoc[0].TableInfo == nil {
logs.CtxErrorf(c.ctx, "table info not found")
return errorx.New(errno.ErrKnowledgeTableInfoNotExistCode, errorx.KVf("msg", "table info not found, doc_id: %d", tableDoc[0].ID))
}
c.Documents[0].TableInfo = *tableDoc[0].TableInfo
// 追加场景
if c.Documents[0].RawContent != "" {
c.Documents[0].FileExtension = getFormatType(c.Documents[0].Type)
uri := getTosUri(c.UserID, string(c.Documents[0].FileExtension))
err := c.storage.PutObject(c.ctx, uri, []byte(c.Documents[0].RawContent))
if err != nil {
logs.CtxErrorf(c.ctx, "put object failed, err: %v", err)
return errorx.New(errno.ErrKnowledgePutObjectFailCode, errorx.KV("msg", err.Error()))
}
c.Documents[0].URI = uri
}
}
return nil
}
func (c *customTableProcessor) BuildDBModel() error {
if len(c.Documents) > 0 &&
c.Documents[0].Type == knowledge.DocumentTypeTable {
if c.Documents[0].IsAppend {
// 追加场景,不需要创建表了
// 一是用户自定义一些数据、二是再上传一个表格,把表格里的数据追加到表格中
} else {
err := c.baseDocProcessor.BuildDBModel()
if err != nil {
return err
}
// 因为这种创建方式不带数据,所以直接设置状态为可用
for i := range c.docModels {
c.docModels[i].DocumentType = 1
c.docModels[i].Status = int32(entity.DocumentStatusInit)
}
}
}
return nil
}
func (c *customTableProcessor) InsertDBModel() error {
if isTableAppend(c.Documents) {
// 追加场景,设置文档为处理中状态
err := c.documentRepo.SetStatus(c.ctx, c.Documents[0].ID, int32(entity.DocumentStatusUploading), "")
if err != nil {
logs.CtxErrorf(c.ctx, "document set status err:%v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
return c.baseDocProcessor.InsertDBModel()
}
func (c *customTableProcessor) Indexing() error {
// c.baseDocProcessor.Indexing()
if isTableAppend(c.Documents) {
err := c.baseDocProcessor.Indexing()
if err != nil {
logs.CtxErrorf(c.ctx, "document indexing err:%v", err)
return err
}
}
return nil
}

View File

@@ -0,0 +1,87 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package impl
import (
"context"
"github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/processor"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/repository"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/eventbus"
"github.com/coze-dev/coze-studio/backend/infra/contract/idgen"
"github.com/coze-dev/coze-studio/backend/infra/contract/rdb"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage"
)
type DocProcessorConfig struct {
UserID int64
SpaceID int64
DocumentSource entity.DocumentSource
Documents []*entity.Document
KnowledgeRepo repository.KnowledgeRepo
DocumentRepo repository.KnowledgeDocumentRepo
SliceRepo repository.KnowledgeDocumentSliceRepo
Idgen idgen.IDGenerator
Storage storage.Storage
Rdb rdb.RDB
Producer eventbus.Producer
ParseManager parser.Manager
}
func NewDocProcessor(ctx context.Context, config *DocProcessorConfig) (p processor.DocProcessor) {
base := &baseDocProcessor{
ctx: ctx,
UserID: config.UserID,
SpaceID: config.SpaceID,
Documents: config.Documents,
documentSource: &config.DocumentSource,
knowledgeRepo: config.KnowledgeRepo,
documentRepo: config.DocumentRepo,
sliceRepo: config.SliceRepo,
storage: config.Storage,
idgen: config.Idgen,
rdb: config.Rdb,
producer: config.Producer,
parseManager: config.ParseManager,
}
switch config.DocumentSource {
case entity.DocumentSourceCustom:
p = &customDocProcessor{
baseDocProcessor: *base,
}
if config.Documents[0].Type == knowledge.DocumentTypeTable {
p = &customTableProcessor{
baseDocProcessor: *base,
}
}
return p
case entity.DocumentSourceLocal:
if config.Documents[0].Type == knowledge.DocumentTypeTable {
return &localTableProcessor{
baseDocProcessor: *base,
}
}
return base
default:
return base
}
}

View File

@@ -0,0 +1,77 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package impl
import (
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
"github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/errno"
)
type localTableProcessor struct {
baseDocProcessor
}
func (l *localTableProcessor) BeforeCreate() error {
if isTableAppend(l.Documents) {
tableDoc, _, err := l.documentRepo.FindDocumentByCondition(l.ctx, &entity.WhereDocumentOpt{
KnowledgeIDs: []int64{l.Documents[0].KnowledgeID},
SelectAll: true,
})
if err != nil {
logs.CtxErrorf(l.ctx, "find document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if len(tableDoc) == 0 {
logs.CtxErrorf(l.ctx, "table doc not found")
return errorx.New(errno.ErrKnowledgeDocumentNotExistCode, errorx.KV("msg", "doc not found"))
}
l.Documents[0].ID = tableDoc[0].ID
if tableDoc[0].TableInfo == nil {
logs.CtxErrorf(l.ctx, "table info not found")
return errorx.New(errno.ErrKnowledgeTableInfoNotExistCode, errorx.KVf("msg", "table info not found, doc_id: %d", tableDoc[0].ID))
}
l.Documents[0].TableInfo = ptr.From(tableDoc[0].TableInfo)
return nil
}
return l.baseDocProcessor.BeforeCreate()
}
func (l *localTableProcessor) BuildDBModel() error {
if isTableAppend(l.Documents) {
return nil
}
return l.baseDocProcessor.BuildDBModel()
}
func (l *localTableProcessor) InsertDBModel() error {
if isTableAppend(l.Documents) {
// 追加场景,设置文档为处理中状态
err := l.documentRepo.SetStatus(l.ctx, l.Documents[0].ID, int32(entity.DocumentStatusUploading), "")
if err != nil {
logs.CtxErrorf(l.ctx, "document set status err:%v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
return l.baseDocProcessor.InsertDBModel()
}

View File

@@ -0,0 +1,45 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package impl
import (
"fmt"
"time"
"github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
)
func getFormatType(tp knowledge.DocumentType) parser.FileExtension {
docType := parser.FileExtensionTXT
if tp == knowledge.DocumentTypeTable {
docType = parser.FileExtensionJSON
}
return docType
}
func getTosUri(userID int64, fileType string) string {
fileName := fmt.Sprintf("FileBizType.Knowledge/%d_%d.%s", userID, time.Now().UnixNano(), fileType)
return fileName
}
func isTableAppend(docs []*entity.Document) bool {
return len(docs) > 0 &&
docs[0].Type == knowledge.DocumentTypeTable &&
docs[0].IsAppend
}