coze-studio/backend/domain/knowledge/service/knowledge.go

1513 lines
55 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package service
import (
"bytes"
"context"
"crypto/sha256"
"encoding/base64"
"errors"
"fmt"
"io"
"math/rand"
"net/http"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/bytedance/sonic"
redisV9 "github.com/redis/go-redis/v9"
"gorm.io/gorm"
"github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
knowledgeModel "github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
"github.com/coze-dev/coze-studio/backend/api/model/ocean/cloud/developer_api"
"github.com/coze-dev/coze-studio/backend/application/base/ctxutil"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/repository"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/convert"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/events"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/processor/impl"
"github.com/coze-dev/coze-studio/backend/infra/contract/cache"
"github.com/coze-dev/coze-studio/backend/infra/contract/chatmodel"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/nl2sql"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/ocr"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/rerank"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore"
"github.com/coze-dev/coze-studio/backend/infra/contract/eventbus"
"github.com/coze-dev/coze-studio/backend/infra/contract/idgen"
"github.com/coze-dev/coze-studio/backend/infra/contract/messages2query"
"github.com/coze-dev/coze-studio/backend/infra/contract/rdb"
rdbEntity "github.com/coze-dev/coze-studio/backend/infra/contract/rdb/entity"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/parser/builtin"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/progressbar"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/rerank/rrf"
"github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
"github.com/coze-dev/coze-studio/backend/pkg/lang/slices"
"github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/errno"
)
func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHandler) {
svc := &knowledgeSVC{
knowledgeRepo: repository.NewKnowledgeDAO(config.DB),
documentRepo: repository.NewKnowledgeDocumentDAO(config.DB),
sliceRepo: repository.NewKnowledgeDocumentSliceDAO(config.DB),
reviewRepo: repository.NewKnowledgeDocumentReviewDAO(config.DB),
idgen: config.IDGen,
rdb: config.RDB,
producer: config.Producer,
searchStoreManagers: config.SearchStoreManagers,
parseManager: config.ParseManager,
storage: config.Storage,
reranker: config.Reranker,
rewriter: config.Rewriter,
nl2Sql: config.NL2Sql,
enableCompactTable: ptr.FromOrDefault(config.EnableCompactTable, true),
cacheCli: config.CacheCli,
isAutoAnnotationSupported: config.IsAutoAnnotationSupported,
modelFactory: config.ModelFactory,
}
if svc.reranker == nil {
svc.reranker = rrf.NewRRFReranker(0)
}
if svc.parseManager == nil {
svc.parseManager = builtin.NewManager(config.Storage, config.OCR, nil)
}
return svc, svc
}
type KnowledgeSVCConfig struct {
DB *gorm.DB // required
IDGen idgen.IDGenerator // required
RDB rdb.RDB // required: 表格存储
Producer eventbus.Producer // required: 文档 indexing 过程走 mq 异步处理
SearchStoreManagers []searchstore.Manager // required: 向量 / 全文
ParseManager parser.Manager // optional: 文档切分与处理能力, default builtin parser
Storage storage.Storage // required: oss
ModelFactory chatmodel.Factory // required: 模型 factory
Rewriter messages2query.MessagesToQuery // optional: 未配置时不改写
Reranker rerank.Reranker // optional: 未配置时默认 rrf
NL2Sql nl2sql.NL2SQL // optional: 未配置时默认不支持
EnableCompactTable *bool // optional: 表格数据压缩,默认 true
OCR ocr.OCR // optional: ocr, 未提供时 ocr 功能不可用
CacheCli cache.Cmdable // optional: 缓存实现
IsAutoAnnotationSupported bool // 是否支持了图片自动标注
}
type knowledgeSVC struct {
knowledgeRepo repository.KnowledgeRepo
documentRepo repository.KnowledgeDocumentRepo
sliceRepo repository.KnowledgeDocumentSliceRepo
reviewRepo repository.KnowledgeDocumentReviewRepo
modelFactory chatmodel.Factory
idgen idgen.IDGenerator
rdb rdb.RDB
producer eventbus.Producer
searchStoreManagers []searchstore.Manager
parseManager parser.Manager
rewriter messages2query.MessagesToQuery
reranker rerank.Reranker
storage storage.Storage
nl2Sql nl2sql.NL2SQL
cacheCli cache.Cmdable
enableCompactTable bool // 表格数据压缩
isAutoAnnotationSupported bool // 是否支持了图片自动标注
}
func (k *knowledgeSVC) CreateKnowledge(ctx context.Context, request *CreateKnowledgeRequest) (response *CreateKnowledgeResponse, err error) {
now := time.Now().UnixMilli()
if len(request.Name) == 0 {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge name is empty"))
}
if request.CreatorID == 0 {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge creator id is empty"))
}
if request.SpaceID == 0 {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge space id is empty"))
}
id, err := k.idgen.GenID(ctx)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeIDGenCode)
}
if err = k.knowledgeRepo.Create(ctx, &model.Knowledge{
ID: id,
Name: request.Name,
CreatorID: request.CreatorID,
AppID: request.AppID,
SpaceID: request.SpaceID,
CreatedAt: now,
UpdatedAt: now,
Status: int32(knowledgeModel.KnowledgeStatusEnable), // 目前向量库的初始化由文档触发,知识库无 init 过程
Description: request.Description,
IconURI: request.IconUri,
FormatType: int32(request.FormatType),
}); err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return &CreateKnowledgeResponse{
KnowledgeID: id,
CreatedAtMs: now,
}, nil
}
func (k *knowledgeSVC) UpdateKnowledge(ctx context.Context, request *UpdateKnowledgeRequest) error {
if request.KnowledgeID == 0 {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge id is empty"))
}
if request.Name != nil && len(*request.Name) == 0 {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge name is empty"))
}
knModel, err := k.knowledgeRepo.GetByID(ctx, request.KnowledgeID)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if knModel == nil {
return errorx.New(errno.ErrKnowledgeNotExistCode, errorx.KV("msg", "knowledge not found"))
}
now := time.Now().UnixMilli()
if request.Status != nil {
knModel.Status = int32(*request.Status)
}
if request.Name != nil {
knModel.Name = *request.Name
}
if request.IconUri != nil {
knModel.IconURI = *request.IconUri
}
if request.Description != nil {
knModel.Description = *request.Description
}
knModel.UpdatedAt = now
if err := k.knowledgeRepo.Update(ctx, knModel); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
knowledge, err := k.fromModelKnowledge(ctx, knModel)
if err != nil {
return err
}
knowledge.UpdatedAtMs = now
return err
}
func (k *knowledgeSVC) DeleteKnowledge(ctx context.Context, request *DeleteKnowledgeRequest) error {
// 先获取一下knowledge的信息
knModel, err := k.knowledgeRepo.GetByID(ctx, request.KnowledgeID)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if knModel == nil || knModel.ID == 0 {
return errorx.New(errno.ErrKnowledgeNotExistCode, errorx.KV("msg", "knowledge not found"))
}
docs, _, err := k.documentRepo.FindDocumentByCondition(ctx, &entity.WhereDocumentOpt{
KnowledgeIDs: []int64{request.KnowledgeID},
SelectAll: true,
})
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if knModel.FormatType == int32(knowledgeModel.DocumentTypeTable) {
for _, doc := range docs {
if doc == nil {
continue
}
if doc.TableInfo != nil {
resp, err := k.rdb.DropTable(ctx, &rdb.DropTableRequest{
TableName: doc.TableInfo.PhysicalTableName,
IfExists: true,
})
if err != nil {
logs.CtxWarnf(ctx, "[DeleteKnowledge] drop table failed, err %v", err)
}
if !resp.Success {
logs.CtxWarnf(ctx, "[DeleteKnowledge] drop table failed")
}
}
}
}
collectionName := getCollectionName(request.KnowledgeID)
for _, mgr := range k.searchStoreManagers {
if err = mgr.Drop(ctx, &searchstore.DropRequest{CollectionName: collectionName}); err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", err.Error()))
}
}
err = k.knowledgeRepo.Delete(ctx, request.KnowledgeID)
if err != nil {
return err
}
if err = k.documentRepo.DeleteDocuments(ctx, slices.Transform(docs, func(a *model.KnowledgeDocument) int64 {
return a.ID
})); err != nil {
logs.CtxErrorf(ctx, "[DeleteKnowledge] delete documents failed, err %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (k *knowledgeSVC) ListKnowledge(ctx context.Context, request *ListKnowledgeRequest) (response *ListKnowledgeResponse, err error) {
if len(request.IDs) == 0 && request.AppID == nil && request.SpaceID == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge ids, project id, space id and query can not be all empty"))
}
opts := &entity.WhereKnowledgeOption{
KnowledgeIDs: request.IDs,
AppID: request.AppID,
SpaceID: request.SpaceID,
Name: request.Name,
Status: request.Status,
UserID: request.UserID,
Query: request.Query,
Page: request.Page,
PageSize: request.PageSize,
Order: convertOrder(request.Order),
OrderType: convertOrderType(request.OrderType),
}
if request.FormatType != nil {
opts.FormatType = ptr.Of(int64(*request.FormatType))
}
pos, total, err := k.knowledgeRepo.FindKnowledgeByCondition(ctx, opts)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
knList := make([]*knowledgeModel.Knowledge, len(pos))
for i := range pos {
if pos[i] == nil {
continue
}
knList[i], err = k.fromModelKnowledge(ctx, pos[i])
if err != nil {
return nil, err
}
}
return &ListKnowledgeResponse{
KnowledgeList: knList,
Total: total,
}, nil
}
func (k *knowledgeSVC) checkRequest(request *CreateDocumentRequest) error {
if len(request.Documents) == 0 {
return errors.New("document is empty")
}
for i := range request.Documents {
if request.Documents[i].Type == knowledgeModel.DocumentTypeImage && ptr.From(request.Documents[i].ParsingStrategy.CaptionType) == parser.ImageAnnotationTypeModel {
if !k.isAutoAnnotationSupported {
return errors.New("auto caption type is not supported")
}
}
if request.Documents[i].ChunkingStrategy != nil {
if request.Documents[i].ChunkingStrategy.ChunkType == parser.ChunkTypeDefault {
request.Documents[i].ChunkingStrategy = getDefaultChunkStrategy()
}
}
}
return nil
}
func (k *knowledgeSVC) CreateDocument(ctx context.Context, request *CreateDocumentRequest) (response *CreateDocumentResponse, err error) {
if err = k.checkRequest(request); err != nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", err.Error()))
}
if err = k.documentsURL2URI(ctx, request.Documents); err != nil {
return nil, errorx.New(errno.ErrKnowledgeDownloadFailedCode, errorx.KV("msg", err.Error()))
}
userID := request.Documents[0].CreatorID
spaceID := request.Documents[0].SpaceID
documentSource := request.Documents[0].Source
docProcessor := impl.NewDocProcessor(ctx, &impl.DocProcessorConfig{
UserID: userID,
SpaceID: spaceID,
DocumentSource: documentSource,
Documents: request.Documents,
KnowledgeRepo: k.knowledgeRepo,
DocumentRepo: k.documentRepo,
SliceRepo: k.sliceRepo,
Idgen: k.idgen,
Producer: k.producer,
ParseManager: k.parseManager,
Storage: k.storage,
Rdb: k.rdb,
})
// 1. 前置的动作,上传 tos 等
err = docProcessor.BeforeCreate()
if err != nil {
return nil, err
}
// 2. 构建 落库
err = docProcessor.BuildDBModel()
if err != nil {
return nil, err
}
// 3. 插入数据库
err = docProcessor.InsertDBModel()
if err != nil {
return nil, err
}
// 4. 发起索引任务
err = docProcessor.Indexing()
if err != nil {
return nil, err
}
// 5. 返回处理后的文档信息
docs := docProcessor.GetResp()
return &CreateDocumentResponse{
Documents: docs,
}, nil
}
func (k *knowledgeSVC) UpdateDocument(ctx context.Context, request *UpdateDocumentRequest) error {
if request == nil {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
doc, err := k.documentRepo.GetByID(ctx, request.DocumentID)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if request.DocumentName != nil {
doc.Name = *request.DocumentName
}
if doc.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
// 如果是表格类型可能是要改table的meta
if doc.TableInfo != nil {
finalColumns, err := k.alterTableSchema(ctx, doc.TableInfo.Columns, request.TableInfo.Columns, doc.TableInfo.PhysicalTableName)
if err != nil {
return err
}
doc.TableInfo.VirtualTableName = doc.Name
if len(request.TableInfo.Columns) != 0 {
doc.TableInfo.Columns = finalColumns
}
}
}
doc.UpdatedAt = time.Now().UnixMilli()
err = k.documentRepo.Update(ctx, doc)
if err != nil {
logs.CtxErrorf(ctx, "[UpdateDocument] update document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (k *knowledgeSVC) DeleteDocument(ctx context.Context, request *DeleteDocumentRequest) error {
if request == nil {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
doc, err := k.documentRepo.GetByID(ctx, request.DocumentID)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if doc == nil || doc.ID == 0 {
logs.CtxWarnf(ctx, "[DeleteDocument] document not found, doc_id: %d", request.DocumentID)
return nil
}
if doc.DocumentType == int32(knowledgeModel.DocumentTypeTable) && doc.TableInfo != nil {
resp, err := k.rdb.DropTable(ctx, &rdb.DropTableRequest{
TableName: doc.TableInfo.PhysicalTableName,
IfExists: true,
})
if err != nil {
logs.CtxWarnf(ctx, "[DeleteDocument] drop table failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
if !resp.Success {
logs.CtxWarnf(ctx, "[DeleteDocument] drop table failed")
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", "drop table failed"))
}
}
err = k.documentRepo.DeleteDocuments(ctx, []int64{request.DocumentID})
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
sliceIDs, err := k.sliceRepo.GetDocumentSliceIDs(ctx, []int64{request.DocumentID})
if err != nil {
logs.CtxErrorf(ctx, "[DeleteDocument] get document slice ids failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if err = k.emitDeleteKnowledgeDataEvent(ctx, doc.KnowledgeID, sliceIDs, strconv.FormatInt(request.DocumentID, 10)); err != nil {
return err
}
return nil
}
func (k *knowledgeSVC) ListDocument(ctx context.Context, request *ListDocumentRequest) (response *ListDocumentResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
opts := entity.WhereDocumentOpt{
StatusNotIn: []int32{int32(entity.DocumentStatusDeleted)},
}
if request.Limit != nil {
opts.Limit = ptr.From(request.Limit)
}
if request.Offset != nil {
opts.Offset = request.Offset
}
if request.Cursor != nil {
opts.Cursor = request.Cursor
}
if len(request.DocumentIDs) > 0 {
opts.IDs = request.DocumentIDs
}
if request.KnowledgeID != 0 {
opts.KnowledgeIDs = []int64{request.KnowledgeID}
}
if request.SelectAll {
opts.SelectAll = true
}
documents, total, err := k.documentRepo.FindDocumentByCondition(ctx, &opts)
if err != nil {
logs.CtxErrorf(ctx, "list document failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
resp := &ListDocumentResponse{
Total: total,
}
if len(documents)+ptr.From(opts.Offset) < int(total) {
resp.HasMore = true
if len(documents) > 0 {
nextCursor := strconv.FormatInt(documents[len(documents)-1].ID, 10)
resp.NextCursor = &nextCursor
}
}
resp.Documents = []*entity.Document{}
for i := range documents {
docItem, err := k.fromModelDocument(ctx, documents[i])
if err != nil {
return nil, err
}
resp.Documents = append(resp.Documents, docItem)
}
return resp, nil
}
func (k *knowledgeSVC) MGetDocumentProgress(ctx context.Context, request *MGetDocumentProgressRequest) (response *MGetDocumentProgressResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
documents, err := k.documentRepo.MGetByID(ctx, request.DocumentIDs)
if err != nil {
logs.CtxErrorf(ctx, "mget document failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
progresslist := []*DocumentProgress{}
for i := range documents {
item := DocumentProgress{
ID: documents[i].ID,
Name: documents[i].Name,
Size: documents[i].Size,
FileExtension: documents[i].FileExtension,
Status: entity.DocumentStatus(documents[i].Status),
StatusMsg: entity.DocumentStatus(documents[i].Status).String(),
}
if documents[i].DocumentType == int32(knowledge.DocumentTypeImage) && len(documents[i].URI) != 0 {
item.URL, err = k.storage.GetObjectUrl(ctx, documents[i].URI)
if err != nil {
logs.CtxErrorf(ctx, "get object url failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", err.Error()))
}
}
if documents[i].Status == int32(entity.DocumentStatusEnable) || documents[i].Status == int32(entity.DocumentStatusFailed) {
item.Progress = progressbar.ProcessDone
} else {
if documents[i].FailReason != "" {
item.StatusMsg = documents[i].FailReason
item.Status = entity.DocumentStatusFailed
progresslist = append(progresslist, &item)
continue
}
err = k.getProgressFromCache(ctx, &item)
if err != nil {
logs.CtxErrorf(ctx, "get progress from cache failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetDocProgressFailCode, errorx.KV("msg", err.Error()))
}
}
progresslist = append(progresslist, &item)
}
return &MGetDocumentProgressResponse{
ProgressList: progresslist,
}, nil
}
func (k *knowledgeSVC) getProgressFromCache(ctx context.Context, documentProgress *DocumentProgress) (err error) {
progressBar := progressbar.NewProgressBar(ctx, documentProgress.ID, 0, k.cacheCli, false)
percent, remainSec, errMsg := progressBar.GetProgress(ctx)
documentProgress.Progress = int(percent)
documentProgress.RemainingSec = int64(remainSec)
if len(errMsg) != 0 {
documentProgress.Status = entity.DocumentStatusFailed
documentProgress.StatusMsg = errMsg
return err
}
return err
}
func (k *knowledgeSVC) ResegmentDocument(ctx context.Context, request *ResegmentDocumentRequest) (response *ResegmentDocumentResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
if request.ChunkingStrategy != nil {
if request.ChunkingStrategy.ChunkType == parser.ChunkTypeDefault {
request.ChunkingStrategy = getDefaultChunkStrategy()
}
}
doc, err := k.documentRepo.GetByID(ctx, request.DocumentID)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if doc == nil || doc.ID == 0 {
return nil, errorx.New(errno.ErrKnowledgeDocumentNotExistCode, errorx.KV("msg", "document not found"))
}
docEntity, err := k.fromModelDocument(ctx, doc)
if err != nil {
return nil, err
}
docEntity.ChunkingStrategy = request.ChunkingStrategy
docEntity.ParsingStrategy = request.ParsingStrategy
event := events.NewIndexDocumentEvent(docEntity.KnowledgeID, docEntity)
body, err := sonic.Marshal(event)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", err.Error()))
}
doc.ParseRule.ChunkingStrategy = request.ChunkingStrategy
doc.ParseRule.ParsingStrategy = request.ParsingStrategy
doc.Status = int32(entity.DocumentStatusChunking)
err = k.documentRepo.Update(ctx, doc)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if err = k.producer.Send(ctx, body, eventbus.WithShardingKey(strconv.FormatInt(docEntity.KnowledgeID, 10))); err != nil {
return nil, errorx.New(errno.ErrKnowledgeMQSendFailCode, errorx.KV("msg", err.Error()))
}
docEntity.Status = entity.DocumentStatusChunking
return &ResegmentDocumentResponse{
Document: docEntity,
}, nil
}
func (k *knowledgeSVC) CreateSlice(ctx context.Context, request *CreateSliceRequest) (response *CreateSliceResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
docInfo, err := k.documentRepo.GetByID(ctx, request.DocumentID)
if err != nil {
logs.CtxErrorf(ctx, "find document failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if docInfo == nil || docInfo.ID == 0 {
return nil, errorx.New(errno.ErrKnowledgeDocumentNotExistCode, errorx.KV("msg", "document not found"))
}
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
_, total, err := k.sliceRepo.FindSliceByCondition(ctx, &entity.WhereSliceOpt{
DocumentID: docInfo.ID,
})
if err != nil {
logs.CtxErrorf(ctx, "FindSliceByCondition err:%v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
request.Position = total + 1
}
slices, err := k.sliceRepo.GetSliceBySequence(ctx, request.DocumentID, request.Position)
if err != nil {
logs.CtxErrorf(ctx, "get slice by sequence failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
now := time.Now().UnixMilli()
id, err := k.idgen.GenID(ctx)
if err != nil {
logs.CtxErrorf(ctx, "gen id failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeIDGenCode)
}
sliceInfo := model.KnowledgeDocumentSlice{
ID: id,
KnowledgeID: docInfo.KnowledgeID,
DocumentID: docInfo.ID,
CreatedAt: now,
UpdatedAt: now,
CreatorID: request.CreatorID,
SpaceID: docInfo.SpaceID,
Status: int32(knowledgeModel.SliceStatusInit),
}
if len(slices) == 0 {
if request.Position == 0 {
request.Position = 1
sliceInfo.Sequence = 1
} else {
return nil, errorx.New(errno.ErrKnowledgeSliceInsertPositionIllegalCode)
}
}
if len(slices) == 1 {
if request.Position == 1 || request.Position == 0 {
// 插入到最前面
sliceInfo.Sequence = slices[0].Sequence - 1
} else {
sliceInfo.Sequence = slices[0].Sequence + 1
}
}
if len(slices) == 2 {
if request.Position == 0 || request.Position == 1 {
sliceInfo.Sequence = slices[0].Sequence - 1
} else {
if slices[0].Sequence+1 < slices[1].Sequence {
sliceInfo.Sequence = float64(int(slices[0].Sequence) + 1)
} else {
sliceInfo.Sequence = (slices[0].Sequence + slices[1].Sequence) / 2
}
}
}
sliceEntity := entity.Slice{
Info: knowledgeModel.Info{
ID: id,
CreatorID: request.CreatorID,
},
DocumentID: request.DocumentID,
RawContent: request.RawContent,
}
docEntity, err := k.fromModelDocument(ctx, docInfo)
if err != nil {
logs.CtxErrorf(ctx, "fromModelDocument failed, err: %v", err)
return nil, err
}
indexSliceEvent := events.NewIndexSliceEvent(&sliceEntity, docEntity)
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeText) ||
docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
sliceInfo.Content = sliceEntity.GetSliceContent()
}
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
sliceEntity.ID = sliceInfo.ID
err = k.upsertDataToTable(ctx, docInfo.TableInfo, []*entity.Slice{&sliceEntity})
if err != nil {
logs.CtxErrorf(ctx, "insert data to table failed, err: %v", err)
return nil, err
}
}
err = k.sliceRepo.Create(ctx, &sliceInfo)
if err != nil {
logs.CtxErrorf(ctx, "create slice failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
body, err := sonic.Marshal(&indexSliceEvent)
if err != nil {
logs.CtxErrorf(ctx, "marshal event failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", err.Error()))
}
if err = k.producer.Send(ctx, body, eventbus.WithShardingKey(strconv.FormatInt(sliceInfo.DocumentID, 10))); err != nil {
logs.CtxErrorf(ctx, "send message failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeMQSendFailCode, errorx.KV("msg", err.Error()))
}
if err = k.documentRepo.UpdateDocumentSliceInfo(ctx, docInfo.ID); err != nil {
logs.CtxErrorf(ctx, "update document slice info failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return &CreateSliceResponse{
SliceID: id,
}, nil
}
func (k *knowledgeSVC) UpdateSlice(ctx context.Context, request *UpdateSliceRequest) error {
if request == nil {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
sliceInfo, err := k.sliceRepo.MGetSlices(ctx, []int64{request.SliceID})
if err != nil {
logs.CtxErrorf(ctx, "mget slice failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if len(sliceInfo) != 1 {
return errorx.New(errno.ErrKnowledgeSliceNotExistCode)
}
docInfo, err := k.documentRepo.GetByID(ctx, request.DocumentID)
if err != nil {
logs.CtxErrorf(ctx, "find document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if docInfo == nil || docInfo.ID == 0 {
return errorx.New(errno.ErrKnowledgeDocumentNotExistCode)
}
// 更新数据库中的存储
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeText) ||
docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
sliceEntity := entity.Slice{RawContent: request.RawContent}
sliceInfo[0].Content = sliceEntity.GetSliceContent()
}
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeImage) {
sliceInfo[0].Content = ptr.From(request.RawContent[0].Text)
}
docEntity, err := k.fromModelDocument(ctx, docInfo)
if err != nil {
logs.CtxErrorf(ctx, "fromModelDocument failed, err: %v", err)
return err
}
sliceInfo[0].UpdatedAt = time.Now().UnixMilli()
sliceInfo[0].Status = int32(knowledgeModel.SliceStatusInit)
indexSliceEvent := events.NewIndexSliceEvent(&entity.Slice{
Info: knowledgeModel.Info{
ID: sliceInfo[0].ID,
},
KnowledgeID: sliceInfo[0].KnowledgeID,
DocumentID: sliceInfo[0].DocumentID,
RawContent: request.RawContent,
}, docEntity)
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
indexSliceEvent.Slice.ID = sliceInfo[0].ID
err = k.upsertDataToTable(ctx, docInfo.TableInfo, []*entity.Slice{indexSliceEvent.Slice})
if err != nil {
logs.CtxErrorf(ctx, "upsert data to table failed, err: %v", err)
return err
}
}
err = k.sliceRepo.Update(ctx, sliceInfo[0])
if err != nil {
logs.CtxErrorf(ctx, "update slice failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
body, err := sonic.Marshal(&indexSliceEvent)
if err != nil {
logs.CtxErrorf(ctx, "marshal event failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", err.Error()))
}
if err = k.producer.Send(ctx, body, eventbus.WithShardingKey(strconv.FormatInt(sliceInfo[0].DocumentID, 10))); err != nil {
logs.CtxErrorf(ctx, "send message failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeMQSendFailCode, errorx.KV("msg", err.Error()))
}
if err = k.documentRepo.UpdateDocumentSliceInfo(ctx, docInfo.ID); err != nil {
logs.CtxErrorf(ctx, "update document slice info failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (k *knowledgeSVC) DeleteSlice(ctx context.Context, request *DeleteSliceRequest) error {
if request == nil {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
sliceInfo, err := k.sliceRepo.MGetSlices(ctx, []int64{request.SliceID})
if err != nil {
logs.CtxErrorf(ctx, "mget slice failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if len(sliceInfo) != 1 {
logs.CtxWarnf(ctx, "slice not found, slice_id: %d", request.SliceID)
return nil
}
docInfo, err := k.documentRepo.GetByID(ctx, sliceInfo[0].DocumentID)
if err != nil {
logs.CtxErrorf(ctx, "find document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if docInfo == nil || docInfo.ID == 0 {
return errorx.New(errno.ErrKnowledgeDocumentNotExistCode)
}
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
_, err := k.rdb.DeleteData(ctx, &rdb.DeleteDataRequest{
TableName: docInfo.TableInfo.PhysicalTableName,
Where: &rdb.ComplexCondition{
Conditions: []*rdb.Condition{
{
Field: consts.RDBFieldID,
Operator: rdbEntity.OperatorEqual,
Value: request.SliceID,
},
},
},
})
if err != nil {
logs.CtxErrorf(ctx, "delete data failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
}
// 删除数据库中的存储
err = k.sliceRepo.Delete(ctx, &model.KnowledgeDocumentSlice{ID: request.SliceID})
if err != nil {
logs.CtxErrorf(ctx, "delete slice failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if err = k.emitDeleteKnowledgeDataEvent(ctx, sliceInfo[0].KnowledgeID, []int64{request.SliceID}, strconv.FormatInt(sliceInfo[0].DocumentID, 10)); err != nil {
return err
}
if err = k.documentRepo.UpdateDocumentSliceInfo(ctx, docInfo.ID); err != nil {
logs.CtxErrorf(ctx, "update document slice info failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (k *knowledgeSVC) ListSlice(ctx context.Context, request *ListSliceRequest) (response *ListSliceResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
if request.DocumentID == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "document_id is empty"))
}
doc, err := k.documentRepo.GetByID(ctx, ptr.From(request.DocumentID))
if err != nil {
logs.CtxErrorf(ctx, "get document failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
resp := ListSliceResponse{}
if doc.Status == int32(entity.DocumentStatusDeleted) {
return &resp, nil
}
slices, total, err := k.sliceRepo.FindSliceByCondition(ctx, &entity.WhereSliceOpt{
KnowledgeID: ptr.From(request.KnowledgeID),
DocumentID: ptr.From(request.DocumentID),
Keyword: request.Keyword,
Sequence: request.Sequence,
PageSize: request.Limit,
Offset: request.Offset,
})
if err != nil {
logs.CtxErrorf(ctx, "list slice failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if total > (request.Sequence + request.Limit) {
resp.HasMore = true
} else {
resp.HasMore = false
}
resp.Total = int(total)
var sliceMap map[int64]*entity.Slice
// 如果是表格类型那么去table中取一下原始数据
if doc.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
// 从数据库中查询原始数据
sliceMap, err = k.selectTableData(ctx, doc.TableInfo, slices)
if err != nil {
logs.CtxErrorf(ctx, "select table data failed, err: %v", err)
return nil, err
}
}
resp.Slices = []*entity.Slice{}
for i := range slices {
resp.Slices = append(resp.Slices, k.fromModelSlice(ctx, slices[i]))
if sliceMap[slices[i].ID] != nil {
resp.Slices[i].RawContent = sliceMap[slices[i].ID].RawContent
}
resp.Slices[i].Sequence = request.Sequence + 1 + int64(i)
}
return &resp, nil
}
func (k *knowledgeSVC) GetSlice(ctx context.Context, request *GetSliceRequest) (response *GetSliceResponse, err error) {
slices, err := k.sliceRepo.MGetSlices(ctx, []int64{request.SliceID})
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if len(slices) == 0 {
return nil, errorx.New(errno.ErrKnowledgeSliceNotExistCode)
}
return &GetSliceResponse{
Slice: k.fromModelSlice(ctx, slices[0]),
}, nil
}
func getDefaultChunkStrategy() *entity.ChunkingStrategy {
return &entity.ChunkingStrategy{
ChunkType: parser.ChunkTypeDefault,
ChunkSize: consts.DefaultChunkSize,
Separator: consts.DefaultSeparator,
Overlap: consts.DefaultOverlap,
TrimSpace: consts.DefaultTrimSpace,
TrimURLAndEmail: consts.DefaultTrimURLAndEmail,
}
}
func (k *knowledgeSVC) CreateDocumentReview(ctx context.Context, request *CreateDocumentReviewRequest) (response *CreateDocumentReviewResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
if request.ChunkStrategy != nil {
if request.ChunkStrategy.ChunkType == parser.ChunkTypeDefault {
request.ChunkStrategy = getDefaultChunkStrategy()
}
}
uid := ctxutil.GetUIDFromCtx(ctx)
if uid == nil {
return nil, errorx.New(errno.ErrKnowledgePermissionCode, errorx.KV("msg", "session required"))
}
kn, err := k.knowledgeRepo.GetByID(ctx, request.KnowledgeID)
if err != nil {
logs.CtxErrorf(ctx, "get knowledge failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if kn == nil {
return nil, errorx.New(errno.ErrKnowledgeNotExistCode)
}
documentIDs := make([]int64, 0, len(request.Reviews))
documentMap := make(map[int64]*model.KnowledgeDocument)
for _, input := range request.Reviews {
if input.DocumentID != nil && *input.DocumentID > 0 {
documentIDs = append(documentIDs, *input.DocumentID)
}
}
if len(documentIDs) > 0 {
documents, err := k.documentRepo.MGetByID(ctx, documentIDs)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
for _, document := range documents {
documentMap[document.ID] = document
}
}
reviews := make([]*entity.Review, 0, len(request.Reviews))
for _, input := range request.Reviews {
review := &entity.Review{
DocumentName: input.DocumentName,
DocumentType: input.DocumentType,
Uri: input.TosUri,
}
if input.DocumentID != nil && *input.DocumentID > 0 {
if document, ok := documentMap[*input.DocumentID]; ok {
review.DocumentName = document.Name
names := strings.Split(document.URI, "/")
objectName := strings.Split(names[len(names)-1], ".")
review.DocumentType = objectName[len(objectName)-1]
review.Uri = document.URI
}
}
review.Url, err = k.storage.GetObjectUrl(ctx, review.Uri)
if err != nil {
logs.CtxErrorf(ctx, "get object url failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", err.Error()))
}
reviews = append(reviews, review)
}
// STEP 1. 生成ID
reviewIDs, err := k.genMultiIDs(ctx, len(request.Reviews))
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeIDGenCode)
}
for i := range request.Reviews {
reviews[i].ReviewID = ptr.Of(reviewIDs[i])
}
modelReviews := make([]*model.KnowledgeDocumentReview, 0, len(reviews))
for _, review := range reviews {
modelReviews = append(modelReviews, &model.KnowledgeDocumentReview{
ID: *review.ReviewID,
KnowledgeID: request.KnowledgeID,
SpaceID: kn.SpaceID,
Name: review.DocumentName,
Type: review.DocumentType,
URI: review.Uri,
CreatorID: *uid,
})
}
err = k.reviewRepo.CreateInBatches(ctx, modelReviews)
if err != nil {
logs.CtxErrorf(ctx, "create review failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
for i := range reviews {
review := reviews[i]
doc := &entity.Document{
KnowledgeID: request.KnowledgeID,
ParsingStrategy: request.ParsingStrategy,
ChunkingStrategy: request.ChunkStrategy,
Type: knowledgeModel.DocumentTypeText,
URI: review.Uri,
FileExtension: parser.FileExtension(review.DocumentType),
Info: knowledgeModel.Info{
Name: review.DocumentName,
CreatorID: *uid,
},
Source: entity.DocumentSourceLocal,
}
reviewEvent := events.NewDocumentReviewEvent(doc, review)
body, err := sonic.Marshal(&reviewEvent)
if err != nil {
logs.CtxErrorf(ctx, "marshal event failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", err.Error()))
}
err = k.producer.Send(ctx, body)
if err != nil {
logs.CtxErrorf(ctx, "send message failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeMQSendFailCode, errorx.KV("msg", err.Error()))
}
}
return &CreateDocumentReviewResponse{
Reviews: reviews,
}, nil
}
func (k *knowledgeSVC) MGetDocumentReview(ctx context.Context, request *MGetDocumentReviewRequest) (response *MGetDocumentReviewResponse, err error) {
reviews, err := k.reviewRepo.MGetByIDs(ctx, request.ReviewIDs)
if err != nil {
logs.CtxErrorf(ctx, "mget review failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
for _, review := range reviews {
if review.KnowledgeID != request.KnowledgeID {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "knowledge_id and doc not match"))
}
}
reviewEntity := make([]*entity.Review, 0, len(reviews))
for _, review := range reviews {
status := entity.ReviewStatus(review.Status)
var reviewTosURL, reviewChunkRespTosURL string
if review.URI != "" {
reviewTosURL, err = k.getObjectURL(ctx, review.URI)
if err != nil {
logs.CtxErrorf(ctx, "get object url failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", err.Error()))
}
}
if review.ChunkRespURI != "" {
reviewChunkRespTosURL, err = k.getObjectURL(ctx, review.ChunkRespURI)
if err != nil {
logs.CtxErrorf(ctx, "get object url failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", err.Error()))
}
}
reviewEntity = append(reviewEntity, &entity.Review{
ReviewID: &review.ID,
DocumentName: review.Name,
DocumentType: review.Type,
Url: reviewTosURL,
Status: &status,
DocTreeTosUrl: ptr.Of(reviewChunkRespTosURL),
PreviewTosUrl: ptr.Of(reviewTosURL),
})
}
return &MGetDocumentReviewResponse{
Reviews: reviewEntity,
}, nil
}
func (k *knowledgeSVC) SaveDocumentReview(ctx context.Context, request *SaveDocumentReviewRequest) error {
if request == nil {
return errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
review, err := k.reviewRepo.GetByID(ctx, request.ReviewID)
if err != nil {
logs.CtxErrorf(ctx, "get review failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
uri := review.ChunkRespURI
if review.Status == int32(entity.ReviewStatus_Enable) && len(uri) > 0 {
newTosUri := fmt.Sprintf("DocReview/%d_%d_%d.txt", review.CreatorID, time.Now().UnixMilli(), review.ID)
err = k.storage.PutObject(ctx, newTosUri, []byte(request.DocTreeJson))
if err != nil {
logs.CtxErrorf(ctx, "put object failed, err: %v", err)
return errorx.New(errno.ErrKnowledgePutObjectFailCode, errorx.KV("msg", err.Error()))
}
err = k.reviewRepo.UpdateReview(ctx, review.ID, map[string]interface{}{
"chunk_resp_uri": newTosUri,
})
if err != nil {
logs.CtxErrorf(ctx, "update review chunk uri failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
}
return nil
}
func (k *knowledgeSVC) documentsURL2URI(ctx context.Context, docs []*entity.Document) error {
download := func(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("http get failed, %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("download file failed, status code=%d", resp.StatusCode)
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read all failed, %w", err)
}
return data, nil
}
// same as UploadFile
const baseWord = "1Aa2Bb3Cc4Dd5Ee6Ff7Gg8Hh9Ii0JjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"
createURI := func(uid int64, fileType string) string {
num := 10
input := fmt.Sprintf("upload_%d_Ma*9)fhi_%d_gou_%s_rand_%d", uid, time.Now().Unix(), fileType, rand.Intn(100000))
hash := sha256.Sum256([]byte(fmt.Sprintf("%s", input)))
hashString := base64.StdEncoding.EncodeToString(hash[:])
if len(hashString) > num {
hashString = hashString[:num]
}
secret := ""
for _, char := range hashString {
index := int(char) % 62
secret += string(baseWord[index])
}
suffix := fmt.Sprintf("%d_%d_%s.%s", uid, time.Now().UnixNano(), secret, fileType)
uri := fmt.Sprintf("%s/%s", developer_api.FileBizType_BIZ_BOT_DATASET, suffix)
return uri
}
for _, doc := range docs {
if doc.URI != "" || doc.URL == "" {
continue
}
b, err := download(doc.URL)
if err != nil {
return fmt.Errorf("[documentsURL2URI] download document failed, %w", err)
}
uri := createURI(doc.CreatorID, string(doc.FileExtension))
if err = k.storage.PutObject(ctx, uri, b); err != nil {
return fmt.Errorf("[documentsURL2URI] upload document failed, %w", err)
}
doc.URI = uri
}
return nil
}
func (k *knowledgeSVC) emitDeleteKnowledgeDataEvent(ctx context.Context, knowledgeID int64, sliceIDs []int64, shardingKey string) error {
deleteSliceEvent := events.NewDeleteKnowledgeDataEvent(knowledgeID, sliceIDs)
body, err := sonic.Marshal(&deleteSliceEvent)
if err != nil {
logs.CtxErrorf(ctx, "marshal event failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", err.Error()))
}
if err = k.producer.Send(ctx, body, eventbus.WithShardingKey(shardingKey)); err != nil {
logs.CtxErrorf(ctx, "send message failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeMQSendFailCode, errorx.KV("msg", err.Error()))
}
return nil
}
func (k *knowledgeSVC) fromModelKnowledge(ctx context.Context, knowledge *model.Knowledge) (*knowledgeModel.Knowledge, error) {
if knowledge == nil {
return nil, nil
}
sliceHit, err := k.sliceRepo.GetSliceHitByKnowledgeID(ctx, knowledge.ID)
if err != nil {
logs.CtxErrorf(ctx, "get slice hit count failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
knEntity := &knowledgeModel.Knowledge{
Info: knowledgeModel.Info{
ID: knowledge.ID,
Name: knowledge.Name,
Description: knowledge.Description,
IconURI: knowledge.IconURI,
CreatorID: knowledge.CreatorID,
SpaceID: knowledge.SpaceID,
CreatedAtMs: knowledge.CreatedAt,
UpdatedAtMs: knowledge.UpdatedAt,
AppID: knowledge.AppID,
},
SliceHit: sliceHit,
Type: knowledgeModel.DocumentType(knowledge.FormatType),
Status: knowledgeModel.KnowledgeStatus(knowledge.Status),
}
if knowledge.IconURI != "" {
objUrl, err := k.storage.GetObjectUrl(ctx, knowledge.IconURI)
if err != nil {
logs.CtxErrorf(ctx, "get object url failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", err.Error()))
}
knEntity.IconURL = objUrl
}
return knEntity, nil
}
func (k *knowledgeSVC) fromModelDocument(ctx context.Context, document *model.KnowledgeDocument) (*entity.Document, error) {
if document == nil {
return nil, nil
}
documentEntity := &entity.Document{
Info: knowledgeModel.Info{
ID: document.ID,
Name: document.Name,
CreatorID: document.CreatorID,
SpaceID: document.SpaceID,
CreatedAtMs: document.CreatedAt,
UpdatedAtMs: document.UpdatedAt,
},
Type: knowledgeModel.DocumentType(document.DocumentType),
KnowledgeID: document.KnowledgeID,
URI: document.URI,
Size: document.Size,
SliceCount: document.SliceCount,
CharCount: document.CharCount,
FileExtension: parser.FileExtension(document.FileExtension),
Source: entity.DocumentSource(document.SourceType),
Status: entity.DocumentStatus(document.Status),
ParsingStrategy: document.ParseRule.ParsingStrategy,
ChunkingStrategy: document.ParseRule.ChunkingStrategy,
}
if document.TableInfo != nil {
documentEntity.TableInfo = *document.TableInfo
documentEntity.TableInfo.Columns = make([]*entity.TableColumn, 0)
for i := range document.TableInfo.Columns {
if document.TableInfo.Columns[i] == nil {
continue
}
if document.TableInfo.Columns[i].Name == consts.RDBFieldID {
continue
}
documentEntity.TableInfo.Columns = append(documentEntity.TableInfo.Columns, document.TableInfo.Columns[i])
}
}
switch document.Status {
case int32(entity.DocumentStatusChunking), int32(entity.DocumentStatusInit), int32(entity.DocumentStatusUploading):
if document.FailReason != "" {
documentEntity.Status = entity.DocumentStatusFailed
documentEntity.StatusMsg = document.FailReason
}
case int32(entity.DocumentStatusFailed):
documentEntity.StatusMsg = document.FailReason
default:
}
if len(document.URI) != 0 {
objUrl, err := k.storage.GetObjectUrl(ctx, document.URI)
if err != nil {
logs.CtxErrorf(ctx, "get object url failed, err: %v", err)
return nil, errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", err.Error()))
}
documentEntity.URL = objUrl
}
return documentEntity, nil
}
func (k *knowledgeSVC) fromModelSlice(ctx context.Context, slice *model.KnowledgeDocumentSlice) *entity.Slice {
if slice == nil {
return nil
}
s := &entity.Slice{
Info: knowledgeModel.Info{
ID: slice.ID,
CreatorID: slice.CreatorID,
SpaceID: slice.SpaceID,
CreatedAtMs: slice.CreatedAt,
UpdatedAtMs: slice.UpdatedAt,
},
DocumentID: slice.DocumentID,
KnowledgeID: slice.KnowledgeID,
ByteCount: int64(len(slice.Content)),
CharCount: int64(utf8.RuneCountInString(slice.Content)),
Hit: slice.Hit,
SliceStatus: knowledgeModel.SliceStatus(slice.Status),
}
if slice.Content != "" {
processedContent := k.formatSliceContent(ctx, slice.Content)
s.RawContent = make([]*knowledgeModel.SliceContent, 0)
s.RawContent = append(s.RawContent, &knowledgeModel.SliceContent{
Type: knowledgeModel.SliceContentTypeText,
Text: ptr.Of(processedContent),
})
}
return s
}
func convertOrderType(orderType *knowledgeModel.OrderType) *entity.OrderType {
if orderType == nil {
return nil
}
odType := *orderType
switch odType {
case knowledgeModel.OrderTypeAsc:
return ptr.Of(entity.OrderTypeAsc)
case knowledgeModel.OrderTypeDesc:
return ptr.Of(entity.OrderTypeDesc)
default:
return ptr.Of(entity.OrderTypeDesc)
}
}
func convertOrder(order *knowledgeModel.Order) *entity.Order {
if order == nil {
return nil
}
od := *order
switch od {
case knowledgeModel.OrderCreatedAt:
return ptr.Of(entity.OrderCreatedAt)
case knowledgeModel.OrderUpdatedAt:
return ptr.Of(entity.OrderUpdatedAt)
default:
return ptr.Of(entity.OrderCreatedAt)
}
}
func (k *knowledgeSVC) GetKnowledgeByID(ctx context.Context, request *GetKnowledgeByIDRequest) (response *GetKnowledgeByIDResponse, err error) {
if request == nil || request.KnowledgeID == 0 {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
kn, err := k.knowledgeRepo.GetByID(ctx, request.KnowledgeID)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if kn == nil || kn.ID == 0 {
return nil, errorx.New(errno.ErrKnowledgeNotExistCode, errorx.KV("msg", "knowledge not found"))
}
knEntity, err := k.fromModelKnowledge(ctx, kn)
if err != nil {
return nil, err
}
return &GetKnowledgeByIDResponse{
Knowledge: knEntity,
}, nil
}
func (k *knowledgeSVC) ListPhotoSlice(ctx context.Context, request *ListPhotoSliceRequest) (response *ListPhotoSliceResponse, err error) {
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
sliceArr, total, err := k.sliceRepo.FindSliceByCondition(ctx, &entity.WhereSliceOpt{
KnowledgeID: request.KnowledgeID,
DocumentIDs: request.DocumentIDs,
Offset: int64(ptr.From(request.Offset)),
PageSize: int64(ptr.From(request.Limit)),
NotEmpty: request.HasCaption,
})
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
response = &ListPhotoSliceResponse{
Total: int(total),
Slices: slices.Transform(sliceArr, func(item *model.KnowledgeDocumentSlice) *entity.Slice {
res := k.fromModelSlice(ctx, item)
return res
}),
}
return response, nil
}
func (k *knowledgeSVC) ExtractPhotoCaption(ctx context.Context, request *ExtractPhotoCaptionRequest) (response *ExtractPhotoCaptionResponse, err error) {
response = &ExtractPhotoCaptionResponse{}
if request == nil {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
if !k.isAutoAnnotationSupported {
return nil, errorx.New(errno.ErrKnowledgeAutoAnnotationNotSupportedCode, errorx.KV("msg", "auto annotation is not supported"))
}
docInfo, err := k.documentRepo.GetByID(ctx, request.DocumentID)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
if docInfo == nil || docInfo.ID == 0 {
return nil, errorx.New(errno.ErrKnowledgeDocumentNotExistCode, errorx.KV("msg", "document not found"))
}
docEntity, err := k.fromModelDocument(ctx, docInfo)
if err != nil {
return nil, err
}
docEntity.ParsingStrategy.CaptionType = ptr.Of(parser.ImageAnnotationTypeModel)
parser, err := k.parseManager.GetParser(convert.DocumentToParseConfig(docEntity))
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeGetParserFailCode, errorx.KV("msg", err.Error()))
}
imageByte, err := k.storage.GetObject(ctx, docEntity.URI)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeGetObjectFailCode, errorx.KV("msg", err.Error()))
}
reader := bytes.NewReader(imageByte)
schemaDoc, err := parser.Parse(ctx, reader)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeParserParseFailCode, errorx.KV("msg", err.Error()))
}
if len(schemaDoc) == 0 {
return nil, errorx.New(errno.ErrKnowledgeParserParseFailCode, errorx.KV("msg", "parse fail, schema doc is empty"))
}
response.Caption = schemaDoc[0].Content
return response, nil
}
func (k *knowledgeSVC) MGetKnowledgeByID(ctx context.Context, request *MGetKnowledgeByIDRequest) (response *MGetKnowledgeByIDResponse, err error) {
if request == nil || len(request.KnowledgeIDs) == 0 {
return nil, errorx.New(errno.ErrKnowledgeInvalidParamCode, errorx.KV("msg", "request is empty"))
}
models, err := k.knowledgeRepo.MGetByID(ctx, request.KnowledgeIDs)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
response = &MGetKnowledgeByIDResponse{}
response.Knowledge = make([]*knowledgeModel.Knowledge, 0, len(models))
for _, model := range models {
if model == nil {
continue
}
kn, err := k.fromModelKnowledge(ctx, model)
if err != nil {
return nil, err
}
response.Knowledge = append(response.Knowledge, kn)
}
return response, nil
}
const (
expireTime = 21600
cacheTime = 7200
)
func (k *knowledgeSVC) getObjectURL(ctx context.Context, uri string) (string, error) {
cmd := k.cacheCli.Get(ctx, uri)
if cmd.Err() != nil {
url, err := k.storage.GetObjectUrl(ctx, uri, storage.WithExpire(expireTime))
if err != nil {
return "", errorx.New(errno.ErrKnowledgeGetObjectURLFailCode, errorx.KV("msg", fmt.Sprintf("get object url failed, %v", err)))
}
if errors.Is(cmd.Err(), redisV9.Nil) {
err = k.cacheCli.Set(ctx, uri, url, cacheTime*time.Second).Err()
if err != nil {
logs.CtxErrorf(ctx, "[getObjectURL] set cache failed, %v", err)
}
}
return url, nil
}
url := cmd.Val()
return url, nil
}
func (k *knowledgeSVC) genMultiIDs(ctx context.Context, counts int) ([]int64, error) {
allIDs := make([]int64, 0)
for l := 0; l < counts; l += 100 {
r := min(l+100, counts)
batchSize := r - l
ids, err := k.idgen.GenMultiIDs(ctx, batchSize)
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeIDGenCode, errorx.KV("msg", fmt.Sprintf("GenMultiIDs failed, err: %v", err)))
}
allIDs = append(allIDs, ids...)
}
return allIDs, nil
}