optimize(knowledge): Optimize the index logic of the knowledge base (#841)

This commit is contained in:
liuyunchao-1998 2025-08-21 17:44:26 +08:00 committed by GitHub
parent 3d53aaa785
commit 09d00c26cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 454 additions and 165 deletions

View File

@ -25,6 +25,7 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"gorm.io/gorm" "gorm.io/gorm"
"gorm.io/gorm/clause"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity" "github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model" "github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model"
@ -52,7 +53,7 @@ func (dao *KnowledgeDocumentSliceDAO) Update(ctx context.Context, slice *model.K
} }
func (dao *KnowledgeDocumentSliceDAO) BatchCreate(ctx context.Context, slices []*model.KnowledgeDocumentSlice) error { func (dao *KnowledgeDocumentSliceDAO) BatchCreate(ctx context.Context, slices []*model.KnowledgeDocumentSlice) error {
return dao.Query.KnowledgeDocumentSlice.WithContext(ctx).CreateInBatches(slices, 100) return dao.Query.KnowledgeDocumentSlice.WithContext(ctx).Clauses(clause.OnConflict{UpdateAll: true}).CreateInBatches(slices, 100)
} }
func (dao *KnowledgeDocumentSliceDAO) BatchSetStatus(ctx context.Context, ids []int64, status int32, reason string) error { func (dao *KnowledgeDocumentSliceDAO) BatchSetStatus(ctx context.Context, ids []int64, status int32, reason string) error {

View File

@ -35,12 +35,15 @@ import (
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model" "github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/events" "github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/events"
"github.com/coze-dev/coze-studio/backend/infra/contract/document" "github.com/coze-dev/coze-studio/backend/infra/contract/document"
progressbarContract "github.com/coze-dev/coze-studio/backend/infra/contract/document/progressbar"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore" "github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore"
"github.com/coze-dev/coze-studio/backend/infra/contract/eventbus" "github.com/coze-dev/coze-studio/backend/infra/contract/eventbus"
"github.com/coze-dev/coze-studio/backend/infra/contract/rdb" "github.com/coze-dev/coze-studio/backend/infra/contract/rdb"
rdbEntity "github.com/coze-dev/coze-studio/backend/infra/contract/rdb/entity"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage" "github.com/coze-dev/coze-studio/backend/infra/contract/storage"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/progressbar" "github.com/coze-dev/coze-studio/backend/infra/impl/document/progressbar"
"github.com/coze-dev/coze-studio/backend/pkg/errorx" "github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
"github.com/coze-dev/coze-studio/backend/pkg/lang/slices" "github.com/coze-dev/coze-studio/backend/pkg/lang/slices"
"github.com/coze-dev/coze-studio/backend/pkg/logs" "github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/errno" "github.com/coze-dev/coze-studio/backend/types/errno"
@ -139,37 +142,98 @@ func (k *knowledgeSVC) indexDocuments(ctx context.Context, event *entity.Event)
return nil return nil
} }
type indexDocCacheRecord struct {
ProcessingIDs []int64
LastProcessedNumber int64
ParseUri string
}
const (
indexDocCacheKey = "index_doc_cache:%d:%d"
)
// indexDocumentNew handles the indexing of a new document into the knowledge system
func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (err error) { func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (err error) {
doc := event.Document doc := event.Document
if doc == nil { if doc == nil {
return errorx.New(errno.ErrKnowledgeNonRetryableCode, errorx.KV("reason", "[indexDocument] document not provided"))
}
// 1. The index operations on the same document in the retry queue and the ordinary queue are concurrent, and the same document data is written twice (generated when the backend bugfix is online)
// 2. rebalance repeated consumption of the same message
// check knowledge and document status
if valid, err := k.isWritableKnowledgeAndDocument(ctx, doc.KnowledgeID, doc.ID); err != nil {
return err
} else if !valid {
return errorx.New(errno.ErrKnowledgeNonRetryableCode, return errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] not writable, knowledge_id=%d, document_id=%d", event.KnowledgeID, doc.ID)) errorx.KV("reason", "[indexDocument] document not provided"))
} }
defer func() { // Validate document and knowledge status
if e := recover(); e != nil { var valid bool
err = errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("panic: %v", e))) if valid, err = k.validateDocumentStatus(ctx, doc); err != nil || !valid {
logs.CtxErrorf(ctx, "[indexDocument] panic, err: %v", err)
if setStatusErr := k.documentRepo.SetStatus(ctx, event.Document.ID, int32(entity.DocumentStatusFailed), err.Error()); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set document status failed, err: %v", setStatusErr)
}
return return
} }
// Setup error handling and recovery
defer k.handleIndexingErrors(ctx, event, &err)
// Start indexing process
if err = k.beginIndexingProcess(ctx, doc); err != nil {
return
}
// Process document parsing and chunking
var parseResult []*schema.Document
var cacheRecord *indexDocCacheRecord
parseResult, cacheRecord, err = k.processDocumentParsing(ctx, doc)
if err != nil { if err != nil {
var errMsg string return
var statusError errorx.StatusError }
if cacheRecord.LastProcessedNumber == 0 {
if err = k.cleanupPreviousProcessing(ctx, doc); err != nil {
return
}
}
// Handle table-type documents specially
if doc.Type == knowledge.DocumentTypeTable {
if err = k.handleTableDocument(ctx, doc, parseResult); err != nil {
return
}
}
// Process document chunks in batches
if err = k.processDocumentChunks(ctx, doc, parseResult, cacheRecord); err != nil {
return
}
// Finalize document indexing
err = k.finalizeDocumentIndexing(ctx, event.Document.KnowledgeID, event.Document.ID)
return
}
// validateDocumentStatus checks if the document can be indexed
func (k *knowledgeSVC) validateDocumentStatus(ctx context.Context, doc *entity.Document) (bool, error) {
valid, err := k.isWritableKnowledgeAndDocument(ctx, doc.KnowledgeID, doc.ID)
if err != nil {
return false, err
}
if !valid {
return false, errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] not writable, knowledge_id=%d, document_id=%d",
doc.KnowledgeID, doc.ID))
}
return true, nil
}
// handleIndexingErrors manages errors and recovery during indexing
func (k *knowledgeSVC) handleIndexingErrors(ctx context.Context, event *entity.Event, err *error) {
if e := recover(); e != nil {
err = ptr.Of(errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("panic: %v", e))))
logs.CtxErrorf(ctx, "[indexDocument] panic, err: %v", err)
k.setDocumentStatus(ctx, event.Document.ID,
int32(entity.DocumentStatusFailed), ptr.From(err).Error())
return
}
if ptr.From(err) != nil {
var status int32 var status int32
if errors.As(err, &statusError) { var errMsg string
var statusError errorx.StatusError
if errors.As(ptr.From(err), &statusError) {
errMsg = errorx.ErrorWithoutStack(statusError) errMsg = errorx.ErrorWithoutStack(statusError)
if statusError.Code() == errno.ErrKnowledgeNonRetryableCode { if statusError.Code() == errno.ErrKnowledgeNonRetryableCode {
status = int32(entity.DocumentStatusFailed) status = int32(entity.DocumentStatusFailed)
@ -177,20 +241,270 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
status = int32(entity.DocumentStatusChunking) status = int32(entity.DocumentStatusChunking)
} }
} else { } else {
errMsg = err.Error() errMsg = ptr.From(err).Error()
status = int32(entity.DocumentStatusChunking) status = int32(entity.DocumentStatusChunking)
} }
if setStatusErr := k.documentRepo.SetStatus(ctx, event.Document.ID, status, errMsg); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set document status failed, err: %v", setStatusErr)
}
}
}()
// clear k.setDocumentStatus(ctx, event.Document.ID, status, errMsg)
}
}
// beginIndexingProcess starts the indexing process
func (k *knowledgeSVC) beginIndexingProcess(ctx context.Context, doc *entity.Document) error {
err := k.documentRepo.SetStatus(ctx, doc.ID, int32(entity.DocumentStatusChunking), "")
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
}
return nil
}
// processDocumentParsing handles document parsing and caching
func (k *knowledgeSVC) processDocumentParsing(ctx context.Context, doc *entity.Document) (
[]*schema.Document, *indexDocCacheRecord, error) {
cacheKey := fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)
cacheRecord := &indexDocCacheRecord{}
// Try to get cached parse results
val, err := k.cacheCli.Get(ctx, cacheKey).Result()
if err == nil {
if err = sonic.UnmarshalString(val, &cacheRecord); err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeParseJSONCode,
errorx.KV("msg", fmt.Sprintf("parse cache record failed, err: %v", err)))
}
}
// Parse document if not cached
if err != nil || len(cacheRecord.ParseUri) == 0 {
return k.parseAndCacheDocument(ctx, doc, cacheRecord, cacheKey)
}
// Load parse results from cache
return k.loadParsedDocument(ctx, cacheRecord)
}
// parseAndCacheDocument parses the document and caches the results
func (k *knowledgeSVC) parseAndCacheDocument(ctx context.Context, doc *entity.Document,
cacheRecord *indexDocCacheRecord, cacheKey string) ([]*schema.Document, *indexDocCacheRecord, error) {
// Get document content from storage
bodyBytes, err := k.storage.GetObject(ctx, doc.URI)
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeGetObjectFailCode,
errorx.KV("msg", fmt.Sprintf("get object failed, err: %v", err)))
}
// Get appropriate parser for document type
docParser, err := k.parseManager.GetParser(convert.DocumentToParseConfig(doc))
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeGetParserFailCode,
errorx.KV("msg", fmt.Sprintf("get parser failed, err: %v", err)))
}
// Parse document content
parseResult, err := docParser.Parse(ctx, bytes.NewReader(bodyBytes), parser.WithExtraMeta(map[string]any{
document.MetaDataKeyCreatorID: doc.CreatorID,
document.MetaDataKeyExternalStorage: map[string]any{
"document_id": doc.ID,
},
}))
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeParserParseFailCode,
errorx.KV("msg", fmt.Sprintf("parse document failed, err: %v", err)))
}
// Cache parse results
if err := k.cacheParseResults(ctx, doc, parseResult, cacheRecord, cacheKey); err != nil {
return nil, nil, err
}
return parseResult, cacheRecord, nil
}
// cacheParseResults stores parse results in persistent storage and cache
func (k *knowledgeSVC) cacheParseResults(ctx context.Context, doc *entity.Document,
parseResult []*schema.Document, cacheRecord *indexDocCacheRecord, cacheKey string) error {
parseResultData, err := sonic.Marshal(parseResult)
if err != nil {
return errorx.New(errno.ErrKnowledgeParseJSONCode,
errorx.KV("msg", fmt.Sprintf("marshal parse result failed, err: %v", err)))
}
fileName := fmt.Sprintf("FileBizType.Knowledge/%d_%d.txt", doc.CreatorID, doc.ID)
if err = k.storage.PutObject(ctx, fileName, parseResultData); err != nil {
return errorx.New(errno.ErrKnowledgePutObjectFailCode,
errorx.KV("msg", fmt.Sprintf("put object failed, err: %v", err)))
}
cacheRecord.ParseUri = fileName
return k.recordIndexDocumentStatus(ctx, cacheRecord, cacheKey)
}
// loadParsedDocument loads previously parsed document from cache
func (k *knowledgeSVC) loadParsedDocument(ctx context.Context,
cacheRecord *indexDocCacheRecord) ([]*schema.Document, *indexDocCacheRecord, error) {
data, err := k.storage.GetObject(ctx, cacheRecord.ParseUri)
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeGetObjectFailCode,
errorx.KV("msg", fmt.Sprintf("get object failed, err: %v", err)))
}
var parseResult []*schema.Document
if err = sonic.Unmarshal(data, &parseResult); err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeParseJSONCode,
errorx.KV("msg", fmt.Sprintf("marshal parse result failed, err: %v", err)))
}
return parseResult, cacheRecord, nil
}
// handleTableDocument handles special processing for table-type documents
func (k *knowledgeSVC) handleTableDocument(ctx context.Context,
doc *entity.Document, parseResult []*schema.Document) error {
noData, err := document.GetDocumentsColumnsOnly(parseResult)
if err != nil {
return errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] get table data status failed, err: %v", err))
}
if noData {
parseResult = nil // clear parse result
}
return nil
}
// processDocumentChunks processes document chunks in batches
func (k *knowledgeSVC) processDocumentChunks(ctx context.Context,
doc *entity.Document, parseResult []*schema.Document, cacheRecord *indexDocCacheRecord) error {
batchSize := 100
progressbar := progressbar.NewProgressBar(ctx, doc.ID,
int64(len(parseResult)*len(k.searchStoreManagers)), k.cacheCli, true)
if err := progressbar.AddN(int(cacheRecord.LastProcessedNumber) * len(k.searchStoreManagers)); err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("add progress bar failed, err: %v", err)))
}
// Process chunks in batches
for i := int(cacheRecord.LastProcessedNumber); i < len(parseResult); i += batchSize {
chunks := parseResult[i:min(i+batchSize, len(parseResult))]
if err := k.batchProcessSlice(ctx, doc, i, chunks, cacheRecord, progressbar); err != nil {
return err
}
}
return nil
}
// finalizeDocumentIndexing completes the document indexing process
func (k *knowledgeSVC) finalizeDocumentIndexing(ctx context.Context, knowledgeID, documentID int64) error {
if err := k.documentRepo.SetStatus(ctx, documentID, int32(entity.DocumentStatusEnable), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
}
if err := k.documentRepo.UpdateDocumentSliceInfo(ctx, documentID); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update document slice info failed, err: %v", err)))
}
if err := k.cacheCli.Del(ctx, fmt.Sprintf(indexDocCacheKey, knowledgeID, documentID)).Err(); err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("del cache failed, err: %v", err)))
}
return nil
}
// batchProcessSlice processes a batch of document slices
func (k *knowledgeSVC) batchProcessSlice(ctx context.Context, doc *entity.Document,
startIdx int, parseResult []*schema.Document, cacheRecord *indexDocCacheRecord,
progressBar progressbarContract.ProgressBar) error {
collectionName := getCollectionName(doc.KnowledgeID) collectionName := getCollectionName(doc.KnowledgeID)
length := len(parseResult)
var ids []int64
var err error
// Generate IDs for this batch
if len(cacheRecord.ProcessingIDs) == 0 {
ids, err = k.genMultiIDs(ctx, length)
if err != nil {
return err
}
} else {
ids = cacheRecord.ProcessingIDs
}
for idx := range parseResult {
parseResult[idx].ID = strconv.FormatInt(ids[idx], 10)
}
// Update cache record with processing IDs
cacheRecord.ProcessingIDs = ids
if err := k.recordIndexDocumentStatus(ctx, cacheRecord,
fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)); err != nil {
return err
}
if !doc.IsAppend { // Convert documents to slices
if doc.Type != knowledge.DocumentTypeImage { sliceEntities, err := k.convertToSlices(doc, parseResult)
if err != nil {
return err
}
// Handle table-type documents
if doc.Type == knowledge.DocumentTypeTable {
if err := k.upsertDataToTable(ctx, &doc.TableInfo, sliceEntities); err != nil {
logs.CtxErrorf(ctx, "[indexDocument] insert data to table failed, err: %v", err)
return err
}
}
// Store slices in database
if err := k.storeSlicesInDB(ctx, doc, parseResult, startIdx, ids); err != nil {
return err
}
// Index slices in search stores
if err := k.indexSlicesInSearchStores(ctx, doc, collectionName, sliceEntities,
cacheRecord, progressBar); err != nil {
return err
}
// Update cache record after successful processing
cacheRecord.LastProcessedNumber = int64(startIdx) + int64(length)
cacheRecord.ProcessingIDs = nil
// Mark slices as done
err = k.sliceRepo.BatchSetStatus(ctx, ids, int32(model.SliceStatusDone), "")
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("batch set slice status failed, err: %v", err)))
}
if err := k.recordIndexDocumentStatus(ctx, cacheRecord,
fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)); err != nil {
return err
}
return nil
}
// convertToSlices converts parsed documents to slice entities
func (k *knowledgeSVC) convertToSlices(doc *entity.Document, parseResult []*schema.Document) ([]*entity.Slice, error) {
convertFn := d2sMapping[doc.Type]
if convertFn == nil {
return nil, errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", "convertFn is empty"))
}
return slices.TransformWithErrorCheck(parseResult, func(a *schema.Document) (*entity.Slice, error) {
return convertFn(a, doc.KnowledgeID, doc.ID, doc.CreatorID)
})
}
// cleanupPreviousProcessing cleans up partially processed data from previous attempts
func (k *knowledgeSVC) cleanupPreviousProcessing(ctx context.Context, doc *entity.Document) error {
collectionName := getCollectionName(doc.KnowledgeID)
if doc.IsAppend || doc.Type == knowledge.DocumentTypeImage {
return nil
}
ids, err := k.sliceRepo.GetDocumentSliceIDs(ctx, []int64{doc.ID}) ids, err := k.sliceRepo.GetDocumentSliceIDs(ctx, []int64{doc.ID})
if err != nil { if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("get document slice ids failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("get document slice ids failed, err: %v", err)))
@ -199,12 +513,13 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
if err = k.sliceRepo.DeleteByDocument(ctx, doc.ID); err != nil { if err = k.sliceRepo.DeleteByDocument(ctx, doc.ID); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("delete document slice failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("delete document slice failed, err: %v", err)))
} }
for _, manager := range k.searchStoreManagers { for _, manager := range k.searchStoreManagers {
s, err := manager.GetSearchStore(ctx, collectionName) s, err := manager.GetSearchStore(ctx, collectionName)
if err != nil { if err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err)))
} }
if err := s.Delete(ctx, slices.Transform(event.SliceIDs, func(id int64) string { if err := s.Delete(ctx, slices.Transform(ids, func(id int64) string {
return strconv.FormatInt(id, 10) return strconv.FormatInt(id, 10)
})); err != nil { })); err != nil {
logs.Errorf("[indexDocument] delete knowledge failed, err: %v", err) logs.Errorf("[indexDocument] delete knowledge failed, err: %v", err)
@ -212,94 +527,42 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
} }
} }
} }
}
}
// set chunk status
if err = k.documentRepo.SetStatus(ctx, doc.ID, int32(entity.DocumentStatusChunking), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
}
// parse & chunk
bodyBytes, err := k.storage.GetObject(ctx, doc.URI)
if err != nil {
return errorx.New(errno.ErrKnowledgeGetObjectFailCode, errorx.KV("msg", fmt.Sprintf("get object failed, err: %v", err)))
}
docParser, err := k.parseManager.GetParser(convert.DocumentToParseConfig(doc))
if err != nil {
return errorx.New(errno.ErrKnowledgeGetParserFailCode, errorx.KV("msg", fmt.Sprintf("get parser failed, err: %v", err)))
}
parseResult, err := docParser.Parse(ctx, bytes.NewReader(bodyBytes), parser.WithExtraMeta(map[string]any{
document.MetaDataKeyCreatorID: doc.CreatorID,
document.MetaDataKeyExternalStorage: map[string]any{
"document_id": doc.ID,
},
}))
if err != nil {
return errorx.New(errno.ErrKnowledgeParserParseFailCode, errorx.KV("msg", fmt.Sprintf("parse document failed, err: %v", err)))
}
if doc.Type == knowledge.DocumentTypeTable { if doc.Type == knowledge.DocumentTypeTable {
noData, err := document.GetDocumentsColumnsOnly(parseResult) _, err := k.rdb.DeleteData(ctx, &rdb.DeleteDataRequest{
if err != nil { // unexpected TableName: doc.TableInfo.PhysicalTableName,
return errorx.New(errno.ErrKnowledgeNonRetryableCode, Where: &rdb.ComplexCondition{
errorx.KVf("reason", "[indexDocument] get table data status failed, err: %v", err)) Conditions: []*rdb.Condition{
} {
if noData { Field: consts.RDBFieldID,
parseResult = nil // clear parse result Operator: rdbEntity.OperatorIn,
} Value: ids,
} },
},
// set id },
allIDs := make([]int64, 0, len(parseResult))
for l := 0; l < len(parseResult); l += 100 {
r := min(l+100, len(parseResult))
batchSize := r - l
ids, err := k.idgen.GenMultiIDs(ctx, batchSize)
if err != nil {
return errorx.New(errno.ErrKnowledgeIDGenCode, errorx.KV("msg", fmt.Sprintf("GenMultiIDs failed, err: %v", err)))
}
allIDs = append(allIDs, ids...)
for i := 0; i < batchSize; i++ {
id := ids[i]
index := l + i
parseResult[index].ID = strconv.FormatInt(id, 10)
}
}
convertFn := d2sMapping[doc.Type]
if convertFn == nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", "convertFn is empty"))
}
sliceEntities, err := slices.TransformWithErrorCheck(parseResult, func(a *schema.Document) (*entity.Slice, error) {
return convertFn(a, doc.KnowledgeID, doc.ID, doc.CreatorID)
}) })
if err != nil { if err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("convert document failed, err: %v", err))) logs.CtxErrorf(ctx, "delete data failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
} }
}
return nil
}
// save slices // storeSlicesInDB stores slice data in the database
if doc.Type == knowledge.DocumentTypeTable { func (k *knowledgeSVC) storeSlicesInDB(ctx context.Context, doc *entity.Document,
// Table type to insert data into a database parseResult []*schema.Document, startIdx int, ids []int64) error {
err = k.upsertDataToTable(ctx, &doc.TableInfo, sliceEntities)
if err != nil {
logs.CtxErrorf(ctx, "[indexDocument] insert data to table failed, err: %v", err)
return err
}
}
var seqOffset float64 var seqOffset float64
var err error
if doc.IsAppend { if doc.IsAppend {
seqOffset, err = k.sliceRepo.GetLastSequence(ctx, doc.ID) seqOffset, err = k.sliceRepo.GetLastSequence(ctx, doc.ID)
if err != nil { if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("get last sequence failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeDBCode,
errorx.KV("msg", fmt.Sprintf("get last sequence failed, err: %v", err)))
} }
seqOffset += 1 seqOffset += 1
} }
if doc.Type == knowledge.DocumentTypeImage { if doc.Type == knowledge.DocumentTypeImage {
if len(parseResult) != 0 { if len(parseResult) != 0 {
slices, _, err := k.sliceRepo.FindSliceByCondition(ctx, &entity.WhereSliceOpt{DocumentID: doc.ID}) slices, _, err := k.sliceRepo.FindSliceByCondition(ctx, &entity.WhereSliceOpt{DocumentID: doc.ID})
@ -332,16 +595,17 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update slice failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update slice failed, err: %v", err)))
} }
} }
} else { return nil
}
sliceModels := make([]*model.KnowledgeDocumentSlice, 0, len(parseResult)) sliceModels := make([]*model.KnowledgeDocumentSlice, 0, len(parseResult))
for i, src := range parseResult { for i, src := range parseResult {
now := time.Now().UnixMilli() now := time.Now().UnixMilli()
sliceModel := &model.KnowledgeDocumentSlice{ sliceModel := &model.KnowledgeDocumentSlice{
ID: allIDs[i], ID: ids[i],
KnowledgeID: doc.KnowledgeID, KnowledgeID: doc.KnowledgeID,
DocumentID: doc.ID, DocumentID: doc.ID,
Content: parseResult[i].Content, Content: parseResult[i].Content,
Sequence: seqOffset + float64(i), Sequence: seqOffset + float64(i+startIdx),
CreatedAt: now, CreatedAt: now,
UpdatedAt: now, UpdatedAt: now,
CreatorID: doc.CreatorID, CreatorID: doc.CreatorID,
@ -349,84 +613,108 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
Status: int32(model.SliceStatusProcessing), Status: int32(model.SliceStatusProcessing),
FailReason: "", FailReason: "",
} }
if doc.Type == knowledge.DocumentTypeTable { if doc.Type == knowledge.DocumentTypeTable {
convertFn := d2sMapping[doc.Type]
sliceEntity, err := convertFn(src, doc.KnowledgeID, doc.ID, doc.CreatorID) sliceEntity, err := convertFn(src, doc.KnowledgeID, doc.ID, doc.CreatorID)
if err != nil { if err != nil {
logs.CtxErrorf(ctx, "[indexDocument] convert document failed, err: %v", err) logs.CtxErrorf(ctx, "[indexDocument] convert document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("convert document failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("convert document failed, err: %v", err)))
} }
sliceModel.Content = sliceEntity.GetSliceContent() sliceModel.Content = sliceEntity.GetSliceContent()
} }
sliceModels = append(sliceModels, sliceModel) sliceModels = append(sliceModels, sliceModel)
} }
if err = k.sliceRepo.BatchCreate(ctx, sliceModels); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("batch create slice failed, err: %v", err)))
}
}
defer func() { err = k.sliceRepo.BatchCreate(ctx, sliceModels)
if err != nil { // set slice status if err != nil {
if setStatusErr := k.sliceRepo.BatchSetStatus(ctx, allIDs, int32(model.SliceStatusFailed), err.Error()); setStatusErr != nil { return errorx.New(errno.ErrKnowledgeDBCode,
logs.CtxErrorf(ctx, "[indexDocument] set slice status failed, err: %v", setStatusErr) errorx.KV("msg", fmt.Sprintf("batch create slice failed, err: %v", err)))
} }
} return nil
}() }
// indexSlicesInSearchStores indexes slices in appropriate search stores
func (k *knowledgeSVC) indexSlicesInSearchStores(ctx context.Context, doc *entity.Document,
collectionName string, sliceEntities []*entity.Slice, cacheRecord *indexDocCacheRecord,
progressBar progressbarContract.ProgressBar) error {
// to vectorstore
fields, err := k.mapSearchFields(doc) fields, err := k.mapSearchFields(doc)
if err != nil { if err != nil {
return err return err
} }
indexingFields := getIndexingFields(fields) indexingFields := getIndexingFields(fields)
// reformat docs, mainly for enableCompactTable // Convert slices to search documents
ssDocs, err := slices.TransformWithErrorCheck(sliceEntities, func(a *entity.Slice) (*schema.Document, error) { ssDocs, err := slices.TransformWithErrorCheck(sliceEntities, func(a *entity.Slice) (*schema.Document, error) {
return k.slice2Document(ctx, doc, a) return k.slice2Document(ctx, doc, a)
}) })
if err != nil { if err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("reformat document failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("reformat document failed, err: %v", err)))
} }
progressbar := progressbar.NewProgressBar(ctx, doc.ID, int64(len(ssDocs)*len(k.searchStoreManagers)), k.cacheCli, true)
// Skip if it's an image document with empty content
if doc.Type == knowledge.DocumentTypeImage && len(ssDocs) == 1 && len(ssDocs[0].Content) == 0 {
return nil
}
// Index in each search store manager
for _, manager := range k.searchStoreManagers { for _, manager := range k.searchStoreManagers {
now := time.Now() now := time.Now()
if err = manager.Create(ctx, &searchstore.CreateRequest{ if err := manager.Create(ctx, &searchstore.CreateRequest{
CollectionName: collectionName, CollectionName: collectionName,
Fields: fields, Fields: fields,
CollectionMeta: nil, CollectionMeta: nil,
}); err != nil { }); err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("create search store failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeSearchStoreCode,
} errorx.KV("msg", fmt.Sprintf("create search store failed, err: %v", err)))
// Picture knowledge base kn: doc: slice = 1: n: n, maybe the content is empty, no need to write
if doc.Type == knowledge.DocumentTypeImage && len(ssDocs) == 1 && len(ssDocs[0].Content) == 0 {
continue
} }
ss, err := manager.GetSearchStore(ctx, collectionName) ss, err := manager.GetSearchStore(ctx, collectionName)
if err != nil { if err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeSearchStoreCode,
errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err)))
} }
if _, err = ss.Store(ctx, ssDocs, if _, err = ss.Store(ctx, ssDocs,
searchstore.WithIndexerPartitionKey(fieldNameDocumentID), searchstore.WithIndexerPartitionKey(fieldNameDocumentID),
searchstore.WithPartition(strconv.FormatInt(doc.ID, 10)), searchstore.WithPartition(strconv.FormatInt(doc.ID, 10)),
searchstore.WithIndexingFields(indexingFields), searchstore.WithIndexingFields(indexingFields),
searchstore.WithProgressBar(progressbar), searchstore.WithProgressBar(progressBar),
); err != nil { ); err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("store search store failed, err: %v", err))) return errorx.New(errno.ErrKnowledgeSearchStoreCode,
errorx.KV("msg", fmt.Sprintf("store search store failed, err: %v", err)))
} }
logs.CtxDebugf(ctx, "[indexDocument] ss type=%v, len(docs)=%d, finished after %d ms", logs.CtxDebugf(ctx, "[indexDocument] ss type=%v, len(docs)=%d, finished after %d ms",
manager.GetType(), len(ssDocs), time.Now().Sub(now).Milliseconds()) manager.GetType(), len(ssDocs), time.Now().Sub(now).Milliseconds())
if err := k.recordIndexDocumentStatus(ctx, cacheRecord,
fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)); err != nil {
return err
} }
// set slice status
if err = k.sliceRepo.BatchSetStatus(ctx, allIDs, int32(model.SliceStatusDone), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("batch set slice status failed, err: %v", err)))
} }
// set document status return nil
}
if err = k.documentRepo.SetStatus(ctx, doc.ID, int32(entity.DocumentStatusEnable), ""); err != nil { // setDocumentStatus updates document status with error handling
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err))) func (k *knowledgeSVC) setDocumentStatus(ctx context.Context, docID int64, status int32, errMsg string) {
if setStatusErr := k.documentRepo.SetStatus(ctx, docID, status, errMsg); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set document status failed, err: %v", setStatusErr)
} }
if err = k.documentRepo.UpdateDocumentSliceInfo(ctx, event.Document.ID); err != nil { }
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update document slice info failed, err: %v", err)))
func (k *knowledgeSVC) recordIndexDocumentStatus(ctx context.Context, r *indexDocCacheRecord, cacheKey string) error {
data, err := sonic.Marshal(r)
if err != nil {
return errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", fmt.Sprintf("marshal parse result failed, err: %v", err)))
}
err = k.cacheCli.Set(ctx, cacheKey, data, time.Hour*2).Err()
if err != nil {
return errorx.New(errno.ErrKnowledgeCacheClientSetFailCode, errorx.KV("msg", fmt.Sprintf("set cache failed, err: %v", err)))
} }
return nil return nil
} }

View File

@ -189,7 +189,7 @@ func (k *knowledgeSVC) prepareRAGDocuments(ctx context.Context, documentIDs []in
} }
func (k *knowledgeSVC) queryRewriteNode(ctx context.Context, req *RetrieveContext) (newRetrieveContext *RetrieveContext, err error) { func (k *knowledgeSVC) queryRewriteNode(ctx context.Context, req *RetrieveContext) (newRetrieveContext *RetrieveContext, err error) {
if len(req.ChatHistory) == 0 { if len(req.ChatHistory) == 1 {
// No context, no rewriting. // No context, no rewriting.
return req, nil return req, nil
} }