optimize(knowledge): Optimize the index logic of the knowledge base (#841)

This commit is contained in:
liuyunchao-1998 2025-08-21 17:44:26 +08:00 committed by GitHub
parent 3d53aaa785
commit 09d00c26cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 454 additions and 165 deletions

View File

@ -25,6 +25,7 @@ import (
"golang.org/x/sync/errgroup"
"gorm.io/gorm"
"gorm.io/gorm/clause"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model"
@ -52,7 +53,7 @@ func (dao *KnowledgeDocumentSliceDAO) Update(ctx context.Context, slice *model.K
}
func (dao *KnowledgeDocumentSliceDAO) BatchCreate(ctx context.Context, slices []*model.KnowledgeDocumentSlice) error {
return dao.Query.KnowledgeDocumentSlice.WithContext(ctx).CreateInBatches(slices, 100)
return dao.Query.KnowledgeDocumentSlice.WithContext(ctx).Clauses(clause.OnConflict{UpdateAll: true}).CreateInBatches(slices, 100)
}
func (dao *KnowledgeDocumentSliceDAO) BatchSetStatus(ctx context.Context, ids []int64, status int32, reason string) error {

View File

@ -35,12 +35,15 @@ import (
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/dal/model"
"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/events"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
progressbarContract "github.com/coze-dev/coze-studio/backend/infra/contract/document/progressbar"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore"
"github.com/coze-dev/coze-studio/backend/infra/contract/eventbus"
"github.com/coze-dev/coze-studio/backend/infra/contract/rdb"
rdbEntity "github.com/coze-dev/coze-studio/backend/infra/contract/rdb/entity"
"github.com/coze-dev/coze-studio/backend/infra/contract/storage"
"github.com/coze-dev/coze-studio/backend/infra/impl/document/progressbar"
"github.com/coze-dev/coze-studio/backend/pkg/errorx"
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
"github.com/coze-dev/coze-studio/backend/pkg/lang/slices"
"github.com/coze-dev/coze-studio/backend/pkg/logs"
"github.com/coze-dev/coze-studio/backend/types/errno"
@ -139,37 +142,98 @@ func (k *knowledgeSVC) indexDocuments(ctx context.Context, event *entity.Event)
return nil
}
type indexDocCacheRecord struct {
ProcessingIDs []int64
LastProcessedNumber int64
ParseUri string
}
const (
indexDocCacheKey = "index_doc_cache:%d:%d"
)
// indexDocumentNew handles the indexing of a new document into the knowledge system
func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (err error) {
doc := event.Document
if doc == nil {
return errorx.New(errno.ErrKnowledgeNonRetryableCode, errorx.KV("reason", "[indexDocument] document not provided"))
}
// 1. The index operations on the same document in the retry queue and the ordinary queue are concurrent, and the same document data is written twice (generated when the backend bugfix is online)
// 2. rebalance repeated consumption of the same message
// check knowledge and document status
if valid, err := k.isWritableKnowledgeAndDocument(ctx, doc.KnowledgeID, doc.ID); err != nil {
return err
} else if !valid {
return errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] not writable, knowledge_id=%d, document_id=%d", event.KnowledgeID, doc.ID))
errorx.KV("reason", "[indexDocument] document not provided"))
}
defer func() {
if e := recover(); e != nil {
err = errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("panic: %v", e)))
logs.CtxErrorf(ctx, "[indexDocument] panic, err: %v", err)
if setStatusErr := k.documentRepo.SetStatus(ctx, event.Document.ID, int32(entity.DocumentStatusFailed), err.Error()); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set document status failed, err: %v", setStatusErr)
}
// Validate document and knowledge status
var valid bool
if valid, err = k.validateDocumentStatus(ctx, doc); err != nil || !valid {
return
}
// Setup error handling and recovery
defer k.handleIndexingErrors(ctx, event, &err)
// Start indexing process
if err = k.beginIndexingProcess(ctx, doc); err != nil {
return
}
// Process document parsing and chunking
var parseResult []*schema.Document
var cacheRecord *indexDocCacheRecord
parseResult, cacheRecord, err = k.processDocumentParsing(ctx, doc)
if err != nil {
var errMsg string
var statusError errorx.StatusError
return
}
if cacheRecord.LastProcessedNumber == 0 {
if err = k.cleanupPreviousProcessing(ctx, doc); err != nil {
return
}
}
// Handle table-type documents specially
if doc.Type == knowledge.DocumentTypeTable {
if err = k.handleTableDocument(ctx, doc, parseResult); err != nil {
return
}
}
// Process document chunks in batches
if err = k.processDocumentChunks(ctx, doc, parseResult, cacheRecord); err != nil {
return
}
// Finalize document indexing
err = k.finalizeDocumentIndexing(ctx, event.Document.KnowledgeID, event.Document.ID)
return
}
// validateDocumentStatus checks if the document can be indexed
func (k *knowledgeSVC) validateDocumentStatus(ctx context.Context, doc *entity.Document) (bool, error) {
valid, err := k.isWritableKnowledgeAndDocument(ctx, doc.KnowledgeID, doc.ID)
if err != nil {
return false, err
}
if !valid {
return false, errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] not writable, knowledge_id=%d, document_id=%d",
doc.KnowledgeID, doc.ID))
}
return true, nil
}
// handleIndexingErrors manages errors and recovery during indexing
func (k *knowledgeSVC) handleIndexingErrors(ctx context.Context, event *entity.Event, err *error) {
if e := recover(); e != nil {
err = ptr.Of(errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("panic: %v", e))))
logs.CtxErrorf(ctx, "[indexDocument] panic, err: %v", err)
k.setDocumentStatus(ctx, event.Document.ID,
int32(entity.DocumentStatusFailed), ptr.From(err).Error())
return
}
if ptr.From(err) != nil {
var status int32
if errors.As(err, &statusError) {
var errMsg string
var statusError errorx.StatusError
if errors.As(ptr.From(err), &statusError) {
errMsg = errorx.ErrorWithoutStack(statusError)
if statusError.Code() == errno.ErrKnowledgeNonRetryableCode {
status = int32(entity.DocumentStatusFailed)
@ -177,20 +241,270 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
status = int32(entity.DocumentStatusChunking)
}
} else {
errMsg = err.Error()
errMsg = ptr.From(err).Error()
status = int32(entity.DocumentStatusChunking)
}
if setStatusErr := k.documentRepo.SetStatus(ctx, event.Document.ID, status, errMsg); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set document status failed, err: %v", setStatusErr)
}
}
}()
// clear
k.setDocumentStatus(ctx, event.Document.ID, status, errMsg)
}
}
// beginIndexingProcess starts the indexing process
func (k *knowledgeSVC) beginIndexingProcess(ctx context.Context, doc *entity.Document) error {
err := k.documentRepo.SetStatus(ctx, doc.ID, int32(entity.DocumentStatusChunking), "")
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
}
return nil
}
// processDocumentParsing handles document parsing and caching
func (k *knowledgeSVC) processDocumentParsing(ctx context.Context, doc *entity.Document) (
[]*schema.Document, *indexDocCacheRecord, error) {
cacheKey := fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)
cacheRecord := &indexDocCacheRecord{}
// Try to get cached parse results
val, err := k.cacheCli.Get(ctx, cacheKey).Result()
if err == nil {
if err = sonic.UnmarshalString(val, &cacheRecord); err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeParseJSONCode,
errorx.KV("msg", fmt.Sprintf("parse cache record failed, err: %v", err)))
}
}
// Parse document if not cached
if err != nil || len(cacheRecord.ParseUri) == 0 {
return k.parseAndCacheDocument(ctx, doc, cacheRecord, cacheKey)
}
// Load parse results from cache
return k.loadParsedDocument(ctx, cacheRecord)
}
// parseAndCacheDocument parses the document and caches the results
func (k *knowledgeSVC) parseAndCacheDocument(ctx context.Context, doc *entity.Document,
cacheRecord *indexDocCacheRecord, cacheKey string) ([]*schema.Document, *indexDocCacheRecord, error) {
// Get document content from storage
bodyBytes, err := k.storage.GetObject(ctx, doc.URI)
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeGetObjectFailCode,
errorx.KV("msg", fmt.Sprintf("get object failed, err: %v", err)))
}
// Get appropriate parser for document type
docParser, err := k.parseManager.GetParser(convert.DocumentToParseConfig(doc))
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeGetParserFailCode,
errorx.KV("msg", fmt.Sprintf("get parser failed, err: %v", err)))
}
// Parse document content
parseResult, err := docParser.Parse(ctx, bytes.NewReader(bodyBytes), parser.WithExtraMeta(map[string]any{
document.MetaDataKeyCreatorID: doc.CreatorID,
document.MetaDataKeyExternalStorage: map[string]any{
"document_id": doc.ID,
},
}))
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeParserParseFailCode,
errorx.KV("msg", fmt.Sprintf("parse document failed, err: %v", err)))
}
// Cache parse results
if err := k.cacheParseResults(ctx, doc, parseResult, cacheRecord, cacheKey); err != nil {
return nil, nil, err
}
return parseResult, cacheRecord, nil
}
// cacheParseResults stores parse results in persistent storage and cache
func (k *knowledgeSVC) cacheParseResults(ctx context.Context, doc *entity.Document,
parseResult []*schema.Document, cacheRecord *indexDocCacheRecord, cacheKey string) error {
parseResultData, err := sonic.Marshal(parseResult)
if err != nil {
return errorx.New(errno.ErrKnowledgeParseJSONCode,
errorx.KV("msg", fmt.Sprintf("marshal parse result failed, err: %v", err)))
}
fileName := fmt.Sprintf("FileBizType.Knowledge/%d_%d.txt", doc.CreatorID, doc.ID)
if err = k.storage.PutObject(ctx, fileName, parseResultData); err != nil {
return errorx.New(errno.ErrKnowledgePutObjectFailCode,
errorx.KV("msg", fmt.Sprintf("put object failed, err: %v", err)))
}
cacheRecord.ParseUri = fileName
return k.recordIndexDocumentStatus(ctx, cacheRecord, cacheKey)
}
// loadParsedDocument loads previously parsed document from cache
func (k *knowledgeSVC) loadParsedDocument(ctx context.Context,
cacheRecord *indexDocCacheRecord) ([]*schema.Document, *indexDocCacheRecord, error) {
data, err := k.storage.GetObject(ctx, cacheRecord.ParseUri)
if err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeGetObjectFailCode,
errorx.KV("msg", fmt.Sprintf("get object failed, err: %v", err)))
}
var parseResult []*schema.Document
if err = sonic.Unmarshal(data, &parseResult); err != nil {
return nil, nil, errorx.New(errno.ErrKnowledgeParseJSONCode,
errorx.KV("msg", fmt.Sprintf("marshal parse result failed, err: %v", err)))
}
return parseResult, cacheRecord, nil
}
// handleTableDocument handles special processing for table-type documents
func (k *knowledgeSVC) handleTableDocument(ctx context.Context,
doc *entity.Document, parseResult []*schema.Document) error {
noData, err := document.GetDocumentsColumnsOnly(parseResult)
if err != nil {
return errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] get table data status failed, err: %v", err))
}
if noData {
parseResult = nil // clear parse result
}
return nil
}
// processDocumentChunks processes document chunks in batches
func (k *knowledgeSVC) processDocumentChunks(ctx context.Context,
doc *entity.Document, parseResult []*schema.Document, cacheRecord *indexDocCacheRecord) error {
batchSize := 100
progressbar := progressbar.NewProgressBar(ctx, doc.ID,
int64(len(parseResult)*len(k.searchStoreManagers)), k.cacheCli, true)
if err := progressbar.AddN(int(cacheRecord.LastProcessedNumber) * len(k.searchStoreManagers)); err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("add progress bar failed, err: %v", err)))
}
// Process chunks in batches
for i := int(cacheRecord.LastProcessedNumber); i < len(parseResult); i += batchSize {
chunks := parseResult[i:min(i+batchSize, len(parseResult))]
if err := k.batchProcessSlice(ctx, doc, i, chunks, cacheRecord, progressbar); err != nil {
return err
}
}
return nil
}
// finalizeDocumentIndexing completes the document indexing process
func (k *knowledgeSVC) finalizeDocumentIndexing(ctx context.Context, knowledgeID, documentID int64) error {
if err := k.documentRepo.SetStatus(ctx, documentID, int32(entity.DocumentStatusEnable), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
}
if err := k.documentRepo.UpdateDocumentSliceInfo(ctx, documentID); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update document slice info failed, err: %v", err)))
}
if err := k.cacheCli.Del(ctx, fmt.Sprintf(indexDocCacheKey, knowledgeID, documentID)).Err(); err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("del cache failed, err: %v", err)))
}
return nil
}
// batchProcessSlice processes a batch of document slices
func (k *knowledgeSVC) batchProcessSlice(ctx context.Context, doc *entity.Document,
startIdx int, parseResult []*schema.Document, cacheRecord *indexDocCacheRecord,
progressBar progressbarContract.ProgressBar) error {
collectionName := getCollectionName(doc.KnowledgeID)
length := len(parseResult)
var ids []int64
var err error
// Generate IDs for this batch
if len(cacheRecord.ProcessingIDs) == 0 {
ids, err = k.genMultiIDs(ctx, length)
if err != nil {
return err
}
} else {
ids = cacheRecord.ProcessingIDs
}
for idx := range parseResult {
parseResult[idx].ID = strconv.FormatInt(ids[idx], 10)
}
// Update cache record with processing IDs
cacheRecord.ProcessingIDs = ids
if err := k.recordIndexDocumentStatus(ctx, cacheRecord,
fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)); err != nil {
return err
}
if !doc.IsAppend {
if doc.Type != knowledge.DocumentTypeImage {
// Convert documents to slices
sliceEntities, err := k.convertToSlices(doc, parseResult)
if err != nil {
return err
}
// Handle table-type documents
if doc.Type == knowledge.DocumentTypeTable {
if err := k.upsertDataToTable(ctx, &doc.TableInfo, sliceEntities); err != nil {
logs.CtxErrorf(ctx, "[indexDocument] insert data to table failed, err: %v", err)
return err
}
}
// Store slices in database
if err := k.storeSlicesInDB(ctx, doc, parseResult, startIdx, ids); err != nil {
return err
}
// Index slices in search stores
if err := k.indexSlicesInSearchStores(ctx, doc, collectionName, sliceEntities,
cacheRecord, progressBar); err != nil {
return err
}
// Update cache record after successful processing
cacheRecord.LastProcessedNumber = int64(startIdx) + int64(length)
cacheRecord.ProcessingIDs = nil
// Mark slices as done
err = k.sliceRepo.BatchSetStatus(ctx, ids, int32(model.SliceStatusDone), "")
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("batch set slice status failed, err: %v", err)))
}
if err := k.recordIndexDocumentStatus(ctx, cacheRecord,
fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)); err != nil {
return err
}
return nil
}
// convertToSlices converts parsed documents to slice entities
func (k *knowledgeSVC) convertToSlices(doc *entity.Document, parseResult []*schema.Document) ([]*entity.Slice, error) {
convertFn := d2sMapping[doc.Type]
if convertFn == nil {
return nil, errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", "convertFn is empty"))
}
return slices.TransformWithErrorCheck(parseResult, func(a *schema.Document) (*entity.Slice, error) {
return convertFn(a, doc.KnowledgeID, doc.ID, doc.CreatorID)
})
}
// cleanupPreviousProcessing cleans up partially processed data from previous attempts
func (k *knowledgeSVC) cleanupPreviousProcessing(ctx context.Context, doc *entity.Document) error {
collectionName := getCollectionName(doc.KnowledgeID)
if doc.IsAppend || doc.Type == knowledge.DocumentTypeImage {
return nil
}
ids, err := k.sliceRepo.GetDocumentSliceIDs(ctx, []int64{doc.ID})
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("get document slice ids failed, err: %v", err)))
@ -199,12 +513,13 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
if err = k.sliceRepo.DeleteByDocument(ctx, doc.ID); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("delete document slice failed, err: %v", err)))
}
for _, manager := range k.searchStoreManagers {
s, err := manager.GetSearchStore(ctx, collectionName)
if err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err)))
}
if err := s.Delete(ctx, slices.Transform(event.SliceIDs, func(id int64) string {
if err := s.Delete(ctx, slices.Transform(ids, func(id int64) string {
return strconv.FormatInt(id, 10)
})); err != nil {
logs.Errorf("[indexDocument] delete knowledge failed, err: %v", err)
@ -212,94 +527,42 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
}
}
}
}
}
// set chunk status
if err = k.documentRepo.SetStatus(ctx, doc.ID, int32(entity.DocumentStatusChunking), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
}
// parse & chunk
bodyBytes, err := k.storage.GetObject(ctx, doc.URI)
if err != nil {
return errorx.New(errno.ErrKnowledgeGetObjectFailCode, errorx.KV("msg", fmt.Sprintf("get object failed, err: %v", err)))
}
docParser, err := k.parseManager.GetParser(convert.DocumentToParseConfig(doc))
if err != nil {
return errorx.New(errno.ErrKnowledgeGetParserFailCode, errorx.KV("msg", fmt.Sprintf("get parser failed, err: %v", err)))
}
parseResult, err := docParser.Parse(ctx, bytes.NewReader(bodyBytes), parser.WithExtraMeta(map[string]any{
document.MetaDataKeyCreatorID: doc.CreatorID,
document.MetaDataKeyExternalStorage: map[string]any{
"document_id": doc.ID,
},
}))
if err != nil {
return errorx.New(errno.ErrKnowledgeParserParseFailCode, errorx.KV("msg", fmt.Sprintf("parse document failed, err: %v", err)))
}
if doc.Type == knowledge.DocumentTypeTable {
noData, err := document.GetDocumentsColumnsOnly(parseResult)
if err != nil { // unexpected
return errorx.New(errno.ErrKnowledgeNonRetryableCode,
errorx.KVf("reason", "[indexDocument] get table data status failed, err: %v", err))
}
if noData {
parseResult = nil // clear parse result
}
}
// set id
allIDs := make([]int64, 0, len(parseResult))
for l := 0; l < len(parseResult); l += 100 {
r := min(l+100, len(parseResult))
batchSize := r - l
ids, err := k.idgen.GenMultiIDs(ctx, batchSize)
if err != nil {
return errorx.New(errno.ErrKnowledgeIDGenCode, errorx.KV("msg", fmt.Sprintf("GenMultiIDs failed, err: %v", err)))
}
allIDs = append(allIDs, ids...)
for i := 0; i < batchSize; i++ {
id := ids[i]
index := l + i
parseResult[index].ID = strconv.FormatInt(id, 10)
}
}
convertFn := d2sMapping[doc.Type]
if convertFn == nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", "convertFn is empty"))
}
sliceEntities, err := slices.TransformWithErrorCheck(parseResult, func(a *schema.Document) (*entity.Slice, error) {
return convertFn(a, doc.KnowledgeID, doc.ID, doc.CreatorID)
_, err := k.rdb.DeleteData(ctx, &rdb.DeleteDataRequest{
TableName: doc.TableInfo.PhysicalTableName,
Where: &rdb.ComplexCondition{
Conditions: []*rdb.Condition{
{
Field: consts.RDBFieldID,
Operator: rdbEntity.OperatorIn,
Value: ids,
},
},
},
})
if err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("convert document failed, err: %v", err)))
logs.CtxErrorf(ctx, "delete data failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
}
return nil
}
// save slices
if doc.Type == knowledge.DocumentTypeTable {
// Table type to insert data into a database
err = k.upsertDataToTable(ctx, &doc.TableInfo, sliceEntities)
if err != nil {
logs.CtxErrorf(ctx, "[indexDocument] insert data to table failed, err: %v", err)
return err
}
}
// storeSlicesInDB stores slice data in the database
func (k *knowledgeSVC) storeSlicesInDB(ctx context.Context, doc *entity.Document,
parseResult []*schema.Document, startIdx int, ids []int64) error {
var seqOffset float64
var err error
if doc.IsAppend {
seqOffset, err = k.sliceRepo.GetLastSequence(ctx, doc.ID)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("get last sequence failed, err: %v", err)))
return errorx.New(errno.ErrKnowledgeDBCode,
errorx.KV("msg", fmt.Sprintf("get last sequence failed, err: %v", err)))
}
seqOffset += 1
}
if doc.Type == knowledge.DocumentTypeImage {
if len(parseResult) != 0 {
slices, _, err := k.sliceRepo.FindSliceByCondition(ctx, &entity.WhereSliceOpt{DocumentID: doc.ID})
@ -332,16 +595,17 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update slice failed, err: %v", err)))
}
}
} else {
return nil
}
sliceModels := make([]*model.KnowledgeDocumentSlice, 0, len(parseResult))
for i, src := range parseResult {
now := time.Now().UnixMilli()
sliceModel := &model.KnowledgeDocumentSlice{
ID: allIDs[i],
ID: ids[i],
KnowledgeID: doc.KnowledgeID,
DocumentID: doc.ID,
Content: parseResult[i].Content,
Sequence: seqOffset + float64(i),
Sequence: seqOffset + float64(i+startIdx),
CreatedAt: now,
UpdatedAt: now,
CreatorID: doc.CreatorID,
@ -349,84 +613,108 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
Status: int32(model.SliceStatusProcessing),
FailReason: "",
}
if doc.Type == knowledge.DocumentTypeTable {
convertFn := d2sMapping[doc.Type]
sliceEntity, err := convertFn(src, doc.KnowledgeID, doc.ID, doc.CreatorID)
if err != nil {
logs.CtxErrorf(ctx, "[indexDocument] convert document failed, err: %v", err)
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("convert document failed, err: %v", err)))
return errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("convert document failed, err: %v", err)))
}
sliceModel.Content = sliceEntity.GetSliceContent()
}
sliceModels = append(sliceModels, sliceModel)
}
if err = k.sliceRepo.BatchCreate(ctx, sliceModels); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("batch create slice failed, err: %v", err)))
}
}
defer func() {
if err != nil { // set slice status
if setStatusErr := k.sliceRepo.BatchSetStatus(ctx, allIDs, int32(model.SliceStatusFailed), err.Error()); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set slice status failed, err: %v", setStatusErr)
err = k.sliceRepo.BatchCreate(ctx, sliceModels)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode,
errorx.KV("msg", fmt.Sprintf("batch create slice failed, err: %v", err)))
}
}
}()
return nil
}
// indexSlicesInSearchStores indexes slices in appropriate search stores
func (k *knowledgeSVC) indexSlicesInSearchStores(ctx context.Context, doc *entity.Document,
collectionName string, sliceEntities []*entity.Slice, cacheRecord *indexDocCacheRecord,
progressBar progressbarContract.ProgressBar) error {
// to vectorstore
fields, err := k.mapSearchFields(doc)
if err != nil {
return err
}
indexingFields := getIndexingFields(fields)
// reformat docs, mainly for enableCompactTable
// Convert slices to search documents
ssDocs, err := slices.TransformWithErrorCheck(sliceEntities, func(a *entity.Slice) (*schema.Document, error) {
return k.slice2Document(ctx, doc, a)
})
if err != nil {
return errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", fmt.Sprintf("reformat document failed, err: %v", err)))
return errorx.New(errno.ErrKnowledgeSystemCode,
errorx.KV("msg", fmt.Sprintf("reformat document failed, err: %v", err)))
}
progressbar := progressbar.NewProgressBar(ctx, doc.ID, int64(len(ssDocs)*len(k.searchStoreManagers)), k.cacheCli, true)
// Skip if it's an image document with empty content
if doc.Type == knowledge.DocumentTypeImage && len(ssDocs) == 1 && len(ssDocs[0].Content) == 0 {
return nil
}
// Index in each search store manager
for _, manager := range k.searchStoreManagers {
now := time.Now()
if err = manager.Create(ctx, &searchstore.CreateRequest{
if err := manager.Create(ctx, &searchstore.CreateRequest{
CollectionName: collectionName,
Fields: fields,
CollectionMeta: nil,
}); err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("create search store failed, err: %v", err)))
}
// Picture knowledge base kn: doc: slice = 1: n: n, maybe the content is empty, no need to write
if doc.Type == knowledge.DocumentTypeImage && len(ssDocs) == 1 && len(ssDocs[0].Content) == 0 {
continue
return errorx.New(errno.ErrKnowledgeSearchStoreCode,
errorx.KV("msg", fmt.Sprintf("create search store failed, err: %v", err)))
}
ss, err := manager.GetSearchStore(ctx, collectionName)
if err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err)))
return errorx.New(errno.ErrKnowledgeSearchStoreCode,
errorx.KV("msg", fmt.Sprintf("get search store failed, err: %v", err)))
}
if _, err = ss.Store(ctx, ssDocs,
searchstore.WithIndexerPartitionKey(fieldNameDocumentID),
searchstore.WithPartition(strconv.FormatInt(doc.ID, 10)),
searchstore.WithIndexingFields(indexingFields),
searchstore.WithProgressBar(progressbar),
searchstore.WithProgressBar(progressBar),
); err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("store search store failed, err: %v", err)))
return errorx.New(errno.ErrKnowledgeSearchStoreCode,
errorx.KV("msg", fmt.Sprintf("store search store failed, err: %v", err)))
}
logs.CtxDebugf(ctx, "[indexDocument] ss type=%v, len(docs)=%d, finished after %d ms",
manager.GetType(), len(ssDocs), time.Now().Sub(now).Milliseconds())
if err := k.recordIndexDocumentStatus(ctx, cacheRecord,
fmt.Sprintf(indexDocCacheKey, doc.KnowledgeID, doc.ID)); err != nil {
return err
}
// set slice status
if err = k.sliceRepo.BatchSetStatus(ctx, allIDs, int32(model.SliceStatusDone), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("batch set slice status failed, err: %v", err)))
}
// set document status
return nil
}
if err = k.documentRepo.SetStatus(ctx, doc.ID, int32(entity.DocumentStatusEnable), ""); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("set document status failed, err: %v", err)))
// setDocumentStatus updates document status with error handling
func (k *knowledgeSVC) setDocumentStatus(ctx context.Context, docID int64, status int32, errMsg string) {
if setStatusErr := k.documentRepo.SetStatus(ctx, docID, status, errMsg); setStatusErr != nil {
logs.CtxErrorf(ctx, "[indexDocument] set document status failed, err: %v", setStatusErr)
}
if err = k.documentRepo.UpdateDocumentSliceInfo(ctx, event.Document.ID); err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", fmt.Sprintf("update document slice info failed, err: %v", err)))
}
func (k *knowledgeSVC) recordIndexDocumentStatus(ctx context.Context, r *indexDocCacheRecord, cacheKey string) error {
data, err := sonic.Marshal(r)
if err != nil {
return errorx.New(errno.ErrKnowledgeParseJSONCode, errorx.KV("msg", fmt.Sprintf("marshal parse result failed, err: %v", err)))
}
err = k.cacheCli.Set(ctx, cacheKey, data, time.Hour*2).Err()
if err != nil {
return errorx.New(errno.ErrKnowledgeCacheClientSetFailCode, errorx.KV("msg", fmt.Sprintf("set cache failed, err: %v", err)))
}
return nil
}

View File

@ -189,7 +189,7 @@ func (k *knowledgeSVC) prepareRAGDocuments(ctx context.Context, documentIDs []in
}
func (k *knowledgeSVC) queryRewriteNode(ctx context.Context, req *RetrieveContext) (newRetrieveContext *RetrieveContext, err error) {
if len(req.ChatHistory) == 0 {
if len(req.ChatHistory) == 1 {
// No context, no rewriting.
return req, nil
}