chore: replace all cn comments to en version by volc api (#313)

This commit is contained in:
tecvan
2025-07-31 15:18:11 +08:00
committed by GitHub
parent 91d6cdb430
commit 5abc63fba6
254 changed files with 5899 additions and 5844 deletions

View File

@@ -19,14 +19,14 @@ package entity
type DocumentStatus int64
const (
DocumentStatusInit DocumentStatus = -1 // 初始化
DocumentStatusUploading DocumentStatus = 0 // 上传中
DocumentStatusEnable DocumentStatus = 1 // 生效
DocumentStatusDisable DocumentStatus = 2 // 失效
DocumentStatusDeleted DocumentStatus = 3 // 已删除
DocumentStatusChunking DocumentStatus = 4 // 切片中
// DocumentStatusRefreshing DocumentStatus = 5 // 刷新中
DocumentStatusFailed DocumentStatus = 9 // 失败
DocumentStatusInit DocumentStatus = -1 // initialization
DocumentStatusUploading DocumentStatus = 0 // Uploading
DocumentStatusEnable DocumentStatus = 1 // take effect
DocumentStatusDisable DocumentStatus = 2 // failure
DocumentStatusDeleted DocumentStatus = 3 // deleted
DocumentStatusChunking DocumentStatus = 4 // Slicing
// DocumentStatusRefreshing DocumentStatus = 5//Refreshing
DocumentStatusFailed DocumentStatus = 9 // fail
)
func (s DocumentStatus) String() string {
@@ -44,7 +44,7 @@ func (s DocumentStatus) String() string {
case DocumentStatusChunking:
return "切片中"
// case DocumentStatusRefreshing:
// return "刷新中"
// Returns "Refreshing"
case DocumentStatusFailed:
return "失败"
default:
@@ -55,6 +55,6 @@ func (s DocumentStatus) String() string {
type DocumentSource int64
const (
DocumentSourceLocal DocumentSource = 0 // 本地文件上传
DocumentSourceCustom DocumentSource = 2 // 自定义文本
DocumentSourceLocal DocumentSource = 0 // local file upload
DocumentSourceCustom DocumentSource = 2 // custom text
)

View File

@@ -27,25 +27,25 @@ type Document struct {
KnowledgeID int64
Type knowledge.DocumentType
RawContent string // 用户自定义的原始内容
URI string // 文档 uri
URL string // 文档 url
Size int64 // 文档 bytes
SliceCount int64 // slice 数量
CharCount int64 // 文档字符数
FileExtension parser.FileExtension // 文档后缀, csv/pdf...
Status DocumentStatus // 文档状态
StatusMsg string // 文档状态详细信息
Hits int64 // 命中次数
Source DocumentSource // 文档来源
ParsingStrategy *ParsingStrategy // 解析策略
ChunkingStrategy *ChunkingStrategy // 分段策略
RawContent string // User-defined original content
URI string // Document URI
URL string // Document URL
Size int64 // Document bytes
SliceCount int64 // Number of slices
CharCount int64 // Number of document characters
FileExtension parser.FileExtension // Document suffix, csv/pdf...
Status DocumentStatus // Document Status
StatusMsg string // Document Status Details
Hits int64 // hit count
Source DocumentSource // document source
ParsingStrategy *ParsingStrategy // parsing strategy
ChunkingStrategy *ChunkingStrategy // segmentation strategy
TableInfo TableInfo
IsAppend bool // 是否在表格中追加
IsAppend bool // Whether to append to the table
// LevelURI string // 层级分段预览 uri
// PreviewURI string // 预览 uri
// LevelURI string//Hierarchical segmentation preview uri
// PreviewURI string//preview uri
}
type TableInfo struct {
@@ -56,18 +56,18 @@ type TableInfo struct {
}
type TableSheet struct {
SheetId int64 // sheet id
HeaderLineIdx int64 // 表头行
StartLineIdx int64 // 数据起始行
SheetName string // sheet的名称
TotalRows int64 // 总行数
HeaderLineIdx int64 // header row
StartLineIdx int64 // Data start row
SheetName string // Name of sheet
TotalRows int64 // total number of rows
}
type TableColumn struct {
ID int64
Name string
Type document.TableColumnType
Description string
Indexing bool // 是否索引
Sequence int64 // 表格中的原始序号
Indexing bool // whether to index
Sequence int64 // The original serial number in the table
}
type WhereDocumentOpt struct {

View File

@@ -29,20 +29,20 @@ type Event struct {
type EventType string
// 文档 event
// 切分 + 写入向量库操作事务性由实现自行保证
// Document event
// Split + write vector library operation transactionality is guaranteed by the implementation itself
const (
EventTypeIndexDocuments EventType = "index_documents"
// EventTypeIndexDocument 文档信息已写入 orm逻辑中需要解析+切分+搜索数据入库
// EventTypeIndexDocument document information has been written to orm, the logic needs to parse + split + search data warehousing
// Event requires: Event.Document
EventTypeIndexDocument EventType = "index_document"
// EventTypeIndexSlice 切片信息已写入 orm逻辑中仅写入搜索数据
// EventTypeIndexSlice slice information has been written to orm, and only search data is written in the logic
// Event requires: Event.Slice
EventTypeIndexSlice EventType = "index_slice"
// EventTypeDeleteKnowledgeData 删除 knowledge
// EventTypeDeleteKnowledgeData remove knowledge
// Event requires: Event.KnowledgeID, Event.SliceIDs
EventTypeDeleteKnowledgeData EventType = "delete_knowledge_data"

View File

@@ -26,10 +26,10 @@ type WhereKnowledgeOption struct {
KnowledgeIDs []int64
AppID *int64
SpaceID *int64
Name *string // 完全匹配
Name *string // Exact match
Status []int32
UserID *int64
Query *string // 模糊匹配
Query *string // fuzzy match
Page *int
PageSize *int
Order *Order

View File

@@ -25,18 +25,18 @@ type RetrievalStrategy = knowledge.RetrievalStrategy
// ParsingStrategy for document parse before indexing
type ParsingStrategy struct {
ParsingType ParsingType `json:"parsing_type"` // 解析类型
ParsingType ParsingType `json:"parsing_type"` // parse type
// Doc
ExtractImage bool `json:"extract_image"` // 提取图片元素
ExtractTable bool `json:"extract_table"` // 提取表格元素
ImageOCR bool `json:"image_ocr"` // 图片 ocr
FilterPages []int `json:"filter_pages"` // 过滤页数
ExtractImage bool `json:"extract_image"` // Extract image elements
ExtractTable bool `json:"extract_table"` // Extract table elements
ImageOCR bool `json:"image_ocr"` // Image ocr
FilterPages []int `json:"filter_pages"` // filter pages
// Sheet
SheetID int64 `json:"sheet_id"` // xlsx sheet id
HeaderLine int `json:"header_line"` // 表头行
DataStartLine int `json:"data_start_line"` // 数据起始行
RowsCount int `json:"rows_count"` // 读取数据行数
HeaderLine int `json:"header_line"` // header row
DataStartLine int `json:"data_start_line"` // Data start row
RowsCount int `json:"rows_count"` // number of rows read
// Image
CaptionType *parser.ImageAnnotationType `json:"caption_type"`
@@ -52,13 +52,13 @@ const (
type ChunkingStrategy struct {
ChunkType parser.ChunkType `json:"chunk_type"`
// custom chunk config
ChunkSize int64 `json:"chunk_size"` // 分段最大长度
Separator string `json:"separator"` // 分段标识符
Overlap int64 `json:"overlap"` // 分段重叠
ChunkSize int64 `json:"chunk_size"` // maximum segmentation length
Separator string `json:"separator"` // segmentation identifier
Overlap int64 `json:"overlap"` // segmented overlap
TrimSpace bool `json:"trim_space"`
TrimURLAndEmail bool `json:"trim_url_and_email"`
// 按层级分段
MaxDepth int64 `json:"max_depth"` // 按层级分段时的最大层级
SaveTitle bool `json:"save_title"` // 保留层级标题
// segmentation by hierarchy
MaxDepth int64 `json:"max_depth"` // Maximum level when segmented by level
SaveTitle bool `json:"save_title"` // Preserve Hierarchical Titles
}

View File

@@ -132,12 +132,12 @@ func (dao *KnowledgeDocumentDAO) DeleteDocuments(ctx context.Context, ids []int6
tx.Commit()
}
}()
// 删除document
// Delete document
err = tx.WithContext(ctx).Model(&model.KnowledgeDocument{}).Where("id in ?", ids).Delete(&model.KnowledgeDocument{}).Error
if err != nil {
return err
}
// 删除document_slice
// Delete document_slice
err = tx.WithContext(ctx).Model(&model.KnowledgeDocumentSlice{}).Where("document_id in?", ids).Delete(&model.KnowledgeDocumentSlice{}).Error
if err != nil {
return err

View File

@@ -120,7 +120,7 @@ func (dao *KnowledgeDocumentSliceDAO) listBatch(ctx context.Context, knowledgeID
pos []*model.KnowledgeDocumentSlice, hasMore bool, err error) {
if batchSize <= 0 {
batchSize = 100 // 默认批量大小
batchSize = 100 // Default batch size
}
do, err := dao.listDo(ctx, knowledgeID, documentID)
@@ -161,7 +161,7 @@ func (dao *KnowledgeDocumentSliceDAO) GetDocumentSliceIDs(ctx context.Context, d
if len(docIDs) == 0 {
return nil, errors.New("empty document ids")
}
// doc可能会有很多slice所以批量处理
// Doc may have many slices, so batch processing
sliceIDs = make([]int64, 0)
var mu sync.Mutex
errGroup, ctx := errgroup.WithContext(ctx)

View File

@@ -10,20 +10,20 @@ import (
const TableNameKnowledge = "knowledge"
// Knowledge 知识库表
// Knowledge knowledge tabke
type Knowledge struct {
ID int64 `gorm:"column:id;primaryKey;comment:主键ID" json:"id"` // 主键ID
Name string `gorm:"column:name;not null;comment:名称" json:"name"` // 名称
AppID int64 `gorm:"column:app_id;not null;comment:项目ID标识该资源是否是项目独有" json:"app_id"` // 项目ID标识该资源是否是项目独有
CreatorID int64 `gorm:"column:creator_id;not null;comment:ID" json:"creator_id"` // ID
SpaceID int64 `gorm:"column:space_id;not null;comment:空间ID" json:"space_id"` // 空间ID
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time in Milliseconds" json:"deleted_at"` // Delete Time in Milliseconds
Status int32 `gorm:"column:status;not null;default:1;comment:0 初始化, 1 生效 2 失效" json:"status"` // 0 初始化, 1 生效 2 失效
Description string `gorm:"column:description;comment:描述" json:"description"` // 描述
IconURI string `gorm:"column:icon_uri;comment:头像uri" json:"icon_uri"` // 头像uri
FormatType int32 `gorm:"column:format_type;not null;comment:0:文本 1:表格 2:图片" json:"format_type"` // 0:文本 1:表格 2:图片
ID int64 `gorm:"column:id;primaryKey;comment:id" json:"id"` // id
Name string `gorm:"column:name;not null;comment:knowledge's name" json:"name"` // knowledge's name
AppID int64 `gorm:"column:app_id;not null;comment:app id" json:"app_id"` // app id
CreatorID int64 `gorm:"column:creator_id;not null;comment:creator id" json:"creator_id"` // creator id
SpaceID int64 `gorm:"column:space_id;not null;comment:space id" json:"space_id"` // space id
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time" json:"deleted_at"` // Delete Time
Status int32 `gorm:"column:status;not null;default:1;comment:0 initialization, 1 effective, 2 invalid" json:"status"` // 0 initialization, 1 effective, 2 invalid
Description string `gorm:"column:description;comment:description" json:"description"` // description
IconURI string `gorm:"column:icon_uri;comment:icon uri" json:"icon_uri"` // icon uri
FormatType int32 `gorm:"column:format_type;not null;comment:0: Text 1: Table 2: Images" json:"format_type"` // 0: Text 1: Table 2: Images
}
// TableName Knowledge's table name

View File

@@ -11,27 +11,27 @@ import (
const TableNameKnowledgeDocument = "knowledge_document"
// KnowledgeDocument 知识库文档表
// KnowledgeDocument knowledge document info
type KnowledgeDocument struct {
ID int64 `gorm:"column:id;primaryKey;comment:主键ID" json:"id"` // 主键ID
KnowledgeID int64 `gorm:"column:knowledge_id;not null;comment:所属knowledge的ID" json:"knowledge_id"` // 所属knowledge的ID
Name string `gorm:"column:name;not null;comment:文档名称" json:"name"` // 文档名称
FileExtension string `gorm:"column:file_extension;not null;default:0;comment:文档类型, txt/pdf/csv/..." json:"file_extension"` // 文档类型, txt/pdf/csv/...
DocumentType int32 `gorm:"column:document_type;not null;comment:文档类型: 0:文本 1:表格 2:图片" json:"document_type"` // 文档类型: 0:文本 1:表格 2:图片
URI string `gorm:"column:uri;comment:资源uri" json:"uri"` // 资源uri
Size int64 `gorm:"column:size;not null;comment:文档大小" json:"size"` // 文档大小
SliceCount int64 `gorm:"column:slice_count;not null;comment:分片数量" json:"slice_count"` // 分片数量
CharCount int64 `gorm:"column:char_count;not null;comment:字符数" json:"char_count"` // 字符数
CreatorID int64 `gorm:"column:creator_id;not null;comment:创建者ID" json:"creator_id"` // 创建者ID
SpaceID int64 `gorm:"column:space_id;not null;comment:空间id" json:"space_id"` // 空间id
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time in Milliseconds" json:"deleted_at"` // Delete Time in Milliseconds
SourceType int32 `gorm:"column:source_type;not null;comment:0:本地文件上传, 2:自定义文本" json:"source_type"` // 0:本地文件上传, 2:自定义文本
Status int32 `gorm:"column:status;not null;comment:状态" json:"status"` // 状态
FailReason string `gorm:"column:fail_reason;comment:失败原因" json:"fail_reason"` // 失败原因
ParseRule *DocumentParseRule `gorm:"column:parse_rule;comment:解析+切片规则;serializer:json" json:"parse_rule"` // 解析+切片规则
TableInfo *entity.TableInfo `gorm:"column:table_info;comment:表格信息;serializer:json" json:"table_info"` // 表格信息
ID int64 `gorm:"column:id;primaryKey;comment:id" json:"id"` // id
KnowledgeID int64 `gorm:"column:knowledge_id;not null;comment:knowledge id" json:"knowledge_id"` // knowledge id
Name string `gorm:"column:name;not null;comment:document name" json:"name"` // document name
FileExtension string `gorm:"column:file_extension;not null;default:0;comment:Document type, txt/pdf/csv etc.." json:"file_extension"` // Document type, txt/pdf/csv etc..
DocumentType int32 `gorm:"column:document_type;not null;comment:Document type: 0: Text 1: Table 2: Image" json:"document_type"` // Document type: 0: Text 1: Table 2: Image
URI string `gorm:"column:uri;comment:uri" json:"uri"` // uri
Size int64 `gorm:"column:size;not null;comment:document size" json:"size"` // document size
SliceCount int64 `gorm:"column:slice_count;not null;comment:slice count" json:"slice_count"` // slice count
CharCount int64 `gorm:"column:char_count;not null;comment:number of characters" json:"char_count"` // number of characters
CreatorID int64 `gorm:"column:creator_id;not null;comment:creator id" json:"creator_id"` // creator id
SpaceID int64 `gorm:"column:space_id;not null;comment:space id" json:"space_id"` // space id
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time" json:"deleted_at"` // Delete Time
SourceType int32 `gorm:"column:source_type;comment:0: Local file upload, 2: Custom text, 103: Feishu 104: Lark" json:"source_type"` // 0: Local file upload, 2: Custom text, 103: Feishu 104: Lark
Status int32 `gorm:"column:status;not null;comment:status" json:"status"` // status
FailReason string `gorm:"column:fail_reason;comment:fail reason" json:"fail_reason"` // fail reason
ParseRule *DocumentParseRule `gorm:"column:parse_rule;comment:parse rule;serializer:json" json:"parse_rule"` // parse rule
TableInfo *entity.TableInfo `gorm:"column:table_info;comment:table info;serializer:json" json:"table_info"` // table info
}
// TableName KnowledgeDocument's table name

View File

@@ -10,21 +10,21 @@ import (
const TableNameKnowledgeDocumentReview = "knowledge_document_review"
// KnowledgeDocumentReview 文档审阅表
// KnowledgeDocumentReview Document slice preview info
type KnowledgeDocumentReview struct {
ID int64 `gorm:"column:id;primaryKey;comment:主键ID" json:"id"` // 主键ID
KnowledgeID int64 `gorm:"column:knowledge_id;not null;comment:knowledge id" json:"knowledge_id"` // knowledge id
SpaceID int64 `gorm:"column:space_id;not null;comment:空间id" json:"space_id"` // 空间id
Name string `gorm:"column:name;not null;comment:文档名称" json:"name"` // 文档名称
Type string `gorm:"column:type;not null;default:0;comment:文档类型" json:"type"` // 文档类型
URI string `gorm:"column:uri;comment:资源标识" json:"uri"` // 资源标识
FormatType int32 `gorm:"column:format_type;not null;comment:0 文本, 1 表格, 2 图片" json:"format_type"` // 0 文本, 1 表格, 2 图片
Status int32 `gorm:"column:status;not null;comment:0 处理中1 已完成2 失败3 失效" json:"status"` // 0 处理中1 已完成2 失败3 失效
ChunkRespURI string `gorm:"column:chunk_resp_uri;comment:预切片tos资源标识" json:"chunk_resp_uri"` // 预切片tos资源标识
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time in Milliseconds" json:"deleted_at"` // Delete Time in Milliseconds
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
CreatorID int64 `gorm:"column:creator_id;not null;comment:创建者ID" json:"creator_id"` // 创建者ID
ID int64 `gorm:"column:id;primaryKey;comment:id" json:"id"` // id
KnowledgeID int64 `gorm:"column:knowledge_id;not null;comment:knowledge id" json:"knowledge_id"` // knowledge id
SpaceID int64 `gorm:"column:space_id;not null;comment:space id" json:"space_id"` // space id
Name string `gorm:"column:name;not null;comment:name" json:"name"` // name
Type string `gorm:"column:type;not null;default:0;comment:document type" json:"type"` // document type
URI string `gorm:"column:uri;comment:uri" json:"uri"` // uri
FormatType int32 `gorm:"column:format_type;not null;comment:0 text, 1 table, 2 images" json:"format_type"` // 0 text, 1 table, 2 images
Status int32 `gorm:"column:status;not null;comment:0 Processing 1 Completed 2 Failed 3 Expired" json:"status"` // 0 Processing 1 Completed 2 Failed 3 Expired
ChunkRespURI string `gorm:"column:chunk_resp_uri;comment:pre-sliced uri" json:"chunk_resp_uri"` // pre-sliced uri
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time" json:"deleted_at"` // Delete Time
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
CreatorID int64 `gorm:"column:creator_id;not null;comment:creator id" json:"creator_id"` // creator id
}
// TableName KnowledgeDocumentReview's table name

View File

@@ -10,21 +10,21 @@ import (
const TableNameKnowledgeDocumentSlice = "knowledge_document_slice"
// KnowledgeDocumentSlice 知识库文件切片表
// KnowledgeDocumentSlice knowledge document slice
type KnowledgeDocumentSlice struct {
ID int64 `gorm:"column:id;primaryKey;comment:主键ID" json:"id"` // 主键ID
KnowledgeID int64 `gorm:"column:knowledge_id;not null;comment:knowledge id" json:"knowledge_id"` // knowledge id
DocumentID int64 `gorm:"column:document_id;not null;comment:document id" json:"document_id"` // document id
Content string `gorm:"column:content;comment:切片内容" json:"content"` // 切片内容
Sequence float64 `gorm:"column:sequence;not null;type:decimal(20,5);comment:切片顺序号, 从1开始" json:"sequence"` // 切片顺序号, 从1开始
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time in Milliseconds" json:"deleted_at"` // Delete Time in Milliseconds
CreatorID int64 `gorm:"column:creator_id;not null;comment:创建者ID" json:"creator_id"` // 创建者ID
SpaceID int64 `gorm:"column:space_id;not null;comment:空间ID" json:"space_id"` // 空间ID
Status int32 `gorm:"column:status;not null;comment:状态" json:"status"` // 状态
FailReason string `gorm:"column:fail_reason;comment:失败原因" json:"fail_reason"` // 失败原因
Hit int64 `gorm:"column:hit;not null;comment:命中次数" json:"hit"` // 命中次数
ID int64 `gorm:"column:id;primaryKey;comment:id" json:"id"` // id
KnowledgeID int64 `gorm:"column:knowledge_id;not null;comment:knowledge id" json:"knowledge_id"` // knowledge id
DocumentID int64 `gorm:"column:document_id;not null;comment:document_id" json:"document_id"` // document_id
Content string `gorm:"column:content;comment:content" json:"content"` // content
Sequence float64 `gorm:"column:sequence;not null;comment:slice sequence number, starting from 1" json:"sequence"` // slice sequence number, starting from 1
CreatedAt int64 `gorm:"column:created_at;not null;comment:Create Time in Milliseconds" json:"created_at"` // Create Time in Milliseconds
UpdatedAt int64 `gorm:"column:updated_at;not null;comment:Update Time in Milliseconds" json:"updated_at"` // Update Time in Milliseconds
DeletedAt gorm.DeletedAt `gorm:"column:deleted_at;comment:Delete Time" json:"deleted_at"` // Delete Time
CreatorID int64 `gorm:"column:creator_id;not null;comment:creator id" json:"creator_id"` // creator id
SpaceID int64 `gorm:"column:space_id;not null;comment:space id" json:"space_id"` // space id
Status int32 `gorm:"column:status;not null;comment:status" json:"status"` // status
FailReason string `gorm:"column:fail_reason;comment:fail reason" json:"fail_reason"` // fail reason
Hit int64 `gorm:"column:hit;not null;comment:hit counts" json:"hit"` // hit counts
}
// TableName KnowledgeDocumentSlice's table name

View File

@@ -45,23 +45,23 @@ func newKnowledge(db *gorm.DB, opts ...gen.DOOption) knowledge {
return _knowledge
}
// knowledge 知识库表
// knowledge knowledge tabke
type knowledge struct {
knowledgeDo
ALL field.Asterisk
ID field.Int64 // 主键ID
Name field.String // 名称
AppID field.Int64 // 项目ID标识该资源是否是项目独有
CreatorID field.Int64 // ID
SpaceID field.Int64 // 空间ID
ID field.Int64 // id
Name field.String // knowledge's name
AppID field.Int64 // app id
CreatorID field.Int64 // creator id
SpaceID field.Int64 // space id
CreatedAt field.Int64 // Create Time in Milliseconds
UpdatedAt field.Int64 // Update Time in Milliseconds
DeletedAt field.Field // Delete Time in Milliseconds
Status field.Int32 // 0 初始化, 1 生效 2 失效
Description field.String // 描述
IconURI field.String // 头像uri
FormatType field.Int32 // 0:文本 1:表格 2:图片
DeletedAt field.Field // Delete Time
Status field.Int32 // 0 initialization, 1 effective, 2 invalid
Description field.String // description
IconURI field.String // icon uri
FormatType field.Int32 // 0: Text 1: Table 2: Images
fieldMap map[string]field.Expr
}

View File

@@ -52,30 +52,30 @@ func newKnowledgeDocument(db *gorm.DB, opts ...gen.DOOption) knowledgeDocument {
return _knowledgeDocument
}
// knowledgeDocument 知识库文档表
// knowledgeDocument knowledge document info
type knowledgeDocument struct {
knowledgeDocumentDo
ALL field.Asterisk
ID field.Int64 // 主键ID
KnowledgeID field.Int64 // 所属knowledge的ID
Name field.String // 文档名称
FileExtension field.String // 文档类型, txt/pdf/csv/...
DocumentType field.Int32 // 文档类型: 0:文本 1:表格 2:图片
URI field.String // 资源uri
Size field.Int64 // 文档大小
SliceCount field.Int64 // 分片数量
CharCount field.Int64 // 字符数
CreatorID field.Int64 // 创建者ID
SpaceID field.Int64 // 空间id
ID field.Int64 // id
KnowledgeID field.Int64 // knowledge id
Name field.String // document name
FileExtension field.String // Document type, txt/pdf/csv etc..
DocumentType field.Int32 // Document type: 0: Text 1: Table 2: Image
URI field.String // uri
Size field.Int64 // document size
SliceCount field.Int64 // slice count
CharCount field.Int64 // number of characters
CreatorID field.Int64 // creator id
SpaceID field.Int64 // space id
CreatedAt field.Int64 // Create Time in Milliseconds
UpdatedAt field.Int64 // Update Time in Milliseconds
DeletedAt field.Field // Delete Time in Milliseconds
SourceType field.Int32 // 0:本地文件上传, 2:自定义文本
Status field.Int32 // 状态
FailReason field.String // 失败原因
ParseRule field.Field // 解析+切片规则
TableInfo field.Field // 表格信息
DeletedAt field.Field // Delete Time
SourceType field.Int32 // 0: Local file upload, 2: Custom text, 103: Feishu 104: Lark
Status field.Int32 // status
FailReason field.String // fail reason
ParseRule field.Field // parse rule
TableInfo field.Field // table info
fieldMap map[string]field.Expr
}

View File

@@ -46,24 +46,24 @@ func newKnowledgeDocumentReview(db *gorm.DB, opts ...gen.DOOption) knowledgeDocu
return _knowledgeDocumentReview
}
// knowledgeDocumentReview 文档审阅表
// knowledgeDocumentReview Document slice preview info
type knowledgeDocumentReview struct {
knowledgeDocumentReviewDo
ALL field.Asterisk
ID field.Int64 // 主键ID
ID field.Int64 // id
KnowledgeID field.Int64 // knowledge id
SpaceID field.Int64 // 空间id
Name field.String // 文档名称
Type field.String // 文档类型
URI field.String // 资源标识
FormatType field.Int32 // 0 文本, 1 表格, 2 图片
Status field.Int32 // 0 处理中1 已完成2 失败3 失效
ChunkRespURI field.String // 预切片tos资源标识
DeletedAt field.Field // Delete Time in Milliseconds
SpaceID field.Int64 // space id
Name field.String // name
Type field.String // document type
URI field.String // uri
FormatType field.Int32 // 0 text, 1 table, 2 images
Status field.Int32 // 0 Processing 1 Completed 2 Failed 3 Expired
ChunkRespURI field.String // pre-sliced uri
DeletedAt field.Field // Delete Time
CreatedAt field.Int64 // Create Time in Milliseconds
UpdatedAt field.Int64 // Update Time in Milliseconds
CreatorID field.Int64 // 创建者ID
CreatorID field.Int64 // creator id
fieldMap map[string]field.Expr
}

View File

@@ -46,24 +46,24 @@ func newKnowledgeDocumentSlice(db *gorm.DB, opts ...gen.DOOption) knowledgeDocum
return _knowledgeDocumentSlice
}
// knowledgeDocumentSlice 知识库文件切片表
// knowledgeDocumentSlice knowledge document slice
type knowledgeDocumentSlice struct {
knowledgeDocumentSliceDo
ALL field.Asterisk
ID field.Int64 // 主键ID
ID field.Int64 // id
KnowledgeID field.Int64 // knowledge id
DocumentID field.Int64 // document id
Content field.String // 切片内容
Sequence field.Float64 // 切片顺序号, 从1开始
DocumentID field.Int64 // document_id
Content field.String // content
Sequence field.Float64 // slice sequence number, starting from 1
CreatedAt field.Int64 // Create Time in Milliseconds
UpdatedAt field.Int64 // Update Time in Milliseconds
DeletedAt field.Field // Delete Time in Milliseconds
CreatorID field.Int64 // 创建者ID
SpaceID field.Int64 // 空间ID
Status field.Int32 // 状态
FailReason field.String // 失败原因
Hit field.Int64 // 命中次数
DeletedAt field.Field // Delete Time
CreatorID field.Int64 // creator id
SpaceID field.Int64 // space id
Status field.Int32 // status
FailReason field.String // fail reason
Hit field.Int64 // hit counts
fieldMap map[string]field.Expr
}

View File

@@ -48,7 +48,7 @@ type baseDocProcessor struct {
Documents []*entity.Document
documentSource *entity.DocumentSource
// 落DB 的 model
// Drop DB model
TableName string
docModels []*model.KnowledgeDocument
@@ -63,7 +63,7 @@ type baseDocProcessor struct {
}
func (p *baseDocProcessor) BeforeCreate() error {
// 从数据源拉取数据
// Pull data from a data source
return nil
}
@@ -154,7 +154,7 @@ func (p *baseDocProcessor) InsertDBModel() (err error) {
func (p *baseDocProcessor) createTable() error {
if len(p.Documents) == 1 && p.Documents[0].Type == knowledge.DocumentTypeTable {
// 表格型知识库,创建表
// Tabular knowledge base, creating tables
rdbColumns := []*rdbEntity.Column{}
tableColumns := p.Documents[0].TableInfo.Columns
columnIDs, err := p.idgen.GenMultiIDs(p.ctx, len(tableColumns)+1)
@@ -178,13 +178,13 @@ func (p *baseDocProcessor) createTable() error {
Indexing: false,
Sequence: -1,
})
// 为每个表格增加个主键ID
// Add a primary key ID to each table
rdbColumns = append(rdbColumns, &rdbEntity.Column{
Name: consts.RDBFieldID,
DataType: rdbEntity.TypeBigInt,
NotNull: true,
})
// 创建一个数据表
// Create a data table
resp, err := p.rdb.CreateTable(p.ctx, &rdb.CreateTableRequest{
Table: &rdbEntity.Table{
Columns: rdbColumns,

View File

@@ -18,7 +18,7 @@ package impl
import "github.com/coze-dev/coze-studio/backend/pkg/logs"
// 用户输入自定义内容后创建文档
// Create a document after the user enters custom content
type customDocProcessor struct {
baseDocProcessor
}

View File

@@ -24,7 +24,7 @@ import (
"github.com/coze-dev/coze-studio/backend/types/errno"
)
// 用户自定义表格创建文档
// User-defined form creation document
type customTableProcessor struct {
baseDocProcessor
}
@@ -46,7 +46,7 @@ func (c *customTableProcessor) BeforeCreate() error {
return errorx.New(errno.ErrKnowledgeTableInfoNotExistCode, errorx.KVf("msg", "table info not found, doc_id: %d", tableDoc[0].ID))
}
c.Documents[0].TableInfo = *tableDoc[0].TableInfo
// 追加场景
// append scene
if c.Documents[0].RawContent != "" {
c.Documents[0].FileExtension = getFormatType(c.Documents[0].Type)
uri := getTosUri(c.UserID, string(c.Documents[0].FileExtension))
@@ -65,14 +65,14 @@ func (c *customTableProcessor) BuildDBModel() error {
if len(c.Documents) > 0 &&
c.Documents[0].Type == knowledge.DocumentTypeTable {
if c.Documents[0].IsAppend {
// 追加场景,不需要创建表了
// 一是用户自定义一些数据、二是再上传一个表格,把表格里的数据追加到表格中
// Append the scene, no need to create a table
// First, the user customizes some data, and second, uploads another form and appends the data in the form to the form
} else {
err := c.baseDocProcessor.BuildDBModel()
if err != nil {
return err
}
// 因为这种创建方式不带数据,所以直接设置状态为可用
// Since this method of creation does not carry any data, the state is set to available directly
for i := range c.docModels {
c.docModels[i].DocumentType = 1
c.docModels[i].Status = int32(entity.DocumentStatusInit)
@@ -84,7 +84,7 @@ func (c *customTableProcessor) BuildDBModel() error {
func (c *customTableProcessor) InsertDBModel() error {
if isTableAppend(c.Documents) {
// 追加场景,设置文档为处理中状态
// Append the scene and set the document to the processing state
err := c.documentRepo.SetStatus(c.ctx, c.Documents[0].ID, int32(entity.DocumentStatusUploading), "")
if err != nil {
logs.CtxErrorf(c.ctx, "document set status err:%v", err)

View File

@@ -65,7 +65,7 @@ func (l *localTableProcessor) BuildDBModel() error {
func (l *localTableProcessor) InsertDBModel() error {
if isTableAppend(l.Documents) {
// 追加场景,设置文档为处理中状态
// Append the scene and set the document to the processing state
err := l.documentRepo.SetStatus(l.ctx, l.Documents[0].ID, int32(entity.DocumentStatusUploading), "")
if err != nil {
logs.CtxErrorf(l.ctx, "document set status err:%v", err)

View File

@@ -19,10 +19,10 @@ package processor
import "github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
type DocProcessor interface {
BeforeCreate() error // 获取数据源
BuildDBModel() error // 构建Doc记录
InsertDBModel() error // 向数据库中插入一条Doc记录
Indexing() error // 发起索引任务
GetResp() []*entity.Document // 返回处理后的文档信息
BeforeCreate() error // Get data source
BuildDBModel() error // Build Doc Record
InsertDBModel() error // Insert a Doc record into the database
Indexing() error // Initiate indexing task
GetResp() []*entity.Document // Return the processed document information
//GetColumnName()
}

View File

@@ -247,7 +247,7 @@ var d2sMapping = map[knowledge.DocumentType]document2SliceFn{
return slice, nil
},
knowledge.DocumentTypeTable: func(doc *schema.Document, knowledgeID, documentID, creatorID int64) (*entity.Slice, error) {
// NOTICE: table 类型的原始数据需要去 rdb 里查
// NOTICE: The original data source of table type needs to be checked in rdb
slice := &entity.Slice{
Info: knowledge.Info{},
KnowledgeID: knowledgeID,

View File

@@ -200,7 +200,7 @@ func (k *knowledgeSVC) copyKnowledge(ctx context.Context, copyCtx *knowledgeCopy
}
func (k *knowledgeSVC) copyKnowledgeDocuments(ctx context.Context, copyCtx *knowledgeCopyCtx) (err error) {
// 查询document信息仅处理完成的文档
// Query document information (only processed documents)
documents, _, err := k.documentRepo.FindDocumentByCondition(ctx, &entity.WhereDocumentOpt{
KnowledgeIDs: []int64{copyCtx.OriginData.ID},
StatusIn: []int32{int32(entity.DocumentStatusEnable), int32(entity.DocumentStatusInit)},
@@ -248,7 +248,7 @@ func (k *knowledgeSVC) copyKnowledgeDocuments(ctx context.Context, copyCtx *know
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
}
}
// 表格类复制
// table copy
eg := errgroup.Group{}
eg.SetLimit(10)
mu := sync.Mutex{}
@@ -286,7 +286,7 @@ func (k *knowledgeSVC) copyKnowledgeDocuments(ctx context.Context, copyCtx *know
}
func (k *knowledgeSVC) copyDocument(ctx context.Context, copyCtx *knowledgeCopyCtx, doc *model.KnowledgeDocument, newDocID int64) (err error) {
// 表格类文档复制
// tabular document replication
newDoc := model.KnowledgeDocument{
ID: newDocID,
KnowledgeID: copyCtx.CopyTask.TargetDataID,
@@ -307,7 +307,7 @@ func (k *knowledgeSVC) copyDocument(ctx context.Context, copyCtx *knowledgeCopyC
ParseRule: doc.ParseRule,
}
columnMap := map[int64]int64{}
// 如果是表格型知识库->创建新的表格
// If it is a tabular knowledge base - > create a new table
if doc.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
if doc.TableInfo != nil {
newTableInfo := entity.TableInfo{}
@@ -472,7 +472,7 @@ func (k *knowledgeSVC) copyDocument(ctx context.Context, copyCtx *knowledgeCopyC
return nil
}
func (k *knowledgeSVC) createTable(ctx context.Context, doc *model.KnowledgeDocument) error {
// 表格型知识库,创建表
// Tabular knowledge base, creating tables
rdbColumns := []*rdbEntity.Column{}
tableColumns := doc.TableInfo.Columns
columnIDs, err := k.genMultiIDs(ctx, len(tableColumns)+1)
@@ -495,13 +495,13 @@ func (k *knowledgeSVC) createTable(ctx context.Context, doc *model.KnowledgeDocu
Indexing: false,
Sequence: -1,
})
// 为每个表格增加个主键ID
// Add a primary key ID to each table
rdbColumns = append(rdbColumns, &rdbEntity.Column{
Name: consts.RDBFieldID,
DataType: rdbEntity.TypeBigInt,
NotNull: true,
})
// 创建一个数据表
// Create a data table
resp, err := k.rdb.CreateTable(ctx, &rdb.CreateTableRequest{
Table: &rdbEntity.Table{
Columns: rdbColumns,

View File

@@ -97,7 +97,7 @@ func (k *knowledgeSVC) HandleMessage(ctx context.Context, msg *eventbus.Message)
}
func (k *knowledgeSVC) deleteKnowledgeDataEventHandler(ctx context.Context, event *entity.Event) error {
// 删除知识库在各个存储里的数据
// Delete the data in each store of the knowledge base
for _, manager := range k.searchStoreManagers {
s, err := manager.GetSearchStore(ctx, getCollectionName(event.KnowledgeID))
if err != nil {
@@ -145,8 +145,8 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
return errorx.New(errno.ErrKnowledgeNonRetryableCode, errorx.KV("reason", "[indexDocument] document not provided"))
}
// 1. retry 队列和普通队列中对同一文档的 index 操作并发,同一个文档数据写入两份(在后端 bugfix 上线时产生)
// 2. rebalance 重复消费同一条消息
// 1. The index operations on the same document in the retry queue and the ordinary queue are concurrent, and the same document data is written twice (generated when the backend bugfix is online)
// 2. rebalance repeated consumption of the same message
// check knowledge and document status
if valid, err := k.isWritableKnowledgeAndDocument(ctx, doc.KnowledgeID, doc.ID); err != nil {
@@ -281,7 +281,7 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
// save slices
if doc.Type == knowledge.DocumentTypeTable {
// 表格类型,将数据插入到数据库中
// Table type to insert data into a database
err = k.upsertDataToTable(ctx, &doc.TableInfo, sliceEntities)
if err != nil {
logs.CtxErrorf(ctx, "[indexDocument] insert data to table failed, err: %v", err)
@@ -360,7 +360,7 @@ func (k *knowledgeSVC) indexDocument(ctx context.Context, event *entity.Event) (
}); err != nil {
return errorx.New(errno.ErrKnowledgeSearchStoreCode, errorx.KV("msg", fmt.Sprintf("create search store failed, err: %v", err)))
}
// 图片型知识库kn:doc:slice = 1:n:n可能content为空不需要写入
// Picture knowledge base kn: doc: slice = 1: n: n, maybe the content is empty, no need to write
if doc.Type == knowledge.DocumentTypeImage && len(ssDocs) == 1 && len(ssDocs[0].Content) == 0 {
continue
}

View File

@@ -203,16 +203,16 @@ type RetrieveRequest = knowledge.RetrieveRequest
type RetrieveContext struct {
Ctx context.Context
OriginQuery string // 原始 query
RewrittenQuery *string // 改写后的 query, 如果没有改写,就是 nil, 会在执行过程中添加上去
ChatHistory []*schema.Message // 如果没有对话历史或者不需要历史,则为 nil
KnowledgeIDs sets.Set[int64] // 本次检索涉及的知识库id
KnowledgeInfoMap map[int64]*KnowledgeInfo // 知识库id到文档id的映射
// 召回策略
OriginQuery string // Original query
RewrittenQuery *string // The rewritten query, if not rewritten, is nil, which will be added during execution
ChatHistory []*schema.Message // Nil if there is no dialogue history or no history is required
KnowledgeIDs sets.Set[int64] // The knowledge base ID involved in this search
KnowledgeInfoMap map[int64]*KnowledgeInfo // Mapping of Knowledge Base IDs to Document IDs
// recall strategy
Strategy *entity.RetrievalStrategy
// 检索涉及的 document 信息
// Retrieve the document information involved
Documents []*model.KnowledgeDocument
// 用于 nl2sql message to query 的 chat model
// A chat model for nl2sql and message to query
ChatModel chatmodel.BaseChatModel
}
@@ -254,9 +254,9 @@ type TableSchemaResponse struct {
type TableDataType int32
const (
AllData TableDataType = 0 // schema sheets preview data
OnlySchema TableDataType = 1 // 只需要 schema 结构 & Sheets
OnlyPreview TableDataType = 2 // 只需要 preview data
AllData TableDataType = 0 // Schema sheets and preview data
OnlySchema TableDataType = 1 // Only need schema structure & Sheets
OnlyPreview TableDataType = 2 // Just preview the data
)
type GetDocumentTableInfoRequest struct {

View File

@@ -103,19 +103,19 @@ func NewKnowledgeSVC(config *KnowledgeSVCConfig) (Knowledge, eventbus.ConsumerHa
type KnowledgeSVCConfig struct {
DB *gorm.DB // required
IDGen idgen.IDGenerator // required
RDB rdb.RDB // required: 表格存储
Producer eventbus.Producer // required: 文档 indexing 过程走 mq 异步处理
SearchStoreManagers []searchstore.Manager // required: 向量 / 全文
ParseManager parser.Manager // optional: 文档切分与处理能力, default builtin parser
RDB rdb.RDB // Required: Form storage
Producer eventbus.Producer // Required: Document indexing process goes through mq asynchronous processing
SearchStoreManagers []searchstore.Manager // Required: Vector/Full Text
ParseManager parser.Manager // Optional: document segmentation and processing capability, default builtin parser
Storage storage.Storage // required: oss
ModelFactory chatmodel.Factory // required: 模型 factory
Rewriter messages2query.MessagesToQuery // optional: 未配置时不改写
Reranker rerank.Reranker // optional: 未配置时默认 rrf
NL2Sql nl2sql.NL2SQL // optional: 未配置时默认不支持
EnableCompactTable *bool // optional: 表格数据压缩,默认 true
OCR ocr.OCR // optional: ocr, 未提供时 ocr 功能不可用
CacheCli cache.Cmdable // optional: 缓存实现
IsAutoAnnotationSupported bool // 是否支持了图片自动标注
ModelFactory chatmodel.Factory // Required: Model factory
Rewriter messages2query.MessagesToQuery // Optional: Do not overwrite when not configured
Reranker rerank.Reranker // Optional: default rrf when not configured
NL2Sql nl2sql.NL2SQL // Optional: Not supported by default when not configured
EnableCompactTable *bool // Optional: Table data compression, default true
OCR ocr.OCR // Optional: ocr, ocr function is not available when not provided
CacheCli cache.Cmdable // Optional: cache implementation
IsAutoAnnotationSupported bool // Does it support automatic image labeling?
}
type knowledgeSVC struct {
@@ -135,8 +135,8 @@ type knowledgeSVC struct {
storage storage.Storage
nl2Sql nl2sql.NL2SQL
cacheCli cache.Cmdable
enableCompactTable bool // 表格数据压缩
isAutoAnnotationSupported bool // 是否支持了图片自动标注
enableCompactTable bool // Table data compression
isAutoAnnotationSupported bool // Does it support automatic image labeling?
}
func (k *knowledgeSVC) CreateKnowledge(ctx context.Context, request *CreateKnowledgeRequest) (response *CreateKnowledgeResponse, err error) {
@@ -163,7 +163,7 @@ func (k *knowledgeSVC) CreateKnowledge(ctx context.Context, request *CreateKnowl
SpaceID: request.SpaceID,
CreatedAt: now,
UpdatedAt: now,
Status: int32(knowledgeModel.KnowledgeStatusEnable), // 目前向量库的初始化由文档触发,知识库无 init 过程
Status: int32(knowledgeModel.KnowledgeStatusEnable), // At present, the initialization of the vector library is triggered by the document, and the knowledge base has no init process
Description: request.Description,
IconURI: request.IconUri,
FormatType: int32(request.FormatType),
@@ -217,7 +217,7 @@ func (k *knowledgeSVC) UpdateKnowledge(ctx context.Context, request *UpdateKnowl
}
func (k *knowledgeSVC) DeleteKnowledge(ctx context.Context, request *DeleteKnowledgeRequest) error {
// 先获取一下knowledge的信息
// Get some knowledge first
knModel, err := k.knowledgeRepo.GetByID(ctx, request.KnowledgeID)
if err != nil {
return errorx.New(errno.ErrKnowledgeDBCode, errorx.KV("msg", err.Error()))
@@ -357,27 +357,27 @@ func (k *knowledgeSVC) CreateDocument(ctx context.Context, request *CreateDocume
Storage: k.storage,
Rdb: k.rdb,
})
// 1. 前置的动作,上传 tos 等
// 1. Front action, upload tos, etc
err = docProcessor.BeforeCreate()
if err != nil {
return nil, err
}
// 2. 构建 落库
// 2. Build, drop library
err = docProcessor.BuildDBModel()
if err != nil {
return nil, err
}
// 3. 插入数据库
// 3. Insert into the database
err = docProcessor.InsertDBModel()
if err != nil {
return nil, err
}
// 4. 发起索引任务
// 4. Initiate the indexing task
err = docProcessor.Indexing()
if err != nil {
return nil, err
}
// 5. 返回处理后的文档信息
// 5. Return the processed document information
docs := docProcessor.GetResp()
return &CreateDocumentResponse{
Documents: docs,
@@ -397,7 +397,7 @@ func (k *knowledgeSVC) UpdateDocument(ctx context.Context, request *UpdateDocume
}
if doc.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
// 如果是表格类型可能是要改table的meta
// If it is a table type, it may be necessary to change the meta of the table.
if doc.TableInfo != nil {
finalColumns, err := k.alterTableSchema(ctx, doc.TableInfo.Columns, request.TableInfo.Columns, doc.TableInfo.PhysicalTableName)
if err != nil {
@@ -673,7 +673,7 @@ func (k *knowledgeSVC) CreateSlice(ctx context.Context, request *CreateSliceRequ
}
if len(slices) == 1 {
if request.Position == 1 || request.Position == 0 {
// 插入到最前面
// Insert to the front
sliceInfo.Sequence = slices[0].Sequence - 1
} else {
sliceInfo.Sequence = slices[0].Sequence + 1
@@ -759,7 +759,7 @@ func (k *knowledgeSVC) UpdateSlice(ctx context.Context, request *UpdateSliceRequ
if docInfo == nil || docInfo.ID == 0 {
return errorx.New(errno.ErrKnowledgeDocumentNotExistCode)
}
// 更新数据库中的存储
// Update storage in the database
if docInfo.DocumentType == int32(knowledgeModel.DocumentTypeText) ||
docInfo.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
sliceEntity := entity.Slice{RawContent: request.RawContent}
@@ -851,7 +851,7 @@ func (k *knowledgeSVC) DeleteSlice(ctx context.Context, request *DeleteSliceRequ
return errorx.New(errno.ErrKnowledgeCrossDomainCode, errorx.KV("msg", err.Error()))
}
}
// 删除数据库中的存储
// Delete storage in the database
err = k.sliceRepo.Delete(ctx, &model.KnowledgeDocumentSlice{ID: request.SliceID})
if err != nil {
logs.CtxErrorf(ctx, "delete slice failed, err: %v", err)
@@ -905,9 +905,9 @@ func (k *knowledgeSVC) ListSlice(ctx context.Context, request *ListSliceRequest)
}
resp.Total = int(total)
var sliceMap map[int64]*entity.Slice
// 如果是表格类型那么去table中取一下原始数据
// If it is a table type, then go to the table to get the original data source
if doc.DocumentType == int32(knowledgeModel.DocumentTypeTable) {
// 从数据库中查询原始数据
// Query original data source from database
sliceMap, err = k.selectTableData(ctx, doc.TableInfo, slices)
if err != nil {
logs.CtxErrorf(ctx, "select table data failed, err: %v", err)
@@ -1010,7 +1010,7 @@ func (k *knowledgeSVC) CreateDocumentReview(ctx context.Context, request *Create
}
reviews = append(reviews, review)
}
// STEP 1. 生成ID
// STEP 1. Generate ID
reviewIDs, err := k.genMultiIDs(ctx, len(request.Reviews))
if err != nil {
return nil, errorx.New(errno.ErrKnowledgeIDGenCode)

View File

@@ -190,7 +190,7 @@ func TestKnowledgeSVC_CreateDocument(t *testing.T) {
// IconURI: "icon.png",
// },
// KnowledgeID: 666,
// RawContent: "测试测试测试测试",
// RawContent: "Test Test Test",
// Source: entity.DocumentSourceCustom,
// FileExtension: "txt",
// }
@@ -274,25 +274,25 @@ func TestKnowledgeSVC_CreateDocument(t *testing.T) {
// VirtualTableName: "test",
// Columns: []*entity.TableColumn{
// {
// Name: "第一列",
// Name: "First Column",
// Type: entity.TableColumnTypeBoolean,
// Indexing: true,
// Sequence: 0,
// },
// {
// Name: "第二列",
// Name: "Second column",
// Type: entity.TableColumnTypeTime,
// Indexing: false,
// Sequence: 1,
// },
// {
// Name: "第三列",
// Name: "Third Column",
// Type: entity.TableColumnTypeString,
// Indexing: false,
// Sequence: 2,
// },
// {
// Name: "第四列",
// Name: "Fourth column",
// Type: entity.TableColumnTypeNumber,
// Indexing: true,
// Sequence: 3,
@@ -384,25 +384,25 @@ func TestKnowledgeSVC_DeleteDocument(t *testing.T) {
// VirtualTableName: "test",
// Columns: []*entity.TableColumn{
// {
// Name: "第一列",
// Name: "First Column",
// Type: entity.TableColumnTypeBoolean,
// Indexing: true,
// Sequence: 0,
// },
// {
// Name: "第二列",
// Name: "Second column",
// Type: entity.TableColumnTypeTime,
// Indexing: false,
// Sequence: 1,
// },
// {
// Name: "第三列",
// Name: "Third Column",
// Type: entity.TableColumnTypeString,
// Indexing: false,
// Sequence: 2,
// },
// {
// Name: "第四列",
// Name: "Fourth column",
// Type: entity.TableColumnTypeNumber,
// Indexing: true,
// Sequence: 3,
@@ -415,14 +415,14 @@ func TestKnowledgeSVC_DeleteDocument(t *testing.T) {
// assert.Equal(t, 1, len(doc))
// time.Sleep(time.Second * 5)
// doc[0].Name = "new_name"
// doc[0].TableInfo.Columns[0].Name = "第一列_changeName"
// doc[0].TableInfo.Columns[1].Name = "第二列_changeSeq"
// Doc [0]. TableInfo. Columns [0]. Name = "First column _changeName"
// Doc [0]. TableInfo. Columns [1]. Name = "Second column _changeSeq"
// doc[0].TableInfo.Columns[1].Sequence = 2
// doc[0].TableInfo.Columns[2].Name = "第三列_changeType"
// Doc [0]. TableInfo. Columns [2]. Name = "Third column _changeType"
// doc[0].TableInfo.Columns[2].Type = entity.TableColumnTypeInteger
// doc[0].TableInfo.Columns[2].Sequence = 1
// // 删除原来的第四列并新建第四列
// doc[0].TableInfo.Columns[3].Name = "第五列_create"
// Delete the original fourth column and create a new fourth column
// Doc [0]. TableInfo. Columns [3]. Name = "Fifth column _create"
// doc[0].TableInfo.Columns[3].Type = entity.TableColumnTypeNumber
// doc[0].TableInfo.Columns[3].Sequence = 3
// doc[0].TableInfo.Columns[3].ID = 0
@@ -479,25 +479,25 @@ func TestKnowledgeSVC_ListDocument(t *testing.T) {
// VirtualTableName: "test",
// Columns: []*entity.TableColumn{
// {
// Name: "第一列",
// Name: "First Column",
// Type: entity.TableColumnTypeBoolean,
// Indexing: true,
// Sequence: 0,
// },
// {
// Name: "第二列",
// Name: "Second column",
// Type: entity.TableColumnTypeTime,
// Indexing: false,
// Sequence: 1,
// },
// {
// Name: "第三列",
// Name: "Third Column",
// Type: entity.TableColumnTypeString,
// Indexing: false,
// Sequence: 2,
// },
// {
// Name: "第四列",
// Name: "Fourth column",
// Type: entity.TableColumnTypeNumber,
// Indexing: true,
// Sequence: 3,
@@ -781,25 +781,25 @@ func TestKnowledgeSVC_ListDocument(t *testing.T) {
// VirtualTableName: "test",
// Columns: []*entity.TableColumn{
// {
// Name: "第一列",
// Name: "First Column",
// Type: entity.TableColumnTypeBoolean,
// Indexing: true,
// Sequence: 0,
// },
// {
// Name: "第二列",
// Name: "Second column",
// Type: entity.TableColumnTypeTime,
// Indexing: false,
// Sequence: 1,
// },
// {
// Name: "第三列",
// Name: "Third Column",
// Type: entity.TableColumnTypeString,
// Indexing: false,
// Sequence: 2,
// },
// {
// Name: "第四列",
// Name: "Fourth column",
// Type: entity.TableColumnTypeNumber,
// Indexing: true,
// Sequence: 3,
@@ -979,7 +979,7 @@ func TestKnowledgeSVC_Retrieve(t *testing.T) {
//svc := MockKnowledgeSVC(t)
//mockey.PatchConvey("test retrieve", t, func() {
// res, err := svc.Retrieve(ctx, &knowledge.RetrieveRequest{
// Query: "查找第三列为gogogo的数据",
// Query: "Find the data of the third column gogogo",
// KnowledgeIDs: []int64{1745810102455734000, 1745810094197395000},
// Strategy: &entity.RetrievalStrategy{
// TopK: ptr.Of(int64(2)),

View File

@@ -116,7 +116,7 @@ func (k *knowledgeSVC) alterTableSchema(ctx context.Context, beforeColumns []*en
continue
}
if targetColumns[i].ID == 0 {
// 要新增的列
// Columns to be added
columnID, err := k.idgen.GenID(ctx)
if err != nil {
logs.CtxErrorf(ctx, "gen id failed, err: %v", err)
@@ -132,7 +132,7 @@ func (k *knowledgeSVC) alterTableSchema(ctx context.Context, beforeColumns []*en
})
} else {
if checkColumnExist(targetColumns[i].ID, beforeColumns) {
// 要修改的列
// Column to modify
alterRequest.Operations = append(alterRequest.Operations, &rdb.AlterTableOperation{
Action: rdbEntity.ModifyColumn,
Column: &rdbEntity.Column{
@@ -153,7 +153,7 @@ func (k *knowledgeSVC) alterTableSchema(ctx context.Context, beforeColumns []*en
continue
}
if !checkColumnExist(beforeColumns[i].ID, targetColumns) {
// 要删除的列
// Column to delete
alterRequest.Operations = append(alterRequest.Operations, &rdb.AlterTableOperation{
Action: rdbEntity.DropColumn,
Column: &rdbEntity.Column{

View File

@@ -69,17 +69,17 @@ func (k *knowledgeSVC) Retrieve(ctx context.Context, request *RetrieveRequest) (
}
chain := compose.NewChain[*RetrieveContext, []*knowledgeModel.RetrieveSlice]()
rewriteNode := compose.InvokableLambda(k.queryRewriteNode)
// 向量化召回
// vectorized recall
vectorRetrieveNode := compose.InvokableLambda(k.vectorRetrieveNode)
// ES召回
// ES recall
EsRetrieveNode := compose.InvokableLambda(k.esRetrieveNode)
// Nl2Sql召回
// Nl2Sql recall
Nl2SqlRetrieveNode := compose.InvokableLambda(k.nl2SqlRetrieveNode)
// pass user query Node
passRequestContextNode := compose.InvokableLambda(k.passRequestContext)
// reRank Node
reRankNode := compose.InvokableLambda(k.reRankNode)
// pack Result接口
// Pack Result Interface
packResult := compose.InvokableLambda(k.packResults)
parallelNode := compose.NewParallel().
AddLambda("vectorRetrieveNode", vectorRetrieveNode).
@@ -190,11 +190,11 @@ func (k *knowledgeSVC) prepareRAGDocuments(ctx context.Context, documentIDs []in
func (k *knowledgeSVC) queryRewriteNode(ctx context.Context, req *RetrieveContext) (newRetrieveContext *RetrieveContext, err error) {
if len(req.ChatHistory) == 0 {
// 没有上下文不需要改写
// No context, no rewriting.
return req, nil
}
if !req.Strategy.EnableQueryRewrite || k.rewriter == nil {
// 未开启rewrite功能不需要上下文改写
// Rewrite function is not enabled, no context rewrite is required
return req, nil
}
var opts []messages2query.Option
@@ -206,7 +206,7 @@ func (k *knowledgeSVC) queryRewriteNode(ctx context.Context, req *RetrieveContex
logs.CtxErrorf(ctx, "rewrite query failed: %v", err)
return req, nil
}
// 改写完成
// Rewrite completed
req.RewrittenQuery = &rewrittenQuery
return req, nil
}
@@ -373,7 +373,7 @@ func (k *knowledgeSVC) nl2SqlExec(ctx context.Context, doc *model.KnowledgeDocum
return nil, err
}
sql = addSliceIdColumn(sql)
// 执行sql
// Execute sql
replaceMap := map[string]sqlparsercontract.TableColumn{}
replaceMap[doc.Name] = sqlparsercontract.TableColumn{
NewTableName: ptr.Of(doc.TableInfo.PhysicalTableName),
@@ -395,7 +395,7 @@ func (k *knowledgeSVC) nl2SqlExec(ctx context.Context, doc *model.KnowledgeDocum
logs.CtxErrorf(ctx, "parse sql failed: %v", err)
return nil, err
}
// 执行sql
// Execute sql
resp, err := k.rdb.ExecuteSQL(ctx, &rdb.ExecuteSQLRequest{
SQL: parsedSQL,
})
@@ -428,7 +428,7 @@ func addSliceIdColumn(originalSql string) string {
if selectIndex == -1 {
return originalSql
}
result := originalSql[:selectIndex+len("select ")] // 保留 select 部分
result := originalSql[:selectIndex+len("select ")] // Keep selected part
remainder := originalSql[selectIndex+len("select "):]
lowerRemainder := strings.ToLower(remainder)
@@ -474,25 +474,25 @@ func (k *knowledgeSVC) passRequestContext(ctx context.Context, req *RetrieveCont
}
func (k *knowledgeSVC) reRankNode(ctx context.Context, resultMap map[string]any) (retrieveResult []*schema.Document, err error) {
// 首先获取下retrieve上下文
// First retrieve the context
retrieveCtx, ok := resultMap["passRequestContext"].(*RetrieveContext)
if !ok {
logs.CtxErrorf(ctx, "retrieve context is not found")
return nil, errorx.New(errno.ErrKnowledgeSystemCode, errorx.KV("msg", "retrieve context is not found"))
}
// 获取下向量化召回的接口
// Get the interface for the downvectorized recall
vectorRetrieveResult, ok := resultMap["vectorRetrieveNode"].([]*schema.Document)
if !ok {
logs.CtxErrorf(ctx, "vector retrieve result is not found")
vectorRetrieveResult = []*schema.Document{}
}
// 获取下es召回的接口
// Get the interface of the es recall.
esRetrieveResult, ok := resultMap["esRetrieveNode"].([]*schema.Document)
if !ok {
logs.CtxErrorf(ctx, "es retrieve result is not found")
esRetrieveResult = []*schema.Document{}
}
// 获取下nl2sql召回的接口
// Get the interface recalled under nl2sql
nl2SqlRetrieveResult, ok := resultMap["nl2SqlRetrieveNode"].([]*schema.Document)
if !ok {
logs.CtxErrorf(ctx, "nl2sql retrieve result is not found")
@@ -508,10 +508,10 @@ func (k *knowledgeSVC) reRankNode(ctx context.Context, resultMap map[string]any)
return data
}
// 根据召回策略从不同渠道获取召回结果
// Obtain recall results from different channels according to the recall strategy
var retrieveResultArr [][]*rerank.Data
if retrieveCtx.Strategy.EnableNL2SQL {
// nl2sql结果
// Nl2sql results
retrieveResultArr = append(retrieveResultArr, docs2RerankData(nl2SqlRetrieveResult))
}
switch retrieveCtx.Strategy.SearchType {
@@ -739,18 +739,18 @@ func (i *ImageContent) SetKV(k string, v string) {
func (k *knowledgeSVC) ParseFrontEndImageContent(ctx context.Context, s string) []*ImageContent {
res := make([]*ImageContent, 0)
imgRe := regexp.MustCompile(`<img\s+[^>]*>`)
// 查找所有匹配项
// Find all matches
matches := imgRe.FindAllSubmatchIndex([]byte(s), -1)
// 遍历匹配项并输出src和data-tos-key字段
// 遍历每个匹配项的索引
// Traverse matches and output the src and data-tos-key fields
// Iterate the index of each match
for _, match := range matches {
// 输出每个匹配项整个正则在文本中的开始和结束位置
// Outputs the beginning and end positions of the entire regular for each match in the text
matchStart := match[0]
matchEnd := match[1]
all := s[match[0]:match[1]]
re := regexp.MustCompile(`<img\s+([^>]+)>`)
// 初始化map存储kv信息把多余信息去掉
// Initialize map to store kv information and remove redundant information
m := make(map[string]string)
l := make([]string, 0)
match := re.FindStringSubmatch(all)
@@ -758,13 +758,13 @@ func (k *knowledgeSVC) ParseFrontEndImageContent(ctx context.Context, s string)
continue
}
attributes := match[1]
// 定义正则表达式模式,用于提取属性键值对
// Defines a regular expression pattern for extracting attribute key-value pairs
attrRe := regexp.MustCompile(`(\S+)=(?:"([^"]*)"|'([^']*)')`)
// 查找所有属性键值对
// Find all attribute key-value pairs
attrMatches := attrRe.FindAllStringSubmatch(attributes, -1)
// 提取并存储kv信息
// Extract and store kv information
for _, attrMatch := range attrMatches {
key := attrMatch[1]
value := attrMatch[2]

View File

@@ -271,11 +271,11 @@ func (k *knowledgeSVC) ValidateTableSchema(ctx context.Context, request *Validat
dst := doc.TableInfo
result := make(map[string]string)
// validate 通过条件:
// 1. 表头名称对齐(不要求顺序一致)
// 2. indexing 列必须有值, 其余列可以为空
// 3. 值类型可转换
// 4. 已有表表头字段全包含
// Validate the conditions:
// 1. Header name alignment (consistent order is not required)
// 2. The indexing column must have a value, and the remaining columns can be empty
// 3. Value types are convertible
// 4. All existing table header fields are included
dstMapping := make(map[string]*entity.TableColumn)
for _, col := range dst.Columns {
dstCol := col
@@ -395,7 +395,7 @@ func (k *knowledgeSVC) GetDocumentTableInfo(ctx context.Context, request *GetDoc
}, nil
}
// GetDocumentTableInfoByID 先不作为接口,有需要再改
// GetDocumentTableInfoByID not as an interface first, and then change it if necessary
func (k *knowledgeSVC) GetDocumentTableInfoByID(ctx context.Context, documentID int64, needData bool) (*TableSchemaResponse, error) {
docs, err := k.documentRepo.MGetByID(ctx, []int64{documentID})
if err != nil {