coze-studio/backend/domain/knowledge/entity/strategy.go

65 lines
2.3 KiB
Go

/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package entity
import (
"github.com/coze-dev/coze-studio/backend/api/model/crossdomain/knowledge"
"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
)
type RetrievalStrategy = knowledge.RetrievalStrategy
// ParsingStrategy for document parse before indexing
type ParsingStrategy struct {
ParsingType ParsingType `json:"parsing_type"` // parse type
// Doc
ExtractImage bool `json:"extract_image"` // Extract image elements
ExtractTable bool `json:"extract_table"` // Extract table elements
ImageOCR bool `json:"image_ocr"` // Image ocr
FilterPages []int `json:"filter_pages"` // filter pages
// Sheet
SheetID int64 `json:"sheet_id"` // xlsx sheet id
HeaderLine int `json:"header_line"` // header row
DataStartLine int `json:"data_start_line"` // Data start row
RowsCount int `json:"rows_count"` // number of rows read
// Image
CaptionType *parser.ImageAnnotationType `json:"caption_type"`
}
type ParsingType int64
const (
ParsingType_FastParsing ParsingType = 0
ParsingType_AccurateParsing ParsingType = 1
)
// ChunkingStrategy for document chunk before indexing
type ChunkingStrategy struct {
ChunkType parser.ChunkType `json:"chunk_type"`
// custom chunk config
ChunkSize int64 `json:"chunk_size"` // maximum segmentation length
Separator string `json:"separator"` // segmentation identifier
Overlap int64 `json:"overlap"` // segmented overlap
TrimSpace bool `json:"trim_space"`
TrimURLAndEmail bool `json:"trim_url_and_email"`
// segmentation by hierarchy
MaxDepth int64 `json:"max_depth"` // Maximum level when segmented by level
SaveTitle bool `json:"save_title"` // Preserve Hierarchical Titles
}