130 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			130 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Go
		
	
	
	
| /*
 | |
|  * Copyright 2025 coze-dev Authors
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  *     http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| package parser
 | |
| 
 | |
| import (
 | |
| 	"github.com/coze-dev/coze-studio/backend/infra/contract/document"
 | |
| 	"github.com/coze-dev/coze-studio/backend/pkg/lang/sets"
 | |
| )
 | |
| 
 | |
| type Manager interface {
 | |
| 	GetParser(config *Config) (Parser, error)
 | |
| 	IsAutoAnnotationSupported() bool
 | |
| }
 | |
| 
 | |
| type Config struct {
 | |
| 	FileExtension    FileExtension
 | |
| 	ParsingStrategy  *ParsingStrategy
 | |
| 	ChunkingStrategy *ChunkingStrategy
 | |
| }
 | |
| 
 | |
| // ParsingStrategy for document parse before indexing
 | |
| type ParsingStrategy struct {
 | |
| 	// Doc
 | |
| 	ExtractImage bool  `json:"extract_image"` // Extract image elements
 | |
| 	ExtractTable bool  `json:"extract_table"` // Extract table elements
 | |
| 	ImageOCR     bool  `json:"image_ocr"`     // Image ocr
 | |
| 	FilterPages  []int `json:"filter_pages"`  // Page filter, first page = 1
 | |
| 
 | |
| 	// Sheet
 | |
| 	SheetID             *int               `json:"sheet_id"`        // xlsx sheet id
 | |
| 	HeaderLine          int                `json:"header_line"`     // header row
 | |
| 	DataStartLine       int                `json:"data_start_line"` // Data start row
 | |
| 	RowsCount           int                `json:"rows_count"`      // number of rows read
 | |
| 	IsAppend            bool               `json:"-"`               // row insertion
 | |
| 	Columns             []*document.Column `json:"-"`               // Sheet Alignment Header
 | |
| 	IgnoreColumnTypeErr bool               `json:"-"`               // Ignore the problem that the column type and value are not aligned when true, when the value is empty
 | |
| 
 | |
| 	// Image
 | |
| 	ImageAnnotationType ImageAnnotationType `json:"image_annotation_type"` // Image content labeling type
 | |
| }
 | |
| 
 | |
| type ChunkingStrategy struct {
 | |
| 	ChunkType ChunkType `json:"chunk_type"`
 | |
| 
 | |
| 	// custom config
 | |
| 	ChunkSize       int64  `json:"chunk_size"` // maximum segmentation length
 | |
| 	Separator       string `json:"separator"`  // segmentation identifier
 | |
| 	Overlap         int64  `json:"overlap"`    // segmented overlap ratio
 | |
| 	TrimSpace       bool   `json:"trim_space"`
 | |
| 	TrimURLAndEmail bool   `json:"trim_url_and_email"`
 | |
| 
 | |
| 	// leveled config
 | |
| 	MaxDepth  int64 `json:"max_depth"`  // Maximum level when segmented by level
 | |
| 	SaveTitle bool  `json:"save_title"` // Preserve Hierarchical Titles
 | |
| }
 | |
| 
 | |
| type ChunkType int64
 | |
| 
 | |
| const (
 | |
| 	ChunkTypeDefault ChunkType = 0 // Automatic sharding
 | |
| 	ChunkTypeCustom  ChunkType = 1 // Custom rule sharding
 | |
| 	ChunkTypeLeveled ChunkType = 2 // Hierarchical sharding
 | |
| )
 | |
| 
 | |
| type ImageAnnotationType int64
 | |
| 
 | |
| const (
 | |
| 	ImageAnnotationTypeModel  ImageAnnotationType = 0 // automatic model annotation
 | |
| 	ImageAnnotationTypeManual ImageAnnotationType = 1 // manual annotation
 | |
| )
 | |
| 
 | |
| type FileExtension string
 | |
| 
 | |
| const (
 | |
| 	// document
 | |
| 	FileExtensionPDF      FileExtension = "pdf"
 | |
| 	FileExtensionTXT      FileExtension = "txt"
 | |
| 	FileExtensionDoc      FileExtension = "doc"
 | |
| 	FileExtensionDocx     FileExtension = "docx"
 | |
| 	FileExtensionMarkdown FileExtension = "md"
 | |
| 
 | |
| 	// sheet
 | |
| 	FileExtensionCSV      FileExtension = "csv"
 | |
| 	FileExtensionXLSX     FileExtension = "xlsx"
 | |
| 	FileExtensionJSON     FileExtension = "json"
 | |
| 	FileExtensionJsonMaps FileExtension = "json_maps" // json of []map[string]string
 | |
| 
 | |
| 	// image
 | |
| 	FileExtensionJPG  FileExtension = "jpg"
 | |
| 	FileExtensionJPEG FileExtension = "jpeg"
 | |
| 	FileExtensionPNG  FileExtension = "png"
 | |
| )
 | |
| 
 | |
| func ValidateFileExtension(fileSuffix string) (ext FileExtension, support bool) {
 | |
| 	fileExtension := FileExtension(fileSuffix)
 | |
| 	_, ok := fileExtensionSet[fileExtension]
 | |
| 	if !ok {
 | |
| 		return "", false
 | |
| 	}
 | |
| 	return fileExtension, true
 | |
| }
 | |
| 
 | |
| var fileExtensionSet = sets.Set[FileExtension]{
 | |
| 	FileExtensionPDF:      {},
 | |
| 	FileExtensionTXT:      {},
 | |
| 	FileExtensionDoc:      {},
 | |
| 	FileExtensionDocx:     {},
 | |
| 	FileExtensionMarkdown: {},
 | |
| 	FileExtensionCSV:      {},
 | |
| 	FileExtensionJSON:     {},
 | |
| 	FileExtensionJsonMaps: {},
 | |
| 	FileExtensionJPG:      {},
 | |
| 	FileExtensionJPEG:     {},
 | |
| 	FileExtensionPNG:      {},
 | |
| }
 |