112 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			112 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Go
		
	
	
	
/*
 | 
						|
 * Copyright 2025 coze-dev Authors
 | 
						|
 *
 | 
						|
 * Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
 * you may not use this file except in compliance with the License.
 | 
						|
 * You may obtain a copy of the License at
 | 
						|
 *
 | 
						|
 *     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 *
 | 
						|
 * Unless required by applicable law or agreed to in writing, software
 | 
						|
 * distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
 * See the License for the specific language governing permissions and
 | 
						|
 * limitations under the License.
 | 
						|
 */
 | 
						|
 | 
						|
package convert
 | 
						|
 | 
						|
import (
 | 
						|
	"time"
 | 
						|
 | 
						|
	"github.com/coze-dev/coze-studio/backend/domain/knowledge/entity"
 | 
						|
	"github.com/coze-dev/coze-studio/backend/domain/knowledge/internal/consts"
 | 
						|
	"github.com/coze-dev/coze-studio/backend/infra/contract/document"
 | 
						|
	"github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
 | 
						|
	"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr"
 | 
						|
)
 | 
						|
 | 
						|
func DocumentToParseConfig(doc *entity.Document) *parser.Config {
 | 
						|
	return ToParseConfig(doc.FileExtension, doc.ParsingStrategy, doc.ChunkingStrategy, doc.IsAppend, doc.TableInfo.Columns)
 | 
						|
}
 | 
						|
 | 
						|
func ToParseConfig(fileExtension parser.FileExtension, ps *entity.ParsingStrategy, cs *entity.ChunkingStrategy, isAppend bool, columns []*entity.TableColumn) *parser.Config {
 | 
						|
	if ps == nil {
 | 
						|
		ps = &entity.ParsingStrategy{HeaderLine: 0, DataStartLine: 1}
 | 
						|
	}
 | 
						|
 | 
						|
	p := &parser.ParsingStrategy{
 | 
						|
		ExtractImage:        ps.ExtractImage,
 | 
						|
		ExtractTable:        ps.ExtractTable,
 | 
						|
		ImageOCR:            ps.ImageOCR,
 | 
						|
		FilterPages:         ps.FilterPages,
 | 
						|
		SheetID:             ptr.Of(int(ps.SheetID)),
 | 
						|
		HeaderLine:          ps.HeaderLine,
 | 
						|
		DataStartLine:       ps.DataStartLine,
 | 
						|
		RowsCount:           ps.RowsCount,
 | 
						|
		IsAppend:            isAppend,
 | 
						|
		Columns:             convColumns(columns),
 | 
						|
		IgnoreColumnTypeErr: true, // default true
 | 
						|
		ImageAnnotationType: ptr.From(ptr.From(ps).CaptionType),
 | 
						|
	}
 | 
						|
 | 
						|
	var c *parser.ChunkingStrategy
 | 
						|
	if cs != nil {
 | 
						|
		c = &parser.ChunkingStrategy{
 | 
						|
			ChunkType:       cs.ChunkType,
 | 
						|
			ChunkSize:       cs.ChunkSize,
 | 
						|
			Separator:       cs.Separator,
 | 
						|
			Overlap:         cs.Overlap,
 | 
						|
			TrimSpace:       cs.TrimSpace,
 | 
						|
			TrimURLAndEmail: cs.TrimURLAndEmail,
 | 
						|
			MaxDepth:        cs.MaxDepth,
 | 
						|
			SaveTitle:       cs.SaveTitle,
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return &parser.Config{
 | 
						|
		FileExtension:    fileExtension,
 | 
						|
		ParsingStrategy:  p,
 | 
						|
		ChunkingStrategy: c,
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func convColumns(src []*entity.TableColumn) []*document.Column {
 | 
						|
	resp := make([]*document.Column, 0, len(src))
 | 
						|
	for _, c := range src {
 | 
						|
		if c.Name == consts.RDBFieldID {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
		dc := &document.Column{
 | 
						|
			ID:          c.ID,
 | 
						|
			Name:        c.Name,
 | 
						|
			Type:        c.Type,
 | 
						|
			Description: c.Description,
 | 
						|
			Nullable:    !c.Indexing,
 | 
						|
			IsPrimary:   false,
 | 
						|
			Sequence:    int(c.Sequence),
 | 
						|
		}
 | 
						|
		resp = append(resp, dc)
 | 
						|
	}
 | 
						|
	return resp
 | 
						|
}
 | 
						|
 | 
						|
func Type2DefaultVal(typ document.TableColumnType) any {
 | 
						|
	switch typ {
 | 
						|
	case document.TableColumnTypeString:
 | 
						|
		return ""
 | 
						|
	case document.TableColumnTypeInteger:
 | 
						|
		return 0
 | 
						|
	case document.TableColumnTypeTime:
 | 
						|
		return time.Time{}
 | 
						|
	case document.TableColumnTypeNumber:
 | 
						|
		return 0.0
 | 
						|
	case document.TableColumnTypeBoolean:
 | 
						|
		return false
 | 
						|
	case document.TableColumnTypeImage:
 | 
						|
		return []byte{}
 | 
						|
	default:
 | 
						|
		return ""
 | 
						|
	}
 | 
						|
}
 |