171 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			171 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Go
		
	
	
	
/*
 | 
						|
 * Copyright 2025 coze-dev Authors
 | 
						|
 *
 | 
						|
 * Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
 * you may not use this file except in compliance with the License.
 | 
						|
 * You may obtain a copy of the License at
 | 
						|
 *
 | 
						|
 *     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 *
 | 
						|
 * Unless required by applicable law or agreed to in writing, software
 | 
						|
 * distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
 * See the License for the specific language governing permissions and
 | 
						|
 * limitations under the License.
 | 
						|
 */
 | 
						|
 | 
						|
package builtin
 | 
						|
 | 
						|
import (
 | 
						|
	"encoding/json"
 | 
						|
 | 
						|
	"github.com/cloudwego/eino/components/document/parser"
 | 
						|
	"github.com/cloudwego/eino/schema"
 | 
						|
 | 
						|
	"github.com/coze-dev/coze-studio/backend/infra/contract/document"
 | 
						|
	contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
 | 
						|
)
 | 
						|
 | 
						|
type rowIterator interface {
 | 
						|
	NextRow() (row []string, end bool, err error)
 | 
						|
}
 | 
						|
 | 
						|
func parseByRowIterator(iter rowIterator, config *contract.Config, opts ...parser.Option) (
 | 
						|
	docs []*schema.Document, err error) {
 | 
						|
 | 
						|
	ps := config.ParsingStrategy
 | 
						|
	options := parser.GetCommonOptions(&parser.Options{}, opts...)
 | 
						|
	i := 0
 | 
						|
	columnsProvides := ps.IsAppend || len(ps.Columns) > 0
 | 
						|
	rev := make(map[int]*document.Column)
 | 
						|
 | 
						|
	var (
 | 
						|
		expColumns []*document.Column
 | 
						|
		expData    [][]*document.ColumnData
 | 
						|
	)
 | 
						|
 | 
						|
	for {
 | 
						|
		row, end, err := iter.NextRow()
 | 
						|
		if err != nil {
 | 
						|
			return nil, err
 | 
						|
		}
 | 
						|
		if end {
 | 
						|
			break
 | 
						|
		}
 | 
						|
		if i == ps.HeaderLine {
 | 
						|
			if columnsProvides {
 | 
						|
				expColumns = ps.Columns
 | 
						|
			} else {
 | 
						|
				for j, col := range row {
 | 
						|
					expColumns = append(expColumns, &document.Column{
 | 
						|
						Name:     col,
 | 
						|
						Type:     document.TableColumnTypeUnknown,
 | 
						|
						Sequence: j,
 | 
						|
					})
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
			for j := range expColumns {
 | 
						|
				tc := expColumns[j]
 | 
						|
				rev[tc.Sequence] = tc
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if i >= ps.DataStartLine {
 | 
						|
			var rowData []*document.ColumnData
 | 
						|
			for j := range row {
 | 
						|
				colSchema, found := rev[j]
 | 
						|
				if !found { // 列裁剪
 | 
						|
					continue
 | 
						|
				}
 | 
						|
 | 
						|
				val := row[j]
 | 
						|
 | 
						|
				if columnsProvides {
 | 
						|
					var data *document.ColumnData
 | 
						|
					if config.ParsingStrategy.IgnoreColumnTypeErr {
 | 
						|
						data = assertValAsForce(colSchema.Type, val, colSchema.Nullable)
 | 
						|
					} else {
 | 
						|
						data, err = assertValAs(colSchema.Type, val)
 | 
						|
						if err != nil {
 | 
						|
							return nil, err
 | 
						|
						}
 | 
						|
					}
 | 
						|
					data.ColumnID = colSchema.ID
 | 
						|
					data.ColumnName = colSchema.Name
 | 
						|
					rowData = append(rowData, data)
 | 
						|
				} else {
 | 
						|
					exp := assertVal(val)
 | 
						|
					colSchema.Type = transformColumnType(colSchema.Type, exp.Type)
 | 
						|
					rowData = append(rowData, &document.ColumnData{
 | 
						|
						ColumnID:   colSchema.ID,
 | 
						|
						ColumnName: colSchema.Name,
 | 
						|
						Type:       document.TableColumnTypeUnknown,
 | 
						|
						ValString:  &val,
 | 
						|
					})
 | 
						|
				}
 | 
						|
			}
 | 
						|
			if rowData != nil {
 | 
						|
				expData = append(expData, rowData)
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		i++
 | 
						|
		if ps.RowsCount != 0 && len(docs) == ps.RowsCount {
 | 
						|
			break
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if !columnsProvides {
 | 
						|
		// align data type when columns are provided
 | 
						|
		for _, col := range expColumns {
 | 
						|
			if col.Type == document.TableColumnTypeUnknown {
 | 
						|
				col.Type = document.TableColumnTypeString
 | 
						|
			}
 | 
						|
		}
 | 
						|
		for _, row := range expData {
 | 
						|
			if err = alignTableSliceValue(expColumns, row); err != nil {
 | 
						|
				return nil, err
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if len(expData) == 0 {
 | 
						|
		// return a special document with columns only if there is no data
 | 
						|
		doc := &schema.Document{
 | 
						|
			MetaData: map[string]any{
 | 
						|
				document.MetaDataKeyColumns:     expColumns,
 | 
						|
				document.MetaDataKeyColumnsOnly: struct{}{},
 | 
						|
			},
 | 
						|
		}
 | 
						|
		for k, v := range options.ExtraMeta {
 | 
						|
			doc.MetaData[k] = v
 | 
						|
		}
 | 
						|
		return []*schema.Document{doc}, nil
 | 
						|
	}
 | 
						|
 | 
						|
	for j := range expData {
 | 
						|
		contentMapping := make(map[string]string)
 | 
						|
		for _, col := range expData[j] {
 | 
						|
			contentMapping[col.ColumnName] = col.GetStringValue()
 | 
						|
		}
 | 
						|
		b, err := json.Marshal(contentMapping)
 | 
						|
		if err != nil {
 | 
						|
			return nil, err
 | 
						|
		}
 | 
						|
		doc := &schema.Document{
 | 
						|
			Content: string(b), // set for tables in text
 | 
						|
			MetaData: map[string]any{
 | 
						|
				document.MetaDataKeyColumns:    expColumns,
 | 
						|
				document.MetaDataKeyColumnData: expData[j],
 | 
						|
			},
 | 
						|
		}
 | 
						|
		for k, v := range options.ExtraMeta {
 | 
						|
			doc.MetaData[k] = v
 | 
						|
		}
 | 
						|
		docs = append(docs, doc)
 | 
						|
	}
 | 
						|
 | 
						|
	return docs, nil
 | 
						|
}
 |