coze-studio/backend/infra/impl/document/parser/builtin/parse_iter.go

171 lines
4.1 KiB
Go

/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package builtin
import (
"encoding/json"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
)
type rowIterator interface {
NextRow() (row []string, end bool, err error)
}
func parseByRowIterator(iter rowIterator, config *contract.Config, opts ...parser.Option) (
docs []*schema.Document, err error) {
ps := config.ParsingStrategy
options := parser.GetCommonOptions(&parser.Options{}, opts...)
i := 0
columnsProvides := ps.IsAppend || len(ps.Columns) > 0
rev := make(map[int]*document.Column)
var (
expColumns []*document.Column
expData [][]*document.ColumnData
)
for {
row, end, err := iter.NextRow()
if err != nil {
return nil, err
}
if end {
break
}
if i == ps.HeaderLine {
if columnsProvides {
expColumns = ps.Columns
} else {
for j, col := range row {
expColumns = append(expColumns, &document.Column{
Name: col,
Type: document.TableColumnTypeUnknown,
Sequence: j,
})
}
}
for j := range expColumns {
tc := expColumns[j]
rev[tc.Sequence] = tc
}
}
if i >= ps.DataStartLine {
var rowData []*document.ColumnData
for j := range row {
colSchema, found := rev[j]
if !found { // column clipping
continue
}
val := row[j]
if columnsProvides {
var data *document.ColumnData
if config.ParsingStrategy.IgnoreColumnTypeErr {
data = assertValAsForce(colSchema.Type, val, colSchema.Nullable)
} else {
data, err = assertValAs(colSchema.Type, val)
if err != nil {
return nil, err
}
}
data.ColumnID = colSchema.ID
data.ColumnName = colSchema.Name
rowData = append(rowData, data)
} else {
exp := assertVal(val)
colSchema.Type = transformColumnType(colSchema.Type, exp.Type)
rowData = append(rowData, &document.ColumnData{
ColumnID: colSchema.ID,
ColumnName: colSchema.Name,
Type: document.TableColumnTypeUnknown,
ValString: &val,
})
}
}
if rowData != nil {
expData = append(expData, rowData)
}
}
i++
if ps.RowsCount != 0 && len(docs) == ps.RowsCount {
break
}
}
if !columnsProvides {
// align data type when columns are provided
for _, col := range expColumns {
if col.Type == document.TableColumnTypeUnknown {
col.Type = document.TableColumnTypeString
}
}
for _, row := range expData {
if err = alignTableSliceValue(expColumns, row); err != nil {
return nil, err
}
}
}
if len(expData) == 0 {
// return a special document with columns only if there is no data
doc := &schema.Document{
MetaData: map[string]any{
document.MetaDataKeyColumns: expColumns,
document.MetaDataKeyColumnsOnly: struct{}{},
},
}
for k, v := range options.ExtraMeta {
doc.MetaData[k] = v
}
return []*schema.Document{doc}, nil
}
for j := range expData {
contentMapping := make(map[string]string)
for _, col := range expData[j] {
contentMapping[col.ColumnName] = col.GetStringValue()
}
b, err := json.Marshal(contentMapping)
if err != nil {
return nil, err
}
doc := &schema.Document{
Content: string(b), // set for tables in text
MetaData: map[string]any{
document.MetaDataKeyColumns: expColumns,
document.MetaDataKeyColumnData: expData[j],
},
}
for k, v := range options.ExtraMeta {
doc.MetaData[k] = v
}
docs = append(docs, doc)
}
return docs, nil
}