171 lines
4.1 KiB
Go
171 lines
4.1 KiB
Go
/*
|
|
* Copyright 2025 coze-dev Authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package builtin
|
|
|
|
import (
|
|
"encoding/json"
|
|
|
|
"github.com/cloudwego/eino/components/document/parser"
|
|
"github.com/cloudwego/eino/schema"
|
|
|
|
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
|
|
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
|
|
)
|
|
|
|
type rowIterator interface {
|
|
NextRow() (row []string, end bool, err error)
|
|
}
|
|
|
|
func parseByRowIterator(iter rowIterator, config *contract.Config, opts ...parser.Option) (
|
|
docs []*schema.Document, err error) {
|
|
|
|
ps := config.ParsingStrategy
|
|
options := parser.GetCommonOptions(&parser.Options{}, opts...)
|
|
i := 0
|
|
columnsProvides := ps.IsAppend || len(ps.Columns) > 0
|
|
rev := make(map[int]*document.Column)
|
|
|
|
var (
|
|
expColumns []*document.Column
|
|
expData [][]*document.ColumnData
|
|
)
|
|
|
|
for {
|
|
row, end, err := iter.NextRow()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if end {
|
|
break
|
|
}
|
|
if i == ps.HeaderLine {
|
|
if columnsProvides {
|
|
expColumns = ps.Columns
|
|
} else {
|
|
for j, col := range row {
|
|
expColumns = append(expColumns, &document.Column{
|
|
Name: col,
|
|
Type: document.TableColumnTypeUnknown,
|
|
Sequence: j,
|
|
})
|
|
}
|
|
}
|
|
|
|
for j := range expColumns {
|
|
tc := expColumns[j]
|
|
rev[tc.Sequence] = tc
|
|
}
|
|
}
|
|
|
|
if i >= ps.DataStartLine {
|
|
var rowData []*document.ColumnData
|
|
for j := range row {
|
|
colSchema, found := rev[j]
|
|
if !found { // column clipping
|
|
continue
|
|
}
|
|
|
|
val := row[j]
|
|
|
|
if columnsProvides {
|
|
var data *document.ColumnData
|
|
if config.ParsingStrategy.IgnoreColumnTypeErr {
|
|
data = assertValAsForce(colSchema.Type, val, colSchema.Nullable)
|
|
} else {
|
|
data, err = assertValAs(colSchema.Type, val)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
data.ColumnID = colSchema.ID
|
|
data.ColumnName = colSchema.Name
|
|
rowData = append(rowData, data)
|
|
} else {
|
|
exp := assertVal(val)
|
|
colSchema.Type = transformColumnType(colSchema.Type, exp.Type)
|
|
rowData = append(rowData, &document.ColumnData{
|
|
ColumnID: colSchema.ID,
|
|
ColumnName: colSchema.Name,
|
|
Type: document.TableColumnTypeUnknown,
|
|
ValString: &val,
|
|
})
|
|
}
|
|
}
|
|
if rowData != nil {
|
|
expData = append(expData, rowData)
|
|
}
|
|
}
|
|
|
|
i++
|
|
if ps.RowsCount != 0 && len(docs) == ps.RowsCount {
|
|
break
|
|
}
|
|
}
|
|
|
|
if !columnsProvides {
|
|
// align data type when columns are provided
|
|
for _, col := range expColumns {
|
|
if col.Type == document.TableColumnTypeUnknown {
|
|
col.Type = document.TableColumnTypeString
|
|
}
|
|
}
|
|
for _, row := range expData {
|
|
if err = alignTableSliceValue(expColumns, row); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(expData) == 0 {
|
|
// return a special document with columns only if there is no data
|
|
doc := &schema.Document{
|
|
MetaData: map[string]any{
|
|
document.MetaDataKeyColumns: expColumns,
|
|
document.MetaDataKeyColumnsOnly: struct{}{},
|
|
},
|
|
}
|
|
for k, v := range options.ExtraMeta {
|
|
doc.MetaData[k] = v
|
|
}
|
|
return []*schema.Document{doc}, nil
|
|
}
|
|
|
|
for j := range expData {
|
|
contentMapping := make(map[string]string)
|
|
for _, col := range expData[j] {
|
|
contentMapping[col.ColumnName] = col.GetStringValue()
|
|
}
|
|
b, err := json.Marshal(contentMapping)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
doc := &schema.Document{
|
|
Content: string(b), // set for tables in text
|
|
MetaData: map[string]any{
|
|
document.MetaDataKeyColumns: expColumns,
|
|
document.MetaDataKeyColumnData: expData[j],
|
|
},
|
|
}
|
|
for k, v := range options.ExtraMeta {
|
|
doc.MetaData[k] = v
|
|
}
|
|
docs = append(docs, doc)
|
|
}
|
|
|
|
return docs, nil
|
|
}
|