201 lines
5.2 KiB
Go
201 lines
5.2 KiB
Go
/*
|
|
* Copyright 2025 coze-dev Authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package builtin
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"testing"
|
|
|
|
"github.com/cloudwego/eino/components/document/parser"
|
|
"github.com/cloudwego/eino/schema"
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
|
|
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
|
|
)
|
|
|
|
func TestParseCSV(t *testing.T) {
|
|
ctx := context.Background()
|
|
b, err := os.ReadFile("./test_data/test_csv.csv")
|
|
assert.NoError(t, err)
|
|
|
|
r1 := bytes.NewReader(b)
|
|
c1 := &contract.Config{
|
|
FileExtension: contract.FileExtensionCSV,
|
|
ParsingStrategy: &contract.ParsingStrategy{
|
|
HeaderLine: 0,
|
|
DataStartLine: 1,
|
|
RowsCount: 20,
|
|
},
|
|
ChunkingStrategy: nil,
|
|
}
|
|
p1 := ParseCSV(c1)
|
|
docs, err := p1(ctx, r1, parser.WithExtraMeta(map[string]any{
|
|
"document_id": int64(123),
|
|
"knowledge_id": int64(456),
|
|
}))
|
|
assert.NoError(t, err)
|
|
for i, doc := range docs {
|
|
assertSheet(t, i, doc)
|
|
}
|
|
|
|
// parse
|
|
r2 := bytes.NewReader(b)
|
|
c2 := &contract.Config{
|
|
FileExtension: contract.FileExtensionCSV,
|
|
ParsingStrategy: &contract.ParsingStrategy{
|
|
HeaderLine: 0,
|
|
DataStartLine: 1,
|
|
RowsCount: 10,
|
|
Columns: []*document.Column{
|
|
{
|
|
ID: 0,
|
|
Name: "col_string_indexing",
|
|
Type: document.TableColumnTypeString,
|
|
Nullable: false,
|
|
Sequence: 0,
|
|
},
|
|
{
|
|
ID: 0,
|
|
Name: "col_string",
|
|
Type: document.TableColumnTypeString,
|
|
Nullable: false,
|
|
Sequence: 1,
|
|
},
|
|
{
|
|
ID: 0,
|
|
Name: "col_int",
|
|
Type: document.TableColumnTypeInteger,
|
|
Nullable: false,
|
|
Sequence: 2,
|
|
},
|
|
{
|
|
ID: 0,
|
|
Name: "col_number",
|
|
Type: document.TableColumnTypeNumber,
|
|
Nullable: true,
|
|
Sequence: 3,
|
|
},
|
|
{
|
|
ID: 0,
|
|
Name: "col_bool",
|
|
Type: document.TableColumnTypeBoolean,
|
|
Nullable: true,
|
|
Sequence: 4,
|
|
},
|
|
{
|
|
ID: 0,
|
|
Name: "col_time",
|
|
Type: document.TableColumnTypeTime,
|
|
Nullable: true,
|
|
Sequence: 5,
|
|
},
|
|
},
|
|
},
|
|
ChunkingStrategy: nil,
|
|
}
|
|
p2 := ParseCSV(c2)
|
|
docs, err = p2(ctx, r2, parser.WithExtraMeta(map[string]any{
|
|
"document_id": int64(123),
|
|
"knowledge_id": int64(456),
|
|
}))
|
|
assert.NoError(t, err)
|
|
for i, doc := range docs {
|
|
assertSheet(t, i, doc)
|
|
}
|
|
}
|
|
|
|
func TestParseCSVBadCases(t *testing.T) {
|
|
t.Run("test nil row", func(t *testing.T) {
|
|
ctx := context.Background()
|
|
f, err := os.Open("test_data/test_csv_badcase_1.csv")
|
|
assert.NoError(t, err)
|
|
b, err := io.ReadAll(f)
|
|
assert.NoError(t, err)
|
|
|
|
pfn := ParseCSV(&contract.Config{
|
|
FileExtension: "csv",
|
|
ParsingStrategy: &contract.ParsingStrategy{
|
|
ExtractImage: true,
|
|
ExtractTable: true,
|
|
ImageOCR: false,
|
|
SheetID: nil,
|
|
HeaderLine: 0,
|
|
DataStartLine: 1,
|
|
RowsCount: 0,
|
|
IsAppend: false,
|
|
Columns: nil,
|
|
IgnoreColumnTypeErr: true,
|
|
ImageAnnotationType: 0,
|
|
},
|
|
})
|
|
|
|
resp, err := pfn(ctx, bytes.NewReader(b))
|
|
assert.NoError(t, err)
|
|
assert.True(t, len(resp) > 0)
|
|
cols, err := document.GetDocumentColumns(resp[0])
|
|
assert.NoError(t, err)
|
|
cols[5].Nullable = false
|
|
npfn := ParseCSV(&contract.Config{
|
|
FileExtension: "csv",
|
|
ParsingStrategy: &contract.ParsingStrategy{
|
|
ExtractImage: true,
|
|
ExtractTable: true,
|
|
ImageOCR: false,
|
|
SheetID: nil,
|
|
HeaderLine: 0,
|
|
DataStartLine: 1,
|
|
RowsCount: 0,
|
|
IsAppend: false,
|
|
Columns: cols,
|
|
IgnoreColumnTypeErr: true,
|
|
ImageAnnotationType: 0,
|
|
},
|
|
})
|
|
resp, err = npfn(ctx, bytes.NewReader(b))
|
|
assert.NoError(t, err)
|
|
assert.True(t, len(resp) > 0)
|
|
for _, item := range resp {
|
|
data, err := document.GetDocumentColumnData(item)
|
|
assert.NoError(t, err)
|
|
assert.NotNil(t, data[5].GetValue())
|
|
}
|
|
})
|
|
}
|
|
|
|
func assertSheet(t *testing.T, i int, doc *schema.Document) {
|
|
fmt.Printf("sheet[%d]:\n", i)
|
|
assert.NotNil(t, doc.MetaData)
|
|
assert.NotNil(t, doc.MetaData[document.MetaDataKeyColumns])
|
|
cols, ok := doc.MetaData[document.MetaDataKeyColumns].([]*document.Column)
|
|
assert.True(t, ok)
|
|
assert.NotNil(t, doc.MetaData[document.MetaDataKeyColumnData])
|
|
row, ok := doc.MetaData[document.MetaDataKeyColumnData].([]*document.ColumnData)
|
|
assert.True(t, ok)
|
|
assert.Equal(t, int64(123), doc.MetaData["document_id"].(int64))
|
|
assert.Equal(t, int64(456), doc.MetaData["knowledge_id"].(int64))
|
|
for j := range row {
|
|
col := cols[j]
|
|
val := row[j]
|
|
fmt.Printf("row[%d]: %v=%v\n", j, col.Name, val.GetStringValue())
|
|
}
|
|
}
|