coze-studio/backend/infra/impl/document/parser/builtin/parse_xlsx_test.go

172 lines
4.1 KiB
Go

/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package builtin
import (
"bytes"
"context"
"os"
"testing"
"github.com/cloudwego/eino/components/document/parser"
"github.com/stretchr/testify/assert"
"github.com/coze-dev/coze-studio/backend/infra/contract/document"
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
)
func TestParseXLSX(t *testing.T) {
ctx := context.Background()
b, err := os.ReadFile("./test_data/test_xlsx.xlsx")
assert.NoError(t, err)
reader := bytes.NewReader(b)
config := &contract.Config{
FileExtension: contract.FileExtensionXLSX,
ParsingStrategy: &contract.ParsingStrategy{
HeaderLine: 0,
DataStartLine: 1,
RowsCount: 10,
Columns: []*document.Column{
{
ID: 0,
Name: "col_string_indexing",
Type: document.TableColumnTypeString,
Nullable: false,
Sequence: 0,
},
{
ID: 0,
Name: "col_string",
Type: document.TableColumnTypeString,
Nullable: true,
Sequence: 1,
},
{
ID: 0,
Name: "col_int",
Type: document.TableColumnTypeInteger,
Nullable: false,
Sequence: 2,
},
{
ID: 0,
Name: "col_number",
Type: document.TableColumnTypeNumber,
Nullable: true,
Sequence: 3,
},
{
ID: 0,
Name: "col_bool",
Type: document.TableColumnTypeBoolean,
Nullable: true,
Sequence: 4,
},
{
ID: 0,
Name: "col_time",
Type: document.TableColumnTypeTime,
Nullable: true,
Sequence: 5,
},
},
},
ChunkingStrategy: nil,
}
pfn := ParseXLSX(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123),
"knowledge_id": int64(456),
}))
assert.NoError(t, err)
for i, doc := range docs {
assertSheet(t, i, doc)
}
}
func TestParseXLSXConvertColumnType(t *testing.T) {
ctx := context.Background()
b, err := os.ReadFile("./test_data/test_xlsx.xlsx")
assert.NoError(t, err)
reader := bytes.NewReader(b)
config := &contract.Config{
FileExtension: contract.FileExtensionXLSX,
ParsingStrategy: &contract.ParsingStrategy{
HeaderLine: 0,
DataStartLine: 1,
RowsCount: 10,
IgnoreColumnTypeErr: true,
Columns: []*document.Column{
{
ID: 0,
Name: "col_string_indexing",
Type: document.TableColumnTypeString,
Nullable: false,
Sequence: 0,
},
{
ID: 0,
Name: "col_string",
Type: document.TableColumnTypeInteger, // string -> int: null
Nullable: true,
Sequence: 1,
},
{
ID: 0,
Name: "col_int",
Type: document.TableColumnTypeString, // int -> string: strconv
Nullable: false,
Sequence: 2,
},
{
ID: 0,
Name: "col_number",
Type: document.TableColumnTypeString, // float -> string: strconv
Nullable: true,
Sequence: 3,
},
//{
// ID: 0,
// Name: "col_bool",
// Type: document.TableColumnTypeBoolean, // trim
// Nullable: true,
// Sequence: 4,
//},
//{
// ID: 0,
// Name: "col_time",
// Type: document.TableColumnTypeTime, // trim
// Nullable: true,
// Sequence: 5,
//},
},
},
ChunkingStrategy: nil,
}
pfn := ParseXLSX(config)
docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
"document_id": int64(123),
"knowledge_id": int64(456),
}))
assert.NoError(t, err)
for i, doc := range docs {
assertSheet(t, i, doc)
}
}