172 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			172 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Go
		
	
	
	
| /*
 | |
|  * Copyright 2025 coze-dev Authors
 | |
|  *
 | |
|  * Licensed under the Apache License, Version 2.0 (the "License");
 | |
|  * you may not use this file except in compliance with the License.
 | |
|  * You may obtain a copy of the License at
 | |
|  *
 | |
|  *     http://www.apache.org/licenses/LICENSE-2.0
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS,
 | |
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|  * See the License for the specific language governing permissions and
 | |
|  * limitations under the License.
 | |
|  */
 | |
| 
 | |
| package builtin
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"context"
 | |
| 	"os"
 | |
| 	"testing"
 | |
| 
 | |
| 	"github.com/cloudwego/eino/components/document/parser"
 | |
| 	"github.com/stretchr/testify/assert"
 | |
| 
 | |
| 	"github.com/coze-dev/coze-studio/backend/infra/contract/document"
 | |
| 	contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
 | |
| )
 | |
| 
 | |
| func TestParseXLSX(t *testing.T) {
 | |
| 	ctx := context.Background()
 | |
| 	b, err := os.ReadFile("./test_data/test_xlsx.xlsx")
 | |
| 	assert.NoError(t, err)
 | |
| 	reader := bytes.NewReader(b)
 | |
| 	config := &contract.Config{
 | |
| 		FileExtension: contract.FileExtensionXLSX,
 | |
| 		ParsingStrategy: &contract.ParsingStrategy{
 | |
| 			HeaderLine:    0,
 | |
| 			DataStartLine: 1,
 | |
| 			RowsCount:     10,
 | |
| 			Columns: []*document.Column{
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_string_indexing",
 | |
| 					Type:     document.TableColumnTypeString,
 | |
| 					Nullable: false,
 | |
| 					Sequence: 0,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_string",
 | |
| 					Type:     document.TableColumnTypeString,
 | |
| 					Nullable: true,
 | |
| 					Sequence: 1,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_int",
 | |
| 					Type:     document.TableColumnTypeInteger,
 | |
| 					Nullable: false,
 | |
| 					Sequence: 2,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_number",
 | |
| 					Type:     document.TableColumnTypeNumber,
 | |
| 					Nullable: true,
 | |
| 					Sequence: 3,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_bool",
 | |
| 					Type:     document.TableColumnTypeBoolean,
 | |
| 					Nullable: true,
 | |
| 					Sequence: 4,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_time",
 | |
| 					Type:     document.TableColumnTypeTime,
 | |
| 					Nullable: true,
 | |
| 					Sequence: 5,
 | |
| 				},
 | |
| 			},
 | |
| 		},
 | |
| 		ChunkingStrategy: nil,
 | |
| 	}
 | |
| 
 | |
| 	pfn := parseXLSX(config)
 | |
| 	docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
 | |
| 		"document_id":  int64(123),
 | |
| 		"knowledge_id": int64(456),
 | |
| 	}))
 | |
| 	assert.NoError(t, err)
 | |
| 	for i, doc := range docs {
 | |
| 		assertSheet(t, i, doc)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func TestParseXLSXConvertColumnType(t *testing.T) {
 | |
| 	ctx := context.Background()
 | |
| 	b, err := os.ReadFile("./test_data/test_xlsx.xlsx")
 | |
| 	assert.NoError(t, err)
 | |
| 	reader := bytes.NewReader(b)
 | |
| 	config := &contract.Config{
 | |
| 		FileExtension: contract.FileExtensionXLSX,
 | |
| 		ParsingStrategy: &contract.ParsingStrategy{
 | |
| 			HeaderLine:          0,
 | |
| 			DataStartLine:       1,
 | |
| 			RowsCount:           10,
 | |
| 			IgnoreColumnTypeErr: true,
 | |
| 			Columns: []*document.Column{
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_string_indexing",
 | |
| 					Type:     document.TableColumnTypeString,
 | |
| 					Nullable: false,
 | |
| 					Sequence: 0,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_string",
 | |
| 					Type:     document.TableColumnTypeInteger, // string -> int: null
 | |
| 					Nullable: true,
 | |
| 					Sequence: 1,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_int",
 | |
| 					Type:     document.TableColumnTypeString, // int -> string: strconv
 | |
| 					Nullable: false,
 | |
| 					Sequence: 2,
 | |
| 				},
 | |
| 				{
 | |
| 					ID:       0,
 | |
| 					Name:     "col_number",
 | |
| 					Type:     document.TableColumnTypeString, // float -> string: strconv
 | |
| 					Nullable: true,
 | |
| 					Sequence: 3,
 | |
| 				},
 | |
| 				//{
 | |
| 				//	ID:       0,
 | |
| 				//	Name:     "col_bool",
 | |
| 				//	Type:     document.TableColumnTypeBoolean, // trim
 | |
| 				//	Nullable: true,
 | |
| 				//	Sequence: 4,
 | |
| 				//},
 | |
| 				//{
 | |
| 				//	ID:       0,
 | |
| 				//	Name:     "col_time",
 | |
| 				//	Type:     document.TableColumnTypeTime, // trim
 | |
| 				//	Nullable: true,
 | |
| 				//	Sequence: 5,
 | |
| 				//},
 | |
| 			},
 | |
| 		},
 | |
| 		ChunkingStrategy: nil,
 | |
| 	}
 | |
| 
 | |
| 	pfn := parseXLSX(config)
 | |
| 	docs, err := pfn(ctx, reader, parser.WithExtraMeta(map[string]any{
 | |
| 		"document_id":  int64(123),
 | |
| 		"knowledge_id": int64(456),
 | |
| 	}))
 | |
| 	assert.NoError(t, err)
 | |
| 	for i, doc := range docs {
 | |
| 		assertSheet(t, i, doc)
 | |
| 	}
 | |
| }
 |