201 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			201 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Go
		
	
	
	
/*
 | 
						|
 * Copyright 2025 coze-dev Authors
 | 
						|
 *
 | 
						|
 * Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
 * you may not use this file except in compliance with the License.
 | 
						|
 * You may obtain a copy of the License at
 | 
						|
 *
 | 
						|
 *     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 *
 | 
						|
 * Unless required by applicable law or agreed to in writing, software
 | 
						|
 * distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
 * See the License for the specific language governing permissions and
 | 
						|
 * limitations under the License.
 | 
						|
 */
 | 
						|
 | 
						|
package builtin
 | 
						|
 | 
						|
import (
 | 
						|
	"bytes"
 | 
						|
	"context"
 | 
						|
	"fmt"
 | 
						|
	"io"
 | 
						|
	"os"
 | 
						|
	"testing"
 | 
						|
 | 
						|
	"github.com/cloudwego/eino/components/document/parser"
 | 
						|
	"github.com/cloudwego/eino/schema"
 | 
						|
	"github.com/stretchr/testify/assert"
 | 
						|
 | 
						|
	"github.com/coze-dev/coze-studio/backend/infra/contract/document"
 | 
						|
	contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
 | 
						|
)
 | 
						|
 | 
						|
func TestParseCSV(t *testing.T) {
 | 
						|
	ctx := context.Background()
 | 
						|
	b, err := os.ReadFile("./test_data/test_csv.csv")
 | 
						|
	assert.NoError(t, err)
 | 
						|
 | 
						|
	r1 := bytes.NewReader(b)
 | 
						|
	c1 := &contract.Config{
 | 
						|
		FileExtension: contract.FileExtensionCSV,
 | 
						|
		ParsingStrategy: &contract.ParsingStrategy{
 | 
						|
			HeaderLine:    0,
 | 
						|
			DataStartLine: 1,
 | 
						|
			RowsCount:     20,
 | 
						|
		},
 | 
						|
		ChunkingStrategy: nil,
 | 
						|
	}
 | 
						|
	p1 := parseCSV(c1)
 | 
						|
	docs, err := p1(ctx, r1, parser.WithExtraMeta(map[string]any{
 | 
						|
		"document_id":  int64(123),
 | 
						|
		"knowledge_id": int64(456),
 | 
						|
	}))
 | 
						|
	assert.NoError(t, err)
 | 
						|
	for i, doc := range docs {
 | 
						|
		assertSheet(t, i, doc)
 | 
						|
	}
 | 
						|
 | 
						|
	// parse
 | 
						|
	r2 := bytes.NewReader(b)
 | 
						|
	c2 := &contract.Config{
 | 
						|
		FileExtension: contract.FileExtensionCSV,
 | 
						|
		ParsingStrategy: &contract.ParsingStrategy{
 | 
						|
			HeaderLine:    0,
 | 
						|
			DataStartLine: 1,
 | 
						|
			RowsCount:     10,
 | 
						|
			Columns: []*document.Column{
 | 
						|
				{
 | 
						|
					ID:       0,
 | 
						|
					Name:     "col_string_indexing",
 | 
						|
					Type:     document.TableColumnTypeString,
 | 
						|
					Nullable: false,
 | 
						|
					Sequence: 0,
 | 
						|
				},
 | 
						|
				{
 | 
						|
					ID:       0,
 | 
						|
					Name:     "col_string",
 | 
						|
					Type:     document.TableColumnTypeString,
 | 
						|
					Nullable: false,
 | 
						|
					Sequence: 1,
 | 
						|
				},
 | 
						|
				{
 | 
						|
					ID:       0,
 | 
						|
					Name:     "col_int",
 | 
						|
					Type:     document.TableColumnTypeInteger,
 | 
						|
					Nullable: false,
 | 
						|
					Sequence: 2,
 | 
						|
				},
 | 
						|
				{
 | 
						|
					ID:       0,
 | 
						|
					Name:     "col_number",
 | 
						|
					Type:     document.TableColumnTypeNumber,
 | 
						|
					Nullable: true,
 | 
						|
					Sequence: 3,
 | 
						|
				},
 | 
						|
				{
 | 
						|
					ID:       0,
 | 
						|
					Name:     "col_bool",
 | 
						|
					Type:     document.TableColumnTypeBoolean,
 | 
						|
					Nullable: true,
 | 
						|
					Sequence: 4,
 | 
						|
				},
 | 
						|
				{
 | 
						|
					ID:       0,
 | 
						|
					Name:     "col_time",
 | 
						|
					Type:     document.TableColumnTypeTime,
 | 
						|
					Nullable: true,
 | 
						|
					Sequence: 5,
 | 
						|
				},
 | 
						|
			},
 | 
						|
		},
 | 
						|
		ChunkingStrategy: nil,
 | 
						|
	}
 | 
						|
	p2 := parseCSV(c2)
 | 
						|
	docs, err = p2(ctx, r2, parser.WithExtraMeta(map[string]any{
 | 
						|
		"document_id":  int64(123),
 | 
						|
		"knowledge_id": int64(456),
 | 
						|
	}))
 | 
						|
	assert.NoError(t, err)
 | 
						|
	for i, doc := range docs {
 | 
						|
		assertSheet(t, i, doc)
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func TestParseCSVBadCases(t *testing.T) {
 | 
						|
	t.Run("test nil row", func(t *testing.T) {
 | 
						|
		ctx := context.Background()
 | 
						|
		f, err := os.Open("test_data/test_csv_badcase_1.csv")
 | 
						|
		assert.NoError(t, err)
 | 
						|
		b, err := io.ReadAll(f)
 | 
						|
		assert.NoError(t, err)
 | 
						|
 | 
						|
		pfn := parseCSV(&contract.Config{
 | 
						|
			FileExtension: "csv",
 | 
						|
			ParsingStrategy: &contract.ParsingStrategy{
 | 
						|
				ExtractImage:        true,
 | 
						|
				ExtractTable:        true,
 | 
						|
				ImageOCR:            false,
 | 
						|
				SheetID:             nil,
 | 
						|
				HeaderLine:          0,
 | 
						|
				DataStartLine:       1,
 | 
						|
				RowsCount:           0,
 | 
						|
				IsAppend:            false,
 | 
						|
				Columns:             nil,
 | 
						|
				IgnoreColumnTypeErr: true,
 | 
						|
				ImageAnnotationType: 0,
 | 
						|
			},
 | 
						|
		})
 | 
						|
 | 
						|
		resp, err := pfn(ctx, bytes.NewReader(b))
 | 
						|
		assert.NoError(t, err)
 | 
						|
		assert.True(t, len(resp) > 0)
 | 
						|
		cols, err := document.GetDocumentColumns(resp[0])
 | 
						|
		assert.NoError(t, err)
 | 
						|
		cols[5].Nullable = false
 | 
						|
		npfn := parseCSV(&contract.Config{
 | 
						|
			FileExtension: "csv",
 | 
						|
			ParsingStrategy: &contract.ParsingStrategy{
 | 
						|
				ExtractImage:        true,
 | 
						|
				ExtractTable:        true,
 | 
						|
				ImageOCR:            false,
 | 
						|
				SheetID:             nil,
 | 
						|
				HeaderLine:          0,
 | 
						|
				DataStartLine:       1,
 | 
						|
				RowsCount:           0,
 | 
						|
				IsAppend:            false,
 | 
						|
				Columns:             cols,
 | 
						|
				IgnoreColumnTypeErr: true,
 | 
						|
				ImageAnnotationType: 0,
 | 
						|
			},
 | 
						|
		})
 | 
						|
		resp, err = npfn(ctx, bytes.NewReader(b))
 | 
						|
		assert.NoError(t, err)
 | 
						|
		assert.True(t, len(resp) > 0)
 | 
						|
		for _, item := range resp {
 | 
						|
			data, err := document.GetDocumentColumnData(item)
 | 
						|
			assert.NoError(t, err)
 | 
						|
			assert.NotNil(t, data[5].GetValue())
 | 
						|
		}
 | 
						|
	})
 | 
						|
}
 | 
						|
 | 
						|
func assertSheet(t *testing.T, i int, doc *schema.Document) {
 | 
						|
	fmt.Printf("sheet[%d]:\n", i)
 | 
						|
	assert.NotNil(t, doc.MetaData)
 | 
						|
	assert.NotNil(t, doc.MetaData[document.MetaDataKeyColumns])
 | 
						|
	cols, ok := doc.MetaData[document.MetaDataKeyColumns].([]*document.Column)
 | 
						|
	assert.True(t, ok)
 | 
						|
	assert.NotNil(t, doc.MetaData[document.MetaDataKeyColumnData])
 | 
						|
	row, ok := doc.MetaData[document.MetaDataKeyColumnData].([]*document.ColumnData)
 | 
						|
	assert.True(t, ok)
 | 
						|
	assert.Equal(t, int64(123), doc.MetaData["document_id"].(int64))
 | 
						|
	assert.Equal(t, int64(456), doc.MetaData["knowledge_id"].(int64))
 | 
						|
	for j := range row {
 | 
						|
		col := cols[j]
 | 
						|
		val := row[j]
 | 
						|
		fmt.Printf("row[%d]: %v=%v\n", j, col.Name, val.GetStringValue())
 | 
						|
	}
 | 
						|
}
 |