117 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			117 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Go
		
	
	
	
/*
 | 
						|
 * Copyright 2025 coze-dev Authors
 | 
						|
 *
 | 
						|
 * Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
 * you may not use this file except in compliance with the License.
 | 
						|
 * You may obtain a copy of the License at
 | 
						|
 *
 | 
						|
 *     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 *
 | 
						|
 * Unless required by applicable law or agreed to in writing, software
 | 
						|
 * distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
 * See the License for the specific language governing permissions and
 | 
						|
 * limitations under the License.
 | 
						|
 */
 | 
						|
 | 
						|
package builtin
 | 
						|
 | 
						|
import (
 | 
						|
	"context"
 | 
						|
	"fmt"
 | 
						|
	"regexp"
 | 
						|
	"strings"
 | 
						|
 | 
						|
	"github.com/cloudwego/eino/components/document/parser"
 | 
						|
	"github.com/cloudwego/eino/schema"
 | 
						|
 | 
						|
	contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
 | 
						|
)
 | 
						|
 | 
						|
var (
 | 
						|
	spaceRegex = regexp.MustCompile(`\s+`)
 | 
						|
	urlRegex   = regexp.MustCompile(`https?://\S+|www\.\S+`)
 | 
						|
	emailRegex = regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`)
 | 
						|
)
 | 
						|
 | 
						|
func ChunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) {
 | 
						|
	cs := config.ChunkingStrategy
 | 
						|
	if cs.Overlap >= cs.ChunkSize {
 | 
						|
		return nil, fmt.Errorf("[ChunkCustom] invalid param, overlap >= chunk_size")
 | 
						|
	}
 | 
						|
 | 
						|
	var (
 | 
						|
		parts         = strings.Split(text, cs.Separator)
 | 
						|
		buffer        []rune
 | 
						|
		currentLength int64
 | 
						|
		options       = parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...)
 | 
						|
	)
 | 
						|
 | 
						|
	trim := func(text string) string {
 | 
						|
		if cs.TrimURLAndEmail {
 | 
						|
			text = urlRegex.ReplaceAllString(text, "")
 | 
						|
			text = emailRegex.ReplaceAllString(text, "")
 | 
						|
		}
 | 
						|
 | 
						|
		if cs.TrimSpace {
 | 
						|
			text = strings.TrimSpace(text)
 | 
						|
			text = spaceRegex.ReplaceAllString(text, " ")
 | 
						|
		}
 | 
						|
 | 
						|
		return text
 | 
						|
	}
 | 
						|
 | 
						|
	add := func() {
 | 
						|
		if len(buffer) == 0 {
 | 
						|
			return
 | 
						|
		}
 | 
						|
		doc := &schema.Document{
 | 
						|
			Content:  string(buffer),
 | 
						|
			MetaData: map[string]any{},
 | 
						|
		}
 | 
						|
		for k, v := range options.ExtraMeta {
 | 
						|
			doc.MetaData[k] = v
 | 
						|
		}
 | 
						|
		docs = append(docs, doc)
 | 
						|
		buffer = []rune{}
 | 
						|
	}
 | 
						|
 | 
						|
	processPart := func(part string) {
 | 
						|
		runes := []rune(part)
 | 
						|
		for partLength := int64(len(runes)); partLength > 0; partLength = int64(len(runes)) {
 | 
						|
			pos := min(partLength, cs.ChunkSize-currentLength)
 | 
						|
			buffer = append(buffer, runes[:pos]...)
 | 
						|
			currentLength = int64(len(buffer))
 | 
						|
 | 
						|
			if currentLength >= cs.ChunkSize {
 | 
						|
				add()
 | 
						|
				if cs.Overlap > 0 {
 | 
						|
					buffer = getOverlap([]rune(docs[len(docs)-1].Content), cs.Overlap, cs.ChunkSize)
 | 
						|
					currentLength = int64(len(buffer))
 | 
						|
				} else {
 | 
						|
					currentLength = 0
 | 
						|
				}
 | 
						|
			}
 | 
						|
			runes = runes[pos:]
 | 
						|
		}
 | 
						|
 | 
						|
		add()
 | 
						|
	}
 | 
						|
 | 
						|
	for _, part := range parts {
 | 
						|
		processPart(trim(part))
 | 
						|
	}
 | 
						|
 | 
						|
	add()
 | 
						|
 | 
						|
	return docs, nil
 | 
						|
}
 | 
						|
 | 
						|
func getOverlap(runes []rune, overlapRatio int64, chunkSize int64) []rune {
 | 
						|
	overlap := int64(float64(chunkSize) * float64(overlapRatio) / 100)
 | 
						|
	if int64(len(runes)) <= overlap {
 | 
						|
		return runes
 | 
						|
	}
 | 
						|
	return runes[len(runes)-int(overlap):]
 | 
						|
}
 |