coze-studio/backend/infra/impl/document/parser/builtin/chunk_custom.go

117 lines
2.9 KiB
Go

/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package builtin
import (
"context"
"fmt"
"regexp"
"strings"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
)
var (
spaceRegex = regexp.MustCompile(`\s+`)
urlRegex = regexp.MustCompile(`https?://\S+|www\.\S+`)
emailRegex = regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`)
)
func ChunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) {
cs := config.ChunkingStrategy
if cs.Overlap >= cs.ChunkSize {
return nil, fmt.Errorf("[ChunkCustom] invalid param, overlap >= chunk_size")
}
var (
parts = strings.Split(text, cs.Separator)
buffer []rune
currentLength int64
options = parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...)
)
trim := func(text string) string {
if cs.TrimURLAndEmail {
text = urlRegex.ReplaceAllString(text, "")
text = emailRegex.ReplaceAllString(text, "")
}
if cs.TrimSpace {
text = strings.TrimSpace(text)
text = spaceRegex.ReplaceAllString(text, " ")
}
return text
}
add := func() {
if len(buffer) == 0 {
return
}
doc := &schema.Document{
Content: string(buffer),
MetaData: map[string]any{},
}
for k, v := range options.ExtraMeta {
doc.MetaData[k] = v
}
docs = append(docs, doc)
buffer = []rune{}
}
processPart := func(part string) {
runes := []rune(part)
for partLength := int64(len(runes)); partLength > 0; partLength = int64(len(runes)) {
pos := min(partLength, cs.ChunkSize-currentLength)
buffer = append(buffer, runes[:pos]...)
currentLength = int64(len(buffer))
if currentLength >= cs.ChunkSize {
add()
if cs.Overlap > 0 {
buffer = getOverlap([]rune(docs[len(docs)-1].Content), cs.Overlap, cs.ChunkSize)
currentLength = int64(len(buffer))
} else {
currentLength = 0
}
}
runes = runes[pos:]
}
add()
}
for _, part := range parts {
processPart(trim(part))
}
add()
return docs, nil
}
func getOverlap(runes []rune, overlapRatio int64, chunkSize int64) []rune {
overlap := int64(float64(chunkSize) * float64(overlapRatio) / 100)
if int64(len(runes)) <= overlap {
return runes
}
return runes[len(runes)-int(overlap):]
}