117 lines
2.9 KiB
Go
117 lines
2.9 KiB
Go
/*
|
|
* Copyright 2025 coze-dev Authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package builtin
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/cloudwego/eino/components/document/parser"
|
|
"github.com/cloudwego/eino/schema"
|
|
|
|
contract "github.com/coze-dev/coze-studio/backend/infra/contract/document/parser"
|
|
)
|
|
|
|
var (
|
|
spaceRegex = regexp.MustCompile(`\s+`)
|
|
urlRegex = regexp.MustCompile(`https?://\S+|www\.\S+`)
|
|
emailRegex = regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`)
|
|
)
|
|
|
|
func ChunkCustom(_ context.Context, text string, config *contract.Config, opts ...parser.Option) (docs []*schema.Document, err error) {
|
|
cs := config.ChunkingStrategy
|
|
if cs.Overlap >= cs.ChunkSize {
|
|
return nil, fmt.Errorf("[ChunkCustom] invalid param, overlap >= chunk_size")
|
|
}
|
|
|
|
var (
|
|
parts = strings.Split(text, cs.Separator)
|
|
buffer []rune
|
|
currentLength int64
|
|
options = parser.GetCommonOptions(&parser.Options{ExtraMeta: map[string]any{}}, opts...)
|
|
)
|
|
|
|
trim := func(text string) string {
|
|
if cs.TrimURLAndEmail {
|
|
text = urlRegex.ReplaceAllString(text, "")
|
|
text = emailRegex.ReplaceAllString(text, "")
|
|
}
|
|
|
|
if cs.TrimSpace {
|
|
text = strings.TrimSpace(text)
|
|
text = spaceRegex.ReplaceAllString(text, " ")
|
|
}
|
|
|
|
return text
|
|
}
|
|
|
|
add := func() {
|
|
if len(buffer) == 0 {
|
|
return
|
|
}
|
|
doc := &schema.Document{
|
|
Content: string(buffer),
|
|
MetaData: map[string]any{},
|
|
}
|
|
for k, v := range options.ExtraMeta {
|
|
doc.MetaData[k] = v
|
|
}
|
|
docs = append(docs, doc)
|
|
buffer = []rune{}
|
|
}
|
|
|
|
processPart := func(part string) {
|
|
runes := []rune(part)
|
|
for partLength := int64(len(runes)); partLength > 0; partLength = int64(len(runes)) {
|
|
pos := min(partLength, cs.ChunkSize-currentLength)
|
|
buffer = append(buffer, runes[:pos]...)
|
|
currentLength = int64(len(buffer))
|
|
|
|
if currentLength >= cs.ChunkSize {
|
|
add()
|
|
if cs.Overlap > 0 {
|
|
buffer = getOverlap([]rune(docs[len(docs)-1].Content), cs.Overlap, cs.ChunkSize)
|
|
currentLength = int64(len(buffer))
|
|
} else {
|
|
currentLength = 0
|
|
}
|
|
}
|
|
runes = runes[pos:]
|
|
}
|
|
|
|
add()
|
|
}
|
|
|
|
for _, part := range parts {
|
|
processPart(trim(part))
|
|
}
|
|
|
|
add()
|
|
|
|
return docs, nil
|
|
}
|
|
|
|
func getOverlap(runes []rune, overlapRatio int64, chunkSize int64) []rune {
|
|
overlap := int64(float64(chunkSize) * float64(overlapRatio) / 100)
|
|
if int64(len(runes)) <= overlap {
|
|
return runes
|
|
}
|
|
return runes[len(runes)-int(overlap):]
|
|
}
|