feat: manually mirror opencoze's code from bytedance

Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
fanlv
2025-07-20 17:36:12 +08:00
commit 890153324f
14811 changed files with 1923430 additions and 0 deletions

View File

@@ -0,0 +1,199 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { get } from 'lodash-es';
import { useKnowledgeParams } from '@coze-data/knowledge-stores';
import { DataNamespace, dataReporter } from '@coze-data/reporter';
import {
type UnitItem,
OptType,
UploadStatus,
type CreateUnitStatus,
UnitType,
} from '@coze-data/knowledge-resource-processor-core';
import { REPORT_EVENTS } from '@coze-arch/report-events';
import { I18n } from '@coze-arch/i18n';
import { CustomError } from '@coze-arch/bot-error';
import {
type DocumentInfo,
DocumentStatus,
type DocumentProgress,
} from '@coze-arch/bot-api/knowledge';
import { type UploadFileData } from '@coze-arch/bot-api/developer_api';
import { Toast } from '@coze-arch/coze-design';
import { SUCCESSFUL_UPLOAD_PROGRESS } from '../constants';
export const transformUnitList = ({
unitList,
data,
fileInstance,
index,
}: {
unitList: UnitItem[];
data: UploadFileData | undefined;
fileInstance: File;
index: number;
}): UnitItem[] => {
if (!data) {
return unitList;
}
const filteredList = unitList.map((unit, i) => {
if (index === i) {
return {
...unit,
uri: data.upload_uri || '',
status: UploadStatus.SUCCESS,
percent: 100,
fileInstance,
};
}
return unit;
});
// TODO as 待解
return filteredList as UnitItem[];
};
export function reportFailGetProgress(data: DocumentProgress[]) {
const failIds = data.filter(item => item.status === DocumentStatus.Failed);
if (failIds.length) {
dataReporter.errorEvent(DataNamespace.KNOWLEDGE, {
eventName: REPORT_EVENTS.KnowledgeGetTaskProgress,
error: new CustomError(
REPORT_EVENTS.KnowledgeGetTaskProgress,
`${
REPORT_EVENTS.KnowledgeGetTaskProgress
}: get progress fail. ${JSON.stringify(failIds)}`,
),
meta: {
failIds,
},
});
}
}
export function isStopPolling(data: DocumentProgress[]) {
return (
data.length > 0 &&
data.every(
item =>
item.progress === SUCCESSFUL_UPLOAD_PROGRESS ||
item.status === DocumentStatus.Failed,
)
);
}
export const clearPolling = (
pollingId: React.MutableRefObject<number | undefined>,
) => {
if (pollingId.current) {
clearTimeout(pollingId.current);
pollingId.current = undefined;
}
};
export function useOptFromQuery(): OptType {
const query = useKnowledgeParams();
const opt = get(query, 'opt', OptType.ADD) as OptType;
return opt;
}
/** 为什么返回undefined? 不一定需要空字符串如果取不到就返回undefined */
/**现在还有 docID 这个入口吗??? */
export function useDocIdFromQuery(): string | undefined {
const query = useKnowledgeParams();
return get(query, 'docID', undefined);
}
export const getFileExtension = (name: string) => {
const index = name.lastIndexOf('.');
return name.slice(index + 1).toLowerCase();
};
export const getBase64 = (file: Blob): Promise<string> =>
new Promise((resolve, reject) => {
const fileReader = new FileReader();
fileReader.onload = event => {
const result = event.target?.result;
if (!result || typeof result !== 'string') {
reject(new CustomError('getBase64', 'file read invalid'));
return;
}
resolve(result.replace(/^.*?,/, ''));
};
fileReader.onerror = () => {
Toast.error(I18n.t('read_file_failed_please_retry'));
reject(new CustomError('getBase64', 'file read fail'));
};
fileReader.onabort = () => {
reject(new CustomError('getBase64', 'file read abort'));
};
fileReader.readAsDataURL(file);
});
export const getUint8Array = (file: Blob): Promise<Uint8Array> =>
new Promise((resolve, reject) => {
const fileReader = new FileReader();
fileReader.onload = event => {
if (event.target?.result) {
const arrayBuffer = event.target.result as ArrayBuffer;
const uint8Array = new Uint8Array(arrayBuffer);
resolve(uint8Array);
} else {
reject(new CustomError('getUint8Array', 'file read invalid'));
}
};
fileReader.readAsArrayBuffer(file);
});
export function reportProcessDocumentFail(
docInfos: DocumentInfo[],
reportEventName: string,
) {
const failDocumentIds = docInfos.filter(
item => item.status === DocumentStatus.Failed,
);
failDocumentIds.length > 0 &&
dataReporter.errorEvent(DataNamespace.KNOWLEDGE, {
eventName: reportEventName,
error: new CustomError(
reportEventName,
`${reportEventName}: fail document_ids are ${JSON.stringify(
failDocumentIds,
)}`,
),
meta: {
failDocumentIds,
},
});
}
export const getProcessingDescMsg = (taskStatus: CreateUnitStatus) =>
I18n.t('knowledge_add_unit_process_notice');
// taskStatus === CreateUnitStatus.TASK_FINISH
// ? ''
// : I18n.t('knowledge_add_unit_process_notice');
export const isThirdResegment = (opt: OptType, type: UnitType | undefined) =>
opt === OptType.RESEGMENT &&
type &&
[UnitType.TABLE_GOOGLE_DRIVE, UnitType.TABLE_FEISHU].includes(type);
export const isIncremental = (opt: OptType) => opt === OptType.INCREMENTAL;

View File

@@ -0,0 +1,68 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import type Cropper from 'cropperjs';
import { type CropperSizePercent } from '@/features/knowledge-type/text/interface';
const fixPrecision = (value: number) => parseFloat(value.toFixed(2));
export const convertCropDataToPercentSize = ({
data,
pdfSize: { naturalHeight, naturalWidth },
}: {
data: Cropper.Data;
pdfSize: {
naturalHeight: number;
naturalWidth: number;
};
}): CropperSizePercent => {
const topPixel = data.y;
const bottomPixel = data.y + data.height;
const leftPixel = data.x;
const rightPixel = data.x + data.width;
return {
topPercent: fixPrecision(topPixel / naturalHeight),
bottomPercent: fixPrecision((naturalHeight - bottomPixel) / naturalHeight),
leftPercent: fixPrecision(leftPixel / naturalWidth),
rightPercent: fixPrecision((naturalWidth - rightPixel) / naturalWidth),
};
};
export const convertPercentSizeToCropData = ({
cropSizePercent: { topPercent, bottomPercent, rightPercent, leftPercent },
pdfSize: { naturalHeight, naturalWidth },
}: {
cropSizePercent: CropperSizePercent;
pdfSize: {
naturalHeight: number;
naturalWidth: number;
};
}): Cropper.Data => {
const x = leftPercent * naturalWidth;
const y = topPercent * naturalHeight;
const width = naturalWidth - x - naturalWidth * rightPercent;
const height = naturalHeight - y - naturalHeight * bottomPercent;
return {
scaleX: 1,
scaleY: 1,
rotate: 0,
x,
y,
width,
height,
};
};

View File

@@ -0,0 +1,42 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { type ResegmentRequest } from '@coze-arch/idl/knowledge';
import { type PDFDocumentFilterValue } from '@/features/knowledge-type/text/interface';
import { mapPDFFilterConfig } from './map-pdf-filter-config';
export const convertFilterStrategyToParams = (
filterValue: PDFDocumentFilterValue | undefined,
): ResegmentRequest => {
if (!filterValue) {
return {};
}
// const { topPercent, rightPercent, bottomPercent, leftPercent } =
// filterValue.cropperSizePercent;
return {
filter_strategy: {
// filter_box_position: [
// topPercent,
// rightPercent,
// bottomPercent,
// leftPercent,
// ],
filter_page: mapPDFFilterConfig(filterValue.filterPagesConfig),
},
};
};

View File

@@ -0,0 +1,27 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { type Dataset, StorageLocation } from '@coze-arch/idl/knowledge';
export function getStorageStrategyEnabled(dataset?: Dataset) {
return (
// 云搜索只在国内环境上线
IS_CN_REGION &&
// 只有知识库首次上传,才可以配置云搜索
dataset?.doc_count === 0 &&
dataset?.storage_location === StorageLocation.Default
);
}

View File

@@ -0,0 +1,43 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
export {
getFrequencyMap,
IValidateRes,
validateField,
tableSettingsToString,
} from './table';
export {
transformUnitList,
reportFailGetProgress,
isStopPolling,
clearPolling,
useOptFromQuery,
useDocIdFromQuery,
getFileExtension,
getBase64,
getUint8Array,
reportProcessDocumentFail,
getProcessingDescMsg,
isThirdResegment,
isIncremental,
} from './common';
export { getSegmentCleanerParams } from './text';
export { getStorageStrategyEnabled } from './get-storage-strategy-enabled';
export { validateCommonDocResegmentStep } from './validate-common-doc-next-step';

View File

@@ -0,0 +1,25 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import {
type DocumentInfo,
DocumentSource,
FormatType,
} from '@coze-arch/idl/knowledge';
export const isLocalTextDocument = (document: DocumentInfo) =>
document.format_type === FormatType.Text &&
document.source_type === DocumentSource.Document;

View File

@@ -0,0 +1,27 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { type FilterPageConfig } from '@/features/knowledge-type/text/interface';
export const mapPDFFilterConfig = (list: FilterPageConfig[]) =>
list
.map(config => {
if (config.isFilter) {
return config.pageIndex;
}
return null;
})
.filter((page): page is number => typeof page === 'number');

View File

@@ -0,0 +1,62 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { I18n } from '@coze-arch/i18n';
import {
type FilterPageConfig,
type PDFDocumentFilterValue,
} from '@/features/knowledge-type/text/interface';
export const getSortedFilterPages = (filterPagesConfig: FilterPageConfig[]) =>
filterPagesConfig
.filter(config => config.isFilter)
.map(config => config.pageIndex)
.sort((prev, after) => prev - after);
export const getFilterPagesString = (pages: number[]) => pages.join(' / ');
/**
* 渲染为形如下方例子的内容:
* 论文 1过滤第 2 / 4 / 6 页;设置了页面局部过滤
* 论文 2过滤第 1 页...
*/
export const renderDocumentFilterValue = ({
filterValue,
pdfList,
}: {
filterValue: PDFDocumentFilterValue[];
pdfList: { name: string; uri: string }[];
}) =>
filterValue
.map(value => {
const pdf = pdfList.find(item => item.uri === value.uri);
if (!pdf) {
return null;
}
const filterPages = getSortedFilterPages(value.filterPagesConfig);
if (!filterPages.length) {
return null;
}
const filterPagesString = getFilterPagesString(filterPages);
return `${pdf.name}: ${I18n.t('data_filter_values', {
filterPages: filterPagesString,
})}`;
})
.filter((filterString): filterString is string => Boolean(filterString))
.join('\n');

View File

@@ -0,0 +1,119 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** 此文件放的是 table 通用 utils */
import { get } from 'lodash-es';
import { I18n } from '@coze-arch/i18n';
import { type TableSettings } from '../types';
import { FrequencyDay, TableSettingFormFields } from '../constants';
export const getFrequencyMap = (updateInterval: FrequencyDay): string => {
const frequencyMap = {
[FrequencyDay.ZERO]: I18n.t('datasets_frequencyModal_frequency_noUpdate'),
[FrequencyDay.ONE]: I18n.t('datasets_frequencyModal_frequency_day', {
num: 1,
}),
[FrequencyDay.THREE]: I18n.t('datasets_frequencyModal_frequency_day', {
num: 3,
}),
[FrequencyDay.SEVEN]: I18n.t('datasets_frequencyModal_frequency_day', {
num: 7,
}),
[FrequencyDay.THIRTY]: I18n.t('datasets_frequencyModal_frequency_day', {
num: 30,
}),
};
return frequencyMap[updateInterval];
};
export interface IValidateRes {
valid: boolean;
errorMsg: string;
}
// 校验tableStructure列名及表明是否包含特殊字符
export const validateField = (
fieldName: string,
emptyMsg = '',
): IValidateRes => {
let valid = true;
let errorMsg = '';
// 是否包含特殊字符-->单引号,双引号,转义符,反引号
const notationReg = /["'`\\]+/g;
if (!fieldName) {
return {
valid: false,
errorMsg: emptyMsg,
};
}
if (notationReg.test(fieldName)) {
valid = false;
errorMsg = I18n.t('knowledge_tableStructure_field_errLegally');
}
// 不能包含_knowledge_slice_id关键字
if (['_knowledge_slice_id'].includes(fieldName)) {
valid = false;
errorMsg = I18n.t('knowledge_tableStructure_errSystemField');
}
return {
valid,
errorMsg,
};
};
export const getSrcFromImg = (str: string): string[] => {
if (!str) {
return [];
}
const imgRegx = /<img[^>]+src\s*=\s*['"]([^'"]+)['"][^>]*>/g;
// 使用正则表达式进行匹配
const matches = str.match(imgRegx);
// 提取匹配结果中的src属性值
const srcList: string[] = [];
if (matches) {
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const src = match.match(/src\s*=\s*['"]([^'"]+)['"]/)?.[1];
if (src) {
srcList.push(src);
}
}
}
return srcList;
};
export const isKeyInTableSettings = (
key: string,
): key is TableSettingFormFields =>
Object.values(TableSettingFormFields).includes(key as TableSettingFormFields);
export const tableSettingsToString = (tableSettings: TableSettings) => {
const res: { [key in keyof TableSettings]: string } = {
sheet_id: '',
header_line_idx: '',
start_line_idx: '',
};
Object.keys(tableSettings).reduce((acc, key) => {
if (isKeyInTableSettings(key)) {
acc[key] = String(get(tableSettings, key));
}
return acc;
}, res);
return res;
};

View File

@@ -0,0 +1,101 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import {
ChunkType,
type ChunkStrategy,
type DocumentInfo,
} from '@coze-arch/bot-api/knowledge';
import { defaultCustomSegmentRule } from '@/constants/text';
import {
SegmentMode,
PreProcessRule,
SeperatorType,
type Seperator,
} from '../types';
export const getSegmentMode = (rule: ChunkStrategy) => {
if (rule.chunk_type === ChunkType.CustomChunk) {
return SegmentMode.CUSTOM;
}
if (rule.chunk_type === ChunkType.LevelChunk) {
return SegmentMode.LEVEL;
}
return SegmentMode.AUTO;
};
export const getSegmentCleanerParams = (docInfo: DocumentInfo) => {
if (docInfo && Object.keys(docInfo) && docInfo?.chunk_strategy) {
try {
const rule = docInfo?.chunk_strategy || {};
const preProcessRules: PreProcessRule[] = [];
if (rule.remove_extra_spaces) {
preProcessRules.push(PreProcessRule.REMOVE_SPACES);
}
if (rule.remove_urls_emails) {
preProcessRules.push(PreProcessRule.REMOVE_EMAILS);
}
return {
docInfo,
segmentMode: getSegmentMode(docInfo?.chunk_strategy),
segmentRule: {
separator: rule.separator
? getSeparator(rule.separator as SeperatorType)
: defaultCustomSegmentRule.separator,
maxTokens: rule.max_tokens
? Number(rule.max_tokens)
: defaultCustomSegmentRule.maxTokens,
preProcessRules,
overlap: rule.overlap
? Number(rule.overlap)
: defaultCustomSegmentRule.overlap,
},
};
} catch (e) {
return undefined;
}
}
return undefined;
};
function getSeperatorTypeExceptCustom(
seperatorType: typeof SeperatorType,
): string[] {
const result: string[] = [];
for (const [, value] of Object.entries(seperatorType)) {
if (value !== seperatorType.CUSTOM) {
result.push(value);
}
}
return result;
}
export const getSeparator = (separator: SeperatorType): Seperator => {
const seperatorType = getSeperatorTypeExceptCustom(SeperatorType);
if (seperatorType.indexOf(separator) > -1) {
return {
type: separator,
};
}
return {
type: SeperatorType.CUSTOM,
customValue: separator,
};
};

View File

@@ -0,0 +1,42 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { FooterBtnStatus } from '@coze-data/knowledge-resource-processor-core';
import { type CustomSegmentRule, SegmentMode, SeperatorType } from '../types';
export const validateCommonDocResegmentStep = (
segmentMode: SegmentMode,
segmentRule: CustomSegmentRule,
): FooterBtnStatus => {
if (segmentMode === SegmentMode.CUSTOM) {
const maxTokens = segmentRule?.maxTokens || 0;
const separator = segmentRule?.separator;
const isCustomSeperatorEmpty =
separator?.type === SeperatorType.CUSTOM && !separator?.customValue;
if (
maxTokens === 0 ||
isCustomSeperatorEmpty ||
typeof segmentRule.overlap !== 'number' ||
Number.isNaN(segmentRule.overlap)
) {
return FooterBtnStatus.DISABLE;
}
}
return FooterBtnStatus.ENABLE;
};