feat: manually mirror opencoze's code from bytedance

Change-Id: I09a73aadda978ad9511264a756b2ce51f5761adf
This commit is contained in:
fanlv
2025-07-20 17:36:12 +08:00
commit 890153324f
14811 changed files with 1923430 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
export const SUCCESSFUL_UPLOAD_PROGRESS = 100;
export const POLLING_TIME = 3000;
export const MAX_UNIT_NAME_LEN = 100;
export const BOT_DATA_REFACTOR_CLASS_NAME = 'data-refactor';
export const TABLE_ACCEPT_LOCAL_FILE = ['.xls', '.xlsx', '.csv'];
interface TextUploadChannelConfig {
acceptFileTypes: string[];
fileFormatString: string;
addUnitMaxLimit: number;
}
export type Channel = 'DOUYIN' | 'DEFAULT';
const textUploadChannelConfigMap: Record<Channel, TextUploadChannelConfig> = {
DOUYIN: {
acceptFileTypes: ['.pdf', '.txt', '.doc', '.docx'],
fileFormatString: 'PDF、TXT、DOC、DOCX',
addUnitMaxLimit: 100,
},
DEFAULT: {
acceptFileTypes: ['.pdf', '.txt', '.doc', '.docx', '.md'],
fileFormatString: 'PDF、TXT、DOC、DOCX、MD',
addUnitMaxLimit: 300,
},
};
export const getTextUploadChannelConfig = (
channel?: Channel,
): TextUploadChannelConfig =>
(channel && textUploadChannelConfigMap[channel]) ||
textUploadChannelConfigMap.DEFAULT;

View File

@@ -0,0 +1,34 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** 更新频率 */
export enum FrequencyDay {
ZERO = 0,
ONE = 1,
THREE = 3,
SEVEN = 7,
THIRTY = 30,
}
export enum TableSettingFormFields {
SHEET = 'sheet_id',
KEY_START_ROW = 'header_line_idx',
DATA_START_ROW = 'start_line_idx',
}
/** 知识库上传文件最大 size 100MB */
export const UNIT_MAX_MB = 100;
export const PDF_MAX_PAGES = 500;

View File

@@ -0,0 +1,32 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
export {
SUCCESSFUL_UPLOAD_PROGRESS,
POLLING_TIME,
MAX_UNIT_NAME_LEN,
BOT_DATA_REFACTOR_CLASS_NAME,
} from './common';
export {
TableStatus,
MAX_TABLE_META_COLUMN_LEN,
MAX_TABLE_META_STR_LEN,
DEFAULT_TABLE_SETTINGS_FROM_ONE,
DEFAULT_TABLE_SETTINGS_FROM_ZERO,
TableSettingFormFields,
} from './table';
export { defaultCustomSegmentRule, getSeperatorOptionList } from './text';
export { FrequencyDay, UNIT_MAX_MB, PDF_MAX_PAGES } from './components';

View File

@@ -0,0 +1,52 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** table common constants */
export enum TableStatus {
ERROR = 'error',
LOADING = 'loading',
NORMAL = 'normal',
}
export const MAX_TABLE_META_COLUMN_LEN = 50;
export const MAX_TABLE_META_STR_LEN = 30;
/** table-local resegment unit steps */
export enum TableLocalResegmentStep {
CONFIGURATION,
PREVIEW,
PROCESSING,
}
export enum TableSettingFormFields {
SHEET = 'sheet_id',
KEY_START_ROW = 'header_line_idx',
DATA_START_ROW = 'start_line_idx',
}
export const DEFAULT_TABLE_SETTINGS_FROM_ONE = {
[TableSettingFormFields.SHEET]: 0,
[TableSettingFormFields.KEY_START_ROW]: 0,
[TableSettingFormFields.DATA_START_ROW]: 1,
};
export const DEFAULT_TABLE_SETTINGS_FROM_ZERO = {
[TableSettingFormFields.SHEET]: 0,
[TableSettingFormFields.KEY_START_ROW]: 0,
[TableSettingFormFields.DATA_START_ROW]: 0,
};

View File

@@ -0,0 +1,55 @@
/*
* Copyright 2025 coze-dev Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { I18n } from '@coze-arch/i18n';
import { type CustomSegmentRule, SeperatorType } from '../types';
const getSeperatorSelect = () => ({
[SeperatorType.LINE_BREAK]: I18n.t('datasets_Custom_segmentID_linebreak'),
[SeperatorType.LINE_BREAK2]: I18n.t('datasets_Custom_segmentID_2linebreak'),
[SeperatorType.CN_PERIOD]: I18n.t('datasets_Custom_segmentID_cnperiod'),
[SeperatorType.CN_EXCLAMATION]: I18n.t(
'datasets_Custom_segmentID_cn_exclamation',
),
[SeperatorType.EN_PERIOD]: I18n.t('datasets_Custom_segmentID_enperiod'),
[SeperatorType.EN_EXCLAMATION]: I18n.t(
'datasets_Custom_segmentID_en_exclamation',
),
[SeperatorType.CN_QUESTION]: I18n.t('datasets_Custom_segmentID_cn_question'),
[SeperatorType.EN_QUESTION]: I18n.t('datasets_Custom_segmentID_en_question'),
[SeperatorType.CUSTOM]: I18n.t('datasets_Custom_segmentID_custom'),
});
export const getSeperatorOptionList = () =>
Object.entries(getSeperatorSelect()).map(([k, label]) => ({
value: k,
label,
}));
const defaultMaxTokens = 800;
const defaultOverlap = 10;
export const defaultCustomSegmentRule: CustomSegmentRule = {
separator: {
type: SeperatorType.LINE_BREAK,
customValue: '###',
},
maxTokens: defaultMaxTokens,
preProcessRules: [],
overlap: defaultOverlap,
};