coze-studio/common/autoinstallers/rush-commands/src/convert-comments/utils/chinese.ts

121 lines
3.4 KiB
TypeScript

/**
* Unicode Range Regular Expressions for Chinese Characters
*/
const CHINESE_REGEX = /[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]/;
const CHINESE_EXTRACT_REGEX = /[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3000-\u303f\uff00-\uffef]+/g;
/**
* Detect whether the text contains Chinese characters
*/
export const containsChinese = (text: string): boolean => {
return CHINESE_REGEX.test(text);
};
/**
* Extract the Chinese part of the text
*/
export const extractChineseParts = (text: string): string[] => {
return text.match(CHINESE_EXTRACT_REGEX) || [];
};
/**
* Count the number of Chinese characters in a text
*/
export const countChineseCharacters = (text: string): number => {
const matches = text.match(CHINESE_EXTRACT_REGEX);
if (!matches) return 0;
return matches.reduce((count, match) => count + match.length, 0);
};
/**
* Detect whether the text is mainly composed of Chinese
*/
export const isPrimarilyChinese = (text: string, threshold: number = 0.5): boolean => {
const totalLength = text.length;
if (totalLength === 0) return false;
const chineseLength = countChineseCharacters(text);
return chineseLength / totalLength >= threshold;
};
/**
* Clean up comment text, remove comment symbols and extra spaces
*/
export const cleanCommentText = (
text: string,
commentType: 'single-line' | 'multi-line',
language?: string
): string => {
let cleaned = text;
if (commentType === 'single-line') {
// Remove different single-line comment symbols based on language type
switch (language) {
case 'yaml':
case 'toml':
case 'shell':
case 'python':
case 'ruby':
cleaned = cleaned.replace(/^#\s*/, '');
break;
case 'ini':
cleaned = cleaned.replace(/^[;#]\s*/, '');
break;
case 'php':
cleaned = cleaned.replace(/^(?:\/\/|#)\s*/, '');
break;
default:
// JavaScript/TypeScript/Go/Java/C/C++/C# style
cleaned = cleaned.replace(/^\/\/\s*/, '');
}
} else if (commentType === 'multi-line') {
// Remove different multi-line comment symbols based on language type
switch (language) {
case 'html':
case 'xml':
case 'markdown':
cleaned = cleaned.replace(/^<!--\s*/, '').replace(/\s*-->$/, '');
break;
case 'python':
cleaned = cleaned.replace(/^"""\s*/, '').replace(/\s*"""$/, '');
break;
case 'ruby':
cleaned = cleaned.replace(/^=begin\s*/, '').replace(/\s*=end$/, '');
break;
default:
// JavaScript/TypeScript/Go/Java/C/C++/C#/CSS style
cleaned = cleaned.replace(/^\/\*\s*/, '').replace(/\s*\*\/$/, '');
// Remove the * symbol at the beginning of each line
cleaned = cleaned.replace(/^\s*\*\s?/gm, '');
}
}
// Remove extra spaces and newlines
cleaned = cleaned.trim();
return cleaned;
};
/**
* Verify whether the translation result is valid.
*/
export const isValidTranslation = (original: string, translated: string): boolean => {
// basic verification
if (!translated || translated.trim().length === 0) {
return false;
}
// Check if Chinese is also included (translation may fail)
if (containsChinese(translated)) {
return false;
}
// Check if the length is reasonable (the translated text should not be much longer than the original).
if (translated.length > original.length * 3) {
return false;
}
return true;
};