更新文档系统归并优化方案

This commit is contained in:
ben
2025-10-29 14:36:13 +00:00
parent 2f96497530
commit 0def756314
332 changed files with 30606 additions and 28342 deletions

View File

@@ -0,0 +1,229 @@
# 胡汉三千年项目 - 数据导入配置
# 符号数据导入器配置文件
# 数据库配置
database:
# 数据库文件路径
path: "symbols.db"
# 数据库连接超时(秒)
timeout: 30
# 是否启用WAL模式提高并发性能
wal_mode: true
# 数据验证配置
validation:
# 必填字段检查
required_fields: ["symbol_id", "symbol_form", "symbol_name"]
# 符号ID格式正则表达式
symbol_id_pattern: "^[A-Za-z0-9_]+$"
# 允许的阴阳属性值
valid_yin_yang: ["yin", "yang", "neutral"]
# 允许的刻法类型
valid_engraving: ["yin_engraving", "yang_engraving", "mixed"]
# 置信度范围
confidence_range:
min: 1
max: 100
# 导入配置
import:
# 批量导入时的批次大小
batch_size: 100
# 是否在导入时显示进度
show_progress: true
# 导入失败时的处理方式
on_error: "continue" # 可选值stop, continue, skip
# 重复记录的处理方式
duplicate_handling: "update" # 可选值skip, update, error
# 文件格式配置
file_formats:
csv:
# 默认分隔符
delimiter: ","
# 支持的编码格式
encodings: ["utf-8", "gbk", "gb2312", "latin1"]
# 是否跳过空行
skip_empty_lines: true
# 是否跳过标题行
skip_header: false
json:
# JSON根节点名称如果数据在特定节点下
root_node: "symbols"
# 是否验证JSON格式
validate_json: true
# 日志配置
logging:
# 日志级别
level: "INFO" # DEBUG, INFO, WARNING, ERROR
# 日志文件路径
file_path: "import_log.log"
# 是否在控制台显示日志
console_output: true
# 日志格式
format: "%(asctime)s - %(levelname)s - %(message)s"
# 示例数据配置
sample_data:
# 是否自动生成示例数据
auto_generate: true
# 示例符号数量
symbol_count: 10
# 示例关联数量
link_count: 5
# 示例传播路径数量
path_count: 3
# 高级配置
advanced:
# 是否启用数据缓存
enable_cache: true
# 缓存大小MB
cache_size: 100
# 是否启用并行处理
parallel_processing: false
# 并行工作线程数
max_workers: 4
# 数据库连接池大小
connection_pool_size: 10
# 数据源配置
data_sources:
# 支持的源类型
supported_types: ["csv", "json", "excel", "database", "api"]
# CSV源配置
csv:
# 默认文件扩展名
extensions: [".csv", ".txt"]
# 最大文件大小MB
max_file_size: 100
# 是否自动检测分隔符
auto_detect_delimiter: true
# JSON源配置
json:
extensions: [".json", ".jsonl"]
max_file_size: 50
# 是否支持JSON Lines格式
support_jsonl: true
# Excel源配置
excel:
extensions: [".xlsx", ".xls"]
max_file_size: 50
# 默认工作表名称
default_sheet: "Sheet1"
# 字段映射配置
field_mappings:
# 符号表字段映射
symbols:
# 标准字段名 -> 可能的数据源字段名
symbol_id: ["id", "symbol_id", "symbolID", "标识符"]
symbol_form: ["form", "symbol_form", "symbolForm", "符号形态"]
symbol_name: ["name", "symbol_name", "symbolName", "符号名称"]
yin_yang_attribute: ["yin_yang", "attribute", "阴阳属性", "阴阳"]
engraving_type: ["engraving", "type", "刻法类型", "刻法"]
origin_civilization: ["civilization", "origin", "起源文明", "文明"]
origin_period: ["period", "era", "起源时期", "时期"]
geographical_context: ["geography", "context", "地理背景", "地理"]
functional_context: ["function", "purpose", "功能背景", "功能"]
phonetic_context: ["phonetic", "sound", "语音背景", "语音"]
semantic_context: ["semantic", "meaning", "语义背景", "语义"]
description: ["desc", "description", "描述", "说明"]
# 关联表字段映射
cross_civilization_links:
source_symbol_id: ["source_id", "from", "源符号", "起点"]
target_symbol_id: ["target_id", "to", "目标符号", "终点"]
link_type: ["type", "relation_type", "关联类型", "类型"]
confidence_level: ["confidence", "level", "置信度", "可信度"]
evidence_description: ["evidence", "proof", "证据描述", "证据"]
source_reference: ["reference", "source", "来源参考", "参考"]
# 传播路径表字段映射
symbol_transmission_paths:
path_name: ["name", "path", "路径名称", "名称"]
start_civilization: ["start", "origin", "起点文明", "起点"]
end_civilization: ["end", "destination", "终点文明", "终点"]
transmission_period: ["period", "time", "传播时期", "时期"]
transmission_route: ["route", "path", "传播路线", "路线"]
supporting_evidence: ["evidence", "support", "支持证据", "证据"]
# 错误处理配置
error_handling:
# 验证错误处理
validation_errors:
# 是否记录验证错误
log_errors: true
# 是否生成错误报告
generate_report: true
# 错误报告格式
report_format: "html" # html, json, csv
# 导入错误处理
import_errors:
# 是否继续导入其他记录
continue_on_error: true
# 是否记录错误详情
log_details: true
# 错误记录保存路径
error_log_path: "import_errors.log"
# 数据库错误处理
database_errors:
# 是否自动重试
auto_retry: true
# 最大重试次数
max_retries: 3
# 重试间隔(秒)
retry_interval: 5
# 性能优化配置
performance:
# 内存使用限制MB
memory_limit: 512
# 是否启用内存优化
memory_optimization: true
# 是否启用延迟加载
lazy_loading: true
# 查询超时时间(秒)
query_timeout: 60
# 安全配置
security:
# 是否验证文件类型
validate_file_type: true
# 允许的文件类型
allowed_file_types: ["csv", "json", "xlsx", "xls"]
# 是否检查文件大小
check_file_size: true
# 最大允许文件大小MB
max_file_size: 100
# 是否扫描恶意内容
scan_malicious_content: false
# 备份配置
backup:
# 是否自动备份
auto_backup: true
# 备份文件路径
backup_path: "backups"
# 备份保留天数
retention_days: 30
# 备份频率
backup_frequency: "daily" # daily, weekly, monthly
# 监控配置
monitoring:
# 是否启用性能监控
enable_monitoring: true
# 监控指标
metrics: ["import_speed", "memory_usage", "error_rate", "success_rate"]
# 监控数据保存路径
metrics_path: "metrics"
# 监控数据保留天数
metrics_retention: 7