229 lines
6.4 KiB
YAML
229 lines
6.4 KiB
YAML
# 胡汉三千年项目 - 数据导入配置
|
||
# 符号数据导入器配置文件
|
||
|
||
# 数据库配置
|
||
database:
|
||
# 数据库文件路径
|
||
path: "symbols.db"
|
||
# 数据库连接超时(秒)
|
||
timeout: 30
|
||
# 是否启用WAL模式(提高并发性能)
|
||
wal_mode: true
|
||
|
||
# 数据验证配置
|
||
validation:
|
||
# 必填字段检查
|
||
required_fields: ["symbol_id", "symbol_form", "symbol_name"]
|
||
# 符号ID格式正则表达式
|
||
symbol_id_pattern: "^[A-Za-z0-9_]+$"
|
||
# 允许的阴阳属性值
|
||
valid_yin_yang: ["yin", "yang", "neutral"]
|
||
# 允许的刻法类型
|
||
valid_engraving: ["yin_engraving", "yang_engraving", "mixed"]
|
||
# 置信度范围
|
||
confidence_range:
|
||
min: 1
|
||
max: 100
|
||
|
||
# 导入配置
|
||
import:
|
||
# 批量导入时的批次大小
|
||
batch_size: 100
|
||
# 是否在导入时显示进度
|
||
show_progress: true
|
||
# 导入失败时的处理方式
|
||
on_error: "continue" # 可选值:stop, continue, skip
|
||
# 重复记录的处理方式
|
||
duplicate_handling: "update" # 可选值:skip, update, error
|
||
|
||
# 文件格式配置
|
||
file_formats:
|
||
csv:
|
||
# 默认分隔符
|
||
delimiter: ","
|
||
# 支持的编码格式
|
||
encodings: ["utf-8", "gbk", "gb2312", "latin1"]
|
||
# 是否跳过空行
|
||
skip_empty_lines: true
|
||
# 是否跳过标题行
|
||
skip_header: false
|
||
|
||
json:
|
||
# JSON根节点名称(如果数据在特定节点下)
|
||
root_node: "symbols"
|
||
# 是否验证JSON格式
|
||
validate_json: true
|
||
|
||
# 日志配置
|
||
logging:
|
||
# 日志级别
|
||
level: "INFO" # DEBUG, INFO, WARNING, ERROR
|
||
# 日志文件路径
|
||
file_path: "import_log.log"
|
||
# 是否在控制台显示日志
|
||
console_output: true
|
||
# 日志格式
|
||
format: "%(asctime)s - %(levelname)s - %(message)s"
|
||
|
||
# 示例数据配置
|
||
sample_data:
|
||
# 是否自动生成示例数据
|
||
auto_generate: true
|
||
# 示例符号数量
|
||
symbol_count: 10
|
||
# 示例关联数量
|
||
link_count: 5
|
||
# 示例传播路径数量
|
||
path_count: 3
|
||
|
||
# 高级配置
|
||
advanced:
|
||
# 是否启用数据缓存
|
||
enable_cache: true
|
||
# 缓存大小(MB)
|
||
cache_size: 100
|
||
# 是否启用并行处理
|
||
parallel_processing: false
|
||
# 并行工作线程数
|
||
max_workers: 4
|
||
# 数据库连接池大小
|
||
connection_pool_size: 10
|
||
|
||
# 数据源配置
|
||
data_sources:
|
||
# 支持的源类型
|
||
supported_types: ["csv", "json", "excel", "database", "api"]
|
||
|
||
# CSV源配置
|
||
csv:
|
||
# 默认文件扩展名
|
||
extensions: [".csv", ".txt"]
|
||
# 最大文件大小(MB)
|
||
max_file_size: 100
|
||
# 是否自动检测分隔符
|
||
auto_detect_delimiter: true
|
||
|
||
# JSON源配置
|
||
json:
|
||
extensions: [".json", ".jsonl"]
|
||
max_file_size: 50
|
||
# 是否支持JSON Lines格式
|
||
support_jsonl: true
|
||
|
||
# Excel源配置
|
||
excel:
|
||
extensions: [".xlsx", ".xls"]
|
||
max_file_size: 50
|
||
# 默认工作表名称
|
||
default_sheet: "Sheet1"
|
||
|
||
# 字段映射配置
|
||
field_mappings:
|
||
# 符号表字段映射
|
||
symbols:
|
||
# 标准字段名 -> 可能的数据源字段名
|
||
symbol_id: ["id", "symbol_id", "symbolID", "标识符"]
|
||
symbol_form: ["form", "symbol_form", "symbolForm", "符号形态"]
|
||
symbol_name: ["name", "symbol_name", "symbolName", "符号名称"]
|
||
yin_yang_attribute: ["yin_yang", "attribute", "阴阳属性", "阴阳"]
|
||
engraving_type: ["engraving", "type", "刻法类型", "刻法"]
|
||
origin_civilization: ["civilization", "origin", "起源文明", "文明"]
|
||
origin_period: ["period", "era", "起源时期", "时期"]
|
||
geographical_context: ["geography", "context", "地理背景", "地理"]
|
||
functional_context: ["function", "purpose", "功能背景", "功能"]
|
||
phonetic_context: ["phonetic", "sound", "语音背景", "语音"]
|
||
semantic_context: ["semantic", "meaning", "语义背景", "语义"]
|
||
description: ["desc", "description", "描述", "说明"]
|
||
|
||
# 关联表字段映射
|
||
cross_civilization_links:
|
||
source_symbol_id: ["source_id", "from", "源符号", "起点"]
|
||
target_symbol_id: ["target_id", "to", "目标符号", "终点"]
|
||
link_type: ["type", "relation_type", "关联类型", "类型"]
|
||
confidence_level: ["confidence", "level", "置信度", "可信度"]
|
||
evidence_description: ["evidence", "proof", "证据描述", "证据"]
|
||
source_reference: ["reference", "source", "来源参考", "参考"]
|
||
|
||
# 传播路径表字段映射
|
||
symbol_transmission_paths:
|
||
path_name: ["name", "path", "路径名称", "名称"]
|
||
start_civilization: ["start", "origin", "起点文明", "起点"]
|
||
end_civilization: ["end", "destination", "终点文明", "终点"]
|
||
transmission_period: ["period", "time", "传播时期", "时期"]
|
||
transmission_route: ["route", "path", "传播路线", "路线"]
|
||
supporting_evidence: ["evidence", "support", "支持证据", "证据"]
|
||
|
||
# 错误处理配置
|
||
error_handling:
|
||
# 验证错误处理
|
||
validation_errors:
|
||
# 是否记录验证错误
|
||
log_errors: true
|
||
# 是否生成错误报告
|
||
generate_report: true
|
||
# 错误报告格式
|
||
report_format: "html" # html, json, csv
|
||
|
||
# 导入错误处理
|
||
import_errors:
|
||
# 是否继续导入其他记录
|
||
continue_on_error: true
|
||
# 是否记录错误详情
|
||
log_details: true
|
||
# 错误记录保存路径
|
||
error_log_path: "import_errors.log"
|
||
|
||
# 数据库错误处理
|
||
database_errors:
|
||
# 是否自动重试
|
||
auto_retry: true
|
||
# 最大重试次数
|
||
max_retries: 3
|
||
# 重试间隔(秒)
|
||
retry_interval: 5
|
||
|
||
# 性能优化配置
|
||
performance:
|
||
# 内存使用限制(MB)
|
||
memory_limit: 512
|
||
# 是否启用内存优化
|
||
memory_optimization: true
|
||
# 是否启用延迟加载
|
||
lazy_loading: true
|
||
# 查询超时时间(秒)
|
||
query_timeout: 60
|
||
|
||
# 安全配置
|
||
security:
|
||
# 是否验证文件类型
|
||
validate_file_type: true
|
||
# 允许的文件类型
|
||
allowed_file_types: ["csv", "json", "xlsx", "xls"]
|
||
# 是否检查文件大小
|
||
check_file_size: true
|
||
# 最大允许文件大小(MB)
|
||
max_file_size: 100
|
||
# 是否扫描恶意内容
|
||
scan_malicious_content: false
|
||
|
||
# 备份配置
|
||
backup:
|
||
# 是否自动备份
|
||
auto_backup: true
|
||
# 备份文件路径
|
||
backup_path: "backups"
|
||
# 备份保留天数
|
||
retention_days: 30
|
||
# 备份频率
|
||
backup_frequency: "daily" # daily, weekly, monthly
|
||
|
||
# 监控配置
|
||
monitoring:
|
||
# 是否启用性能监控
|
||
enable_monitoring: true
|
||
# 监控指标
|
||
metrics: ["import_speed", "memory_usage", "error_rate", "success_rate"]
|
||
# 监控数据保存路径
|
||
metrics_path: "metrics"
|
||
# 监控数据保留天数
|
||
metrics_retention: 7 |