huhan3000/胡汉三千年项目/数据导入工具/导入配置.yaml

229 lines
6.4 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 胡汉三千年项目 - 数据导入配置
# 符号数据导入器配置文件
# 数据库配置
database:
# 数据库文件路径
path: "symbols.db"
# 数据库连接超时(秒)
timeout: 30
# 是否启用WAL模式提高并发性能
wal_mode: true
# 数据验证配置
validation:
# 必填字段检查
required_fields: ["symbol_id", "symbol_form", "symbol_name"]
# 符号ID格式正则表达式
symbol_id_pattern: "^[A-Za-z0-9_]+$"
# 允许的阴阳属性值
valid_yin_yang: ["yin", "yang", "neutral"]
# 允许的刻法类型
valid_engraving: ["yin_engraving", "yang_engraving", "mixed"]
# 置信度范围
confidence_range:
min: 1
max: 100
# 导入配置
import:
# 批量导入时的批次大小
batch_size: 100
# 是否在导入时显示进度
show_progress: true
# 导入失败时的处理方式
on_error: "continue" # 可选值stop, continue, skip
# 重复记录的处理方式
duplicate_handling: "update" # 可选值skip, update, error
# 文件格式配置
file_formats:
csv:
# 默认分隔符
delimiter: ","
# 支持的编码格式
encodings: ["utf-8", "gbk", "gb2312", "latin1"]
# 是否跳过空行
skip_empty_lines: true
# 是否跳过标题行
skip_header: false
json:
# JSON根节点名称如果数据在特定节点下
root_node: "symbols"
# 是否验证JSON格式
validate_json: true
# 日志配置
logging:
# 日志级别
level: "INFO" # DEBUG, INFO, WARNING, ERROR
# 日志文件路径
file_path: "import_log.log"
# 是否在控制台显示日志
console_output: true
# 日志格式
format: "%(asctime)s - %(levelname)s - %(message)s"
# 示例数据配置
sample_data:
# 是否自动生成示例数据
auto_generate: true
# 示例符号数量
symbol_count: 10
# 示例关联数量
link_count: 5
# 示例传播路径数量
path_count: 3
# 高级配置
advanced:
# 是否启用数据缓存
enable_cache: true
# 缓存大小MB
cache_size: 100
# 是否启用并行处理
parallel_processing: false
# 并行工作线程数
max_workers: 4
# 数据库连接池大小
connection_pool_size: 10
# 数据源配置
data_sources:
# 支持的源类型
supported_types: ["csv", "json", "excel", "database", "api"]
# CSV源配置
csv:
# 默认文件扩展名
extensions: [".csv", ".txt"]
# 最大文件大小MB
max_file_size: 100
# 是否自动检测分隔符
auto_detect_delimiter: true
# JSON源配置
json:
extensions: [".json", ".jsonl"]
max_file_size: 50
# 是否支持JSON Lines格式
support_jsonl: true
# Excel源配置
excel:
extensions: [".xlsx", ".xls"]
max_file_size: 50
# 默认工作表名称
default_sheet: "Sheet1"
# 字段映射配置
field_mappings:
# 符号表字段映射
symbols:
# 标准字段名 -> 可能的数据源字段名
symbol_id: ["id", "symbol_id", "symbolID", "标识符"]
symbol_form: ["form", "symbol_form", "symbolForm", "符号形态"]
symbol_name: ["name", "symbol_name", "symbolName", "符号名称"]
yin_yang_attribute: ["yin_yang", "attribute", "阴阳属性", "阴阳"]
engraving_type: ["engraving", "type", "刻法类型", "刻法"]
origin_civilization: ["civilization", "origin", "起源文明", "文明"]
origin_period: ["period", "era", "起源时期", "时期"]
geographical_context: ["geography", "context", "地理背景", "地理"]
functional_context: ["function", "purpose", "功能背景", "功能"]
phonetic_context: ["phonetic", "sound", "语音背景", "语音"]
semantic_context: ["semantic", "meaning", "语义背景", "语义"]
description: ["desc", "description", "描述", "说明"]
# 关联表字段映射
cross_civilization_links:
source_symbol_id: ["source_id", "from", "源符号", "起点"]
target_symbol_id: ["target_id", "to", "目标符号", "终点"]
link_type: ["type", "relation_type", "关联类型", "类型"]
confidence_level: ["confidence", "level", "置信度", "可信度"]
evidence_description: ["evidence", "proof", "证据描述", "证据"]
source_reference: ["reference", "source", "来源参考", "参考"]
# 传播路径表字段映射
symbol_transmission_paths:
path_name: ["name", "path", "路径名称", "名称"]
start_civilization: ["start", "origin", "起点文明", "起点"]
end_civilization: ["end", "destination", "终点文明", "终点"]
transmission_period: ["period", "time", "传播时期", "时期"]
transmission_route: ["route", "path", "传播路线", "路线"]
supporting_evidence: ["evidence", "support", "支持证据", "证据"]
# 错误处理配置
error_handling:
# 验证错误处理
validation_errors:
# 是否记录验证错误
log_errors: true
# 是否生成错误报告
generate_report: true
# 错误报告格式
report_format: "html" # html, json, csv
# 导入错误处理
import_errors:
# 是否继续导入其他记录
continue_on_error: true
# 是否记录错误详情
log_details: true
# 错误记录保存路径
error_log_path: "import_errors.log"
# 数据库错误处理
database_errors:
# 是否自动重试
auto_retry: true
# 最大重试次数
max_retries: 3
# 重试间隔(秒)
retry_interval: 5
# 性能优化配置
performance:
# 内存使用限制MB
memory_limit: 512
# 是否启用内存优化
memory_optimization: true
# 是否启用延迟加载
lazy_loading: true
# 查询超时时间(秒)
query_timeout: 60
# 安全配置
security:
# 是否验证文件类型
validate_file_type: true
# 允许的文件类型
allowed_file_types: ["csv", "json", "xlsx", "xls"]
# 是否检查文件大小
check_file_size: true
# 最大允许文件大小MB
max_file_size: 100
# 是否扫描恶意内容
scan_malicious_content: false
# 备份配置
backup:
# 是否自动备份
auto_backup: true
# 备份文件路径
backup_path: "backups"
# 备份保留天数
retention_days: 30
# 备份频率
backup_frequency: "daily" # daily, weekly, monthly
# 监控配置
monitoring:
# 是否启用性能监控
enable_monitoring: true
# 监控指标
metrics: ["import_speed", "memory_usage", "error_rate", "success_rate"]
# 监控数据保存路径
metrics_path: "metrics"
# 监控数据保留天数
metrics_retention: 7