# 胡汉三千年项目 - 数据导入配置 # 符号数据导入器配置文件 # 数据库配置 database: # 数据库文件路径 path: "symbols.db" # 数据库连接超时(秒) timeout: 30 # 是否启用WAL模式(提高并发性能) wal_mode: true # 数据验证配置 validation: # 必填字段检查 required_fields: ["symbol_id", "symbol_form", "symbol_name"] # 符号ID格式正则表达式 symbol_id_pattern: "^[A-Za-z0-9_]+$" # 允许的阴阳属性值 valid_yin_yang: ["yin", "yang", "neutral"] # 允许的刻法类型 valid_engraving: ["yin_engraving", "yang_engraving", "mixed"] # 置信度范围 confidence_range: min: 1 max: 100 # 导入配置 import: # 批量导入时的批次大小 batch_size: 100 # 是否在导入时显示进度 show_progress: true # 导入失败时的处理方式 on_error: "continue" # 可选值:stop, continue, skip # 重复记录的处理方式 duplicate_handling: "update" # 可选值:skip, update, error # 文件格式配置 file_formats: csv: # 默认分隔符 delimiter: "," # 支持的编码格式 encodings: ["utf-8", "gbk", "gb2312", "latin1"] # 是否跳过空行 skip_empty_lines: true # 是否跳过标题行 skip_header: false json: # JSON根节点名称(如果数据在特定节点下) root_node: "symbols" # 是否验证JSON格式 validate_json: true # 日志配置 logging: # 日志级别 level: "INFO" # DEBUG, INFO, WARNING, ERROR # 日志文件路径 file_path: "import_log.log" # 是否在控制台显示日志 console_output: true # 日志格式 format: "%(asctime)s - %(levelname)s - %(message)s" # 示例数据配置 sample_data: # 是否自动生成示例数据 auto_generate: true # 示例符号数量 symbol_count: 10 # 示例关联数量 link_count: 5 # 示例传播路径数量 path_count: 3 # 高级配置 advanced: # 是否启用数据缓存 enable_cache: true # 缓存大小(MB) cache_size: 100 # 是否启用并行处理 parallel_processing: false # 并行工作线程数 max_workers: 4 # 数据库连接池大小 connection_pool_size: 10 # 数据源配置 data_sources: # 支持的源类型 supported_types: ["csv", "json", "excel", "database", "api"] # CSV源配置 csv: # 默认文件扩展名 extensions: [".csv", ".txt"] # 最大文件大小(MB) max_file_size: 100 # 是否自动检测分隔符 auto_detect_delimiter: true # JSON源配置 json: extensions: [".json", ".jsonl"] max_file_size: 50 # 是否支持JSON Lines格式 support_jsonl: true # Excel源配置 excel: extensions: [".xlsx", ".xls"] max_file_size: 50 # 默认工作表名称 default_sheet: "Sheet1" # 字段映射配置 field_mappings: # 符号表字段映射 symbols: # 标准字段名 -> 可能的数据源字段名 symbol_id: ["id", "symbol_id", "symbolID", "标识符"] symbol_form: ["form", "symbol_form", "symbolForm", "符号形态"] symbol_name: ["name", "symbol_name", "symbolName", "符号名称"] yin_yang_attribute: ["yin_yang", "attribute", "阴阳属性", "阴阳"] engraving_type: ["engraving", "type", "刻法类型", "刻法"] origin_civilization: ["civilization", "origin", "起源文明", "文明"] origin_period: ["period", "era", "起源时期", "时期"] geographical_context: ["geography", "context", "地理背景", "地理"] functional_context: ["function", "purpose", "功能背景", "功能"] phonetic_context: ["phonetic", "sound", "语音背景", "语音"] semantic_context: ["semantic", "meaning", "语义背景", "语义"] description: ["desc", "description", "描述", "说明"] # 关联表字段映射 cross_civilization_links: source_symbol_id: ["source_id", "from", "源符号", "起点"] target_symbol_id: ["target_id", "to", "目标符号", "终点"] link_type: ["type", "relation_type", "关联类型", "类型"] confidence_level: ["confidence", "level", "置信度", "可信度"] evidence_description: ["evidence", "proof", "证据描述", "证据"] source_reference: ["reference", "source", "来源参考", "参考"] # 传播路径表字段映射 symbol_transmission_paths: path_name: ["name", "path", "路径名称", "名称"] start_civilization: ["start", "origin", "起点文明", "起点"] end_civilization: ["end", "destination", "终点文明", "终点"] transmission_period: ["period", "time", "传播时期", "时期"] transmission_route: ["route", "path", "传播路线", "路线"] supporting_evidence: ["evidence", "support", "支持证据", "证据"] # 错误处理配置 error_handling: # 验证错误处理 validation_errors: # 是否记录验证错误 log_errors: true # 是否生成错误报告 generate_report: true # 错误报告格式 report_format: "html" # html, json, csv # 导入错误处理 import_errors: # 是否继续导入其他记录 continue_on_error: true # 是否记录错误详情 log_details: true # 错误记录保存路径 error_log_path: "import_errors.log" # 数据库错误处理 database_errors: # 是否自动重试 auto_retry: true # 最大重试次数 max_retries: 3 # 重试间隔(秒) retry_interval: 5 # 性能优化配置 performance: # 内存使用限制(MB) memory_limit: 512 # 是否启用内存优化 memory_optimization: true # 是否启用延迟加载 lazy_loading: true # 查询超时时间(秒) query_timeout: 60 # 安全配置 security: # 是否验证文件类型 validate_file_type: true # 允许的文件类型 allowed_file_types: ["csv", "json", "xlsx", "xls"] # 是否检查文件大小 check_file_size: true # 最大允许文件大小(MB) max_file_size: 100 # 是否扫描恶意内容 scan_malicious_content: false # 备份配置 backup: # 是否自动备份 auto_backup: true # 备份文件路径 backup_path: "backups" # 备份保留天数 retention_days: 30 # 备份频率 backup_frequency: "daily" # daily, weekly, monthly # 监控配置 monitoring: # 是否启用性能监控 enable_monitoring: true # 监控指标 metrics: ["import_speed", "memory_usage", "error_rate", "success_rate"] # 监控数据保存路径 metrics_path: "metrics" # 监控数据保留天数 metrics_retention: 7