#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 符号数据导入器 胡汉三千年项目数据导入工具 功能:提供符号数据的批量导入、验证和转换功能 """ import sqlite3 import pandas as pd import json import csv import os from typing import Dict, List, Tuple, Any, Optional from datetime import datetime import re class SymbolDataImporter: """符号数据导入器""" def __init__(self, db_path: str = "symbols.db"): """初始化导入器""" self.db_path = db_path self.conn = sqlite3.connect(db_path) def create_tables(self) -> bool: """创建数据库表结构""" cursor = self.conn.cursor() try: # 创建符号主表 cursor.execute(""" CREATE TABLE IF NOT EXISTS symbols ( symbol_id TEXT PRIMARY KEY, symbol_form TEXT NOT NULL, symbol_name TEXT NOT NULL, yin_yang_attribute TEXT CHECK(yin_yang_attribute IN ('yin', 'yang', 'neutral')), engraving_type TEXT CHECK(engraving_type IN ('yin_engraving', 'yang_engraving', 'mixed')), origin_civilization TEXT, origin_period TEXT, geographical_context TEXT, functional_context TEXT, phonetic_context TEXT, semantic_context TEXT, discovery_date TEXT, discovery_location TEXT, material_type TEXT, preservation_status TEXT, description TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) # 创建符号属性表 cursor.execute(""" CREATE TABLE IF NOT EXISTS symbol_attributes ( attribute_id INTEGER PRIMARY KEY AUTOINCREMENT, symbol_id TEXT, attribute_name TEXT NOT NULL, attribute_value TEXT, attribute_type TEXT CHECK(attribute_type IN ('text', 'numeric', 'boolean', 'date')), confidence_level INTEGER CHECK(confidence_level BETWEEN 1 AND 100), source_reference TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (symbol_id) REFERENCES symbols(symbol_id) ) """) # 创建跨文明关联表 cursor.execute(""" CREATE TABLE IF NOT EXISTS cross_civilization_links ( link_id INTEGER PRIMARY KEY AUTOINCREMENT, source_symbol_id TEXT NOT NULL, target_symbol_id TEXT NOT NULL, link_type TEXT CHECK(link_type IN ('phonetic', 'semantic', 'morphological', 'functional', 'geographical')), confidence_level INTEGER CHECK(confidence_level BETWEEN 1 AND 100), evidence_description TEXT, source_reference TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (source_symbol_id) REFERENCES symbols(symbol_id), FOREIGN KEY (target_symbol_id) REFERENCES symbols(symbol_id), UNIQUE(source_symbol_id, target_symbol_id, link_type) ) """) # 创建符号传播路径表 cursor.execute(""" CREATE TABLE IF NOT EXISTS symbol_transmission_paths ( path_id INTEGER PRIMARY KEY AUTOINCREMENT, path_name TEXT NOT NULL, start_civilization TEXT, end_civilization TEXT, transmission_period TEXT, transmission_route TEXT, supporting_evidence TEXT, confidence_level INTEGER CHECK(confidence_level BETWEEN 1 AND 100), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) # 创建路径节点表 cursor.execute(""" CREATE TABLE IF NOT EXISTS path_nodes ( node_id INTEGER PRIMARY KEY AUTOINCREMENT, path_id INTEGER, symbol_id TEXT, node_sequence INTEGER, node_role TEXT, transmission_context TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (path_id) REFERENCES symbol_transmission_paths(path_id), FOREIGN KEY (symbol_id) REFERENCES symbols(symbol_id) ) """) self.conn.commit() print("数据库表结构创建成功") return True except Exception as e: print(f"创建表结构失败:{e}") self.conn.rollback() return False def validate_symbol_data(self, symbol_data: Dict[str, Any]) -> Tuple[bool, List[str]]: """验证符号数据""" errors = [] # 必填字段检查 required_fields = ['symbol_id', 'symbol_form', 'symbol_name'] for field in required_fields: if field not in symbol_data or not symbol_data[field]: errors.append(f"缺少必填字段:{field}") # 符号ID格式检查 if 'symbol_id' in symbol_data: symbol_id = symbol_data['symbol_id'] if not re.match(r'^[A-Za-z0-9_]+$', symbol_id): errors.append("符号ID只能包含字母、数字和下划线") # 阴阳属性验证 if 'yin_yang_attribute' in symbol_data and symbol_data['yin_yang_attribute']: valid_yin_yang = ['yin', 'yang', 'neutral'] if symbol_data['yin_yang_attribute'] not in valid_yin_yang: errors.append(f"阴阳属性必须是:{', '.join(valid_yin_yang)}") # 刻法类型验证 if 'engraving_type' in symbol_data and symbol_data['engraving_type']: valid_engraving = ['yin_engraving', 'yang_engraving', 'mixed'] if symbol_data['engraving_type'] not in valid_engraving: errors.append(f"刻法类型必须是:{', '.join(valid_engraving)}") # 置信度验证 if 'confidence_level' in symbol_data and symbol_data['confidence_level']: try: confidence = int(symbol_data['confidence_level']) if not 1 <= confidence <= 100: errors.append("置信度必须在1-100之间") except ValueError: errors.append("置信度必须是整数") return len(errors) == 0, errors def import_symbol_from_dict(self, symbol_data: Dict[str, Any]) -> Tuple[bool, str]: """从字典导入单个符号""" # 验证数据 is_valid, errors = self.validate_symbol_data(symbol_data) if not is_valid: return False, f"数据验证失败:{'; '.join(errors)}" cursor = self.conn.cursor() try: # 检查符号是否已存在 cursor.execute("SELECT symbol_id FROM symbols WHERE symbol_id = ?", (symbol_data['symbol_id'],)) existing_symbol = cursor.fetchone() if existing_symbol: # 更新现有符号 update_fields = [] update_values = [] for field, value in symbol_data.items(): if field != 'symbol_id': update_fields.append(f"{field} = ?") update_values.append(value) update_values.append(symbol_data['symbol_id']) # WHERE条件 update_query = f""" UPDATE symbols SET {', '.join(update_fields)}, updated_at = CURRENT_TIMESTAMP WHERE symbol_id = ? """ cursor.execute(update_query, update_values) action = "更新" else: # 插入新符号 fields = list(symbol_data.keys()) placeholders = ['?'] * len(fields) values = list(symbol_data.values()) insert_query = f""" INSERT INTO symbols ({', '.join(fields)}) VALUES ({', '.join(placeholders)}) """ cursor.execute(insert_query, values) action = "插入" self.conn.commit() return True, f"符号 {symbol_data['symbol_id']} {action}成功" except Exception as e: self.conn.rollback() return False, f"导入失败:{e}" def import_symbols_from_csv(self, csv_file: str, delimiter: str = ',') -> Dict[str, Any]: """从CSV文件批量导入符号""" results = { 'total': 0, 'success': 0, 'failed': 0, 'errors': [] } try: with open(csv_file, 'r', encoding='utf-8') as file: # 检测编码 try: reader = csv.DictReader(file, delimiter=delimiter) rows = list(reader) except UnicodeDecodeError: # 尝试其他编码 file.seek(0) reader = csv.DictReader(file, delimiter=delimiter) rows = list(reader) results['total'] = len(rows) for i, row in enumerate(rows, 1): # 清理数据 symbol_data = {} for key, value in row.items(): if value is not None and value.strip(): symbol_data[key.strip()] = value.strip() # 导入符号 success, message = self.import_symbol_from_dict(symbol_data) if success: results['success'] += 1 else: results['failed'] += 1 results['errors'].append({ 'row': i, 'symbol_id': symbol_data.get('symbol_id', '未知'), 'error': message }) # 进度显示 if i % 10 == 0: print(f"已处理 {i}/{len(rows)} 行") print(f"CSV导入完成:成功 {results['success']},失败 {results['failed']}") except Exception as e: results['errors'].append({ 'row': '文件级别', 'symbol_id': 'N/A', 'error': f"文件读取失败:{e}" }) results['failed'] = results['total'] return results def import_symbols_from_json(self, json_file: str) -> Dict[str, Any]: """从JSON文件导入符号""" results = { 'total': 0, 'success': 0, 'failed': 0, 'errors': [] } try: with open(json_file, 'r', encoding='utf-8') as file: data = json.load(file) # 支持多种JSON格式 if isinstance(data, list): symbols_list = data elif isinstance(data, dict) and 'symbols' in data: symbols_list = data['symbols'] else: symbols_list = [data] results['total'] = len(symbols_list) for i, symbol_data in enumerate(symbols_list, 1): success, message = self.import_symbol_from_dict(symbol_data) if success: results['success'] += 1 else: results['failed'] += 1 results['errors'].append({ 'index': i, 'symbol_id': symbol_data.get('symbol_id', '未知'), 'error': message }) # 进度显示 if i % 10 == 0: print(f"已处理 {i}/{len(symbols_list)} 个符号") print(f"JSON导入完成:成功 {results['success']},失败 {results['failed']}") except Exception as e: results['errors'].append({ 'index': '文件级别', 'symbol_id': 'N/A', 'error': f"文件读取失败:{e}" }) results['failed'] = results['total'] return results def import_cross_civilization_links(self, links_data: List[Dict[str, Any]]) -> Dict[str, Any]: """导入跨文明关联""" results = { 'total': 0, 'success': 0, 'failed': 0, 'errors': [] } cursor = self.conn.cursor() results['total'] = len(links_data) for i, link_data in enumerate(links_data, 1): try: # 验证必填字段 required_fields = ['source_symbol_id', 'target_symbol_id', 'link_type'] for field in required_fields: if field not in link_data or not link_data[field]: raise ValueError(f"缺少必填字段:{field}") # 验证符号存在 cursor.execute("SELECT symbol_id FROM symbols WHERE symbol_id = ?", (link_data['source_symbol_id'],)) if not cursor.fetchone(): raise ValueError(f"源符号不存在:{link_data['source_symbol_id']}") cursor.execute("SELECT symbol_id FROM symbols WHERE symbol_id = ?", (link_data['target_symbol_id'],)) if not cursor.fetchone(): raise ValueError(f"目标符号不存在:{link_data['target_symbol_id']}") # 验证关联类型 valid_link_types = ['phonetic', 'semantic', 'morphological', 'functional', 'geographical'] if link_data['link_type'] not in valid_link_types: raise ValueError(f"关联类型必须是:{', '.join(valid_link_types)}") # 验证置信度 confidence = link_data.get('confidence_level', 50) if not 1 <= confidence <= 100: raise ValueError("置信度必须在1-100之间") # 插入或更新关联 insert_query = """ INSERT OR REPLACE INTO cross_civilization_links (source_symbol_id, target_symbol_id, link_type, confidence_level, evidence_description, source_reference) VALUES (?, ?, ?, ?, ?, ?) """ cursor.execute(insert_query, ( link_data['source_symbol_id'], link_data['target_symbol_id'], link_data['link_type'], confidence, link_data.get('evidence_description', ''), link_data.get('source_reference', '') )) results['success'] += 1 except Exception as e: results['failed'] += 1 results['errors'].append({ 'index': i, 'link': f"{link_data.get('source_symbol_id', '?')} -> {link_data.get('target_symbol_id', '?')}", 'error': str(e) }) # 进度显示 if i % 10 == 0: print(f"已处理 {i}/{len(links_data)} 个关联") self.conn.commit() print(f"关联导入完成:成功 {results['success']},失败 {results['failed']}") return results def import_transmission_paths(self, paths_data: List[Dict[str, Any]]) -> Dict[str, Any]: """导入传播路径""" results = { 'total': 0, 'success': 0, 'failed': 0, 'errors': [] } cursor = self.conn.cursor() results['total'] = len(paths_data) for i, path_data in enumerate(paths_data, 1): try: # 验证必填字段 if 'path_name' not in path_data or not path_data['path_name']: raise ValueError("缺少必填字段:path_name") # 插入传播路径 insert_path_query = """ INSERT INTO symbol_transmission_paths (path_name, start_civilization, end_civilization, transmission_period, transmission_route, supporting_evidence, confidence_level) VALUES (?, ?, ?, ?, ?, ?, ?) """ cursor.execute(insert_path_query, ( path_data['path_name'], path_data.get('start_civilization', ''), path_data.get('end_civilization', ''), path_data.get('transmission_period', ''), path_data.get('transmission_route', ''), path_data.get('supporting_evidence', ''), path_data.get('confidence_level', 50) )) path_id = cursor.lastrowid # 插入路径节点 if 'nodes' in path_data and isinstance(path_data['nodes'], list): for j, node_data in enumerate(path_data['nodes'], 1): if 'symbol_id' not in node_data: continue insert_node_query = """ INSERT INTO path_nodes (path_id, symbol_id, node_sequence, node_role, transmission_context) VALUES (?, ?, ?, ?, ?) """ cursor.execute(insert_node_query, ( path_id, node_data['symbol_id'], node_data.get('node_sequence', j), node_data.get('node_role', ''), node_data.get('transmission_context', '') )) results['success'] += 1 except Exception as e: results['failed'] += 1 results['errors'].append({ 'index': i, 'path_name': path_data.get('path_name', '未知'), 'error': str(e) }) # 进度显示 if i % 5 == 0: print(f"已处理 {i}/{len(paths_data)} 个传播路径") self.conn.commit() print(f"传播路径导入完成:成功 {results['success']},失败 {results['failed']}") return results def generate_sample_data(self) -> Dict[str, Any]: """生成示例数据""" sample_symbols = [ { 'symbol_id': 'P_yin_001', 'symbol_form': 'P', 'symbol_name': '太平洋符号', 'yin_yang_attribute': 'yin', 'engraving_type': 'yin_engraving', 'origin_civilization': 'Chinese', 'origin_period': '上古', 'geographical_context': '太平洋盆地', 'functional_context': '阴刻地理标识', 'phonetic_context': 'pǐ (否卦读音)', 'semantic_context': '凹陷、阴刻、盆地', 'description': '代表太平洋盆地的阴刻符号' }, { 'symbol_id': 'T_yang_001', 'symbol_form': 'T', 'symbol_name': '泰山符号', 'yin_yang_attribute': 'yang', 'engraving_type': 'yang_engraving', 'origin_civilization': 'Chinese', 'origin_period': '上古', 'geographical_context': '泰山山脉', 'functional_context': '阳刻地理标识', 'phonetic_context': 'tài (泰卦读音)', 'semantic_context': '凸起、阳刻、山脉', 'description': '代表泰山山脉的阳刻符号' }, { 'symbol_id': 'M_neutral_001', 'symbol_form': 'M', 'symbol_name': '月亮符号', 'yin_yang_attribute': 'neutral', 'engraving_type': 'mixed', 'origin_civilization': 'Multiple', 'origin_period': '远古', 'geographical_context': '全球分布', 'functional_context': '月亮崇拜标识', 'phonetic_context': 'moon, month', 'semantic_context': '月亮、月份、循环', 'description': '代表月亮崇拜的中性符号' } ] sample_links = [ { 'source_symbol_id': 'P_yin_001', 'target_symbol_id': 'T_yang_001', 'link_type': 'functional', 'confidence_level': 85, 'evidence_description': '阴阳对立功能互补', 'source_reference': '周易卦象分析' }, { 'source_symbol_id': 'P_yin_001', 'target_symbol_id': 'M_neutral_001', 'link_type': 'semantic', 'confidence_level': 70, 'evidence_description': '阴刻与月亮关联', 'source_reference': '跨文明符号研究' } ] sample_paths = [ { 'path_name': '阴阳符号传播路径', 'start_civilization': 'Chinese', 'end_civilization': 'Global', 'transmission_period': '上古-现代', 'transmission_route': '丝绸之路-海上贸易', 'supporting_evidence': '考古发现与文献记载', 'confidence_level': 80, 'nodes': [ {'symbol_id': 'P_yin_001', 'node_sequence': 1, 'node_role': '起源', 'transmission_context': '中国周易体系'}, {'symbol_id': 'T_yang_001', 'node_sequence': 2, 'node_role': '发展', 'transmission_context': '阴阳哲学传播'}, {'symbol_id': 'M_neutral_001', 'node_sequence': 3, 'node_role': '融合', 'transmission_context': '跨文明符号融合'} ] } ] return { 'symbols': sample_symbols, 'links': sample_links, 'paths': sample_paths } def import_sample_data(self) -> Dict[str, Any]: """导入示例数据""" sample_data = self.generate_sample_data() results = {} # 导入符号 print("正在导入示例符号数据...") results['symbols'] = self.import_symbols_from_json_data(sample_data['symbols']) # 导入关联 print("正在导入示例关联数据...") results['links'] = self.import_cross_civilization_links(sample_data['links']) # 导入传播路径 print("正在导入示例传播路径...") results['paths'] = self.import_transmission_paths(sample_data['paths']) return results def import_symbols_from_json_data(self, symbols_list: List[Dict[str, Any]]) -> Dict[str, Any]: """从JSON数据导入符号(内部方法)""" results = { 'total': 0, 'success': 0, 'failed': 0, 'errors': [] } results['total'] = len(symbols_list) for i, symbol_data in enumerate(symbols_list, 1): success, message = self.import_symbol_from_dict(symbol_data) if success: results['success'] += 1 else: results['failed'] += 1 results['errors'].append({ 'index': i, 'symbol_id': symbol_data.get('symbol_id', '未知'), 'error': message }) return results def export_database_schema(self, output_file: str = "database_schema.md") -> str: """导出数据库结构文档""" cursor = self.conn.cursor() schema_doc = ["# 胡汉三千年项目 - 数据库结构文档", "", f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ""] # 获取所有表 cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") tables = cursor.fetchall() for table in tables: table_name = table[0] schema_doc.append(f"## {table_name} 表") schema_doc.append("") # 获取表结构 cursor.execute(f"PRAGMA table_info({table_name})") columns = cursor.fetchall() schema_doc.append("| 字段名 | 类型 | 是否主键 | 是否为空 | 默认值 | 说明 |") schema_doc.append("|--------|------|----------|----------|--------|------|") for col in columns: col_name = col[1] col_type = col[2] is_primary = "是" if col[5] == 1 else "否" is_nullable = "否" if col[3] == 0 else "是" default_value = col[4] if col[4] else "无" schema_doc.append(f"| {col_name} | {col_type} | {is_primary} | {is_nullable} | {default_value} | |") schema_doc.append("") schema_content = '\n'.join(schema_doc) with open(output_file, 'w', encoding='utf-8') as f: f.write(schema_content) return f"数据库结构文档已导出至:{output_file}" # 使用示例 def main(): """主函数示例""" # 创建导入器 importer = SymbolDataImporter() # 示例1:创建数据库表结构 print("=== 创建数据库表结构 ===") success = importer.create_tables() if success: print("表结构创建成功") # 示例2:导入示例数据 print("\n=== 导入示例数据 ===") results = importer.import_sample_data() print(f"符号导入:{results['symbols']['success']} 成功,{results['symbols']['failed']} 失败") print(f"关联导入:{results['links']['success']} 成功,{results['links']['failed']} 失败") print(f"路径导入:{results['paths']['success']} 成功,{results['paths']['failed']} 失败") # 示例3:导出数据库结构文档 print("\n=== 导出数据库结构文档 ===") result = importer.export_database_schema("database_schema.md") print(result) # 示例4:从CSV导入(需要示例CSV文件) # print("\n=== 从CSV导入数据 ===") # if os.path.exists("sample_symbols.csv"): # results = importer.import_symbols_from_csv("sample_symbols.csv") # print(f"CSV导入结果:{results['success']} 成功,{results['failed']} 失败") # else: # print("示例CSV文件不存在,跳过测试") # 示例5:从JSON导入(需要示例JSON文件) # print("\n=== 从JSON导入数据 ===") # if os.path.exists("sample_symbols.json"): # results = importer.import_symbols_from_json("sample_symbols.json") # print(f"JSON导入结果:{results['success']} 成功,{results['failed']} 失败") # else: # print("示例JSON文件不存在,跳过测试") if __name__ == "__main__": main()