更新文档系统归并优化方案

This commit is contained in:
ben
2025-10-29 14:36:13 +00:00
parent 2f96497530
commit 0def756314
332 changed files with 30606 additions and 28342 deletions

View File

@@ -0,0 +1,705 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
符号数据导入器
胡汉三千年项目数据导入工具
功能:提供符号数据的批量导入、验证和转换功能
"""
import sqlite3
import pandas as pd
import json
import csv
import os
from typing import Dict, List, Tuple, Any, Optional
from datetime import datetime
import re
class SymbolDataImporter:
"""符号数据导入器"""
def __init__(self, db_path: str = "symbols.db"):
"""初始化导入器"""
self.db_path = db_path
self.conn = sqlite3.connect(db_path)
def create_tables(self) -> bool:
"""创建数据库表结构"""
cursor = self.conn.cursor()
try:
# 创建符号主表
cursor.execute("""
CREATE TABLE IF NOT EXISTS symbols (
symbol_id TEXT PRIMARY KEY,
symbol_form TEXT NOT NULL,
symbol_name TEXT NOT NULL,
yin_yang_attribute TEXT CHECK(yin_yang_attribute IN ('yin', 'yang', 'neutral')),
engraving_type TEXT CHECK(engraving_type IN ('yin_engraving', 'yang_engraving', 'mixed')),
origin_civilization TEXT,
origin_period TEXT,
geographical_context TEXT,
functional_context TEXT,
phonetic_context TEXT,
semantic_context TEXT,
discovery_date TEXT,
discovery_location TEXT,
material_type TEXT,
preservation_status TEXT,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 创建符号属性表
cursor.execute("""
CREATE TABLE IF NOT EXISTS symbol_attributes (
attribute_id INTEGER PRIMARY KEY AUTOINCREMENT,
symbol_id TEXT,
attribute_name TEXT NOT NULL,
attribute_value TEXT,
attribute_type TEXT CHECK(attribute_type IN ('text', 'numeric', 'boolean', 'date')),
confidence_level INTEGER CHECK(confidence_level BETWEEN 1 AND 100),
source_reference TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (symbol_id) REFERENCES symbols(symbol_id)
)
""")
# 创建跨文明关联表
cursor.execute("""
CREATE TABLE IF NOT EXISTS cross_civilization_links (
link_id INTEGER PRIMARY KEY AUTOINCREMENT,
source_symbol_id TEXT NOT NULL,
target_symbol_id TEXT NOT NULL,
link_type TEXT CHECK(link_type IN ('phonetic', 'semantic', 'morphological', 'functional', 'geographical')),
confidence_level INTEGER CHECK(confidence_level BETWEEN 1 AND 100),
evidence_description TEXT,
source_reference TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (source_symbol_id) REFERENCES symbols(symbol_id),
FOREIGN KEY (target_symbol_id) REFERENCES symbols(symbol_id),
UNIQUE(source_symbol_id, target_symbol_id, link_type)
)
""")
# 创建符号传播路径表
cursor.execute("""
CREATE TABLE IF NOT EXISTS symbol_transmission_paths (
path_id INTEGER PRIMARY KEY AUTOINCREMENT,
path_name TEXT NOT NULL,
start_civilization TEXT,
end_civilization TEXT,
transmission_period TEXT,
transmission_route TEXT,
supporting_evidence TEXT,
confidence_level INTEGER CHECK(confidence_level BETWEEN 1 AND 100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 创建路径节点表
cursor.execute("""
CREATE TABLE IF NOT EXISTS path_nodes (
node_id INTEGER PRIMARY KEY AUTOINCREMENT,
path_id INTEGER,
symbol_id TEXT,
node_sequence INTEGER,
node_role TEXT,
transmission_context TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (path_id) REFERENCES symbol_transmission_paths(path_id),
FOREIGN KEY (symbol_id) REFERENCES symbols(symbol_id)
)
""")
self.conn.commit()
print("数据库表结构创建成功")
return True
except Exception as e:
print(f"创建表结构失败:{e}")
self.conn.rollback()
return False
def validate_symbol_data(self, symbol_data: Dict[str, Any]) -> Tuple[bool, List[str]]:
"""验证符号数据"""
errors = []
# 必填字段检查
required_fields = ['symbol_id', 'symbol_form', 'symbol_name']
for field in required_fields:
if field not in symbol_data or not symbol_data[field]:
errors.append(f"缺少必填字段:{field}")
# 符号ID格式检查
if 'symbol_id' in symbol_data:
symbol_id = symbol_data['symbol_id']
if not re.match(r'^[A-Za-z0-9_]+$', symbol_id):
errors.append("符号ID只能包含字母、数字和下划线")
# 阴阳属性验证
if 'yin_yang_attribute' in symbol_data and symbol_data['yin_yang_attribute']:
valid_yin_yang = ['yin', 'yang', 'neutral']
if symbol_data['yin_yang_attribute'] not in valid_yin_yang:
errors.append(f"阴阳属性必须是:{', '.join(valid_yin_yang)}")
# 刻法类型验证
if 'engraving_type' in symbol_data and symbol_data['engraving_type']:
valid_engraving = ['yin_engraving', 'yang_engraving', 'mixed']
if symbol_data['engraving_type'] not in valid_engraving:
errors.append(f"刻法类型必须是:{', '.join(valid_engraving)}")
# 置信度验证
if 'confidence_level' in symbol_data and symbol_data['confidence_level']:
try:
confidence = int(symbol_data['confidence_level'])
if not 1 <= confidence <= 100:
errors.append("置信度必须在1-100之间")
except ValueError:
errors.append("置信度必须是整数")
return len(errors) == 0, errors
def import_symbol_from_dict(self, symbol_data: Dict[str, Any]) -> Tuple[bool, str]:
"""从字典导入单个符号"""
# 验证数据
is_valid, errors = self.validate_symbol_data(symbol_data)
if not is_valid:
return False, f"数据验证失败:{'; '.join(errors)}"
cursor = self.conn.cursor()
try:
# 检查符号是否已存在
cursor.execute("SELECT symbol_id FROM symbols WHERE symbol_id = ?",
(symbol_data['symbol_id'],))
existing_symbol = cursor.fetchone()
if existing_symbol:
# 更新现有符号
update_fields = []
update_values = []
for field, value in symbol_data.items():
if field != 'symbol_id':
update_fields.append(f"{field} = ?")
update_values.append(value)
update_values.append(symbol_data['symbol_id']) # WHERE条件
update_query = f"""
UPDATE symbols
SET {', '.join(update_fields)}, updated_at = CURRENT_TIMESTAMP
WHERE symbol_id = ?
"""
cursor.execute(update_query, update_values)
action = "更新"
else:
# 插入新符号
fields = list(symbol_data.keys())
placeholders = ['?'] * len(fields)
values = list(symbol_data.values())
insert_query = f"""
INSERT INTO symbols ({', '.join(fields)})
VALUES ({', '.join(placeholders)})
"""
cursor.execute(insert_query, values)
action = "插入"
self.conn.commit()
return True, f"符号 {symbol_data['symbol_id']} {action}成功"
except Exception as e:
self.conn.rollback()
return False, f"导入失败:{e}"
def import_symbols_from_csv(self, csv_file: str, delimiter: str = ',') -> Dict[str, Any]:
"""从CSV文件批量导入符号"""
results = {
'total': 0,
'success': 0,
'failed': 0,
'errors': []
}
try:
with open(csv_file, 'r', encoding='utf-8') as file:
# 检测编码
try:
reader = csv.DictReader(file, delimiter=delimiter)
rows = list(reader)
except UnicodeDecodeError:
# 尝试其他编码
file.seek(0)
reader = csv.DictReader(file, delimiter=delimiter)
rows = list(reader)
results['total'] = len(rows)
for i, row in enumerate(rows, 1):
# 清理数据
symbol_data = {}
for key, value in row.items():
if value is not None and value.strip():
symbol_data[key.strip()] = value.strip()
# 导入符号
success, message = self.import_symbol_from_dict(symbol_data)
if success:
results['success'] += 1
else:
results['failed'] += 1
results['errors'].append({
'row': i,
'symbol_id': symbol_data.get('symbol_id', '未知'),
'error': message
})
# 进度显示
if i % 10 == 0:
print(f"已处理 {i}/{len(rows)}")
print(f"CSV导入完成成功 {results['success']},失败 {results['failed']}")
except Exception as e:
results['errors'].append({
'row': '文件级别',
'symbol_id': 'N/A',
'error': f"文件读取失败:{e}"
})
results['failed'] = results['total']
return results
def import_symbols_from_json(self, json_file: str) -> Dict[str, Any]:
"""从JSON文件导入符号"""
results = {
'total': 0,
'success': 0,
'failed': 0,
'errors': []
}
try:
with open(json_file, 'r', encoding='utf-8') as file:
data = json.load(file)
# 支持多种JSON格式
if isinstance(data, list):
symbols_list = data
elif isinstance(data, dict) and 'symbols' in data:
symbols_list = data['symbols']
else:
symbols_list = [data]
results['total'] = len(symbols_list)
for i, symbol_data in enumerate(symbols_list, 1):
success, message = self.import_symbol_from_dict(symbol_data)
if success:
results['success'] += 1
else:
results['failed'] += 1
results['errors'].append({
'index': i,
'symbol_id': symbol_data.get('symbol_id', '未知'),
'error': message
})
# 进度显示
if i % 10 == 0:
print(f"已处理 {i}/{len(symbols_list)} 个符号")
print(f"JSON导入完成成功 {results['success']},失败 {results['failed']}")
except Exception as e:
results['errors'].append({
'index': '文件级别',
'symbol_id': 'N/A',
'error': f"文件读取失败:{e}"
})
results['failed'] = results['total']
return results
def import_cross_civilization_links(self, links_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""导入跨文明关联"""
results = {
'total': 0,
'success': 0,
'failed': 0,
'errors': []
}
cursor = self.conn.cursor()
results['total'] = len(links_data)
for i, link_data in enumerate(links_data, 1):
try:
# 验证必填字段
required_fields = ['source_symbol_id', 'target_symbol_id', 'link_type']
for field in required_fields:
if field not in link_data or not link_data[field]:
raise ValueError(f"缺少必填字段:{field}")
# 验证符号存在
cursor.execute("SELECT symbol_id FROM symbols WHERE symbol_id = ?",
(link_data['source_symbol_id'],))
if not cursor.fetchone():
raise ValueError(f"源符号不存在:{link_data['source_symbol_id']}")
cursor.execute("SELECT symbol_id FROM symbols WHERE symbol_id = ?",
(link_data['target_symbol_id'],))
if not cursor.fetchone():
raise ValueError(f"目标符号不存在:{link_data['target_symbol_id']}")
# 验证关联类型
valid_link_types = ['phonetic', 'semantic', 'morphological', 'functional', 'geographical']
if link_data['link_type'] not in valid_link_types:
raise ValueError(f"关联类型必须是:{', '.join(valid_link_types)}")
# 验证置信度
confidence = link_data.get('confidence_level', 50)
if not 1 <= confidence <= 100:
raise ValueError("置信度必须在1-100之间")
# 插入或更新关联
insert_query = """
INSERT OR REPLACE INTO cross_civilization_links
(source_symbol_id, target_symbol_id, link_type, confidence_level,
evidence_description, source_reference)
VALUES (?, ?, ?, ?, ?, ?)
"""
cursor.execute(insert_query, (
link_data['source_symbol_id'],
link_data['target_symbol_id'],
link_data['link_type'],
confidence,
link_data.get('evidence_description', ''),
link_data.get('source_reference', '')
))
results['success'] += 1
except Exception as e:
results['failed'] += 1
results['errors'].append({
'index': i,
'link': f"{link_data.get('source_symbol_id', '?')} -> {link_data.get('target_symbol_id', '?')}",
'error': str(e)
})
# 进度显示
if i % 10 == 0:
print(f"已处理 {i}/{len(links_data)} 个关联")
self.conn.commit()
print(f"关联导入完成:成功 {results['success']},失败 {results['failed']}")
return results
def import_transmission_paths(self, paths_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""导入传播路径"""
results = {
'total': 0,
'success': 0,
'failed': 0,
'errors': []
}
cursor = self.conn.cursor()
results['total'] = len(paths_data)
for i, path_data in enumerate(paths_data, 1):
try:
# 验证必填字段
if 'path_name' not in path_data or not path_data['path_name']:
raise ValueError("缺少必填字段path_name")
# 插入传播路径
insert_path_query = """
INSERT INTO symbol_transmission_paths
(path_name, start_civilization, end_civilization, transmission_period,
transmission_route, supporting_evidence, confidence_level)
VALUES (?, ?, ?, ?, ?, ?, ?)
"""
cursor.execute(insert_path_query, (
path_data['path_name'],
path_data.get('start_civilization', ''),
path_data.get('end_civilization', ''),
path_data.get('transmission_period', ''),
path_data.get('transmission_route', ''),
path_data.get('supporting_evidence', ''),
path_data.get('confidence_level', 50)
))
path_id = cursor.lastrowid
# 插入路径节点
if 'nodes' in path_data and isinstance(path_data['nodes'], list):
for j, node_data in enumerate(path_data['nodes'], 1):
if 'symbol_id' not in node_data:
continue
insert_node_query = """
INSERT INTO path_nodes
(path_id, symbol_id, node_sequence, node_role, transmission_context)
VALUES (?, ?, ?, ?, ?)
"""
cursor.execute(insert_node_query, (
path_id,
node_data['symbol_id'],
node_data.get('node_sequence', j),
node_data.get('node_role', ''),
node_data.get('transmission_context', '')
))
results['success'] += 1
except Exception as e:
results['failed'] += 1
results['errors'].append({
'index': i,
'path_name': path_data.get('path_name', '未知'),
'error': str(e)
})
# 进度显示
if i % 5 == 0:
print(f"已处理 {i}/{len(paths_data)} 个传播路径")
self.conn.commit()
print(f"传播路径导入完成:成功 {results['success']},失败 {results['failed']}")
return results
def generate_sample_data(self) -> Dict[str, Any]:
"""生成示例数据"""
sample_symbols = [
{
'symbol_id': 'P_yin_001',
'symbol_form': 'P',
'symbol_name': '太平洋符号',
'yin_yang_attribute': 'yin',
'engraving_type': 'yin_engraving',
'origin_civilization': 'Chinese',
'origin_period': '上古',
'geographical_context': '太平洋盆地',
'functional_context': '阴刻地理标识',
'phonetic_context': 'pǐ (否卦读音)',
'semantic_context': '凹陷、阴刻、盆地',
'description': '代表太平洋盆地的阴刻符号'
},
{
'symbol_id': 'T_yang_001',
'symbol_form': 'T',
'symbol_name': '泰山符号',
'yin_yang_attribute': 'yang',
'engraving_type': 'yang_engraving',
'origin_civilization': 'Chinese',
'origin_period': '上古',
'geographical_context': '泰山山脉',
'functional_context': '阳刻地理标识',
'phonetic_context': 'tài (泰卦读音)',
'semantic_context': '凸起、阳刻、山脉',
'description': '代表泰山山脉的阳刻符号'
},
{
'symbol_id': 'M_neutral_001',
'symbol_form': 'M',
'symbol_name': '月亮符号',
'yin_yang_attribute': 'neutral',
'engraving_type': 'mixed',
'origin_civilization': 'Multiple',
'origin_period': '远古',
'geographical_context': '全球分布',
'functional_context': '月亮崇拜标识',
'phonetic_context': 'moon, month',
'semantic_context': '月亮、月份、循环',
'description': '代表月亮崇拜的中性符号'
}
]
sample_links = [
{
'source_symbol_id': 'P_yin_001',
'target_symbol_id': 'T_yang_001',
'link_type': 'functional',
'confidence_level': 85,
'evidence_description': '阴阳对立功能互补',
'source_reference': '周易卦象分析'
},
{
'source_symbol_id': 'P_yin_001',
'target_symbol_id': 'M_neutral_001',
'link_type': 'semantic',
'confidence_level': 70,
'evidence_description': '阴刻与月亮关联',
'source_reference': '跨文明符号研究'
}
]
sample_paths = [
{
'path_name': '阴阳符号传播路径',
'start_civilization': 'Chinese',
'end_civilization': 'Global',
'transmission_period': '上古-现代',
'transmission_route': '丝绸之路-海上贸易',
'supporting_evidence': '考古发现与文献记载',
'confidence_level': 80,
'nodes': [
{'symbol_id': 'P_yin_001', 'node_sequence': 1, 'node_role': '起源', 'transmission_context': '中国周易体系'},
{'symbol_id': 'T_yang_001', 'node_sequence': 2, 'node_role': '发展', 'transmission_context': '阴阳哲学传播'},
{'symbol_id': 'M_neutral_001', 'node_sequence': 3, 'node_role': '融合', 'transmission_context': '跨文明符号融合'}
]
}
]
return {
'symbols': sample_symbols,
'links': sample_links,
'paths': sample_paths
}
def import_sample_data(self) -> Dict[str, Any]:
"""导入示例数据"""
sample_data = self.generate_sample_data()
results = {}
# 导入符号
print("正在导入示例符号数据...")
results['symbols'] = self.import_symbols_from_json_data(sample_data['symbols'])
# 导入关联
print("正在导入示例关联数据...")
results['links'] = self.import_cross_civilization_links(sample_data['links'])
# 导入传播路径
print("正在导入示例传播路径...")
results['paths'] = self.import_transmission_paths(sample_data['paths'])
return results
def import_symbols_from_json_data(self, symbols_list: List[Dict[str, Any]]) -> Dict[str, Any]:
"""从JSON数据导入符号内部方法"""
results = {
'total': 0,
'success': 0,
'failed': 0,
'errors': []
}
results['total'] = len(symbols_list)
for i, symbol_data in enumerate(symbols_list, 1):
success, message = self.import_symbol_from_dict(symbol_data)
if success:
results['success'] += 1
else:
results['failed'] += 1
results['errors'].append({
'index': i,
'symbol_id': symbol_data.get('symbol_id', '未知'),
'error': message
})
return results
def export_database_schema(self, output_file: str = "database_schema.md") -> str:
"""导出数据库结构文档"""
cursor = self.conn.cursor()
schema_doc = ["# 胡汉三千年项目 - 数据库结构文档", "", f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ""]
# 获取所有表
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
tables = cursor.fetchall()
for table in tables:
table_name = table[0]
schema_doc.append(f"## {table_name}")
schema_doc.append("")
# 获取表结构
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
schema_doc.append("| 字段名 | 类型 | 是否主键 | 是否为空 | 默认值 | 说明 |")
schema_doc.append("|--------|------|----------|----------|--------|------|")
for col in columns:
col_name = col[1]
col_type = col[2]
is_primary = "" if col[5] == 1 else ""
is_nullable = "" if col[3] == 0 else ""
default_value = col[4] if col[4] else ""
schema_doc.append(f"| {col_name} | {col_type} | {is_primary} | {is_nullable} | {default_value} | |")
schema_doc.append("")
schema_content = '\n'.join(schema_doc)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(schema_content)
return f"数据库结构文档已导出至:{output_file}"
# 使用示例
def main():
"""主函数示例"""
# 创建导入器
importer = SymbolDataImporter()
# 示例1创建数据库表结构
print("=== 创建数据库表结构 ===")
success = importer.create_tables()
if success:
print("表结构创建成功")
# 示例2导入示例数据
print("\n=== 导入示例数据 ===")
results = importer.import_sample_data()
print(f"符号导入:{results['symbols']['success']} 成功,{results['symbols']['failed']} 失败")
print(f"关联导入:{results['links']['success']} 成功,{results['links']['failed']} 失败")
print(f"路径导入:{results['paths']['success']} 成功,{results['paths']['failed']} 失败")
# 示例3导出数据库结构文档
print("\n=== 导出数据库结构文档 ===")
result = importer.export_database_schema("database_schema.md")
print(result)
# 示例4从CSV导入需要示例CSV文件
# print("\n=== 从CSV导入数据 ===")
# if os.path.exists("sample_symbols.csv"):
# results = importer.import_symbols_from_csv("sample_symbols.csv")
# print(f"CSV导入结果{results['success']} 成功,{results['failed']} 失败")
# else:
# print("示例CSV文件不存在跳过测试")
# 示例5从JSON导入需要示例JSON文件
# print("\n=== 从JSON导入数据 ===")
# if os.path.exists("sample_symbols.json"):
# results = importer.import_symbols_from_json("sample_symbols.json")
# print(f"JSON导入结果{results['success']} 成功,{results['failed']} 失败")
# else:
# print("示例JSON文件不存在跳过测试")
if __name__ == "__main__":
main()