298 lines
12 KiB
Python
298 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
s音文明基因验证工具包
|
||
Silk Road S-Phoneme Civilization Gene Verification Toolkit
|
||
|
||
用于验证和分析丝绸之路s音文化哈希理论的工具集合
|
||
"""
|
||
|
||
import re
|
||
import json
|
||
from collections import defaultdict, Counter
|
||
from typing import Dict, List, Tuple, Optional
|
||
import Levenshtein
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
|
||
@dataclass
|
||
class SPhonemeWord:
|
||
"""s音词汇数据结构"""
|
||
word: str
|
||
language: str
|
||
s_variant: str
|
||
meaning: str
|
||
era: str
|
||
region: str
|
||
cultural_context: str
|
||
confidence_score: float = 1.0
|
||
|
||
class SPhonemeAnalyzer:
|
||
"""s音分析器核心类"""
|
||
|
||
def __init__(self):
|
||
# s音及其变体定义
|
||
self.s_variants = {
|
||
's': 'standard',
|
||
'ś': 'palatalized',
|
||
'š': 'retroflex',
|
||
'sh': 'english_sh',
|
||
'x': 'chinese_x',
|
||
'z': 'voiced',
|
||
'c': 'latin_c',
|
||
'ss': 'geminate',
|
||
'sc': 'latin_sc'
|
||
}
|
||
|
||
# 丝绸之路核心s音词汇库
|
||
self.silk_road_lexicon = self._initialize_lexicon()
|
||
|
||
def _initialize_lexicon(self) -> List[SPhonemeWord]:
|
||
"""初始化丝绸之路s音词汇库"""
|
||
lexicon_data = [
|
||
# 中原起点
|
||
("丝", "chinese", "s", "silk", "ancient", "central_china", "core_commodity"),
|
||
("商", "chinese", "sh", "merchant", "ancient", "central_china", "trade_identity"),
|
||
("缫", "chinese", "s", "reel_silk", "ancient", "central_china", "silk_technology"),
|
||
("莎车", "chinese", "s", "Shache_kingdom", "ancient", "xinjiang", "silk_road_city"),
|
||
|
||
# 西域中转
|
||
("粟特", "chinese", "s", "Sogdian", "medieval", "central_asia", "merchant_ethnicity"),
|
||
("sart", "sogdian", "s", "merchant", "medieval", "central_asia", "trade_profession"),
|
||
("sūtra", "sanskrit", "s", "scripture", "ancient", "india", "buddhist_text"),
|
||
|
||
# 中亚枢纽
|
||
("Samarkand", "persian", "s", "city_name", "ancient", "uzbekistan", "trade_hub"),
|
||
("sesame", "english", "s", "sesame", "ancient", "mesopotamia", "trade_crop"),
|
||
("śaśama", "sanskrit", "ś", "sesame", "ancient", "india", "sanskrit_crop"),
|
||
|
||
# 西亚节点
|
||
("沙门", "chinese", "sh", "buddhist_monk", "ancient", "china", "buddhist_title"),
|
||
("śramaṇa", "sanskrit", "ś", "ascetic", "ancient", "india", "religious_practitioner"),
|
||
("Syria", "english", "s", "Syria", "ancient", "levant", "geographical_region"),
|
||
("Sūriyā", "arabic", "s", "Syria", "medieval", "levant", "arabic_geography"),
|
||
|
||
# 欧洲终端
|
||
("silk", "english", "s", "silk", "medieval", "europe", "luxury_good"),
|
||
("satin", "english", "s", "satin", "medieval", "europe", "fine_fabric"),
|
||
("saltpeter", "english", "s", "saltpeter", "medieval", "europe", "chemical_compound"),
|
||
|
||
# 关键族群
|
||
("Śākya", "sanskrit", "ś", "Shakya_clan", "ancient", "india", "buddhist_clan"),
|
||
("Saka", "persian", "s", "Scythian", "ancient", "central_asia", "nomadic_group"),
|
||
("Seljuk", "turkish", "s", "Seljuk_dynasty", "medieval", "turkey", "turkic_dynasty"),
|
||
("sabra", "sumerian", "s", "trade_official", "ancient", "mesopotamia", "administrative_title")
|
||
]
|
||
|
||
return [SPhonemeWord(*data) for data in lexicon_data]
|
||
|
||
def extract_s_phonemes(self, word: str) -> List[str]:
|
||
"""提取单词中的s音及其变体"""
|
||
s_sounds = []
|
||
word_lower = word.lower()
|
||
|
||
# 检查所有s音变体
|
||
for variant in self.s_variants.keys():
|
||
if variant in word_lower:
|
||
# 找到所有出现的位置
|
||
positions = [m.start() for m in re.finditer(variant, word_lower)]
|
||
for pos in positions:
|
||
s_sounds.append({
|
||
'variant': variant,
|
||
'position': pos,
|
||
'type': self.s_variants[variant]
|
||
})
|
||
|
||
return s_sounds
|
||
|
||
def calculate_phoneme_similarity(self, word1: str, word2: str) -> float:
|
||
"""计算两个单词的s音相似度"""
|
||
# 提取s音部分
|
||
s1 = ''.join([c for c in word1.lower() if c in self.s_variants.keys()])
|
||
s2 = ''.join([c for c in word2.lower() if c in self.s_variants.keys()])
|
||
|
||
if not s1 or not s2:
|
||
return 0.0
|
||
|
||
# 计算编辑距离相似度
|
||
distance = Levenshtein.distance(s1, s2)
|
||
max_len = max(len(s1), len(s2))
|
||
|
||
return 1 - (distance / max_len) if max_len > 0 else 0.0
|
||
|
||
def analyze_s_phoneme_stability(self, word_list: List[str]) -> Dict:
|
||
"""分析s音在词汇列表中的稳定性"""
|
||
total_words = len(word_list)
|
||
s_phoneme_words = 0
|
||
s_phoneme_distribution = Counter()
|
||
|
||
for word in word_list:
|
||
s_phonemes = self.extract_s_phonemes(word)
|
||
if s_phonemes:
|
||
s_phoneme_words += 1
|
||
for phoneme in s_phonemes:
|
||
s_phoneme_distribution[phoneme['variant']] += 1
|
||
|
||
stability_rate = s_phoneme_words / total_words if total_words > 0 else 0
|
||
|
||
return {
|
||
'total_words': total_words,
|
||
's_phoneme_words': s_phoneme_words,
|
||
'stability_rate': stability_rate,
|
||
'phoneme_distribution': dict(s_phoneme_distribution),
|
||
'most_common_phoneme': s_phoneme_distribution.most_common(1)[0] if s_phoneme_distribution else None
|
||
}
|
||
|
||
def find_cultural_transmission_paths(self, source_civ: str, target_civ: str) -> List[Dict]:
|
||
"""寻找文明间的s音传播路径"""
|
||
paths = []
|
||
|
||
# 筛选相关文明的词汇
|
||
source_words = [w for w in self.silk_road_lexicon if w.language == source_civ]
|
||
target_words = [w for w in self.silk_road_lexicon if w.language == target_civ]
|
||
|
||
for s_word in source_words:
|
||
for t_word in target_words:
|
||
similarity = self.calculate_phoneme_similarity(s_word.word, t_word.word)
|
||
if similarity > 0.3: # 相似度阈值
|
||
paths.append({
|
||
'source_word': s_word.word,
|
||
'target_word': t_word.word,
|
||
'similarity': similarity,
|
||
'source_meaning': s_word.meaning,
|
||
'target_meaning': t_word.meaning,
|
||
'time_gap': self._estimate_time_gap(s_word.era, t_word.era)
|
||
})
|
||
|
||
# 按相似度排序
|
||
paths.sort(key=lambda x: x['similarity'], reverse=True)
|
||
return paths
|
||
|
||
def _estimate_time_gap(self, era1: str, era2: str) -> str:
|
||
"""估算时间差距(简化版)"""
|
||
era_order = {
|
||
'ancient': 1,
|
||
'classical': 2,
|
||
'medieval': 3,
|
||
'modern': 4
|
||
}
|
||
|
||
order1 = era_order.get(era1, 0)
|
||
order2 = era_order.get(era2, 0)
|
||
gap = abs(order1 - order2)
|
||
|
||
if gap == 0:
|
||
return "contemporary"
|
||
elif gap == 1:
|
||
return "1_era_gap"
|
||
else:
|
||
return f"{gap}_eras_gap"
|
||
|
||
def generate_cultural_gene_report(self) -> Dict:
|
||
"""生成文明基因分析报告"""
|
||
# 分析s音稳定性
|
||
all_words = [w.word for w in self.silk_road_lexicon]
|
||
stability_analysis = self.analyze_s_phoneme_stability(all_words)
|
||
|
||
# 分析文明传播路径
|
||
transmission_paths = {}
|
||
key_civilizations = ['sumerian', 'persian', 'sanskrit', 'chinese', 'turkish', 'english']
|
||
|
||
for i, civ1 in enumerate(key_civilizations):
|
||
for civ2 in key_civilizations[i+1:]:
|
||
path_key = f"{civ1}_to_{civ2}"
|
||
paths = self.find_cultural_transmission_paths(civ1, civ2)
|
||
if paths:
|
||
transmission_paths[path_key] = {
|
||
'path_count': len(paths),
|
||
'strongest_connection': max(paths, key=lambda x: x['similarity']) if paths else None,
|
||
'avg_similarity': sum(p['similarity'] for p in paths) / len(paths) if paths else 0
|
||
}
|
||
|
||
# 统计各文明的s音特征
|
||
civ_s_analysis = {}
|
||
for civ in key_civilizations:
|
||
civ_words = [w.word for w in self.silk_road_lexicon if w.language == civ]
|
||
if civ_words:
|
||
civ_s_analysis[civ] = self.analyze_s_phoneme_stability(civ_words)
|
||
|
||
return {
|
||
'overall_stability': stability_analysis,
|
||
'transmission_paths': transmission_paths,
|
||
'civilization_analysis': civ_s_analysis,
|
||
'total_lexicon_size': len(self.silk_road_lexicon),
|
||
'key_findings': self._generate_key_findings(stability_analysis, transmission_paths)
|
||
}
|
||
|
||
def _generate_key_findings(self, stability: Dict, paths: Dict) -> List[str]:
|
||
"""生成关键发现"""
|
||
findings = []
|
||
|
||
# s音稳定性发现
|
||
if stability['stability_rate'] > 0.8:
|
||
findings.append(f"High_s_phoneme_stability:_{stability['stability_rate']:.1%}")
|
||
|
||
# 传播路径发现
|
||
strong_paths = [p for p in paths.values() if p['avg_similarity'] > 0.5]
|
||
if strong_paths:
|
||
findings.append(f"Strong_cultural_transmission:_{len(strong_paths)}_paths")
|
||
|
||
# 最稳定的s音变体
|
||
if stability['most_common_phoneme']:
|
||
variant, count = stability['most_common_phoneme']
|
||
findings.append(f"Dominant_s_variant:_{variant}_({count}_occurrences)")
|
||
|
||
return findings
|
||
|
||
def main():
|
||
"""主函数:演示s音文明基因分析"""
|
||
print("🧬 s音文明基因验证工具包")
|
||
print("=" * 50)
|
||
|
||
# 初始化分析器
|
||
analyzer = SPhonemeAnalyzer()
|
||
|
||
# 运行全面分析
|
||
report = analyzer.generate_cultural_gene_report()
|
||
|
||
# 输出结果
|
||
print(f"📊 分析完成!词汇库规模: {report['total_lexicon_size']}")
|
||
print(f"🎯 s音整体稳定性: {report['overall_stability']['stability_rate']:.2%}")
|
||
print(f"🔗 发现传播路径: {len(report['transmission_paths'])}")
|
||
|
||
print("\n🔍 关键发现:")
|
||
for finding in report['key_findings']:
|
||
print(f" • {finding.replace('_', ' ')}")
|
||
|
||
# 详细分析示例
|
||
print("\n📈 文明间s音传播分析:")
|
||
for path_key, data in list(report['transmission_paths'].items())[:3]:
|
||
print(f"\n{path_key.replace('_', ' → ')}:")
|
||
print(f" 路径数量: {data['path_count']}")
|
||
print(f" 平均相似度: {data['avg_similarity']:.2f}")
|
||
|
||
if data['strongest_connection']:
|
||
conn = data['strongest_connection']
|
||
print(f" 最强连接: {conn['source_word']} → {conn['target_word']} (相似度: {conn['similarity']:.2f})")
|
||
|
||
# 保存详细报告
|
||
output_file = "s_phoneme_civilization_report.json"
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(report, f, ensure_ascii=False, indent=2, default=str)
|
||
|
||
print(f"\n💾 详细报告已保存至: {output_file}")
|
||
|
||
# 特定分析示例
|
||
print("\n🔬 特定词汇分析示例:")
|
||
test_pairs = [
|
||
("Śākya", "Saka"),
|
||
("丝", "silk"),
|
||
("商", "sart"),
|
||
("沙门", "śramaṇa")
|
||
]
|
||
|
||
for word1, word2 in test_pairs:
|
||
similarity = analyzer.calculate_phoneme_similarity(word1, word2)
|
||
print(f" {word1} ↔ {word2}: 相似度 = {similarity:.3f}")
|
||
|
||
if __name__ == "__main__":
|
||
main() |