🔥 重大突破:完整的日本阳具崇拜北魏起源论
- 🔤 文字学证据:𥘵字(示+旦)揭示祖先崇拜=生殖崇拜 - 🌋 地理学证据:大同火山→昊天寺→平城→奈良→富士山崇拜传播链 - 🏛️ 建筑学证据:应县木塔承载寇谦之静轮天宫的生殖象征 - 📜 制度学证据:北魏→日本完整政治文化传播机制 核心发现: ✨ 四重证据相互印证的完整理论体系 ✨ 从一个汉字解开东亚文化千年之谜 ✨ 首次系统解释日本阳具崇拜历史起源 ✨ 为'胡汉三千年'理论提供核心实证支撑 学术价值: - 创新'纯逻辑考古'研究方法论 - 建立跨学科文化传播理论 - 填补东亚文化研究重要空白 - 为中华文明世界影响提供科学证据
This commit is contained in:
305
tools/document-management/doc_indexer.py
Normal file
305
tools/document-management/doc_indexer.py
Normal file
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
胡汉三千年项目文档入库系统
|
||||
Document Indexing System for Hu-Han Three Thousand Years Project
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
import yaml
|
||||
|
||||
class DocumentIndexer:
|
||||
def __init__(self, project_root: str, index_file: str = "document_index.json"):
|
||||
self.project_root = Path(project_root)
|
||||
self.index_file = self.project_root / index_file
|
||||
self.documents = {}
|
||||
self.load_index()
|
||||
|
||||
def load_index(self):
|
||||
"""加载现有索引"""
|
||||
if self.index_file.exists():
|
||||
with open(self.index_file, 'r', encoding='utf-8') as f:
|
||||
self.documents = json.load(f)
|
||||
|
||||
def save_index(self):
|
||||
"""保存索引到文件"""
|
||||
with open(self.index_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.documents, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def generate_doc_id(self, file_path: str) -> str:
|
||||
"""生成文档ID"""
|
||||
# 使用文件路径的hash作为ID
|
||||
return hashlib.md5(file_path.encode('utf-8')).hexdigest()[:12]
|
||||
|
||||
def extract_metadata(self, file_path: Path) -> Dict:
|
||||
"""提取文档元数据"""
|
||||
metadata = {
|
||||
'file_path': str(file_path.relative_to(self.project_root)),
|
||||
'file_name': file_path.name,
|
||||
'file_size': file_path.stat().st_size,
|
||||
'created_time': datetime.fromtimestamp(file_path.stat().st_ctime).isoformat(),
|
||||
'modified_time': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat(),
|
||||
'file_extension': file_path.suffix,
|
||||
}
|
||||
|
||||
# 根据文件路径推断分类
|
||||
path_parts = file_path.parts
|
||||
if 'core-theory' in path_parts:
|
||||
metadata['category'] = 'core_theory'
|
||||
metadata['category_zh'] = '核心理论'
|
||||
elif 'historical-research' in path_parts:
|
||||
metadata['category'] = 'historical_research'
|
||||
metadata['category_zh'] = '历史研究'
|
||||
elif 'academic-papers' in path_parts:
|
||||
metadata['category'] = 'academic_papers'
|
||||
metadata['category_zh'] = '学术论文'
|
||||
elif 'literary-works' in path_parts:
|
||||
metadata['category'] = 'literary_works'
|
||||
metadata['category_zh'] = '文学创作'
|
||||
else:
|
||||
metadata['category'] = 'other'
|
||||
metadata['category_zh'] = '其他'
|
||||
|
||||
# 提取文档内容信息
|
||||
if file_path.suffix in ['.md', '.rst', '.txt']:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
metadata.update(self.analyze_content(content))
|
||||
except Exception as e:
|
||||
metadata['content_error'] = str(e)
|
||||
|
||||
return metadata
|
||||
|
||||
def analyze_content(self, content: str) -> Dict:
|
||||
"""分析文档内容"""
|
||||
lines = content.split('\n')
|
||||
|
||||
# 提取标题
|
||||
title = None
|
||||
for line in lines[:10]: # 只检查前10行
|
||||
if line.startswith('# '):
|
||||
title = line[2:].strip()
|
||||
break
|
||||
|
||||
# 统计信息
|
||||
word_count = len(content)
|
||||
line_count = len(lines)
|
||||
|
||||
# 提取关键词(简单实现)
|
||||
keywords = self.extract_keywords(content)
|
||||
|
||||
return {
|
||||
'title': title,
|
||||
'word_count': word_count,
|
||||
'line_count': line_count,
|
||||
'keywords': keywords,
|
||||
'has_chinese': bool(re.search(r'[\u4e00-\u9fff]', content)),
|
||||
'has_english': bool(re.search(r'[a-zA-Z]', content)),
|
||||
}
|
||||
|
||||
def extract_keywords(self, content: str) -> List[str]:
|
||||
"""提取关键词"""
|
||||
# 简单的关键词提取
|
||||
keywords = []
|
||||
|
||||
# 项目相关关键词
|
||||
project_keywords = [
|
||||
'胡汉三千年', '嚈哒', 'Y音正统性', '圐圙', '北朝宇宙',
|
||||
'天地相通', '音韵表意', '纵横术', '三体', '文明对话'
|
||||
]
|
||||
|
||||
for keyword in project_keywords:
|
||||
if keyword in content:
|
||||
keywords.append(keyword)
|
||||
|
||||
return keywords
|
||||
|
||||
def suggest_english_name(self, file_path: Path, metadata: Dict) -> str:
|
||||
"""建议英文文件名"""
|
||||
# 基于内容和路径生成英文文件名
|
||||
category = metadata.get('category', 'doc')
|
||||
|
||||
# 特殊文件名映射
|
||||
name_mapping = {
|
||||
'嚈哒起源研究总结.md': 'yanda_origins_research_summary.md',
|
||||
'Y音正统性与地缘政治密码_完整理论框架.md': 'y_sound_orthodoxy_geopolitical_codes.md',
|
||||
'胡汉三千年.md': 'hu_han_three_thousand_years.md',
|
||||
'三体解读深度书评.md': 'three_body_analysis_review.md',
|
||||
}
|
||||
|
||||
if file_path.name in name_mapping:
|
||||
return name_mapping[file_path.name]
|
||||
|
||||
# 自动生成
|
||||
base_name = file_path.stem
|
||||
# 简单的中文转英文(需要更复杂的实现)
|
||||
english_name = re.sub(r'[^\w\-_.]', '_', base_name.lower())
|
||||
english_name = re.sub(r'_+', '_', english_name)
|
||||
|
||||
return f"{category}_{english_name}{file_path.suffix}"
|
||||
|
||||
def index_document(self, file_path: Path) -> str:
|
||||
"""索引单个文档"""
|
||||
doc_id = self.generate_doc_id(str(file_path))
|
||||
metadata = self.extract_metadata(file_path)
|
||||
|
||||
# 建议英文文件名
|
||||
suggested_name = self.suggest_english_name(file_path, metadata)
|
||||
metadata['suggested_english_name'] = suggested_name
|
||||
|
||||
self.documents[doc_id] = metadata
|
||||
return doc_id
|
||||
|
||||
def index_all_documents(self):
|
||||
"""索引所有文档"""
|
||||
print("🔍 开始索引所有文档...")
|
||||
|
||||
# 要索引的文件类型
|
||||
file_extensions = ['.md', '.rst', '.txt', '.py']
|
||||
|
||||
# 要排除的目录
|
||||
exclude_dirs = {'.git', '__pycache__', '.venv', 'sphinx-env', '_build', 'node_modules'}
|
||||
|
||||
indexed_count = 0
|
||||
|
||||
for file_path in self.project_root.rglob('*'):
|
||||
# 跳过目录
|
||||
if file_path.is_dir():
|
||||
continue
|
||||
|
||||
# 跳过排除的目录
|
||||
if any(exclude_dir in file_path.parts for exclude_dir in exclude_dirs):
|
||||
continue
|
||||
|
||||
# 只处理指定类型的文件
|
||||
if file_path.suffix not in file_extensions:
|
||||
continue
|
||||
|
||||
try:
|
||||
doc_id = self.index_document(file_path)
|
||||
print(f"✅ 已索引: {file_path.name} -> {doc_id}")
|
||||
indexed_count += 1
|
||||
except Exception as e:
|
||||
print(f"❌ 索引失败: {file_path.name} - {e}")
|
||||
|
||||
self.save_index()
|
||||
print(f"🎉 索引完成!共索引 {indexed_count} 个文档")
|
||||
return indexed_count
|
||||
|
||||
def search_documents(self, query: str) -> List[Dict]:
|
||||
"""搜索文档"""
|
||||
results = []
|
||||
query_lower = query.lower()
|
||||
|
||||
for doc_id, metadata in self.documents.items():
|
||||
score = 0
|
||||
|
||||
# 标题匹配
|
||||
if metadata.get('title') and query_lower in metadata['title'].lower():
|
||||
score += 10
|
||||
|
||||
# 文件名匹配
|
||||
if query_lower in metadata['file_name'].lower():
|
||||
score += 5
|
||||
|
||||
# 关键词匹配
|
||||
if metadata.get('keywords'):
|
||||
for keyword in metadata['keywords']:
|
||||
if query_lower in keyword.lower():
|
||||
score += 3
|
||||
|
||||
# 分类匹配
|
||||
if query_lower in metadata.get('category', '').lower():
|
||||
score += 2
|
||||
|
||||
if score > 0:
|
||||
result = metadata.copy()
|
||||
result['doc_id'] = doc_id
|
||||
result['score'] = score
|
||||
results.append(result)
|
||||
|
||||
# 按分数排序
|
||||
results.sort(key=lambda x: x['score'], reverse=True)
|
||||
return results
|
||||
|
||||
def generate_rename_script(self) -> str:
|
||||
"""生成重命名脚本"""
|
||||
script_lines = ['#!/bin/bash', '', '# 文档重命名脚本', '']
|
||||
|
||||
for doc_id, metadata in self.documents.items():
|
||||
current_path = metadata['file_path']
|
||||
suggested_name = metadata.get('suggested_english_name')
|
||||
|
||||
if suggested_name and suggested_name != metadata['file_name']:
|
||||
# 生成重命名命令
|
||||
new_path = str(Path(current_path).parent / suggested_name)
|
||||
script_lines.append(f'# {metadata["file_name"]} -> {suggested_name}')
|
||||
script_lines.append(f'mv "{current_path}" "{new_path}"')
|
||||
script_lines.append('')
|
||||
|
||||
return '\n'.join(script_lines)
|
||||
|
||||
def export_index_report(self) -> str:
|
||||
"""导出索引报告"""
|
||||
report = {
|
||||
'summary': {
|
||||
'total_documents': len(self.documents),
|
||||
'categories': {},
|
||||
'file_types': {},
|
||||
'generated_at': datetime.now().isoformat()
|
||||
},
|
||||
'documents': self.documents
|
||||
}
|
||||
|
||||
# 统计分类
|
||||
for metadata in self.documents.values():
|
||||
category = metadata.get('category', 'unknown')
|
||||
report['summary']['categories'][category] = report['summary']['categories'].get(category, 0) + 1
|
||||
|
||||
file_ext = metadata.get('file_extension', 'unknown')
|
||||
report['summary']['file_types'][file_ext] = report['summary']['file_types'].get(file_ext, 0) + 1
|
||||
|
||||
return json.dumps(report, ensure_ascii=False, indent=2)
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
project_root = "."
|
||||
indexer = DocumentIndexer(project_root)
|
||||
|
||||
print("胡汉三千年项目文档入库系统")
|
||||
print("=" * 40)
|
||||
|
||||
# 索引所有文档
|
||||
indexer.index_all_documents()
|
||||
|
||||
# 生成报告
|
||||
report = indexer.export_index_report()
|
||||
with open('document_index_report.json', 'w', encoding='utf-8') as f:
|
||||
f.write(report)
|
||||
|
||||
# 生成重命名脚本
|
||||
rename_script = indexer.generate_rename_script()
|
||||
with open('rename_documents.sh', 'w', encoding='utf-8') as f:
|
||||
f.write(rename_script)
|
||||
|
||||
print("\n📊 生成的文件:")
|
||||
print("- document_index.json: 文档索引")
|
||||
print("- document_index_report.json: 详细报告")
|
||||
print("- rename_documents.sh: 重命名脚本")
|
||||
|
||||
# 演示搜索功能
|
||||
print("\n🔍 搜索演示:")
|
||||
for query in ['嚈哒', '三体', 'Y音', '理论']:
|
||||
results = indexer.search_documents(query)
|
||||
print(f"搜索 '{query}': 找到 {len(results)} 个结果")
|
||||
for result in results[:2]: # 只显示前2个结果
|
||||
print(f" - {result['file_name']} (分数: {result['score']})")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user