305 lines
11 KiB
Python
305 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
胡汉三千年项目文档入库系统
|
|
Document Indexing System for Hu-Han Three Thousand Years Project
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import hashlib
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
import yaml
|
|
|
|
class DocumentIndexer:
|
|
def __init__(self, project_root: str, index_file: str = "document_index.json"):
|
|
self.project_root = Path(project_root)
|
|
self.index_file = self.project_root / index_file
|
|
self.documents = {}
|
|
self.load_index()
|
|
|
|
def load_index(self):
|
|
"""加载现有索引"""
|
|
if self.index_file.exists():
|
|
with open(self.index_file, 'r', encoding='utf-8') as f:
|
|
self.documents = json.load(f)
|
|
|
|
def save_index(self):
|
|
"""保存索引到文件"""
|
|
with open(self.index_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.documents, f, ensure_ascii=False, indent=2)
|
|
|
|
def generate_doc_id(self, file_path: str) -> str:
|
|
"""生成文档ID"""
|
|
# 使用文件路径的hash作为ID
|
|
return hashlib.md5(file_path.encode('utf-8')).hexdigest()[:12]
|
|
|
|
def extract_metadata(self, file_path: Path) -> Dict:
|
|
"""提取文档元数据"""
|
|
metadata = {
|
|
'file_path': str(file_path.relative_to(self.project_root)),
|
|
'file_name': file_path.name,
|
|
'file_size': file_path.stat().st_size,
|
|
'created_time': datetime.fromtimestamp(file_path.stat().st_ctime).isoformat(),
|
|
'modified_time': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat(),
|
|
'file_extension': file_path.suffix,
|
|
}
|
|
|
|
# 根据文件路径推断分类
|
|
path_parts = file_path.parts
|
|
if 'core-theory' in path_parts:
|
|
metadata['category'] = 'core_theory'
|
|
metadata['category_zh'] = '核心理论'
|
|
elif 'historical-research' in path_parts:
|
|
metadata['category'] = 'historical_research'
|
|
metadata['category_zh'] = '历史研究'
|
|
elif 'academic-papers' in path_parts:
|
|
metadata['category'] = 'academic_papers'
|
|
metadata['category_zh'] = '学术论文'
|
|
elif 'literary-works' in path_parts:
|
|
metadata['category'] = 'literary_works'
|
|
metadata['category_zh'] = '文学创作'
|
|
else:
|
|
metadata['category'] = 'other'
|
|
metadata['category_zh'] = '其他'
|
|
|
|
# 提取文档内容信息
|
|
if file_path.suffix in ['.md', '.rst', '.txt']:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
metadata.update(self.analyze_content(content))
|
|
except Exception as e:
|
|
metadata['content_error'] = str(e)
|
|
|
|
return metadata
|
|
|
|
def analyze_content(self, content: str) -> Dict:
|
|
"""分析文档内容"""
|
|
lines = content.split('\n')
|
|
|
|
# 提取标题
|
|
title = None
|
|
for line in lines[:10]: # 只检查前10行
|
|
if line.startswith('# '):
|
|
title = line[2:].strip()
|
|
break
|
|
|
|
# 统计信息
|
|
word_count = len(content)
|
|
line_count = len(lines)
|
|
|
|
# 提取关键词(简单实现)
|
|
keywords = self.extract_keywords(content)
|
|
|
|
return {
|
|
'title': title,
|
|
'word_count': word_count,
|
|
'line_count': line_count,
|
|
'keywords': keywords,
|
|
'has_chinese': bool(re.search(r'[\u4e00-\u9fff]', content)),
|
|
'has_english': bool(re.search(r'[a-zA-Z]', content)),
|
|
}
|
|
|
|
def extract_keywords(self, content: str) -> List[str]:
|
|
"""提取关键词"""
|
|
# 简单的关键词提取
|
|
keywords = []
|
|
|
|
# 项目相关关键词
|
|
project_keywords = [
|
|
'胡汉三千年', '嚈哒', 'Y音正统性', '圐圙', '北朝宇宙',
|
|
'天地相通', '音韵表意', '纵横术', '三体', '文明对话'
|
|
]
|
|
|
|
for keyword in project_keywords:
|
|
if keyword in content:
|
|
keywords.append(keyword)
|
|
|
|
return keywords
|
|
|
|
def suggest_english_name(self, file_path: Path, metadata: Dict) -> str:
|
|
"""建议英文文件名"""
|
|
# 基于内容和路径生成英文文件名
|
|
category = metadata.get('category', 'doc')
|
|
|
|
# 特殊文件名映射
|
|
name_mapping = {
|
|
'嚈哒起源研究总结.md': 'yanda_origins_research_summary.md',
|
|
'Y音正统性与地缘政治密码_完整理论框架.md': 'y_sound_orthodoxy_geopolitical_codes.md',
|
|
'胡汉三千年.md': 'hu_han_three_thousand_years.md',
|
|
'三体解读深度书评.md': 'three_body_analysis_review.md',
|
|
}
|
|
|
|
if file_path.name in name_mapping:
|
|
return name_mapping[file_path.name]
|
|
|
|
# 自动生成
|
|
base_name = file_path.stem
|
|
# 简单的中文转英文(需要更复杂的实现)
|
|
english_name = re.sub(r'[^\w\-_.]', '_', base_name.lower())
|
|
english_name = re.sub(r'_+', '_', english_name)
|
|
|
|
return f"{category}_{english_name}{file_path.suffix}"
|
|
|
|
def index_document(self, file_path: Path) -> str:
|
|
"""索引单个文档"""
|
|
doc_id = self.generate_doc_id(str(file_path))
|
|
metadata = self.extract_metadata(file_path)
|
|
|
|
# 建议英文文件名
|
|
suggested_name = self.suggest_english_name(file_path, metadata)
|
|
metadata['suggested_english_name'] = suggested_name
|
|
|
|
self.documents[doc_id] = metadata
|
|
return doc_id
|
|
|
|
def index_all_documents(self):
|
|
"""索引所有文档"""
|
|
print("🔍 开始索引所有文档...")
|
|
|
|
# 要索引的文件类型
|
|
file_extensions = ['.md', '.rst', '.txt', '.py']
|
|
|
|
# 要排除的目录
|
|
exclude_dirs = {'.git', '__pycache__', '.venv', 'sphinx-env', '_build', 'node_modules'}
|
|
|
|
indexed_count = 0
|
|
|
|
for file_path in self.project_root.rglob('*'):
|
|
# 跳过目录
|
|
if file_path.is_dir():
|
|
continue
|
|
|
|
# 跳过排除的目录
|
|
if any(exclude_dir in file_path.parts for exclude_dir in exclude_dirs):
|
|
continue
|
|
|
|
# 只处理指定类型的文件
|
|
if file_path.suffix not in file_extensions:
|
|
continue
|
|
|
|
try:
|
|
doc_id = self.index_document(file_path)
|
|
print(f"✅ 已索引: {file_path.name} -> {doc_id}")
|
|
indexed_count += 1
|
|
except Exception as e:
|
|
print(f"❌ 索引失败: {file_path.name} - {e}")
|
|
|
|
self.save_index()
|
|
print(f"🎉 索引完成!共索引 {indexed_count} 个文档")
|
|
return indexed_count
|
|
|
|
def search_documents(self, query: str) -> List[Dict]:
|
|
"""搜索文档"""
|
|
results = []
|
|
query_lower = query.lower()
|
|
|
|
for doc_id, metadata in self.documents.items():
|
|
score = 0
|
|
|
|
# 标题匹配
|
|
if metadata.get('title') and query_lower in metadata['title'].lower():
|
|
score += 10
|
|
|
|
# 文件名匹配
|
|
if query_lower in metadata['file_name'].lower():
|
|
score += 5
|
|
|
|
# 关键词匹配
|
|
if metadata.get('keywords'):
|
|
for keyword in metadata['keywords']:
|
|
if query_lower in keyword.lower():
|
|
score += 3
|
|
|
|
# 分类匹配
|
|
if query_lower in metadata.get('category', '').lower():
|
|
score += 2
|
|
|
|
if score > 0:
|
|
result = metadata.copy()
|
|
result['doc_id'] = doc_id
|
|
result['score'] = score
|
|
results.append(result)
|
|
|
|
# 按分数排序
|
|
results.sort(key=lambda x: x['score'], reverse=True)
|
|
return results
|
|
|
|
def generate_rename_script(self) -> str:
|
|
"""生成重命名脚本"""
|
|
script_lines = ['#!/bin/bash', '', '# 文档重命名脚本', '']
|
|
|
|
for doc_id, metadata in self.documents.items():
|
|
current_path = metadata['file_path']
|
|
suggested_name = metadata.get('suggested_english_name')
|
|
|
|
if suggested_name and suggested_name != metadata['file_name']:
|
|
# 生成重命名命令
|
|
new_path = str(Path(current_path).parent / suggested_name)
|
|
script_lines.append(f'# {metadata["file_name"]} -> {suggested_name}')
|
|
script_lines.append(f'mv "{current_path}" "{new_path}"')
|
|
script_lines.append('')
|
|
|
|
return '\n'.join(script_lines)
|
|
|
|
def export_index_report(self) -> str:
|
|
"""导出索引报告"""
|
|
report = {
|
|
'summary': {
|
|
'total_documents': len(self.documents),
|
|
'categories': {},
|
|
'file_types': {},
|
|
'generated_at': datetime.now().isoformat()
|
|
},
|
|
'documents': self.documents
|
|
}
|
|
|
|
# 统计分类
|
|
for metadata in self.documents.values():
|
|
category = metadata.get('category', 'unknown')
|
|
report['summary']['categories'][category] = report['summary']['categories'].get(category, 0) + 1
|
|
|
|
file_ext = metadata.get('file_extension', 'unknown')
|
|
report['summary']['file_types'][file_ext] = report['summary']['file_types'].get(file_ext, 0) + 1
|
|
|
|
return json.dumps(report, ensure_ascii=False, indent=2)
|
|
|
|
def main():
|
|
"""主函数"""
|
|
project_root = "."
|
|
indexer = DocumentIndexer(project_root)
|
|
|
|
print("胡汉三千年项目文档入库系统")
|
|
print("=" * 40)
|
|
|
|
# 索引所有文档
|
|
indexer.index_all_documents()
|
|
|
|
# 生成报告
|
|
report = indexer.export_index_report()
|
|
with open('document_index_report.json', 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
# 生成重命名脚本
|
|
rename_script = indexer.generate_rename_script()
|
|
with open('rename_documents.sh', 'w', encoding='utf-8') as f:
|
|
f.write(rename_script)
|
|
|
|
print("\n📊 生成的文件:")
|
|
print("- document_index.json: 文档索引")
|
|
print("- document_index_report.json: 详细报告")
|
|
print("- rename_documents.sh: 重命名脚本")
|
|
|
|
# 演示搜索功能
|
|
print("\n🔍 搜索演示:")
|
|
for query in ['嚈哒', '三体', 'Y音', '理论']:
|
|
results = indexer.search_documents(query)
|
|
print(f"搜索 '{query}': 找到 {len(results)} 个结果")
|
|
for result in results[:2]: # 只显示前2个结果
|
|
print(f" - {result['file_name']} (分数: {result['score']})")
|
|
|
|
if __name__ == "__main__":
|
|
main() |