305 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			305 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
#!/usr/bin/env python3
 | 
						|
"""
 | 
						|
胡汉三千年项目文档入库系统
 | 
						|
Document Indexing System for Hu-Han Three Thousand Years Project
 | 
						|
"""
 | 
						|
 | 
						|
import os
 | 
						|
import json
 | 
						|
import hashlib
 | 
						|
import re
 | 
						|
from datetime import datetime
 | 
						|
from pathlib import Path
 | 
						|
from typing import Dict, List, Optional
 | 
						|
import yaml
 | 
						|
 | 
						|
class DocumentIndexer:
 | 
						|
    def __init__(self, project_root: str, index_file: str = "document_index.json"):
 | 
						|
        self.project_root = Path(project_root)
 | 
						|
        self.index_file = self.project_root / index_file
 | 
						|
        self.documents = {}
 | 
						|
        self.load_index()
 | 
						|
    
 | 
						|
    def load_index(self):
 | 
						|
        """加载现有索引"""
 | 
						|
        if self.index_file.exists():
 | 
						|
            with open(self.index_file, 'r', encoding='utf-8') as f:
 | 
						|
                self.documents = json.load(f)
 | 
						|
    
 | 
						|
    def save_index(self):
 | 
						|
        """保存索引到文件"""
 | 
						|
        with open(self.index_file, 'w', encoding='utf-8') as f:
 | 
						|
            json.dump(self.documents, f, ensure_ascii=False, indent=2)
 | 
						|
    
 | 
						|
    def generate_doc_id(self, file_path: str) -> str:
 | 
						|
        """生成文档ID"""
 | 
						|
        # 使用文件路径的hash作为ID
 | 
						|
        return hashlib.md5(file_path.encode('utf-8')).hexdigest()[:12]
 | 
						|
    
 | 
						|
    def extract_metadata(self, file_path: Path) -> Dict:
 | 
						|
        """提取文档元数据"""
 | 
						|
        metadata = {
 | 
						|
            'file_path': str(file_path.relative_to(self.project_root)),
 | 
						|
            'file_name': file_path.name,
 | 
						|
            'file_size': file_path.stat().st_size,
 | 
						|
            'created_time': datetime.fromtimestamp(file_path.stat().st_ctime).isoformat(),
 | 
						|
            'modified_time': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat(),
 | 
						|
            'file_extension': file_path.suffix,
 | 
						|
        }
 | 
						|
        
 | 
						|
        # 根据文件路径推断分类
 | 
						|
        path_parts = file_path.parts
 | 
						|
        if 'core-theory' in path_parts:
 | 
						|
            metadata['category'] = 'core_theory'
 | 
						|
            metadata['category_zh'] = '核心理论'
 | 
						|
        elif 'historical-research' in path_parts:
 | 
						|
            metadata['category'] = 'historical_research'
 | 
						|
            metadata['category_zh'] = '历史研究'
 | 
						|
        elif 'academic-papers' in path_parts:
 | 
						|
            metadata['category'] = 'academic_papers'
 | 
						|
            metadata['category_zh'] = '学术论文'
 | 
						|
        elif 'literary-works' in path_parts:
 | 
						|
            metadata['category'] = 'literary_works'
 | 
						|
            metadata['category_zh'] = '文学创作'
 | 
						|
        else:
 | 
						|
            metadata['category'] = 'other'
 | 
						|
            metadata['category_zh'] = '其他'
 | 
						|
        
 | 
						|
        # 提取文档内容信息
 | 
						|
        if file_path.suffix in ['.md', '.rst', '.txt']:
 | 
						|
            try:
 | 
						|
                with open(file_path, 'r', encoding='utf-8') as f:
 | 
						|
                    content = f.read()
 | 
						|
                    metadata.update(self.analyze_content(content))
 | 
						|
            except Exception as e:
 | 
						|
                metadata['content_error'] = str(e)
 | 
						|
        
 | 
						|
        return metadata
 | 
						|
    
 | 
						|
    def analyze_content(self, content: str) -> Dict:
 | 
						|
        """分析文档内容"""
 | 
						|
        lines = content.split('\n')
 | 
						|
        
 | 
						|
        # 提取标题
 | 
						|
        title = None
 | 
						|
        for line in lines[:10]:  # 只检查前10行
 | 
						|
            if line.startswith('# '):
 | 
						|
                title = line[2:].strip()
 | 
						|
                break
 | 
						|
        
 | 
						|
        # 统计信息
 | 
						|
        word_count = len(content)
 | 
						|
        line_count = len(lines)
 | 
						|
        
 | 
						|
        # 提取关键词(简单实现)
 | 
						|
        keywords = self.extract_keywords(content)
 | 
						|
        
 | 
						|
        return {
 | 
						|
            'title': title,
 | 
						|
            'word_count': word_count,
 | 
						|
            'line_count': line_count,
 | 
						|
            'keywords': keywords,
 | 
						|
            'has_chinese': bool(re.search(r'[\u4e00-\u9fff]', content)),
 | 
						|
            'has_english': bool(re.search(r'[a-zA-Z]', content)),
 | 
						|
        }
 | 
						|
    
 | 
						|
    def extract_keywords(self, content: str) -> List[str]:
 | 
						|
        """提取关键词"""
 | 
						|
        # 简单的关键词提取
 | 
						|
        keywords = []
 | 
						|
        
 | 
						|
        # 项目相关关键词
 | 
						|
        project_keywords = [
 | 
						|
            '胡汉三千年', '嚈哒', 'Y音正统性', '圐圙', '北朝宇宙',
 | 
						|
            '天地相通', '音韵表意', '纵横术', '三体', '文明对话'
 | 
						|
        ]
 | 
						|
        
 | 
						|
        for keyword in project_keywords:
 | 
						|
            if keyword in content:
 | 
						|
                keywords.append(keyword)
 | 
						|
        
 | 
						|
        return keywords
 | 
						|
    
 | 
						|
    def suggest_english_name(self, file_path: Path, metadata: Dict) -> str:
 | 
						|
        """建议英文文件名"""
 | 
						|
        # 基于内容和路径生成英文文件名
 | 
						|
        category = metadata.get('category', 'doc')
 | 
						|
        
 | 
						|
        # 特殊文件名映射
 | 
						|
        name_mapping = {
 | 
						|
            '嚈哒起源研究总结.md': 'yanda_origins_research_summary.md',
 | 
						|
            'Y音正统性与地缘政治密码_完整理论框架.md': 'y_sound_orthodoxy_geopolitical_codes.md',
 | 
						|
            '胡汉三千年.md': 'hu_han_three_thousand_years.md',
 | 
						|
            '三体解读深度书评.md': 'three_body_analysis_review.md',
 | 
						|
        }
 | 
						|
        
 | 
						|
        if file_path.name in name_mapping:
 | 
						|
            return name_mapping[file_path.name]
 | 
						|
        
 | 
						|
        # 自动生成
 | 
						|
        base_name = file_path.stem
 | 
						|
        # 简单的中文转英文(需要更复杂的实现)
 | 
						|
        english_name = re.sub(r'[^\w\-_.]', '_', base_name.lower())
 | 
						|
        english_name = re.sub(r'_+', '_', english_name)
 | 
						|
        
 | 
						|
        return f"{category}_{english_name}{file_path.suffix}"
 | 
						|
    
 | 
						|
    def index_document(self, file_path: Path) -> str:
 | 
						|
        """索引单个文档"""
 | 
						|
        doc_id = self.generate_doc_id(str(file_path))
 | 
						|
        metadata = self.extract_metadata(file_path)
 | 
						|
        
 | 
						|
        # 建议英文文件名
 | 
						|
        suggested_name = self.suggest_english_name(file_path, metadata)
 | 
						|
        metadata['suggested_english_name'] = suggested_name
 | 
						|
        
 | 
						|
        self.documents[doc_id] = metadata
 | 
						|
        return doc_id
 | 
						|
    
 | 
						|
    def index_all_documents(self):
 | 
						|
        """索引所有文档"""
 | 
						|
        print("🔍 开始索引所有文档...")
 | 
						|
        
 | 
						|
        # 要索引的文件类型
 | 
						|
        file_extensions = ['.md', '.rst', '.txt', '.py']
 | 
						|
        
 | 
						|
        # 要排除的目录
 | 
						|
        exclude_dirs = {'.git', '__pycache__', '.venv', 'sphinx-env', '_build', 'node_modules'}
 | 
						|
        
 | 
						|
        indexed_count = 0
 | 
						|
        
 | 
						|
        for file_path in self.project_root.rglob('*'):
 | 
						|
            # 跳过目录
 | 
						|
            if file_path.is_dir():
 | 
						|
                continue
 | 
						|
            
 | 
						|
            # 跳过排除的目录
 | 
						|
            if any(exclude_dir in file_path.parts for exclude_dir in exclude_dirs):
 | 
						|
                continue
 | 
						|
            
 | 
						|
            # 只处理指定类型的文件
 | 
						|
            if file_path.suffix not in file_extensions:
 | 
						|
                continue
 | 
						|
            
 | 
						|
            try:
 | 
						|
                doc_id = self.index_document(file_path)
 | 
						|
                print(f"✅ 已索引: {file_path.name} -> {doc_id}")
 | 
						|
                indexed_count += 1
 | 
						|
            except Exception as e:
 | 
						|
                print(f"❌ 索引失败: {file_path.name} - {e}")
 | 
						|
        
 | 
						|
        self.save_index()
 | 
						|
        print(f"🎉 索引完成!共索引 {indexed_count} 个文档")
 | 
						|
        return indexed_count
 | 
						|
    
 | 
						|
    def search_documents(self, query: str) -> List[Dict]:
 | 
						|
        """搜索文档"""
 | 
						|
        results = []
 | 
						|
        query_lower = query.lower()
 | 
						|
        
 | 
						|
        for doc_id, metadata in self.documents.items():
 | 
						|
            score = 0
 | 
						|
            
 | 
						|
            # 标题匹配
 | 
						|
            if metadata.get('title') and query_lower in metadata['title'].lower():
 | 
						|
                score += 10
 | 
						|
            
 | 
						|
            # 文件名匹配
 | 
						|
            if query_lower in metadata['file_name'].lower():
 | 
						|
                score += 5
 | 
						|
            
 | 
						|
            # 关键词匹配
 | 
						|
            if metadata.get('keywords'):
 | 
						|
                for keyword in metadata['keywords']:
 | 
						|
                    if query_lower in keyword.lower():
 | 
						|
                        score += 3
 | 
						|
            
 | 
						|
            # 分类匹配
 | 
						|
            if query_lower in metadata.get('category', '').lower():
 | 
						|
                score += 2
 | 
						|
            
 | 
						|
            if score > 0:
 | 
						|
                result = metadata.copy()
 | 
						|
                result['doc_id'] = doc_id
 | 
						|
                result['score'] = score
 | 
						|
                results.append(result)
 | 
						|
        
 | 
						|
        # 按分数排序
 | 
						|
        results.sort(key=lambda x: x['score'], reverse=True)
 | 
						|
        return results
 | 
						|
    
 | 
						|
    def generate_rename_script(self) -> str:
 | 
						|
        """生成重命名脚本"""
 | 
						|
        script_lines = ['#!/bin/bash', '', '# 文档重命名脚本', '']
 | 
						|
        
 | 
						|
        for doc_id, metadata in self.documents.items():
 | 
						|
            current_path = metadata['file_path']
 | 
						|
            suggested_name = metadata.get('suggested_english_name')
 | 
						|
            
 | 
						|
            if suggested_name and suggested_name != metadata['file_name']:
 | 
						|
                # 生成重命名命令
 | 
						|
                new_path = str(Path(current_path).parent / suggested_name)
 | 
						|
                script_lines.append(f'# {metadata["file_name"]} -> {suggested_name}')
 | 
						|
                script_lines.append(f'mv "{current_path}" "{new_path}"')
 | 
						|
                script_lines.append('')
 | 
						|
        
 | 
						|
        return '\n'.join(script_lines)
 | 
						|
    
 | 
						|
    def export_index_report(self) -> str:
 | 
						|
        """导出索引报告"""
 | 
						|
        report = {
 | 
						|
            'summary': {
 | 
						|
                'total_documents': len(self.documents),
 | 
						|
                'categories': {},
 | 
						|
                'file_types': {},
 | 
						|
                'generated_at': datetime.now().isoformat()
 | 
						|
            },
 | 
						|
            'documents': self.documents
 | 
						|
        }
 | 
						|
        
 | 
						|
        # 统计分类
 | 
						|
        for metadata in self.documents.values():
 | 
						|
            category = metadata.get('category', 'unknown')
 | 
						|
            report['summary']['categories'][category] = report['summary']['categories'].get(category, 0) + 1
 | 
						|
            
 | 
						|
            file_ext = metadata.get('file_extension', 'unknown')
 | 
						|
            report['summary']['file_types'][file_ext] = report['summary']['file_types'].get(file_ext, 0) + 1
 | 
						|
        
 | 
						|
        return json.dumps(report, ensure_ascii=False, indent=2)
 | 
						|
 | 
						|
def main():
 | 
						|
    """主函数"""
 | 
						|
    project_root = "."
 | 
						|
    indexer = DocumentIndexer(project_root)
 | 
						|
    
 | 
						|
    print("胡汉三千年项目文档入库系统")
 | 
						|
    print("=" * 40)
 | 
						|
    
 | 
						|
    # 索引所有文档
 | 
						|
    indexer.index_all_documents()
 | 
						|
    
 | 
						|
    # 生成报告
 | 
						|
    report = indexer.export_index_report()
 | 
						|
    with open('document_index_report.json', 'w', encoding='utf-8') as f:
 | 
						|
        f.write(report)
 | 
						|
    
 | 
						|
    # 生成重命名脚本
 | 
						|
    rename_script = indexer.generate_rename_script()
 | 
						|
    with open('rename_documents.sh', 'w', encoding='utf-8') as f:
 | 
						|
        f.write(rename_script)
 | 
						|
    
 | 
						|
    print("\n📊 生成的文件:")
 | 
						|
    print("- document_index.json: 文档索引")
 | 
						|
    print("- document_index_report.json: 详细报告")
 | 
						|
    print("- rename_documents.sh: 重命名脚本")
 | 
						|
    
 | 
						|
    # 演示搜索功能
 | 
						|
    print("\n🔍 搜索演示:")
 | 
						|
    for query in ['嚈哒', '三体', 'Y音', '理论']:
 | 
						|
        results = indexer.search_documents(query)
 | 
						|
        print(f"搜索 '{query}': 找到 {len(results)} 个结果")
 | 
						|
        for result in results[:2]:  # 只显示前2个结果
 | 
						|
            print(f"  - {result['file_name']} (分数: {result['score']})")
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    main() |