重大发现：全球文明天崇拜和玉崇拜普遍性验证完成

- 验证了地球上所有文明都具备天崇拜和玉崇拜模式 - 覆盖亚洲、欧洲、非洲、美洲、大洋洲、中东等全球范围 - 确认K音文明传播网络的全球分布 - 完善昆仑38词汇系统的理论框架 - 更新坦桑尼亚玉石开采和埃及法老坟墓水银的考古证据 - 全球文明同源论取得重大突破
2025-10-30 13:48:03 +00:00
parent 6b9c762367
commit 2a19a79695
119 changed files with 6319 additions and 875 deletions
--- a/unified-docs/tools/doc-indexer.py
+++ b/unified-docs/tools/doc-indexer.py
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+胡汉三千年项目文档索引工具
+
+功能：
+1. 自动扫描文档目录
+2. 生成文档索引
+3. 更新统一索引文件
+4. 检测文档变更
+
+作者：胡汉三千年项目团队
+版本：1.0.0
+"""
+
+import os
+import json
+import hashlib
+import datetime
+from pathlib import Path
+
+class DocumentIndexer:
+    def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
+        self.base_path = Path(base_path)
+        self.index_file = self.base_path / "unified-index.json"
+        self.categories = [
+            "01-core-theory",
+            "02-thematic-research", 
+            "03-historical-analysis",
+            "04-methodology",
+            "05-applications",
+            "06-resources"
+        ]
+        
+    def scan_documents(self):
+        """扫描所有文档目录，收集文档信息"""
+        documents = {}
+        
+        for category in self.categories:
+            category_path = self.base_path / category
+            if not category_path.exists():
+                continue
+                
+            documents[category] = []
+            
+            # 扫描Markdown文件
+            for md_file in category_path.rglob("*.md"):
+                if md_file.name == "README.md":
+                    continue
+                    
+                doc_info = self._get_document_info(md_file, category)
+                documents[category].append(doc_info)
+        
+        return documents
+    
+    def _get_document_info(self, file_path, category):
+        """获取单个文档的详细信息"""
+        stat = file_path.stat()
+        
+        # 计算文件哈希
+        file_hash = self._calculate_file_hash(file_path)
+        
+        # 读取文件内容获取基本信息
+        title = file_path.stem
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+            # 尝试从内容中提取标题
+            lines = content.split('\n')
+            for line in lines:
+                if line.startswith('# '):
+                    title = line[2:].strip()
+                    break
+        
+        return {
+            "title": title,
+            "filename": file_path.name,
+            "path": str(file_path.relative_to(self.base_path)),
+            "category": category,
+            "size": stat.st_size,
+            "modified": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
+            "hash": file_hash,
+            "word_count": len(content.split())
+        }
+    
+    def _calculate_file_hash(self, file_path):
+        """计算文件内容的哈希值"""
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    
+    def generate_index(self, documents):
+        """生成索引文件"""
+        index_data = {
+            "metadata": {
+                "generated_at": datetime.datetime.now().isoformat(),
+                "total_documents": sum(len(docs) for docs in documents.values()),
+                "tool_version": "1.0.0"
+            },
+            "categories": {},
+            "documents": documents,
+            "statistics": self._calculate_statistics(documents)
+        }
+        
+        # 按类别统计
+        for category, docs in documents.items():
+            index_data["categories"][category] = {
+                "count": len(docs),
+                "total_size": sum(doc["size"] for doc in docs),
+                "last_modified": max(doc["modified"] for doc in docs) if docs else None
+            }
+        
+        return index_data
+    
+    def _calculate_statistics(self, documents):
+        """计算文档统计信息"""
+        all_docs = []
+        for docs in documents.values():
+            all_docs.extend(docs)
+        
+        if not all_docs:
+            return {
+                "total_documents": 0,
+                "total_size_bytes": 0,
+                "total_size_mb": 0,
+                "total_words": 0,
+                "average_words_per_doc": 0,
+                "last_modified": None
+            }
+        
+        total_size = sum(doc["size"] for doc in all_docs)
+        total_words = sum(doc["word_count"] for doc in all_docs)
+        
+        return {
+            "total_documents": len(all_docs),
+            "total_size_bytes": total_size,
+            "total_size_mb": round(total_size / (1024 * 1024), 2),
+            "total_words": total_words,
+            "average_words_per_doc": round(total_words / len(all_docs), 2),
+            "last_modified": max(doc["modified"] for doc in all_docs)
+        }
+    
+    def save_index(self, index_data):
+        """保存索引到文件"""
+        with open(self.index_file, 'w', encoding='utf-8') as f:
+            json.dump(index_data, f, ensure_ascii=False, indent=2)
+    
+    def update_markdown_index(self, index_data):
+        """更新Markdown格式的索引文件"""
+        md_index_file = self.base_path / "unified-index.md"
+        
+        # 读取现有的Markdown索引
+        if md_index_file.exists():
+            with open(md_index_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+        else:
+            content = ""
+        
+        # 生成新的索引内容
+        new_content = self._generate_markdown_index(index_data)
+        
+        # 更新文档迁移状态部分
+        updated_content = self._update_migration_status(content, new_content, index_data)
+        
+        with open(md_index_file, 'w', encoding='utf-8') as f:
+            f.write(updated_content)
+    
+    def _generate_markdown_index(self, index_data):
+        """生成Markdown格式的索引内容"""
+        lines = []
+        
+        # 统计信息
+        stats = index_data["statistics"]
+        lines.append("## 文档统计信息\n")
+        lines.append(f"- **总文档数**: {stats['total_documents']}")
+        lines.append(f"- **总大小**: {stats['total_size_mb']} MB")
+        lines.append(f"- **总字数**: {stats['total_words']:,}")
+        lines.append(f"- **平均每文档字数**: {stats['average_words_per_doc']}")
+        lines.append(f"- **最后更新时间**: {stats['last_modified']}\n")
+        
+        # 按类别列出文档
+        for category, docs in index_data["documents"].items():
+            if docs:
+                lines.append(f"\n### {category.replace('-', ' ').title()}\n")
+                
+                for doc in sorted(docs, key=lambda x: x["title"]):
+                    lines.append(f"- **{doc['title']}** - `{doc['filename']}` ")
+                    lines.append(f"  - 大小: {round(doc['size']/1024, 1)} KB")
+                    lines.append(f"  - 字数: {doc['word_count']}")
+                    lines.append(f"  - 修改: {doc['modified'][:10]}")
+        
+        return '\n'.join(lines)
+    
+    def _update_migration_status(self, old_content, new_index_content, index_data):
+        """更新文档迁移状态部分"""
+        # 查找文档迁移状态部分
+        migration_start = old_content.find("## 文档迁移状态")
+        if migration_start == -1:
+            # 如果没有找到，在适当位置插入
+            insert_pos = old_content.find("## 更新日志")
+            if insert_pos == -1:
+                insert_pos = len(old_content)
+            
+            migration_content = self._generate_migration_status(index_data)
+            updated_content = old_content[:insert_pos] + "\n" + migration_content + "\n" + old_content[insert_pos:]
+        else:
+            # 替换现有的迁移状态部分
+            migration_end = old_content.find("##", migration_start + 1)
+            if migration_end == -1:
+                migration_end = len(old_content)
+            
+            migration_content = self._generate_migration_status(index_data)
+            updated_content = old_content[:migration_start] + migration_content + old_content[migration_end:]
+        
+        # 更新索引内容部分
+        index_start = updated_content.find("## 文档统计信息")
+        if index_start != -1:
+            index_end = updated_content.find("##", index_start + 1)
+            if index_end == -1:
+                index_end = len(updated_content)
+            
+            updated_content = updated_content[:index_start] + new_index_content + updated_content[index_end:]
+        
+        return updated_content
+    
+    def _generate_migration_status(self, index_data):
+        """生成文档迁移状态内容"""
+        lines = []
+        lines.append("## 文档迁移状态\n")
+        
+        stats = index_data["statistics"]
+        total_migrated = stats["total_documents"]
+        
+        # 估算core-docs和thematic-research中的文档数
+        estimated_core_docs = 399  # 根据之前的统计
+        estimated_thematic = 142   # 根据之前的统计
+        total_estimated = estimated_core_docs + estimated_thematic
+        
+        migration_percentage = (total_migrated / total_estimated * 100) if total_estimated > 0 else 0
+        
+        lines.append(f"### 迁移进度: {migration_percentage:.1f}%\n")
+        lines.append(f"- **已迁移文档**: {total_migrated}")
+        lines.append(f"- **预计总文档**: {total_estimated}")
+        lines.append(f"- **剩余文档**: {total_estimated - total_migrated}\n")
+        
+        lines.append("### 按类别迁移情况\n")
+        for category, info in index_data["categories"].items():
+            lines.append(f"- **{category.replace('-', ' ').title()}**: {info['count']} 个文档")
+        
+        return '\n'.join(lines)
+    
+    def run(self):
+        """运行索引工具"""
+        print("=== 胡汉三千年项目文档索引工具 ===")
+        print(f"扫描目录: {self.base_path}")
+        
+        # 扫描文档
+        print("正在扫描文档...")
+        documents = self.scan_documents()
+        
+        # 生成索引
+        print("正在生成索引...")
+        index_data = self.generate_index(documents)
+        
+        # 保存JSON索引
+        print("正在保存索引文件...")
+        self.save_index(index_data)
+        
+        # 更新Markdown索引
+        print("正在更新Markdown索引...")
+        self.update_markdown_index(index_data)
+        
+        # 输出统计信息
+        stats = index_data["statistics"]
+        print(f"\n=== 索引完成 ===")
+        print(f"处理文档数: {stats['total_documents']}")
+        print(f"总大小: {stats['total_size_mb']} MB")
+        print(f"索引文件: {self.index_file}")
+        print(f"生成时间: {index_data['metadata']['generated_at']}")
+
+def main():
+    """主函数"""
+    indexer = DocumentIndexer()
+    indexer.run()
+
+if __name__ == "__main__":
+    main()