huhan3000/unified-docs/tools/doc-indexer.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
胡汉三千年项目文档索引工具

功能：
1. 自动扫描文档目录
2. 生成文档索引
3. 更新统一索引文件
4. 检测文档变更

作者：胡汉三千年项目团队
版本：1.0.0
"""

import os
import json
import hashlib
import datetime
from pathlib import Path

class DocumentIndexer:
    def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
        self.base_path = Path(base_path)
        self.index_file = self.base_path / "unified-index.json"
        self.categories = [
            "01-core-theory",
            "02-thematic-research",
            "03-historical-analysis",
            "04-methodology",
            "05-applications",
            "06-resources"
        ]

    def scan_documents(self):
        """扫描所有文档目录，收集文档信息"""
        documents = {}

        for category in self.categories:
            category_path = self.base_path / category
            if not category_path.exists():
                continue

            documents[category] = []

            # 扫描Markdown文件
            for md_file in category_path.rglob("*.md"):
                if md_file.name == "README.md":
                    continue

                doc_info = self._get_document_info(md_file, category)
                documents[category].append(doc_info)

        return documents

    def _get_document_info(self, file_path, category):
        """获取单个文档的详细信息"""
        stat = file_path.stat()

        # 计算文件哈希
        file_hash = self._calculate_file_hash(file_path)

        # 读取文件内容获取基本信息
        title = file_path.stem
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            # 尝试从内容中提取标题
            lines = content.split('\n')
            for line in lines:
                if line.startswith('# '):
                    title = line[2:].strip()
                    break

        return {
            "title": title,
            "filename": file_path.name,
            "path": str(file_path.relative_to(self.base_path)),
            "category": category,
            "size": stat.st_size,
            "modified": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
            "hash": file_hash,
            "word_count": len(content.split())
        }

    def _calculate_file_hash(self, file_path):
        """计算文件内容的哈希值"""
        hash_md5 = hashlib.md5()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()

    def generate_index(self, documents):
        """生成索引文件"""
        index_data = {
            "metadata": {
                "generated_at": datetime.datetime.now().isoformat(),
                "total_documents": sum(len(docs) for docs in documents.values()),
                "tool_version": "1.0.0"
            },
            "categories": {},
            "documents": documents,
            "statistics": self._calculate_statistics(documents)
        }

        # 按类别统计
        for category, docs in documents.items():
            index_data["categories"][category] = {
                "count": len(docs),
                "total_size": sum(doc["size"] for doc in docs),
                "last_modified": max(doc["modified"] for doc in docs) if docs else None
            }

        return index_data

    def _calculate_statistics(self, documents):
        """计算文档统计信息"""
        all_docs = []
        for docs in documents.values():
            all_docs.extend(docs)

        if not all_docs:
            return {
                "total_documents": 0,
                "total_size_bytes": 0,
                "total_size_mb": 0,
                "total_words": 0,
                "average_words_per_doc": 0,
                "last_modified": None
            }

        total_size = sum(doc["size"] for doc in all_docs)
        total_words = sum(doc["word_count"] for doc in all_docs)

        return {
            "total_documents": len(all_docs),
            "total_size_bytes": total_size,
            "total_size_mb": round(total_size / (1024 * 1024), 2),
            "total_words": total_words,
            "average_words_per_doc": round(total_words / len(all_docs), 2),
            "last_modified": max(doc["modified"] for doc in all_docs)
        }

    def save_index(self, index_data):
        """保存索引到文件"""
        with open(self.index_file, 'w', encoding='utf-8') as f:
            json.dump(index_data, f, ensure_ascii=False, indent=2)

    def update_markdown_index(self, index_data):
        """更新Markdown格式的索引文件"""
        md_index_file = self.base_path / "unified-index.md"

        # 读取现有的Markdown索引
        if md_index_file.exists():
            with open(md_index_file, 'r', encoding='utf-8') as f:
                content = f.read()
        else:
            content = ""

        # 生成新的索引内容
        new_content = self._generate_markdown_index(index_data)

        # 更新文档迁移状态部分
        updated_content = self._update_migration_status(content, new_content, index_data)

        with open(md_index_file, 'w', encoding='utf-8') as f:
            f.write(updated_content)

    def _generate_markdown_index(self, index_data):
        """生成Markdown格式的索引内容"""
        lines = []

        # 统计信息
        stats = index_data["statistics"]
        lines.append("## 文档统计信息\n")
        lines.append(f"- **总文档数**: {stats['total_documents']}")
        lines.append(f"- **总大小**: {stats['total_size_mb']} MB")
        lines.append(f"- **总字数**: {stats['total_words']:,}")
        lines.append(f"- **平均每文档字数**: {stats['average_words_per_doc']}")
        lines.append(f"- **最后更新时间**: {stats['last_modified']}\n")

        # 按类别列出文档
        for category, docs in index_data["documents"].items():
            if docs:
                lines.append(f"\n### {category.replace('-', ' ').title()}\n")

                for doc in sorted(docs, key=lambda x: x["title"]):
                    lines.append(f"- **{doc['title']}** - `{doc['filename']}` ")
                    lines.append(f"  - 大小: {round(doc['size']/1024, 1)} KB")
                    lines.append(f"  - 字数: {doc['word_count']}")
                    lines.append(f"  - 修改: {doc['modified'][:10]}")

        return '\n'.join(lines)

    def _update_migration_status(self, old_content, new_index_content, index_data):
        """更新文档迁移状态部分"""
        # 查找文档迁移状态部分
        migration_start = old_content.find("## 文档迁移状态")
        if migration_start == -1:
            # 如果没有找到，在适当位置插入
            insert_pos = old_content.find("## 更新日志")
            if insert_pos == -1:
                insert_pos = len(old_content)

            migration_content = self._generate_migration_status(index_data)
            updated_content = old_content[:insert_pos] + "\n" + migration_content + "\n" + old_content[insert_pos:]
        else:
            # 替换现有的迁移状态部分
            migration_end = old_content.find("##", migration_start + 1)
            if migration_end == -1:
                migration_end = len(old_content)

            migration_content = self._generate_migration_status(index_data)
            updated_content = old_content[:migration_start] + migration_content + old_content[migration_end:]

        # 更新索引内容部分
        index_start = updated_content.find("## 文档统计信息")
        if index_start != -1:
            index_end = updated_content.find("##", index_start + 1)
            if index_end == -1:
                index_end = len(updated_content)

            updated_content = updated_content[:index_start] + new_index_content + updated_content[index_end:]

        return updated_content

    def _generate_migration_status(self, index_data):
        """生成文档迁移状态内容"""
        lines = []
        lines.append("## 文档迁移状态\n")

        stats = index_data["statistics"]
        total_migrated = stats["total_documents"]

        # 估算core-docs和thematic-research中的文档数
        estimated_core_docs = 399  # 根据之前的统计
        estimated_thematic = 142   # 根据之前的统计
        total_estimated = estimated_core_docs + estimated_thematic

        migration_percentage = (total_migrated / total_estimated * 100) if total_estimated > 0 else 0

        lines.append(f"### 迁移进度: {migration_percentage:.1f}%\n")
        lines.append(f"- **已迁移文档**: {total_migrated}")
        lines.append(f"- **预计总文档**: {total_estimated}")
        lines.append(f"- **剩余文档**: {total_estimated - total_migrated}\n")

        lines.append("### 按类别迁移情况\n")
        for category, info in index_data["categories"].items():
            lines.append(f"- **{category.replace('-', ' ').title()}**: {info['count']} 个文档")

        return '\n'.join(lines)

    def run(self):
        """运行索引工具"""
        print("=== 胡汉三千年项目文档索引工具 ===")
        print(f"扫描目录: {self.base_path}")

        # 扫描文档
        print("正在扫描文档...")
        documents = self.scan_documents()

        # 生成索引
        print("正在生成索引...")
        index_data = self.generate_index(documents)

        # 保存JSON索引
        print("正在保存索引文件...")
        self.save_index(index_data)

        # 更新Markdown索引
        print("正在更新Markdown索引...")
        self.update_markdown_index(index_data)

        # 输出统计信息
        stats = index_data["statistics"]
        print(f"\n=== 索引完成 ===")
        print(f"处理文档数: {stats['total_documents']}")
        print(f"总大小: {stats['total_size_mb']} MB")
        print(f"索引文件: {self.index_file}")
        print(f"生成时间: {index_data['metadata']['generated_at']}")

def main():
    """主函数"""
    indexer = DocumentIndexer()
    indexer.run()

if __name__ == "__main__":
    main()