huhan3000/unified-docs/tools/doc-migrator.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
胡汉三千年项目文档迁移工具

功能：
1. 从core-docs和thematic-research迁移文档到统一文档系统
2. 自动分类和组织文档
3. 保持文档结构和元数据
4. 生成迁移报告

作者：胡汉三千年项目团队
版本：1.0.0
"""

import os
import json
import shutil
import hashlib
from pathlib import Path
import datetime

class DocumentMigrator:
    def __init__(self):
        self.base_path = Path("/home/ben/code/huhan3000")
        self.unified_docs_path = self.base_path / "unified-docs"
        self.core_docs_path = self.base_path / "core-docs"
        self.thematic_research_path = self.base_path / "thematic-research"

        # 分类映射规则
        self.category_mapping = {
            # core-docs 分类规则
            "core-docs": {
                "音韵考古学": "01-core-theory/01-phonological-archaeology",
                "文明传播模型": "01-core-theory/02-civilization-diffusion",
                "方法论体系": "01-core-theory/03-methodology",
                "学术成果": "01-core-theory/04-academic-achievements",
                "理论框架": "01-core-theory/05-theoretical-framework",
                "实证研究": "02-thematic-research/01-empirical-studies",
                "历史分析": "03-historical-analysis/01-historical-events",
                "文化比较": "04-cultural-comparison/01-cross-cultural",
                "技术实现": "05-technical-implementation/01-tools",
                "项目文档": "06-project-docs/01-management"
            },
            # thematic-research 分类规则
            "thematic-research": {
                "civilization-studies": "02-thematic-research/02-civilization-studies",
                "phonological-studies": "02-thematic-research/03-phonological-studies",
                "commercial-studies": "02-thematic-research/04-commercial-studies",
                "historical-studies": "03-historical-analysis/02-historical-studies",
                "cultural-studies": "04-cultural-comparison/02-cultural-studies",
                "theory-studies": "01-core-theory/06-theory-studies",
                "methodology-studies": "01-core-theory/03-methodology",
                "empirical-studies": "02-thematic-research/01-empirical-studies",
                "comparative-studies": "04-cultural-comparison/03-comparative-studies"
            }
        }

        # 文件扩展名映射
        self.file_extensions = {
            ".md": "markdown",
            ".txt": "text",
            ".py": "python",
            ".json": "json",
            ".yaml": "yaml",
            ".yml": "yaml"
        }

        self.migration_report = {
            "metadata": {
                "migration_date": datetime.datetime.now().isoformat(),
                "tool_version": "1.0.0"
            },
            "statistics": {
                "total_files_scanned": 0,
                "total_files_migrated": 0,
                "total_files_skipped": 0,
                "total_errors": 0
            },
            "migration_details": {
                "core-docs": {"scanned": 0, "migrated": 0, "skipped": 0},
                "thematic-research": {"scanned": 0, "migrated": 0, "skipped": 0}
            },
            "errors": [],
            "migrated_files": []
        }

    def _calculate_file_hash(self, file_path):
        """计算文件内容的哈希值"""
        hash_md5 = hashlib.md5()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()

    def _get_file_category(self, source_type, file_path, content=None):
        """根据文件路径和内容确定分类"""
        file_path_str = str(file_path)

        # 首先尝试基于路径的分类
        for keyword, target_category in self.category_mapping[source_type].items():
            if keyword.lower() in file_path_str.lower():
                return target_category

        # 如果基于路径无法分类，尝试基于内容（如果提供了内容）
        if content:
            content_lower = content.lower()

            # 关键词匹配
            keyword_categories = {
                "音韵": "01-core-theory/01-phonological-archaeology",
                "文明": "01-core-theory/02-civilization-diffusion",
                "方法": "01-core-theory/03-methodology",
                "理论": "01-core-theory/05-theoretical-framework",
                "实证": "02-thematic-research/01-empirical-studies",
                "历史": "03-historical-analysis/01-historical-events",
                "文化": "04-cultural-comparison/01-cross-cultural",
                "技术": "05-technical-implementation/01-tools",
                "项目": "06-project-docs/01-management"
            }

            for keyword, category in keyword_categories.items():
                if keyword in content_lower:
                    return category

        # 默认分类
        if source_type == "core-docs":
            return "01-core-theory/99-uncategorized"
        else:
            return "02-thematic-research/99-uncategorized"

    def _ensure_directory(self, dir_path):
        """确保目录存在"""
        dir_path.mkdir(parents=True, exist_ok=True)

    def _copy_file_with_metadata(self, source_path, target_path):
        """复制文件并保持元数据"""
        try:
            # 复制文件
            shutil.copy2(source_path, target_path)

            # 获取文件信息
            stat = source_path.stat()
            file_info = {
                "source_path": str(source_path),
                "target_path": str(target_path),
                "size": stat.st_size,
                "modified_time": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
                "hash": self._calculate_file_hash(source_path),
                "file_type": self.file_extensions.get(source_path.suffix, "unknown")
            }

            return file_info
        except Exception as e:
            raise Exception(f"文件复制失败: {e}")

    def _create_migration_metadata(self, source_path, target_path, category):
        """创建迁移元数据文件"""
        metadata_path = target_path.with_suffix(target_path.suffix + ".metadata.json")

        metadata = {
            "original_source": str(source_path),
            "migration_date": datetime.datetime.now().isoformat(),
            "category": category,
            "tool_version": "1.0.0"
        }

        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, ensure_ascii=False, indent=2)

    def migrate_core_docs(self, dry_run=False):
        """迁移core-docs文档"""
        print("开始迁移 core-docs 文档...")

        migrated_files = []

        # 扫描core-docs目录
        for file_path in self.core_docs_path.rglob("*"):
            if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]:
                self.migration_report["statistics"]["total_files_scanned"] += 1
                self.migration_report["migration_details"]["core-docs"]["scanned"] += 1

                try:
                    # 读取文件内容用于分类
                    content = None
                    if file_path.suffix in [".md", ".txt"]:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()

                    # 确定目标分类
                    category = self._get_file_category("core-docs", file_path, content)

                    # 构建目标路径
                    relative_path = file_path.relative_to(self.core_docs_path)
                    target_dir = self.unified_docs_path / category
                    target_path = target_dir / relative_path.name

                    # 确保目标目录存在
                    self._ensure_directory(target_dir)

                    if not dry_run:
                        # 复制文件
                        file_info = self._copy_file_with_metadata(file_path, target_path)

                        # 创建元数据文件
                        self._create_migration_metadata(file_path, target_path, category)

                        file_info["category"] = category
                        migrated_files.append(file_info)

                        self.migration_report["statistics"]["total_files_migrated"] += 1
                        self.migration_report["migration_details"]["core-docs"]["migrated"] += 1

                        print(f"✓ 已迁移: {file_path.name} -> {category}")
                    else:
                        print(f"[模拟] 将迁移: {file_path.name} -> {category}")

                except Exception as e:
                    error_msg = f"迁移失败 {file_path}: {e}"
                    self.migration_report["errors"].append(error_msg)
                    self.migration_report["statistics"]["total_errors"] += 1
                    self.migration_report["migration_details"]["core-docs"]["skipped"] += 1
                    print(f"✗ {error_msg}")

        return migrated_files

    def migrate_thematic_research(self, dry_run=False):
        """迁移thematic-research文档"""
        print("开始迁移 thematic-research 文档...")

        migrated_files = []

        # 扫描thematic-research目录
        for file_path in self.thematic_research_path.rglob("*"):
            if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]:
                self.migration_report["statistics"]["total_files_scanned"] += 1
                self.migration_report["migration_details"]["thematic-research"]["scanned"] += 1

                try:
                    # 读取文件内容用于分类
                    content = None
                    if file_path.suffix in [".md", ".txt"]:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()

                    # 确定目标分类
                    category = self._get_file_category("thematic-research", file_path, content)

                    # 构建目标路径
                    relative_path = file_path.relative_to(self.thematic_research_path)
                    target_dir = self.unified_docs_path / category
                    target_path = target_dir / relative_path.name

                    # 确保目标目录存在
                    self._ensure_directory(target_dir)

                    if not dry_run:
                        # 复制文件
                        file_info = self._copy_file_with_metadata(file_path, target_path)

                        # 创建元数据文件
                        self._create_migration_metadata(file_path, target_path, category)

                        file_info["category"] = category
                        migrated_files.append(file_info)

                        self.migration_report["statistics"]["total_files_migrated"] += 1
                        self.migration_report["migration_details"]["thematic-research"]["migrated"] += 1

                        print(f"✓ 已迁移: {file_path.name} -> {category}")
                    else:
                        print(f"[模拟] 将迁移: {file_path.name} -> {category}")

                except Exception as e:
                    error_msg = f"迁移失败 {file_path}: {e}"
                    self.migration_report["errors"].append(error_msg)
                    self.migration_report["statistics"]["total_errors"] += 1
                    self.migration_report["migration_details"]["thematic-research"]["skipped"] += 1
                    print(f"✗ {error_msg}")

        return migrated_files

    def save_migration_report(self):
        """保存迁移报告"""
        report_path = self.unified_docs_path / "migration-report.json"

        with open(report_path, 'w', encoding='utf-8') as f:
            json.dump(self.migration_report, f, ensure_ascii=False, indent=2)

        print(f"迁移报告已保存到: {report_path}")
        return report_path

    def print_summary(self):
        """打印迁移摘要"""
        stats = self.migration_report["statistics"]
        details = self.migration_report["migration_details"]

        print("\n=== 迁移摘要 ===")
        print(f"总扫描文件数: {stats['total_files_scanned']}")
        print(f"总迁移文件数: {stats['total_files_migrated']}")
        print(f"总跳过文件数: {stats['total_files_skipped']}")
        print(f"总错误数: {stats['total_errors']}")

        print("\n=== 详细统计 ===")
        for source_type, detail in details.items():
            print(f"{source_type}:")
            print(f"  扫描: {detail['scanned']}")
            print(f"  迁移: {detail['migrated']}")
            print(f"  跳过: {detail['skipped']}")

        if self.migration_report["errors"]:
            print("\n=== 错误列表 ===")
            for error in self.migration_report["errors"]:
                print(f"  - {error}")

def main():
    """主函数"""
    import sys

    migrator = DocumentMigrator()

    if len(sys.argv) < 2:
        print("用法:")
        print("  python doc-migrator.py migrate [--dry-run]")
        print("  python doc-migrator.py migrate-core [--dry-run]")
        print("  python doc-migrator.py migrate-thematic [--dry-run]")
        print("  python doc-migrator.py summary")
        return

    command = sys.argv[1]
    dry_run = "--dry-run" in sys.argv

    if command == "migrate":
        print("开始完整迁移过程...")

        # 迁移core-docs
        migrator.migrate_core_docs(dry_run)

        # 迁移thematic-research
        migrator.migrate_thematic_research(dry_run)

        # 保存报告
        if not dry_run:
            migrator.save_migration_report()

        migrator.print_summary()

    elif command == "migrate-core":
        print("开始迁移 core-docs...")
        migrator.migrate_core_docs(dry_run)

        if not dry_run:
            migrator.save_migration_report()

        migrator.print_summary()

    elif command == "migrate-thematic":
        print("开始迁移 thematic-research...")
        migrator.migrate_thematic_research(dry_run)

        if not dry_run:
            migrator.save_migration_report()

        migrator.print_summary()

    elif command == "summary":
        migrator.print_summary()

    else:
        print(f"未知命令: {command}")

if __name__ == "__main__":
    main()