#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 胡汉三千年项目文档迁移工具 功能: 1. 从core-docs和thematic-research迁移文档到统一文档系统 2. 自动分类和组织文档 3. 保持文档结构和元数据 4. 生成迁移报告 作者:胡汉三千年项目团队 版本:1.0.0 """ import os import json import shutil import hashlib from pathlib import Path import datetime class DocumentMigrator: def __init__(self): self.base_path = Path("/home/ben/code/huhan3000") self.unified_docs_path = self.base_path / "unified-docs" self.core_docs_path = self.base_path / "core-docs" self.thematic_research_path = self.base_path / "thematic-research" # 分类映射规则 self.category_mapping = { # core-docs 分类规则 "core-docs": { "音韵考古学": "01-core-theory/01-phonological-archaeology", "文明传播模型": "01-core-theory/02-civilization-diffusion", "方法论体系": "01-core-theory/03-methodology", "学术成果": "01-core-theory/04-academic-achievements", "理论框架": "01-core-theory/05-theoretical-framework", "实证研究": "02-thematic-research/01-empirical-studies", "历史分析": "03-historical-analysis/01-historical-events", "文化比较": "04-cultural-comparison/01-cross-cultural", "技术实现": "05-technical-implementation/01-tools", "项目文档": "06-project-docs/01-management" }, # thematic-research 分类规则 "thematic-research": { "civilization-studies": "02-thematic-research/02-civilization-studies", "phonological-studies": "02-thematic-research/03-phonological-studies", "commercial-studies": "02-thematic-research/04-commercial-studies", "historical-studies": "03-historical-analysis/02-historical-studies", "cultural-studies": "04-cultural-comparison/02-cultural-studies", "theory-studies": "01-core-theory/06-theory-studies", "methodology-studies": "01-core-theory/03-methodology", "empirical-studies": "02-thematic-research/01-empirical-studies", "comparative-studies": "04-cultural-comparison/03-comparative-studies" } } # 文件扩展名映射 self.file_extensions = { ".md": "markdown", ".txt": "text", ".py": "python", ".json": "json", ".yaml": "yaml", ".yml": "yaml" } self.migration_report = { "metadata": { "migration_date": datetime.datetime.now().isoformat(), "tool_version": "1.0.0" }, "statistics": { "total_files_scanned": 0, "total_files_migrated": 0, "total_files_skipped": 0, "total_errors": 0 }, "migration_details": { "core-docs": {"scanned": 0, "migrated": 0, "skipped": 0}, "thematic-research": {"scanned": 0, "migrated": 0, "skipped": 0} }, "errors": [], "migrated_files": [] } def _calculate_file_hash(self, file_path): """计算文件内容的哈希值""" hash_md5 = hashlib.md5() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def _get_file_category(self, source_type, file_path, content=None): """根据文件路径和内容确定分类""" file_path_str = str(file_path) # 首先尝试基于路径的分类 for keyword, target_category in self.category_mapping[source_type].items(): if keyword.lower() in file_path_str.lower(): return target_category # 如果基于路径无法分类,尝试基于内容(如果提供了内容) if content: content_lower = content.lower() # 关键词匹配 keyword_categories = { "音韵": "01-core-theory/01-phonological-archaeology", "文明": "01-core-theory/02-civilization-diffusion", "方法": "01-core-theory/03-methodology", "理论": "01-core-theory/05-theoretical-framework", "实证": "02-thematic-research/01-empirical-studies", "历史": "03-historical-analysis/01-historical-events", "文化": "04-cultural-comparison/01-cross-cultural", "技术": "05-technical-implementation/01-tools", "项目": "06-project-docs/01-management" } for keyword, category in keyword_categories.items(): if keyword in content_lower: return category # 默认分类 if source_type == "core-docs": return "01-core-theory/99-uncategorized" else: return "02-thematic-research/99-uncategorized" def _ensure_directory(self, dir_path): """确保目录存在""" dir_path.mkdir(parents=True, exist_ok=True) def _copy_file_with_metadata(self, source_path, target_path): """复制文件并保持元数据""" try: # 复制文件 shutil.copy2(source_path, target_path) # 获取文件信息 stat = source_path.stat() file_info = { "source_path": str(source_path), "target_path": str(target_path), "size": stat.st_size, "modified_time": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(), "hash": self._calculate_file_hash(source_path), "file_type": self.file_extensions.get(source_path.suffix, "unknown") } return file_info except Exception as e: raise Exception(f"文件复制失败: {e}") def _create_migration_metadata(self, source_path, target_path, category): """创建迁移元数据文件""" metadata_path = target_path.with_suffix(target_path.suffix + ".metadata.json") metadata = { "original_source": str(source_path), "migration_date": datetime.datetime.now().isoformat(), "category": category, "tool_version": "1.0.0" } with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) def migrate_core_docs(self, dry_run=False): """迁移core-docs文档""" print("开始迁移 core-docs 文档...") migrated_files = [] # 扫描core-docs目录 for file_path in self.core_docs_path.rglob("*"): if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]: self.migration_report["statistics"]["total_files_scanned"] += 1 self.migration_report["migration_details"]["core-docs"]["scanned"] += 1 try: # 读取文件内容用于分类 content = None if file_path.suffix in [".md", ".txt"]: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 确定目标分类 category = self._get_file_category("core-docs", file_path, content) # 构建目标路径 relative_path = file_path.relative_to(self.core_docs_path) target_dir = self.unified_docs_path / category target_path = target_dir / relative_path.name # 确保目标目录存在 self._ensure_directory(target_dir) if not dry_run: # 复制文件 file_info = self._copy_file_with_metadata(file_path, target_path) # 创建元数据文件 self._create_migration_metadata(file_path, target_path, category) file_info["category"] = category migrated_files.append(file_info) self.migration_report["statistics"]["total_files_migrated"] += 1 self.migration_report["migration_details"]["core-docs"]["migrated"] += 1 print(f"✓ 已迁移: {file_path.name} -> {category}") else: print(f"[模拟] 将迁移: {file_path.name} -> {category}") except Exception as e: error_msg = f"迁移失败 {file_path}: {e}" self.migration_report["errors"].append(error_msg) self.migration_report["statistics"]["total_errors"] += 1 self.migration_report["migration_details"]["core-docs"]["skipped"] += 1 print(f"✗ {error_msg}") return migrated_files def migrate_thematic_research(self, dry_run=False): """迁移thematic-research文档""" print("开始迁移 thematic-research 文档...") migrated_files = [] # 扫描thematic-research目录 for file_path in self.thematic_research_path.rglob("*"): if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]: self.migration_report["statistics"]["total_files_scanned"] += 1 self.migration_report["migration_details"]["thematic-research"]["scanned"] += 1 try: # 读取文件内容用于分类 content = None if file_path.suffix in [".md", ".txt"]: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 确定目标分类 category = self._get_file_category("thematic-research", file_path, content) # 构建目标路径 relative_path = file_path.relative_to(self.thematic_research_path) target_dir = self.unified_docs_path / category target_path = target_dir / relative_path.name # 确保目标目录存在 self._ensure_directory(target_dir) if not dry_run: # 复制文件 file_info = self._copy_file_with_metadata(file_path, target_path) # 创建元数据文件 self._create_migration_metadata(file_path, target_path, category) file_info["category"] = category migrated_files.append(file_info) self.migration_report["statistics"]["total_files_migrated"] += 1 self.migration_report["migration_details"]["thematic-research"]["migrated"] += 1 print(f"✓ 已迁移: {file_path.name} -> {category}") else: print(f"[模拟] 将迁移: {file_path.name} -> {category}") except Exception as e: error_msg = f"迁移失败 {file_path}: {e}" self.migration_report["errors"].append(error_msg) self.migration_report["statistics"]["total_errors"] += 1 self.migration_report["migration_details"]["thematic-research"]["skipped"] += 1 print(f"✗ {error_msg}") return migrated_files def save_migration_report(self): """保存迁移报告""" report_path = self.unified_docs_path / "migration-report.json" with open(report_path, 'w', encoding='utf-8') as f: json.dump(self.migration_report, f, ensure_ascii=False, indent=2) print(f"迁移报告已保存到: {report_path}") return report_path def print_summary(self): """打印迁移摘要""" stats = self.migration_report["statistics"] details = self.migration_report["migration_details"] print("\n=== 迁移摘要 ===") print(f"总扫描文件数: {stats['total_files_scanned']}") print(f"总迁移文件数: {stats['total_files_migrated']}") print(f"总跳过文件数: {stats['total_files_skipped']}") print(f"总错误数: {stats['total_errors']}") print("\n=== 详细统计 ===") for source_type, detail in details.items(): print(f"{source_type}:") print(f" 扫描: {detail['scanned']}") print(f" 迁移: {detail['migrated']}") print(f" 跳过: {detail['skipped']}") if self.migration_report["errors"]: print("\n=== 错误列表 ===") for error in self.migration_report["errors"]: print(f" - {error}") def main(): """主函数""" import sys migrator = DocumentMigrator() if len(sys.argv) < 2: print("用法:") print(" python doc-migrator.py migrate [--dry-run]") print(" python doc-migrator.py migrate-core [--dry-run]") print(" python doc-migrator.py migrate-thematic [--dry-run]") print(" python doc-migrator.py summary") return command = sys.argv[1] dry_run = "--dry-run" in sys.argv if command == "migrate": print("开始完整迁移过程...") # 迁移core-docs migrator.migrate_core_docs(dry_run) # 迁移thematic-research migrator.migrate_thematic_research(dry_run) # 保存报告 if not dry_run: migrator.save_migration_report() migrator.print_summary() elif command == "migrate-core": print("开始迁移 core-docs...") migrator.migrate_core_docs(dry_run) if not dry_run: migrator.save_migration_report() migrator.print_summary() elif command == "migrate-thematic": print("开始迁移 thematic-research...") migrator.migrate_thematic_research(dry_run) if not dry_run: migrator.save_migration_report() migrator.print_summary() elif command == "summary": migrator.print_summary() else: print(f"未知命令: {command}") if __name__ == "__main__": main()