huhan3000/unified-docs/tools/doc-migrator.py

373 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
胡汉三千年项目文档迁移工具
功能:
1. 从core-docs和thematic-research迁移文档到统一文档系统
2. 自动分类和组织文档
3. 保持文档结构和元数据
4. 生成迁移报告
作者:胡汉三千年项目团队
版本1.0.0
"""
import os
import json
import shutil
import hashlib
from pathlib import Path
import datetime
class DocumentMigrator:
def __init__(self):
self.base_path = Path("/home/ben/code/huhan3000")
self.unified_docs_path = self.base_path / "unified-docs"
self.core_docs_path = self.base_path / "core-docs"
self.thematic_research_path = self.base_path / "thematic-research"
# 分类映射规则
self.category_mapping = {
# core-docs 分类规则
"core-docs": {
"音韵考古学": "01-core-theory/01-phonological-archaeology",
"文明传播模型": "01-core-theory/02-civilization-diffusion",
"方法论体系": "01-core-theory/03-methodology",
"学术成果": "01-core-theory/04-academic-achievements",
"理论框架": "01-core-theory/05-theoretical-framework",
"实证研究": "02-thematic-research/01-empirical-studies",
"历史分析": "03-historical-analysis/01-historical-events",
"文化比较": "04-cultural-comparison/01-cross-cultural",
"技术实现": "05-technical-implementation/01-tools",
"项目文档": "06-project-docs/01-management"
},
# thematic-research 分类规则
"thematic-research": {
"civilization-studies": "02-thematic-research/02-civilization-studies",
"phonological-studies": "02-thematic-research/03-phonological-studies",
"commercial-studies": "02-thematic-research/04-commercial-studies",
"historical-studies": "03-historical-analysis/02-historical-studies",
"cultural-studies": "04-cultural-comparison/02-cultural-studies",
"theory-studies": "01-core-theory/06-theory-studies",
"methodology-studies": "01-core-theory/03-methodology",
"empirical-studies": "02-thematic-research/01-empirical-studies",
"comparative-studies": "04-cultural-comparison/03-comparative-studies"
}
}
# 文件扩展名映射
self.file_extensions = {
".md": "markdown",
".txt": "text",
".py": "python",
".json": "json",
".yaml": "yaml",
".yml": "yaml"
}
self.migration_report = {
"metadata": {
"migration_date": datetime.datetime.now().isoformat(),
"tool_version": "1.0.0"
},
"statistics": {
"total_files_scanned": 0,
"total_files_migrated": 0,
"total_files_skipped": 0,
"total_errors": 0
},
"migration_details": {
"core-docs": {"scanned": 0, "migrated": 0, "skipped": 0},
"thematic-research": {"scanned": 0, "migrated": 0, "skipped": 0}
},
"errors": [],
"migrated_files": []
}
def _calculate_file_hash(self, file_path):
"""计算文件内容的哈希值"""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def _get_file_category(self, source_type, file_path, content=None):
"""根据文件路径和内容确定分类"""
file_path_str = str(file_path)
# 首先尝试基于路径的分类
for keyword, target_category in self.category_mapping[source_type].items():
if keyword.lower() in file_path_str.lower():
return target_category
# 如果基于路径无法分类,尝试基于内容(如果提供了内容)
if content:
content_lower = content.lower()
# 关键词匹配
keyword_categories = {
"音韵": "01-core-theory/01-phonological-archaeology",
"文明": "01-core-theory/02-civilization-diffusion",
"方法": "01-core-theory/03-methodology",
"理论": "01-core-theory/05-theoretical-framework",
"实证": "02-thematic-research/01-empirical-studies",
"历史": "03-historical-analysis/01-historical-events",
"文化": "04-cultural-comparison/01-cross-cultural",
"技术": "05-technical-implementation/01-tools",
"项目": "06-project-docs/01-management"
}
for keyword, category in keyword_categories.items():
if keyword in content_lower:
return category
# 默认分类
if source_type == "core-docs":
return "01-core-theory/99-uncategorized"
else:
return "02-thematic-research/99-uncategorized"
def _ensure_directory(self, dir_path):
"""确保目录存在"""
dir_path.mkdir(parents=True, exist_ok=True)
def _copy_file_with_metadata(self, source_path, target_path):
"""复制文件并保持元数据"""
try:
# 复制文件
shutil.copy2(source_path, target_path)
# 获取文件信息
stat = source_path.stat()
file_info = {
"source_path": str(source_path),
"target_path": str(target_path),
"size": stat.st_size,
"modified_time": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
"hash": self._calculate_file_hash(source_path),
"file_type": self.file_extensions.get(source_path.suffix, "unknown")
}
return file_info
except Exception as e:
raise Exception(f"文件复制失败: {e}")
def _create_migration_metadata(self, source_path, target_path, category):
"""创建迁移元数据文件"""
metadata_path = target_path.with_suffix(target_path.suffix + ".metadata.json")
metadata = {
"original_source": str(source_path),
"migration_date": datetime.datetime.now().isoformat(),
"category": category,
"tool_version": "1.0.0"
}
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
def migrate_core_docs(self, dry_run=False):
"""迁移core-docs文档"""
print("开始迁移 core-docs 文档...")
migrated_files = []
# 扫描core-docs目录
for file_path in self.core_docs_path.rglob("*"):
if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]:
self.migration_report["statistics"]["total_files_scanned"] += 1
self.migration_report["migration_details"]["core-docs"]["scanned"] += 1
try:
# 读取文件内容用于分类
content = None
if file_path.suffix in [".md", ".txt"]:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 确定目标分类
category = self._get_file_category("core-docs", file_path, content)
# 构建目标路径
relative_path = file_path.relative_to(self.core_docs_path)
target_dir = self.unified_docs_path / category
target_path = target_dir / relative_path.name
# 确保目标目录存在
self._ensure_directory(target_dir)
if not dry_run:
# 复制文件
file_info = self._copy_file_with_metadata(file_path, target_path)
# 创建元数据文件
self._create_migration_metadata(file_path, target_path, category)
file_info["category"] = category
migrated_files.append(file_info)
self.migration_report["statistics"]["total_files_migrated"] += 1
self.migration_report["migration_details"]["core-docs"]["migrated"] += 1
print(f"✓ 已迁移: {file_path.name} -> {category}")
else:
print(f"[模拟] 将迁移: {file_path.name} -> {category}")
except Exception as e:
error_msg = f"迁移失败 {file_path}: {e}"
self.migration_report["errors"].append(error_msg)
self.migration_report["statistics"]["total_errors"] += 1
self.migration_report["migration_details"]["core-docs"]["skipped"] += 1
print(f"{error_msg}")
return migrated_files
def migrate_thematic_research(self, dry_run=False):
"""迁移thematic-research文档"""
print("开始迁移 thematic-research 文档...")
migrated_files = []
# 扫描thematic-research目录
for file_path in self.thematic_research_path.rglob("*"):
if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]:
self.migration_report["statistics"]["total_files_scanned"] += 1
self.migration_report["migration_details"]["thematic-research"]["scanned"] += 1
try:
# 读取文件内容用于分类
content = None
if file_path.suffix in [".md", ".txt"]:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 确定目标分类
category = self._get_file_category("thematic-research", file_path, content)
# 构建目标路径
relative_path = file_path.relative_to(self.thematic_research_path)
target_dir = self.unified_docs_path / category
target_path = target_dir / relative_path.name
# 确保目标目录存在
self._ensure_directory(target_dir)
if not dry_run:
# 复制文件
file_info = self._copy_file_with_metadata(file_path, target_path)
# 创建元数据文件
self._create_migration_metadata(file_path, target_path, category)
file_info["category"] = category
migrated_files.append(file_info)
self.migration_report["statistics"]["total_files_migrated"] += 1
self.migration_report["migration_details"]["thematic-research"]["migrated"] += 1
print(f"✓ 已迁移: {file_path.name} -> {category}")
else:
print(f"[模拟] 将迁移: {file_path.name} -> {category}")
except Exception as e:
error_msg = f"迁移失败 {file_path}: {e}"
self.migration_report["errors"].append(error_msg)
self.migration_report["statistics"]["total_errors"] += 1
self.migration_report["migration_details"]["thematic-research"]["skipped"] += 1
print(f"{error_msg}")
return migrated_files
def save_migration_report(self):
"""保存迁移报告"""
report_path = self.unified_docs_path / "migration-report.json"
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(self.migration_report, f, ensure_ascii=False, indent=2)
print(f"迁移报告已保存到: {report_path}")
return report_path
def print_summary(self):
"""打印迁移摘要"""
stats = self.migration_report["statistics"]
details = self.migration_report["migration_details"]
print("\n=== 迁移摘要 ===")
print(f"总扫描文件数: {stats['total_files_scanned']}")
print(f"总迁移文件数: {stats['total_files_migrated']}")
print(f"总跳过文件数: {stats['total_files_skipped']}")
print(f"总错误数: {stats['total_errors']}")
print("\n=== 详细统计 ===")
for source_type, detail in details.items():
print(f"{source_type}:")
print(f" 扫描: {detail['scanned']}")
print(f" 迁移: {detail['migrated']}")
print(f" 跳过: {detail['skipped']}")
if self.migration_report["errors"]:
print("\n=== 错误列表 ===")
for error in self.migration_report["errors"]:
print(f" - {error}")
def main():
"""主函数"""
import sys
migrator = DocumentMigrator()
if len(sys.argv) < 2:
print("用法:")
print(" python doc-migrator.py migrate [--dry-run]")
print(" python doc-migrator.py migrate-core [--dry-run]")
print(" python doc-migrator.py migrate-thematic [--dry-run]")
print(" python doc-migrator.py summary")
return
command = sys.argv[1]
dry_run = "--dry-run" in sys.argv
if command == "migrate":
print("开始完整迁移过程...")
# 迁移core-docs
migrator.migrate_core_docs(dry_run)
# 迁移thematic-research
migrator.migrate_thematic_research(dry_run)
# 保存报告
if not dry_run:
migrator.save_migration_report()
migrator.print_summary()
elif command == "migrate-core":
print("开始迁移 core-docs...")
migrator.migrate_core_docs(dry_run)
if not dry_run:
migrator.save_migration_report()
migrator.print_summary()
elif command == "migrate-thematic":
print("开始迁移 thematic-research...")
migrator.migrate_thematic_research(dry_run)
if not dry_run:
migrator.save_migration_report()
migrator.print_summary()
elif command == "summary":
migrator.print_summary()
else:
print(f"未知命令: {command}")
if __name__ == "__main__":
main()