373 lines
15 KiB
Python
373 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
胡汉三千年项目文档迁移工具
|
||
|
||
功能:
|
||
1. 从core-docs和thematic-research迁移文档到统一文档系统
|
||
2. 自动分类和组织文档
|
||
3. 保持文档结构和元数据
|
||
4. 生成迁移报告
|
||
|
||
作者:胡汉三千年项目团队
|
||
版本:1.0.0
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import shutil
|
||
import hashlib
|
||
from pathlib import Path
|
||
import datetime
|
||
|
||
class DocumentMigrator:
|
||
def __init__(self):
|
||
self.base_path = Path("/home/ben/code/huhan3000")
|
||
self.unified_docs_path = self.base_path / "unified-docs"
|
||
self.core_docs_path = self.base_path / "core-docs"
|
||
self.thematic_research_path = self.base_path / "thematic-research"
|
||
|
||
# 分类映射规则
|
||
self.category_mapping = {
|
||
# core-docs 分类规则
|
||
"core-docs": {
|
||
"音韵考古学": "01-core-theory/01-phonological-archaeology",
|
||
"文明传播模型": "01-core-theory/02-civilization-diffusion",
|
||
"方法论体系": "01-core-theory/03-methodology",
|
||
"学术成果": "01-core-theory/04-academic-achievements",
|
||
"理论框架": "01-core-theory/05-theoretical-framework",
|
||
"实证研究": "02-thematic-research/01-empirical-studies",
|
||
"历史分析": "03-historical-analysis/01-historical-events",
|
||
"文化比较": "04-cultural-comparison/01-cross-cultural",
|
||
"技术实现": "05-technical-implementation/01-tools",
|
||
"项目文档": "06-project-docs/01-management"
|
||
},
|
||
# thematic-research 分类规则
|
||
"thematic-research": {
|
||
"civilization-studies": "02-thematic-research/02-civilization-studies",
|
||
"phonological-studies": "02-thematic-research/03-phonological-studies",
|
||
"commercial-studies": "02-thematic-research/04-commercial-studies",
|
||
"historical-studies": "03-historical-analysis/02-historical-studies",
|
||
"cultural-studies": "04-cultural-comparison/02-cultural-studies",
|
||
"theory-studies": "01-core-theory/06-theory-studies",
|
||
"methodology-studies": "01-core-theory/03-methodology",
|
||
"empirical-studies": "02-thematic-research/01-empirical-studies",
|
||
"comparative-studies": "04-cultural-comparison/03-comparative-studies"
|
||
}
|
||
}
|
||
|
||
# 文件扩展名映射
|
||
self.file_extensions = {
|
||
".md": "markdown",
|
||
".txt": "text",
|
||
".py": "python",
|
||
".json": "json",
|
||
".yaml": "yaml",
|
||
".yml": "yaml"
|
||
}
|
||
|
||
self.migration_report = {
|
||
"metadata": {
|
||
"migration_date": datetime.datetime.now().isoformat(),
|
||
"tool_version": "1.0.0"
|
||
},
|
||
"statistics": {
|
||
"total_files_scanned": 0,
|
||
"total_files_migrated": 0,
|
||
"total_files_skipped": 0,
|
||
"total_errors": 0
|
||
},
|
||
"migration_details": {
|
||
"core-docs": {"scanned": 0, "migrated": 0, "skipped": 0},
|
||
"thematic-research": {"scanned": 0, "migrated": 0, "skipped": 0}
|
||
},
|
||
"errors": [],
|
||
"migrated_files": []
|
||
}
|
||
|
||
def _calculate_file_hash(self, file_path):
|
||
"""计算文件内容的哈希值"""
|
||
hash_md5 = hashlib.md5()
|
||
with open(file_path, "rb") as f:
|
||
for chunk in iter(lambda: f.read(4096), b""):
|
||
hash_md5.update(chunk)
|
||
return hash_md5.hexdigest()
|
||
|
||
def _get_file_category(self, source_type, file_path, content=None):
|
||
"""根据文件路径和内容确定分类"""
|
||
file_path_str = str(file_path)
|
||
|
||
# 首先尝试基于路径的分类
|
||
for keyword, target_category in self.category_mapping[source_type].items():
|
||
if keyword.lower() in file_path_str.lower():
|
||
return target_category
|
||
|
||
# 如果基于路径无法分类,尝试基于内容(如果提供了内容)
|
||
if content:
|
||
content_lower = content.lower()
|
||
|
||
# 关键词匹配
|
||
keyword_categories = {
|
||
"音韵": "01-core-theory/01-phonological-archaeology",
|
||
"文明": "01-core-theory/02-civilization-diffusion",
|
||
"方法": "01-core-theory/03-methodology",
|
||
"理论": "01-core-theory/05-theoretical-framework",
|
||
"实证": "02-thematic-research/01-empirical-studies",
|
||
"历史": "03-historical-analysis/01-historical-events",
|
||
"文化": "04-cultural-comparison/01-cross-cultural",
|
||
"技术": "05-technical-implementation/01-tools",
|
||
"项目": "06-project-docs/01-management"
|
||
}
|
||
|
||
for keyword, category in keyword_categories.items():
|
||
if keyword in content_lower:
|
||
return category
|
||
|
||
# 默认分类
|
||
if source_type == "core-docs":
|
||
return "01-core-theory/99-uncategorized"
|
||
else:
|
||
return "02-thematic-research/99-uncategorized"
|
||
|
||
def _ensure_directory(self, dir_path):
|
||
"""确保目录存在"""
|
||
dir_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
def _copy_file_with_metadata(self, source_path, target_path):
|
||
"""复制文件并保持元数据"""
|
||
try:
|
||
# 复制文件
|
||
shutil.copy2(source_path, target_path)
|
||
|
||
# 获取文件信息
|
||
stat = source_path.stat()
|
||
file_info = {
|
||
"source_path": str(source_path),
|
||
"target_path": str(target_path),
|
||
"size": stat.st_size,
|
||
"modified_time": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||
"hash": self._calculate_file_hash(source_path),
|
||
"file_type": self.file_extensions.get(source_path.suffix, "unknown")
|
||
}
|
||
|
||
return file_info
|
||
except Exception as e:
|
||
raise Exception(f"文件复制失败: {e}")
|
||
|
||
def _create_migration_metadata(self, source_path, target_path, category):
|
||
"""创建迁移元数据文件"""
|
||
metadata_path = target_path.with_suffix(target_path.suffix + ".metadata.json")
|
||
|
||
metadata = {
|
||
"original_source": str(source_path),
|
||
"migration_date": datetime.datetime.now().isoformat(),
|
||
"category": category,
|
||
"tool_version": "1.0.0"
|
||
}
|
||
|
||
with open(metadata_path, 'w', encoding='utf-8') as f:
|
||
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
||
|
||
def migrate_core_docs(self, dry_run=False):
|
||
"""迁移core-docs文档"""
|
||
print("开始迁移 core-docs 文档...")
|
||
|
||
migrated_files = []
|
||
|
||
# 扫描core-docs目录
|
||
for file_path in self.core_docs_path.rglob("*"):
|
||
if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]:
|
||
self.migration_report["statistics"]["total_files_scanned"] += 1
|
||
self.migration_report["migration_details"]["core-docs"]["scanned"] += 1
|
||
|
||
try:
|
||
# 读取文件内容用于分类
|
||
content = None
|
||
if file_path.suffix in [".md", ".txt"]:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 确定目标分类
|
||
category = self._get_file_category("core-docs", file_path, content)
|
||
|
||
# 构建目标路径
|
||
relative_path = file_path.relative_to(self.core_docs_path)
|
||
target_dir = self.unified_docs_path / category
|
||
target_path = target_dir / relative_path.name
|
||
|
||
# 确保目标目录存在
|
||
self._ensure_directory(target_dir)
|
||
|
||
if not dry_run:
|
||
# 复制文件
|
||
file_info = self._copy_file_with_metadata(file_path, target_path)
|
||
|
||
# 创建元数据文件
|
||
self._create_migration_metadata(file_path, target_path, category)
|
||
|
||
file_info["category"] = category
|
||
migrated_files.append(file_info)
|
||
|
||
self.migration_report["statistics"]["total_files_migrated"] += 1
|
||
self.migration_report["migration_details"]["core-docs"]["migrated"] += 1
|
||
|
||
print(f"✓ 已迁移: {file_path.name} -> {category}")
|
||
else:
|
||
print(f"[模拟] 将迁移: {file_path.name} -> {category}")
|
||
|
||
except Exception as e:
|
||
error_msg = f"迁移失败 {file_path}: {e}"
|
||
self.migration_report["errors"].append(error_msg)
|
||
self.migration_report["statistics"]["total_errors"] += 1
|
||
self.migration_report["migration_details"]["core-docs"]["skipped"] += 1
|
||
print(f"✗ {error_msg}")
|
||
|
||
return migrated_files
|
||
|
||
def migrate_thematic_research(self, dry_run=False):
|
||
"""迁移thematic-research文档"""
|
||
print("开始迁移 thematic-research 文档...")
|
||
|
||
migrated_files = []
|
||
|
||
# 扫描thematic-research目录
|
||
for file_path in self.thematic_research_path.rglob("*"):
|
||
if file_path.is_file() and file_path.suffix in [".md", ".txt", ".py", ".json"]:
|
||
self.migration_report["statistics"]["total_files_scanned"] += 1
|
||
self.migration_report["migration_details"]["thematic-research"]["scanned"] += 1
|
||
|
||
try:
|
||
# 读取文件内容用于分类
|
||
content = None
|
||
if file_path.suffix in [".md", ".txt"]:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 确定目标分类
|
||
category = self._get_file_category("thematic-research", file_path, content)
|
||
|
||
# 构建目标路径
|
||
relative_path = file_path.relative_to(self.thematic_research_path)
|
||
target_dir = self.unified_docs_path / category
|
||
target_path = target_dir / relative_path.name
|
||
|
||
# 确保目标目录存在
|
||
self._ensure_directory(target_dir)
|
||
|
||
if not dry_run:
|
||
# 复制文件
|
||
file_info = self._copy_file_with_metadata(file_path, target_path)
|
||
|
||
# 创建元数据文件
|
||
self._create_migration_metadata(file_path, target_path, category)
|
||
|
||
file_info["category"] = category
|
||
migrated_files.append(file_info)
|
||
|
||
self.migration_report["statistics"]["total_files_migrated"] += 1
|
||
self.migration_report["migration_details"]["thematic-research"]["migrated"] += 1
|
||
|
||
print(f"✓ 已迁移: {file_path.name} -> {category}")
|
||
else:
|
||
print(f"[模拟] 将迁移: {file_path.name} -> {category}")
|
||
|
||
except Exception as e:
|
||
error_msg = f"迁移失败 {file_path}: {e}"
|
||
self.migration_report["errors"].append(error_msg)
|
||
self.migration_report["statistics"]["total_errors"] += 1
|
||
self.migration_report["migration_details"]["thematic-research"]["skipped"] += 1
|
||
print(f"✗ {error_msg}")
|
||
|
||
return migrated_files
|
||
|
||
def save_migration_report(self):
|
||
"""保存迁移报告"""
|
||
report_path = self.unified_docs_path / "migration-report.json"
|
||
|
||
with open(report_path, 'w', encoding='utf-8') as f:
|
||
json.dump(self.migration_report, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"迁移报告已保存到: {report_path}")
|
||
return report_path
|
||
|
||
def print_summary(self):
|
||
"""打印迁移摘要"""
|
||
stats = self.migration_report["statistics"]
|
||
details = self.migration_report["migration_details"]
|
||
|
||
print("\n=== 迁移摘要 ===")
|
||
print(f"总扫描文件数: {stats['total_files_scanned']}")
|
||
print(f"总迁移文件数: {stats['total_files_migrated']}")
|
||
print(f"总跳过文件数: {stats['total_files_skipped']}")
|
||
print(f"总错误数: {stats['total_errors']}")
|
||
|
||
print("\n=== 详细统计 ===")
|
||
for source_type, detail in details.items():
|
||
print(f"{source_type}:")
|
||
print(f" 扫描: {detail['scanned']}")
|
||
print(f" 迁移: {detail['migrated']}")
|
||
print(f" 跳过: {detail['skipped']}")
|
||
|
||
if self.migration_report["errors"]:
|
||
print("\n=== 错误列表 ===")
|
||
for error in self.migration_report["errors"]:
|
||
print(f" - {error}")
|
||
|
||
def main():
|
||
"""主函数"""
|
||
import sys
|
||
|
||
migrator = DocumentMigrator()
|
||
|
||
if len(sys.argv) < 2:
|
||
print("用法:")
|
||
print(" python doc-migrator.py migrate [--dry-run]")
|
||
print(" python doc-migrator.py migrate-core [--dry-run]")
|
||
print(" python doc-migrator.py migrate-thematic [--dry-run]")
|
||
print(" python doc-migrator.py summary")
|
||
return
|
||
|
||
command = sys.argv[1]
|
||
dry_run = "--dry-run" in sys.argv
|
||
|
||
if command == "migrate":
|
||
print("开始完整迁移过程...")
|
||
|
||
# 迁移core-docs
|
||
migrator.migrate_core_docs(dry_run)
|
||
|
||
# 迁移thematic-research
|
||
migrator.migrate_thematic_research(dry_run)
|
||
|
||
# 保存报告
|
||
if not dry_run:
|
||
migrator.save_migration_report()
|
||
|
||
migrator.print_summary()
|
||
|
||
elif command == "migrate-core":
|
||
print("开始迁移 core-docs...")
|
||
migrator.migrate_core_docs(dry_run)
|
||
|
||
if not dry_run:
|
||
migrator.save_migration_report()
|
||
|
||
migrator.print_summary()
|
||
|
||
elif command == "migrate-thematic":
|
||
print("开始迁移 thematic-research...")
|
||
migrator.migrate_thematic_research(dry_run)
|
||
|
||
if not dry_run:
|
||
migrator.save_migration_report()
|
||
|
||
migrator.print_summary()
|
||
|
||
elif command == "summary":
|
||
migrator.print_summary()
|
||
|
||
else:
|
||
print(f"未知命令: {command}")
|
||
|
||
if __name__ == "__main__":
|
||
main() |