huhan3000/docs-deduplication-tool.py

#!/usr/bin/env python3
"""
文档去重工具 - 解决core-docs和thematic-research之间的重复文件问题

功能：
1. 分析两个目录中的重复文件
2. 建立文件映射关系
3. 生成去重报告
4. 提供迁移建议
"""

import os
import hashlib
import json
from pathlib import Path
from collections import defaultdict

class DocumentDeduplicator:
    def __init__(self, core_docs_path, thematic_research_path):
        self.core_docs_path = Path(core_docs_path)
        self.thematic_research_path = Path(thematic_research_path)
        self.duplicates = defaultdict(list)
        self.file_mapping = {}

    def calculate_file_hash(self, file_path):
        """计算文件的MD5哈希值"""
        hash_md5 = hashlib.md5()
        try:
            with open(file_path, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
            return hash_md5.hexdigest()
        except Exception as e:
            print(f"计算文件哈希时出错: {file_path}, 错误: {e}")
            return None

    def get_file_info(self, file_path):
        """获取文件信息"""
        try:
            stat = file_path.stat()
            return {
                'path': str(file_path),
                'size': stat.st_size,
                'modified': stat.st_mtime,
                'hash': self.calculate_file_hash(file_path)
            }
        except Exception as e:
            print(f"获取文件信息时出错: {file_path}, 错误: {e}")
            return None

    def scan_directory(self, directory_path):
        """扫描目录中的所有Markdown文件"""
        files = []
        for root, dirs, files_list in os.walk(directory_path):
            for file in files_list:
                if file.endswith('.md') or file.endswith('.py'):
                    file_path = Path(root) / file
                    file_info = self.get_file_info(file_path)
                    if file_info:
                        files.append(file_info)
        return files

    def find_duplicates(self):
        """查找重复文件"""
        print("开始扫描core-docs目录...")
        core_files = self.scan_directory(self.core_docs_path)
        print(f"找到 {len(core_files)} 个文件")

        print("开始扫描thematic-research目录...")
        thematic_files = self.scan_directory(self.thematic_research_path)
        print(f"找到 {len(thematic_files)} 个文件")

        # 按哈希值分组
        hash_groups = defaultdict(list)

        for file_info in core_files + thematic_files:
            if file_info['hash']:
                hash_groups[file_info['hash']].append(file_info)

        # 找出重复文件
        for hash_val, files in hash_groups.items():
            if len(files) > 1:
                self.duplicates[hash_val] = files

        return len(self.duplicates)

    def analyze_content_similarity(self):
        """分析内容相似性（基于文件名和路径）"""
        print("分析文件名相似性...")

        # 获取所有文件名（不含路径）
        core_filenames = {}
        thematic_filenames = {}

        for root, dirs, files in os.walk(self.core_docs_path):
            for file in files:
                if file.endswith('.md'):
                    core_filenames[file] = os.path.join(root, file)

        for root, dirs, files in os.walk(self.thematic_research_path):
            for file in files:
                if file.endswith('.md'):
                    thematic_filenames[file] = os.path.join(root, file)

        # 找出相同文件名的文件
        common_filenames = set(core_filenames.keys()) & set(thematic_filenames.keys())

        print(f"发现 {len(common_filenames)} 个相同文件名的文件")

        similarity_report = {
            'common_filenames': list(common_filenames),
            'core_unique': len(core_filenames) - len(common_filenames),
            'thematic_unique': len(thematic_filenames) - len(common_filenames),
            'total_files': len(core_filenames) + len(thematic_filenames)
        }

        return similarity_report

    def generate_migration_plan(self):
        """生成迁移计划"""
        print("生成文档迁移计划...")

        migration_plan = {
            'unified_structure': {
                'core-theory': [],
                'thematic-research': [],
                'historical-analysis': [],
                'methodology': [],
                'applications': [],
                'resources': []
            },
            'files_to_keep': [],
            'files_to_remove': [],
            'estimated_space_saving': 0
        }

        # 分析重复文件，决定保留哪个版本
        for hash_val, files in self.duplicates.items():
            if len(files) > 1:
                # 选择修改时间最新的文件
                latest_file = max(files, key=lambda x: x['modified'])
                migration_plan['files_to_keep'].append(latest_file['path'])

                # 标记要删除的文件
                for file in files:
                    if file['path'] != latest_file['path']:
                        migration_plan['files_to_remove'].append(file['path'])
                        migration_plan['estimated_space_saving'] += file['size']

        return migration_plan

    def generate_report(self):
        """生成详细报告"""
        print("生成去重分析报告...")

        # 查找重复文件
        duplicate_count = self.find_duplicates()

        # 分析内容相似性
        similarity_report = self.analyze_content_similarity()

        # 生成迁移计划
        migration_plan = self.generate_migration_plan()

        report = {
            'summary': {
                'total_duplicates_found': duplicate_count,
                'files_with_common_names': similarity_report['common_filenames'],
                'core_unique_files': similarity_report['core_unique'],
                'thematic_unique_files': similarity_report['thematic_unique'],
                'total_files_analyzed': similarity_report['total_files']
            },
            'duplicates_details': dict(self.duplicates),
            'migration_plan': migration_plan,
            'recommendations': [
                "建立统一的文档目录结构",
                "实施文档版本控制系统",
                "开发自动化文档索引工具",
                "建立文档生命周期管理机制"
            ]
        }

        return report

def main():
    """主函数"""
    core_docs_path = "/home/ben/code/huhan3000/core-docs"
    thematic_research_path = "/home/ben/code/huhan3000/thematic-research"

    print("=== 胡汉三千年项目文档去重分析工具 ===")
    print(f"Core Docs 路径: {core_docs_path}")
    print(f"Thematic Research 路径: {thematic_research_path}")
    print()

    deduplicator = DocumentDeduplicator(core_docs_path, thematic_research_path)

    # 生成报告
    report = deduplicator.generate_report()

    # 保存报告
    report_file = "/home/ben/code/huhan3000/docs-deduplication-report.json"
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, ensure_ascii=False, indent=2)

    print(f"报告已保存到: {report_file}")

    # 打印摘要
    print("\n=== 分析摘要 ===")
    print(f"发现重复文件组数: {report['summary']['total_duplicates_found']}")
    print(f"相同文件名的文件数: {len(report['summary']['files_with_common_names'])}")
    print(f"Core Docs 独有文件: {report['summary']['core_unique_files']}")
    print(f"Thematic Research 独有文件: {report['summary']['thematic_unique_files']}")
    print(f"预计节省空间: {report['migration_plan']['estimated_space_saving'] / (1024*1024):.2f} MB")

    print("\n=== 推荐操作 ===")
    for i, recommendation in enumerate(report['recommendations'], 1):
        print(f"{i}. {recommendation}")

if __name__ == "__main__":
    main()