#!/usr/bin/env python3 """ 文档去重工具 - 解决core-docs和thematic-research之间的重复文件问题 功能: 1. 分析两个目录中的重复文件 2. 建立文件映射关系 3. 生成去重报告 4. 提供迁移建议 """ import os import hashlib import json from pathlib import Path from collections import defaultdict class DocumentDeduplicator: def __init__(self, core_docs_path, thematic_research_path): self.core_docs_path = Path(core_docs_path) self.thematic_research_path = Path(thematic_research_path) self.duplicates = defaultdict(list) self.file_mapping = {} def calculate_file_hash(self, file_path): """计算文件的MD5哈希值""" hash_md5 = hashlib.md5() try: with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() except Exception as e: print(f"计算文件哈希时出错: {file_path}, 错误: {e}") return None def get_file_info(self, file_path): """获取文件信息""" try: stat = file_path.stat() return { 'path': str(file_path), 'size': stat.st_size, 'modified': stat.st_mtime, 'hash': self.calculate_file_hash(file_path) } except Exception as e: print(f"获取文件信息时出错: {file_path}, 错误: {e}") return None def scan_directory(self, directory_path): """扫描目录中的所有Markdown文件""" files = [] for root, dirs, files_list in os.walk(directory_path): for file in files_list: if file.endswith('.md') or file.endswith('.py'): file_path = Path(root) / file file_info = self.get_file_info(file_path) if file_info: files.append(file_info) return files def find_duplicates(self): """查找重复文件""" print("开始扫描core-docs目录...") core_files = self.scan_directory(self.core_docs_path) print(f"找到 {len(core_files)} 个文件") print("开始扫描thematic-research目录...") thematic_files = self.scan_directory(self.thematic_research_path) print(f"找到 {len(thematic_files)} 个文件") # 按哈希值分组 hash_groups = defaultdict(list) for file_info in core_files + thematic_files: if file_info['hash']: hash_groups[file_info['hash']].append(file_info) # 找出重复文件 for hash_val, files in hash_groups.items(): if len(files) > 1: self.duplicates[hash_val] = files return len(self.duplicates) def analyze_content_similarity(self): """分析内容相似性(基于文件名和路径)""" print("分析文件名相似性...") # 获取所有文件名(不含路径) core_filenames = {} thematic_filenames = {} for root, dirs, files in os.walk(self.core_docs_path): for file in files: if file.endswith('.md'): core_filenames[file] = os.path.join(root, file) for root, dirs, files in os.walk(self.thematic_research_path): for file in files: if file.endswith('.md'): thematic_filenames[file] = os.path.join(root, file) # 找出相同文件名的文件 common_filenames = set(core_filenames.keys()) & set(thematic_filenames.keys()) print(f"发现 {len(common_filenames)} 个相同文件名的文件") similarity_report = { 'common_filenames': list(common_filenames), 'core_unique': len(core_filenames) - len(common_filenames), 'thematic_unique': len(thematic_filenames) - len(common_filenames), 'total_files': len(core_filenames) + len(thematic_filenames) } return similarity_report def generate_migration_plan(self): """生成迁移计划""" print("生成文档迁移计划...") migration_plan = { 'unified_structure': { 'core-theory': [], 'thematic-research': [], 'historical-analysis': [], 'methodology': [], 'applications': [], 'resources': [] }, 'files_to_keep': [], 'files_to_remove': [], 'estimated_space_saving': 0 } # 分析重复文件,决定保留哪个版本 for hash_val, files in self.duplicates.items(): if len(files) > 1: # 选择修改时间最新的文件 latest_file = max(files, key=lambda x: x['modified']) migration_plan['files_to_keep'].append(latest_file['path']) # 标记要删除的文件 for file in files: if file['path'] != latest_file['path']: migration_plan['files_to_remove'].append(file['path']) migration_plan['estimated_space_saving'] += file['size'] return migration_plan def generate_report(self): """生成详细报告""" print("生成去重分析报告...") # 查找重复文件 duplicate_count = self.find_duplicates() # 分析内容相似性 similarity_report = self.analyze_content_similarity() # 生成迁移计划 migration_plan = self.generate_migration_plan() report = { 'summary': { 'total_duplicates_found': duplicate_count, 'files_with_common_names': similarity_report['common_filenames'], 'core_unique_files': similarity_report['core_unique'], 'thematic_unique_files': similarity_report['thematic_unique'], 'total_files_analyzed': similarity_report['total_files'] }, 'duplicates_details': dict(self.duplicates), 'migration_plan': migration_plan, 'recommendations': [ "建立统一的文档目录结构", "实施文档版本控制系统", "开发自动化文档索引工具", "建立文档生命周期管理机制" ] } return report def main(): """主函数""" core_docs_path = "/home/ben/code/huhan3000/core-docs" thematic_research_path = "/home/ben/code/huhan3000/thematic-research" print("=== 胡汉三千年项目文档去重分析工具 ===") print(f"Core Docs 路径: {core_docs_path}") print(f"Thematic Research 路径: {thematic_research_path}") print() deduplicator = DocumentDeduplicator(core_docs_path, thematic_research_path) # 生成报告 report = deduplicator.generate_report() # 保存报告 report_file = "/home/ben/code/huhan3000/docs-deduplication-report.json" with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, ensure_ascii=False, indent=2) print(f"报告已保存到: {report_file}") # 打印摘要 print("\n=== 分析摘要 ===") print(f"发现重复文件组数: {report['summary']['total_duplicates_found']}") print(f"相同文件名的文件数: {len(report['summary']['files_with_common_names'])}") print(f"Core Docs 独有文件: {report['summary']['core_unique_files']}") print(f"Thematic Research 独有文件: {report['summary']['thematic_unique_files']}") print(f"预计节省空间: {report['migration_plan']['estimated_space_saving'] / (1024*1024):.2f} MB") print("\n=== 推荐操作 ===") for i, recommendation in enumerate(report['recommendations'], 1): print(f"{i}. {recommendation}") if __name__ == "__main__": main()