huhan3000/docs-deduplication-tool.py

220 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""
文档去重工具 - 解决core-docs和thematic-research之间的重复文件问题
功能:
1. 分析两个目录中的重复文件
2. 建立文件映射关系
3. 生成去重报告
4. 提供迁移建议
"""
import os
import hashlib
import json
from pathlib import Path
from collections import defaultdict
class DocumentDeduplicator:
def __init__(self, core_docs_path, thematic_research_path):
self.core_docs_path = Path(core_docs_path)
self.thematic_research_path = Path(thematic_research_path)
self.duplicates = defaultdict(list)
self.file_mapping = {}
def calculate_file_hash(self, file_path):
"""计算文件的MD5哈希值"""
hash_md5 = hashlib.md5()
try:
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
except Exception as e:
print(f"计算文件哈希时出错: {file_path}, 错误: {e}")
return None
def get_file_info(self, file_path):
"""获取文件信息"""
try:
stat = file_path.stat()
return {
'path': str(file_path),
'size': stat.st_size,
'modified': stat.st_mtime,
'hash': self.calculate_file_hash(file_path)
}
except Exception as e:
print(f"获取文件信息时出错: {file_path}, 错误: {e}")
return None
def scan_directory(self, directory_path):
"""扫描目录中的所有Markdown文件"""
files = []
for root, dirs, files_list in os.walk(directory_path):
for file in files_list:
if file.endswith('.md') or file.endswith('.py'):
file_path = Path(root) / file
file_info = self.get_file_info(file_path)
if file_info:
files.append(file_info)
return files
def find_duplicates(self):
"""查找重复文件"""
print("开始扫描core-docs目录...")
core_files = self.scan_directory(self.core_docs_path)
print(f"找到 {len(core_files)} 个文件")
print("开始扫描thematic-research目录...")
thematic_files = self.scan_directory(self.thematic_research_path)
print(f"找到 {len(thematic_files)} 个文件")
# 按哈希值分组
hash_groups = defaultdict(list)
for file_info in core_files + thematic_files:
if file_info['hash']:
hash_groups[file_info['hash']].append(file_info)
# 找出重复文件
for hash_val, files in hash_groups.items():
if len(files) > 1:
self.duplicates[hash_val] = files
return len(self.duplicates)
def analyze_content_similarity(self):
"""分析内容相似性(基于文件名和路径)"""
print("分析文件名相似性...")
# 获取所有文件名(不含路径)
core_filenames = {}
thematic_filenames = {}
for root, dirs, files in os.walk(self.core_docs_path):
for file in files:
if file.endswith('.md'):
core_filenames[file] = os.path.join(root, file)
for root, dirs, files in os.walk(self.thematic_research_path):
for file in files:
if file.endswith('.md'):
thematic_filenames[file] = os.path.join(root, file)
# 找出相同文件名的文件
common_filenames = set(core_filenames.keys()) & set(thematic_filenames.keys())
print(f"发现 {len(common_filenames)} 个相同文件名的文件")
similarity_report = {
'common_filenames': list(common_filenames),
'core_unique': len(core_filenames) - len(common_filenames),
'thematic_unique': len(thematic_filenames) - len(common_filenames),
'total_files': len(core_filenames) + len(thematic_filenames)
}
return similarity_report
def generate_migration_plan(self):
"""生成迁移计划"""
print("生成文档迁移计划...")
migration_plan = {
'unified_structure': {
'core-theory': [],
'thematic-research': [],
'historical-analysis': [],
'methodology': [],
'applications': [],
'resources': []
},
'files_to_keep': [],
'files_to_remove': [],
'estimated_space_saving': 0
}
# 分析重复文件,决定保留哪个版本
for hash_val, files in self.duplicates.items():
if len(files) > 1:
# 选择修改时间最新的文件
latest_file = max(files, key=lambda x: x['modified'])
migration_plan['files_to_keep'].append(latest_file['path'])
# 标记要删除的文件
for file in files:
if file['path'] != latest_file['path']:
migration_plan['files_to_remove'].append(file['path'])
migration_plan['estimated_space_saving'] += file['size']
return migration_plan
def generate_report(self):
"""生成详细报告"""
print("生成去重分析报告...")
# 查找重复文件
duplicate_count = self.find_duplicates()
# 分析内容相似性
similarity_report = self.analyze_content_similarity()
# 生成迁移计划
migration_plan = self.generate_migration_plan()
report = {
'summary': {
'total_duplicates_found': duplicate_count,
'files_with_common_names': similarity_report['common_filenames'],
'core_unique_files': similarity_report['core_unique'],
'thematic_unique_files': similarity_report['thematic_unique'],
'total_files_analyzed': similarity_report['total_files']
},
'duplicates_details': dict(self.duplicates),
'migration_plan': migration_plan,
'recommendations': [
"建立统一的文档目录结构",
"实施文档版本控制系统",
"开发自动化文档索引工具",
"建立文档生命周期管理机制"
]
}
return report
def main():
"""主函数"""
core_docs_path = "/home/ben/code/huhan3000/core-docs"
thematic_research_path = "/home/ben/code/huhan3000/thematic-research"
print("=== 胡汉三千年项目文档去重分析工具 ===")
print(f"Core Docs 路径: {core_docs_path}")
print(f"Thematic Research 路径: {thematic_research_path}")
print()
deduplicator = DocumentDeduplicator(core_docs_path, thematic_research_path)
# 生成报告
report = deduplicator.generate_report()
# 保存报告
report_file = "/home/ben/code/huhan3000/docs-deduplication-report.json"
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"报告已保存到: {report_file}")
# 打印摘要
print("\n=== 分析摘要 ===")
print(f"发现重复文件组数: {report['summary']['total_duplicates_found']}")
print(f"相同文件名的文件数: {len(report['summary']['files_with_common_names'])}")
print(f"Core Docs 独有文件: {report['summary']['core_unique_files']}")
print(f"Thematic Research 独有文件: {report['summary']['thematic_unique_files']}")
print(f"预计节省空间: {report['migration_plan']['estimated_space_saving'] / (1024*1024):.2f} MB")
print("\n=== 推荐操作 ===")
for i, recommendation in enumerate(report['recommendations'], 1):
print(f"{i}. {recommendation}")
if __name__ == "__main__":
main()