220 lines
8.0 KiB
Python
220 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
文档去重工具 - 解决core-docs和thematic-research之间的重复文件问题
|
|
|
|
功能:
|
|
1. 分析两个目录中的重复文件
|
|
2. 建立文件映射关系
|
|
3. 生成去重报告
|
|
4. 提供迁移建议
|
|
"""
|
|
|
|
import os
|
|
import hashlib
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
class DocumentDeduplicator:
|
|
def __init__(self, core_docs_path, thematic_research_path):
|
|
self.core_docs_path = Path(core_docs_path)
|
|
self.thematic_research_path = Path(thematic_research_path)
|
|
self.duplicates = defaultdict(list)
|
|
self.file_mapping = {}
|
|
|
|
def calculate_file_hash(self, file_path):
|
|
"""计算文件的MD5哈希值"""
|
|
hash_md5 = hashlib.md5()
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
hash_md5.update(chunk)
|
|
return hash_md5.hexdigest()
|
|
except Exception as e:
|
|
print(f"计算文件哈希时出错: {file_path}, 错误: {e}")
|
|
return None
|
|
|
|
def get_file_info(self, file_path):
|
|
"""获取文件信息"""
|
|
try:
|
|
stat = file_path.stat()
|
|
return {
|
|
'path': str(file_path),
|
|
'size': stat.st_size,
|
|
'modified': stat.st_mtime,
|
|
'hash': self.calculate_file_hash(file_path)
|
|
}
|
|
except Exception as e:
|
|
print(f"获取文件信息时出错: {file_path}, 错误: {e}")
|
|
return None
|
|
|
|
def scan_directory(self, directory_path):
|
|
"""扫描目录中的所有Markdown文件"""
|
|
files = []
|
|
for root, dirs, files_list in os.walk(directory_path):
|
|
for file in files_list:
|
|
if file.endswith('.md') or file.endswith('.py'):
|
|
file_path = Path(root) / file
|
|
file_info = self.get_file_info(file_path)
|
|
if file_info:
|
|
files.append(file_info)
|
|
return files
|
|
|
|
def find_duplicates(self):
|
|
"""查找重复文件"""
|
|
print("开始扫描core-docs目录...")
|
|
core_files = self.scan_directory(self.core_docs_path)
|
|
print(f"找到 {len(core_files)} 个文件")
|
|
|
|
print("开始扫描thematic-research目录...")
|
|
thematic_files = self.scan_directory(self.thematic_research_path)
|
|
print(f"找到 {len(thematic_files)} 个文件")
|
|
|
|
# 按哈希值分组
|
|
hash_groups = defaultdict(list)
|
|
|
|
for file_info in core_files + thematic_files:
|
|
if file_info['hash']:
|
|
hash_groups[file_info['hash']].append(file_info)
|
|
|
|
# 找出重复文件
|
|
for hash_val, files in hash_groups.items():
|
|
if len(files) > 1:
|
|
self.duplicates[hash_val] = files
|
|
|
|
return len(self.duplicates)
|
|
|
|
def analyze_content_similarity(self):
|
|
"""分析内容相似性(基于文件名和路径)"""
|
|
print("分析文件名相似性...")
|
|
|
|
# 获取所有文件名(不含路径)
|
|
core_filenames = {}
|
|
thematic_filenames = {}
|
|
|
|
for root, dirs, files in os.walk(self.core_docs_path):
|
|
for file in files:
|
|
if file.endswith('.md'):
|
|
core_filenames[file] = os.path.join(root, file)
|
|
|
|
for root, dirs, files in os.walk(self.thematic_research_path):
|
|
for file in files:
|
|
if file.endswith('.md'):
|
|
thematic_filenames[file] = os.path.join(root, file)
|
|
|
|
# 找出相同文件名的文件
|
|
common_filenames = set(core_filenames.keys()) & set(thematic_filenames.keys())
|
|
|
|
print(f"发现 {len(common_filenames)} 个相同文件名的文件")
|
|
|
|
similarity_report = {
|
|
'common_filenames': list(common_filenames),
|
|
'core_unique': len(core_filenames) - len(common_filenames),
|
|
'thematic_unique': len(thematic_filenames) - len(common_filenames),
|
|
'total_files': len(core_filenames) + len(thematic_filenames)
|
|
}
|
|
|
|
return similarity_report
|
|
|
|
def generate_migration_plan(self):
|
|
"""生成迁移计划"""
|
|
print("生成文档迁移计划...")
|
|
|
|
migration_plan = {
|
|
'unified_structure': {
|
|
'core-theory': [],
|
|
'thematic-research': [],
|
|
'historical-analysis': [],
|
|
'methodology': [],
|
|
'applications': [],
|
|
'resources': []
|
|
},
|
|
'files_to_keep': [],
|
|
'files_to_remove': [],
|
|
'estimated_space_saving': 0
|
|
}
|
|
|
|
# 分析重复文件,决定保留哪个版本
|
|
for hash_val, files in self.duplicates.items():
|
|
if len(files) > 1:
|
|
# 选择修改时间最新的文件
|
|
latest_file = max(files, key=lambda x: x['modified'])
|
|
migration_plan['files_to_keep'].append(latest_file['path'])
|
|
|
|
# 标记要删除的文件
|
|
for file in files:
|
|
if file['path'] != latest_file['path']:
|
|
migration_plan['files_to_remove'].append(file['path'])
|
|
migration_plan['estimated_space_saving'] += file['size']
|
|
|
|
return migration_plan
|
|
|
|
def generate_report(self):
|
|
"""生成详细报告"""
|
|
print("生成去重分析报告...")
|
|
|
|
# 查找重复文件
|
|
duplicate_count = self.find_duplicates()
|
|
|
|
# 分析内容相似性
|
|
similarity_report = self.analyze_content_similarity()
|
|
|
|
# 生成迁移计划
|
|
migration_plan = self.generate_migration_plan()
|
|
|
|
report = {
|
|
'summary': {
|
|
'total_duplicates_found': duplicate_count,
|
|
'files_with_common_names': similarity_report['common_filenames'],
|
|
'core_unique_files': similarity_report['core_unique'],
|
|
'thematic_unique_files': similarity_report['thematic_unique'],
|
|
'total_files_analyzed': similarity_report['total_files']
|
|
},
|
|
'duplicates_details': dict(self.duplicates),
|
|
'migration_plan': migration_plan,
|
|
'recommendations': [
|
|
"建立统一的文档目录结构",
|
|
"实施文档版本控制系统",
|
|
"开发自动化文档索引工具",
|
|
"建立文档生命周期管理机制"
|
|
]
|
|
}
|
|
|
|
return report
|
|
|
|
def main():
|
|
"""主函数"""
|
|
core_docs_path = "/home/ben/code/huhan3000/core-docs"
|
|
thematic_research_path = "/home/ben/code/huhan3000/thematic-research"
|
|
|
|
print("=== 胡汉三千年项目文档去重分析工具 ===")
|
|
print(f"Core Docs 路径: {core_docs_path}")
|
|
print(f"Thematic Research 路径: {thematic_research_path}")
|
|
print()
|
|
|
|
deduplicator = DocumentDeduplicator(core_docs_path, thematic_research_path)
|
|
|
|
# 生成报告
|
|
report = deduplicator.generate_report()
|
|
|
|
# 保存报告
|
|
report_file = "/home/ben/code/huhan3000/docs-deduplication-report.json"
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"报告已保存到: {report_file}")
|
|
|
|
# 打印摘要
|
|
print("\n=== 分析摘要 ===")
|
|
print(f"发现重复文件组数: {report['summary']['total_duplicates_found']}")
|
|
print(f"相同文件名的文件数: {len(report['summary']['files_with_common_names'])}")
|
|
print(f"Core Docs 独有文件: {report['summary']['core_unique_files']}")
|
|
print(f"Thematic Research 独有文件: {report['summary']['thematic_unique_files']}")
|
|
print(f"预计节省空间: {report['migration_plan']['estimated_space_saving'] / (1024*1024):.2f} MB")
|
|
|
|
print("\n=== 推荐操作 ===")
|
|
for i, recommendation in enumerate(report['recommendations'], 1):
|
|
print(f"{i}. {recommendation}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |