288 lines
10 KiB
Python
288 lines
10 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
胡汉三千年项目文档索引工具
|
||
|
||
功能:
|
||
1. 自动扫描文档目录
|
||
2. 生成文档索引
|
||
3. 更新统一索引文件
|
||
4. 检测文档变更
|
||
|
||
作者:胡汉三千年项目团队
|
||
版本:1.0.0
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import hashlib
|
||
import datetime
|
||
from pathlib import Path
|
||
|
||
class DocumentIndexer:
|
||
def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
|
||
self.base_path = Path(base_path)
|
||
self.index_file = self.base_path / "unified-index.json"
|
||
self.categories = [
|
||
"01-core-theory",
|
||
"02-thematic-research",
|
||
"03-historical-analysis",
|
||
"04-methodology",
|
||
"05-applications",
|
||
"06-resources"
|
||
]
|
||
|
||
def scan_documents(self):
|
||
"""扫描所有文档目录,收集文档信息"""
|
||
documents = {}
|
||
|
||
for category in self.categories:
|
||
category_path = self.base_path / category
|
||
if not category_path.exists():
|
||
continue
|
||
|
||
documents[category] = []
|
||
|
||
# 扫描Markdown文件
|
||
for md_file in category_path.rglob("*.md"):
|
||
if md_file.name == "README.md":
|
||
continue
|
||
|
||
doc_info = self._get_document_info(md_file, category)
|
||
documents[category].append(doc_info)
|
||
|
||
return documents
|
||
|
||
def _get_document_info(self, file_path, category):
|
||
"""获取单个文档的详细信息"""
|
||
stat = file_path.stat()
|
||
|
||
# 计算文件哈希
|
||
file_hash = self._calculate_file_hash(file_path)
|
||
|
||
# 读取文件内容获取基本信息
|
||
title = file_path.stem
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
# 尝试从内容中提取标题
|
||
lines = content.split('\n')
|
||
for line in lines:
|
||
if line.startswith('# '):
|
||
title = line[2:].strip()
|
||
break
|
||
|
||
return {
|
||
"title": title,
|
||
"filename": file_path.name,
|
||
"path": str(file_path.relative_to(self.base_path)),
|
||
"category": category,
|
||
"size": stat.st_size,
|
||
"modified": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||
"hash": file_hash,
|
||
"word_count": len(content.split())
|
||
}
|
||
|
||
def _calculate_file_hash(self, file_path):
|
||
"""计算文件内容的哈希值"""
|
||
hash_md5 = hashlib.md5()
|
||
with open(file_path, "rb") as f:
|
||
for chunk in iter(lambda: f.read(4096), b""):
|
||
hash_md5.update(chunk)
|
||
return hash_md5.hexdigest()
|
||
|
||
def generate_index(self, documents):
|
||
"""生成索引文件"""
|
||
index_data = {
|
||
"metadata": {
|
||
"generated_at": datetime.datetime.now().isoformat(),
|
||
"total_documents": sum(len(docs) for docs in documents.values()),
|
||
"tool_version": "1.0.0"
|
||
},
|
||
"categories": {},
|
||
"documents": documents,
|
||
"statistics": self._calculate_statistics(documents)
|
||
}
|
||
|
||
# 按类别统计
|
||
for category, docs in documents.items():
|
||
index_data["categories"][category] = {
|
||
"count": len(docs),
|
||
"total_size": sum(doc["size"] for doc in docs),
|
||
"last_modified": max(doc["modified"] for doc in docs) if docs else None
|
||
}
|
||
|
||
return index_data
|
||
|
||
def _calculate_statistics(self, documents):
|
||
"""计算文档统计信息"""
|
||
all_docs = []
|
||
for docs in documents.values():
|
||
all_docs.extend(docs)
|
||
|
||
if not all_docs:
|
||
return {
|
||
"total_documents": 0,
|
||
"total_size_bytes": 0,
|
||
"total_size_mb": 0,
|
||
"total_words": 0,
|
||
"average_words_per_doc": 0,
|
||
"last_modified": None
|
||
}
|
||
|
||
total_size = sum(doc["size"] for doc in all_docs)
|
||
total_words = sum(doc["word_count"] for doc in all_docs)
|
||
|
||
return {
|
||
"total_documents": len(all_docs),
|
||
"total_size_bytes": total_size,
|
||
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
||
"total_words": total_words,
|
||
"average_words_per_doc": round(total_words / len(all_docs), 2),
|
||
"last_modified": max(doc["modified"] for doc in all_docs)
|
||
}
|
||
|
||
def save_index(self, index_data):
|
||
"""保存索引到文件"""
|
||
with open(self.index_file, 'w', encoding='utf-8') as f:
|
||
json.dump(index_data, f, ensure_ascii=False, indent=2)
|
||
|
||
def update_markdown_index(self, index_data):
|
||
"""更新Markdown格式的索引文件"""
|
||
md_index_file = self.base_path / "unified-index.md"
|
||
|
||
# 读取现有的Markdown索引
|
||
if md_index_file.exists():
|
||
with open(md_index_file, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
else:
|
||
content = ""
|
||
|
||
# 生成新的索引内容
|
||
new_content = self._generate_markdown_index(index_data)
|
||
|
||
# 更新文档迁移状态部分
|
||
updated_content = self._update_migration_status(content, new_content, index_data)
|
||
|
||
with open(md_index_file, 'w', encoding='utf-8') as f:
|
||
f.write(updated_content)
|
||
|
||
def _generate_markdown_index(self, index_data):
|
||
"""生成Markdown格式的索引内容"""
|
||
lines = []
|
||
|
||
# 统计信息
|
||
stats = index_data["statistics"]
|
||
lines.append("## 文档统计信息\n")
|
||
lines.append(f"- **总文档数**: {stats['total_documents']}")
|
||
lines.append(f"- **总大小**: {stats['total_size_mb']} MB")
|
||
lines.append(f"- **总字数**: {stats['total_words']:,}")
|
||
lines.append(f"- **平均每文档字数**: {stats['average_words_per_doc']}")
|
||
lines.append(f"- **最后更新时间**: {stats['last_modified']}\n")
|
||
|
||
# 按类别列出文档
|
||
for category, docs in index_data["documents"].items():
|
||
if docs:
|
||
lines.append(f"\n### {category.replace('-', ' ').title()}\n")
|
||
|
||
for doc in sorted(docs, key=lambda x: x["title"]):
|
||
lines.append(f"- **{doc['title']}** - `{doc['filename']}` ")
|
||
lines.append(f" - 大小: {round(doc['size']/1024, 1)} KB")
|
||
lines.append(f" - 字数: {doc['word_count']}")
|
||
lines.append(f" - 修改: {doc['modified'][:10]}")
|
||
|
||
return '\n'.join(lines)
|
||
|
||
def _update_migration_status(self, old_content, new_index_content, index_data):
|
||
"""更新文档迁移状态部分"""
|
||
# 查找文档迁移状态部分
|
||
migration_start = old_content.find("## 文档迁移状态")
|
||
if migration_start == -1:
|
||
# 如果没有找到,在适当位置插入
|
||
insert_pos = old_content.find("## 更新日志")
|
||
if insert_pos == -1:
|
||
insert_pos = len(old_content)
|
||
|
||
migration_content = self._generate_migration_status(index_data)
|
||
updated_content = old_content[:insert_pos] + "\n" + migration_content + "\n" + old_content[insert_pos:]
|
||
else:
|
||
# 替换现有的迁移状态部分
|
||
migration_end = old_content.find("##", migration_start + 1)
|
||
if migration_end == -1:
|
||
migration_end = len(old_content)
|
||
|
||
migration_content = self._generate_migration_status(index_data)
|
||
updated_content = old_content[:migration_start] + migration_content + old_content[migration_end:]
|
||
|
||
# 更新索引内容部分
|
||
index_start = updated_content.find("## 文档统计信息")
|
||
if index_start != -1:
|
||
index_end = updated_content.find("##", index_start + 1)
|
||
if index_end == -1:
|
||
index_end = len(updated_content)
|
||
|
||
updated_content = updated_content[:index_start] + new_index_content + updated_content[index_end:]
|
||
|
||
return updated_content
|
||
|
||
def _generate_migration_status(self, index_data):
|
||
"""生成文档迁移状态内容"""
|
||
lines = []
|
||
lines.append("## 文档迁移状态\n")
|
||
|
||
stats = index_data["statistics"]
|
||
total_migrated = stats["total_documents"]
|
||
|
||
# 估算core-docs和thematic-research中的文档数
|
||
estimated_core_docs = 399 # 根据之前的统计
|
||
estimated_thematic = 142 # 根据之前的统计
|
||
total_estimated = estimated_core_docs + estimated_thematic
|
||
|
||
migration_percentage = (total_migrated / total_estimated * 100) if total_estimated > 0 else 0
|
||
|
||
lines.append(f"### 迁移进度: {migration_percentage:.1f}%\n")
|
||
lines.append(f"- **已迁移文档**: {total_migrated}")
|
||
lines.append(f"- **预计总文档**: {total_estimated}")
|
||
lines.append(f"- **剩余文档**: {total_estimated - total_migrated}\n")
|
||
|
||
lines.append("### 按类别迁移情况\n")
|
||
for category, info in index_data["categories"].items():
|
||
lines.append(f"- **{category.replace('-', ' ').title()}**: {info['count']} 个文档")
|
||
|
||
return '\n'.join(lines)
|
||
|
||
def run(self):
|
||
"""运行索引工具"""
|
||
print("=== 胡汉三千年项目文档索引工具 ===")
|
||
print(f"扫描目录: {self.base_path}")
|
||
|
||
# 扫描文档
|
||
print("正在扫描文档...")
|
||
documents = self.scan_documents()
|
||
|
||
# 生成索引
|
||
print("正在生成索引...")
|
||
index_data = self.generate_index(documents)
|
||
|
||
# 保存JSON索引
|
||
print("正在保存索引文件...")
|
||
self.save_index(index_data)
|
||
|
||
# 更新Markdown索引
|
||
print("正在更新Markdown索引...")
|
||
self.update_markdown_index(index_data)
|
||
|
||
# 输出统计信息
|
||
stats = index_data["statistics"]
|
||
print(f"\n=== 索引完成 ===")
|
||
print(f"处理文档数: {stats['total_documents']}")
|
||
print(f"总大小: {stats['total_size_mb']} MB")
|
||
print(f"索引文件: {self.index_file}")
|
||
print(f"生成时间: {index_data['metadata']['generated_at']}")
|
||
|
||
def main():
|
||
"""主函数"""
|
||
indexer = DocumentIndexer()
|
||
indexer.run()
|
||
|
||
if __name__ == "__main__":
|
||
main() |