huhan3000/unified-docs/tools/doc-indexer.py

288 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
胡汉三千年项目文档索引工具
功能:
1. 自动扫描文档目录
2. 生成文档索引
3. 更新统一索引文件
4. 检测文档变更
作者:胡汉三千年项目团队
版本1.0.0
"""
import os
import json
import hashlib
import datetime
from pathlib import Path
class DocumentIndexer:
def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
self.base_path = Path(base_path)
self.index_file = self.base_path / "unified-index.json"
self.categories = [
"01-core-theory",
"02-thematic-research",
"03-historical-analysis",
"04-methodology",
"05-applications",
"06-resources"
]
def scan_documents(self):
"""扫描所有文档目录,收集文档信息"""
documents = {}
for category in self.categories:
category_path = self.base_path / category
if not category_path.exists():
continue
documents[category] = []
# 扫描Markdown文件
for md_file in category_path.rglob("*.md"):
if md_file.name == "README.md":
continue
doc_info = self._get_document_info(md_file, category)
documents[category].append(doc_info)
return documents
def _get_document_info(self, file_path, category):
"""获取单个文档的详细信息"""
stat = file_path.stat()
# 计算文件哈希
file_hash = self._calculate_file_hash(file_path)
# 读取文件内容获取基本信息
title = file_path.stem
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 尝试从内容中提取标题
lines = content.split('\n')
for line in lines:
if line.startswith('# '):
title = line[2:].strip()
break
return {
"title": title,
"filename": file_path.name,
"path": str(file_path.relative_to(self.base_path)),
"category": category,
"size": stat.st_size,
"modified": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(),
"hash": file_hash,
"word_count": len(content.split())
}
def _calculate_file_hash(self, file_path):
"""计算文件内容的哈希值"""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def generate_index(self, documents):
"""生成索引文件"""
index_data = {
"metadata": {
"generated_at": datetime.datetime.now().isoformat(),
"total_documents": sum(len(docs) for docs in documents.values()),
"tool_version": "1.0.0"
},
"categories": {},
"documents": documents,
"statistics": self._calculate_statistics(documents)
}
# 按类别统计
for category, docs in documents.items():
index_data["categories"][category] = {
"count": len(docs),
"total_size": sum(doc["size"] for doc in docs),
"last_modified": max(doc["modified"] for doc in docs) if docs else None
}
return index_data
def _calculate_statistics(self, documents):
"""计算文档统计信息"""
all_docs = []
for docs in documents.values():
all_docs.extend(docs)
if not all_docs:
return {
"total_documents": 0,
"total_size_bytes": 0,
"total_size_mb": 0,
"total_words": 0,
"average_words_per_doc": 0,
"last_modified": None
}
total_size = sum(doc["size"] for doc in all_docs)
total_words = sum(doc["word_count"] for doc in all_docs)
return {
"total_documents": len(all_docs),
"total_size_bytes": total_size,
"total_size_mb": round(total_size / (1024 * 1024), 2),
"total_words": total_words,
"average_words_per_doc": round(total_words / len(all_docs), 2),
"last_modified": max(doc["modified"] for doc in all_docs)
}
def save_index(self, index_data):
"""保存索引到文件"""
with open(self.index_file, 'w', encoding='utf-8') as f:
json.dump(index_data, f, ensure_ascii=False, indent=2)
def update_markdown_index(self, index_data):
"""更新Markdown格式的索引文件"""
md_index_file = self.base_path / "unified-index.md"
# 读取现有的Markdown索引
if md_index_file.exists():
with open(md_index_file, 'r', encoding='utf-8') as f:
content = f.read()
else:
content = ""
# 生成新的索引内容
new_content = self._generate_markdown_index(index_data)
# 更新文档迁移状态部分
updated_content = self._update_migration_status(content, new_content, index_data)
with open(md_index_file, 'w', encoding='utf-8') as f:
f.write(updated_content)
def _generate_markdown_index(self, index_data):
"""生成Markdown格式的索引内容"""
lines = []
# 统计信息
stats = index_data["statistics"]
lines.append("## 文档统计信息\n")
lines.append(f"- **总文档数**: {stats['total_documents']}")
lines.append(f"- **总大小**: {stats['total_size_mb']} MB")
lines.append(f"- **总字数**: {stats['total_words']:,}")
lines.append(f"- **平均每文档字数**: {stats['average_words_per_doc']}")
lines.append(f"- **最后更新时间**: {stats['last_modified']}\n")
# 按类别列出文档
for category, docs in index_data["documents"].items():
if docs:
lines.append(f"\n### {category.replace('-', ' ').title()}\n")
for doc in sorted(docs, key=lambda x: x["title"]):
lines.append(f"- **{doc['title']}** - `{doc['filename']}` ")
lines.append(f" - 大小: {round(doc['size']/1024, 1)} KB")
lines.append(f" - 字数: {doc['word_count']}")
lines.append(f" - 修改: {doc['modified'][:10]}")
return '\n'.join(lines)
def _update_migration_status(self, old_content, new_index_content, index_data):
"""更新文档迁移状态部分"""
# 查找文档迁移状态部分
migration_start = old_content.find("## 文档迁移状态")
if migration_start == -1:
# 如果没有找到,在适当位置插入
insert_pos = old_content.find("## 更新日志")
if insert_pos == -1:
insert_pos = len(old_content)
migration_content = self._generate_migration_status(index_data)
updated_content = old_content[:insert_pos] + "\n" + migration_content + "\n" + old_content[insert_pos:]
else:
# 替换现有的迁移状态部分
migration_end = old_content.find("##", migration_start + 1)
if migration_end == -1:
migration_end = len(old_content)
migration_content = self._generate_migration_status(index_data)
updated_content = old_content[:migration_start] + migration_content + old_content[migration_end:]
# 更新索引内容部分
index_start = updated_content.find("## 文档统计信息")
if index_start != -1:
index_end = updated_content.find("##", index_start + 1)
if index_end == -1:
index_end = len(updated_content)
updated_content = updated_content[:index_start] + new_index_content + updated_content[index_end:]
return updated_content
def _generate_migration_status(self, index_data):
"""生成文档迁移状态内容"""
lines = []
lines.append("## 文档迁移状态\n")
stats = index_data["statistics"]
total_migrated = stats["total_documents"]
# 估算core-docs和thematic-research中的文档数
estimated_core_docs = 399 # 根据之前的统计
estimated_thematic = 142 # 根据之前的统计
total_estimated = estimated_core_docs + estimated_thematic
migration_percentage = (total_migrated / total_estimated * 100) if total_estimated > 0 else 0
lines.append(f"### 迁移进度: {migration_percentage:.1f}%\n")
lines.append(f"- **已迁移文档**: {total_migrated}")
lines.append(f"- **预计总文档**: {total_estimated}")
lines.append(f"- **剩余文档**: {total_estimated - total_migrated}\n")
lines.append("### 按类别迁移情况\n")
for category, info in index_data["categories"].items():
lines.append(f"- **{category.replace('-', ' ').title()}**: {info['count']} 个文档")
return '\n'.join(lines)
def run(self):
"""运行索引工具"""
print("=== 胡汉三千年项目文档索引工具 ===")
print(f"扫描目录: {self.base_path}")
# 扫描文档
print("正在扫描文档...")
documents = self.scan_documents()
# 生成索引
print("正在生成索引...")
index_data = self.generate_index(documents)
# 保存JSON索引
print("正在保存索引文件...")
self.save_index(index_data)
# 更新Markdown索引
print("正在更新Markdown索引...")
self.update_markdown_index(index_data)
# 输出统计信息
stats = index_data["statistics"]
print(f"\n=== 索引完成 ===")
print(f"处理文档数: {stats['total_documents']}")
print(f"总大小: {stats['total_size_mb']} MB")
print(f"索引文件: {self.index_file}")
print(f"生成时间: {index_data['metadata']['generated_at']}")
def main():
"""主函数"""
indexer = DocumentIndexer()
indexer.run()
if __name__ == "__main__":
main()