#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 胡汉三千年项目文档索引工具 功能: 1. 自动扫描文档目录 2. 生成文档索引 3. 更新统一索引文件 4. 检测文档变更 作者:胡汉三千年项目团队 版本:1.0.0 """ import os import json import hashlib import datetime from pathlib import Path class DocumentIndexer: def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"): self.base_path = Path(base_path) self.index_file = self.base_path / "unified-index.json" self.categories = [ "01-core-theory", "02-thematic-research", "03-historical-analysis", "04-methodology", "05-applications", "06-resources" ] def scan_documents(self): """扫描所有文档目录,收集文档信息""" documents = {} for category in self.categories: category_path = self.base_path / category if not category_path.exists(): continue documents[category] = [] # 扫描Markdown文件 for md_file in category_path.rglob("*.md"): if md_file.name == "README.md": continue doc_info = self._get_document_info(md_file, category) documents[category].append(doc_info) return documents def _get_document_info(self, file_path, category): """获取单个文档的详细信息""" stat = file_path.stat() # 计算文件哈希 file_hash = self._calculate_file_hash(file_path) # 读取文件内容获取基本信息 title = file_path.stem with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 尝试从内容中提取标题 lines = content.split('\n') for line in lines: if line.startswith('# '): title = line[2:].strip() break return { "title": title, "filename": file_path.name, "path": str(file_path.relative_to(self.base_path)), "category": category, "size": stat.st_size, "modified": datetime.datetime.fromtimestamp(stat.st_mtime).isoformat(), "hash": file_hash, "word_count": len(content.split()) } def _calculate_file_hash(self, file_path): """计算文件内容的哈希值""" hash_md5 = hashlib.md5() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def generate_index(self, documents): """生成索引文件""" index_data = { "metadata": { "generated_at": datetime.datetime.now().isoformat(), "total_documents": sum(len(docs) for docs in documents.values()), "tool_version": "1.0.0" }, "categories": {}, "documents": documents, "statistics": self._calculate_statistics(documents) } # 按类别统计 for category, docs in documents.items(): index_data["categories"][category] = { "count": len(docs), "total_size": sum(doc["size"] for doc in docs), "last_modified": max(doc["modified"] for doc in docs) if docs else None } return index_data def _calculate_statistics(self, documents): """计算文档统计信息""" all_docs = [] for docs in documents.values(): all_docs.extend(docs) if not all_docs: return { "total_documents": 0, "total_size_bytes": 0, "total_size_mb": 0, "total_words": 0, "average_words_per_doc": 0, "last_modified": None } total_size = sum(doc["size"] for doc in all_docs) total_words = sum(doc["word_count"] for doc in all_docs) return { "total_documents": len(all_docs), "total_size_bytes": total_size, "total_size_mb": round(total_size / (1024 * 1024), 2), "total_words": total_words, "average_words_per_doc": round(total_words / len(all_docs), 2), "last_modified": max(doc["modified"] for doc in all_docs) } def save_index(self, index_data): """保存索引到文件""" with open(self.index_file, 'w', encoding='utf-8') as f: json.dump(index_data, f, ensure_ascii=False, indent=2) def update_markdown_index(self, index_data): """更新Markdown格式的索引文件""" md_index_file = self.base_path / "unified-index.md" # 读取现有的Markdown索引 if md_index_file.exists(): with open(md_index_file, 'r', encoding='utf-8') as f: content = f.read() else: content = "" # 生成新的索引内容 new_content = self._generate_markdown_index(index_data) # 更新文档迁移状态部分 updated_content = self._update_migration_status(content, new_content, index_data) with open(md_index_file, 'w', encoding='utf-8') as f: f.write(updated_content) def _generate_markdown_index(self, index_data): """生成Markdown格式的索引内容""" lines = [] # 统计信息 stats = index_data["statistics"] lines.append("## 文档统计信息\n") lines.append(f"- **总文档数**: {stats['total_documents']}") lines.append(f"- **总大小**: {stats['total_size_mb']} MB") lines.append(f"- **总字数**: {stats['total_words']:,}") lines.append(f"- **平均每文档字数**: {stats['average_words_per_doc']}") lines.append(f"- **最后更新时间**: {stats['last_modified']}\n") # 按类别列出文档 for category, docs in index_data["documents"].items(): if docs: lines.append(f"\n### {category.replace('-', ' ').title()}\n") for doc in sorted(docs, key=lambda x: x["title"]): lines.append(f"- **{doc['title']}** - `{doc['filename']}` ") lines.append(f" - 大小: {round(doc['size']/1024, 1)} KB") lines.append(f" - 字数: {doc['word_count']}") lines.append(f" - 修改: {doc['modified'][:10]}") return '\n'.join(lines) def _update_migration_status(self, old_content, new_index_content, index_data): """更新文档迁移状态部分""" # 查找文档迁移状态部分 migration_start = old_content.find("## 文档迁移状态") if migration_start == -1: # 如果没有找到,在适当位置插入 insert_pos = old_content.find("## 更新日志") if insert_pos == -1: insert_pos = len(old_content) migration_content = self._generate_migration_status(index_data) updated_content = old_content[:insert_pos] + "\n" + migration_content + "\n" + old_content[insert_pos:] else: # 替换现有的迁移状态部分 migration_end = old_content.find("##", migration_start + 1) if migration_end == -1: migration_end = len(old_content) migration_content = self._generate_migration_status(index_data) updated_content = old_content[:migration_start] + migration_content + old_content[migration_end:] # 更新索引内容部分 index_start = updated_content.find("## 文档统计信息") if index_start != -1: index_end = updated_content.find("##", index_start + 1) if index_end == -1: index_end = len(updated_content) updated_content = updated_content[:index_start] + new_index_content + updated_content[index_end:] return updated_content def _generate_migration_status(self, index_data): """生成文档迁移状态内容""" lines = [] lines.append("## 文档迁移状态\n") stats = index_data["statistics"] total_migrated = stats["total_documents"] # 估算core-docs和thematic-research中的文档数 estimated_core_docs = 399 # 根据之前的统计 estimated_thematic = 142 # 根据之前的统计 total_estimated = estimated_core_docs + estimated_thematic migration_percentage = (total_migrated / total_estimated * 100) if total_estimated > 0 else 0 lines.append(f"### 迁移进度: {migration_percentage:.1f}%\n") lines.append(f"- **已迁移文档**: {total_migrated}") lines.append(f"- **预计总文档**: {total_estimated}") lines.append(f"- **剩余文档**: {total_estimated - total_migrated}\n") lines.append("### 按类别迁移情况\n") for category, info in index_data["categories"].items(): lines.append(f"- **{category.replace('-', ' ').title()}**: {info['count']} 个文档") return '\n'.join(lines) def run(self): """运行索引工具""" print("=== 胡汉三千年项目文档索引工具 ===") print(f"扫描目录: {self.base_path}") # 扫描文档 print("正在扫描文档...") documents = self.scan_documents() # 生成索引 print("正在生成索引...") index_data = self.generate_index(documents) # 保存JSON索引 print("正在保存索引文件...") self.save_index(index_data) # 更新Markdown索引 print("正在更新Markdown索引...") self.update_markdown_index(index_data) # 输出统计信息 stats = index_data["statistics"] print(f"\n=== 索引完成 ===") print(f"处理文档数: {stats['total_documents']}") print(f"总大小: {stats['total_size_mb']} MB") print(f"索引文件: {self.index_file}") print(f"生成时间: {index_data['metadata']['generated_at']}") def main(): """主函数""" indexer = DocumentIndexer() indexer.run() if __name__ == "__main__": main()