#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 胡汉三千年项目文档搜索工具 功能: 1. 全文搜索文档内容 2. 按关键词检索 3. 按类别过滤 4. 支持模糊搜索 作者:胡汉三千年项目团队 版本:1.0.0 """ import os import json import re from pathlib import Path class DocumentSearcher: def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"): self.base_path = Path(base_path) self.index_file = self.base_path / "unified-index.json" self.index_data = self._load_index() def _load_index(self): """加载索引文件""" if not self.index_file.exists(): print("警告:索引文件不存在,请先运行文档索引工具") return {"documents": {}} with open(self.index_file, 'r', encoding='utf-8') as f: return json.load(f) def search_by_keyword(self, keyword, category=None, case_sensitive=False): """按关键词搜索文档""" results = [] for cat, docs in self.index_data.get("documents", {}).items(): # 如果指定了类别,只搜索该类别 if category and cat != category: continue for doc in docs: file_path = self.base_path / doc["path"] if not file_path.exists(): continue # 搜索文件内容 matches = self._search_in_file(file_path, keyword, case_sensitive) if matches: result = { "document": doc, "matches": matches, "match_count": len(matches) } results.append(result) # 按匹配数量排序 results.sort(key=lambda x: x["match_count"], reverse=True) return results def _search_in_file(self, file_path, keyword, case_sensitive): """在单个文件中搜索关键词""" matches = [] try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 构建搜索模式 if case_sensitive: pattern = re.escape(keyword) else: pattern = re.escape(keyword) flags = re.IGNORECASE # 搜索关键词 for match in re.finditer(pattern, content, flags if not case_sensitive else 0): start_line = content[:match.start()].count('\n') + 1 end_line = content[:match.end()].count('\n') + 1 # 获取匹配行的上下文 lines = content.split('\n') context_start = max(0, start_line - 3) context_end = min(len(lines), end_line + 3) context = '\n'.join(lines[context_start:context_end]) matches.append({ "start_line": start_line, "end_line": end_line, "match_text": match.group(), "context": context }) except Exception as e: print(f"搜索文件时出错 {file_path}: {e}") return matches def search_by_title(self, title_pattern, category=None): """按标题模式搜索文档""" results = [] for cat, docs in self.index_data.get("documents", {}).items(): # 如果指定了类别,只搜索该类别 if category and cat != category: continue for doc in docs: if re.search(title_pattern, doc["title"], re.IGNORECASE): results.append({ "document": doc, "match_type": "title", "match_score": self._calculate_match_score(title_pattern, doc["title"]) }) # 按匹配分数排序 results.sort(key=lambda x: x["match_score"], reverse=True) return results def _calculate_match_score(self, pattern, text): """计算匹配分数""" # 简单的匹配分数计算 if pattern.lower() in text.lower(): return 1.0 # 模糊匹配分数 pattern_words = set(pattern.lower().split()) text_words = set(text.lower().split()) if pattern_words.intersection(text_words): return len(pattern_words.intersection(text_words)) / len(pattern_words) return 0.0 def list_documents(self, category=None, sort_by="title"): """列出文档""" documents = [] for cat, docs in self.index_data.get("documents", {}).items(): # 如果指定了类别,只列出该类别 if category and cat != category: continue documents.extend(docs) # 排序 if sort_by == "title": documents.sort(key=lambda x: x["title"]) elif sort_by == "modified": documents.sort(key=lambda x: x["modified"], reverse=True) elif sort_by == "size": documents.sort(key=lambda x: x["size"], reverse=True) return documents def get_category_stats(self): """获取类别统计信息""" return self.index_data.get("categories", {}) def get_overall_stats(self): """获取总体统计信息""" return self.index_data.get("statistics", {}) def print_search_results(self, results, max_results=10): """打印搜索结果""" if not results: print("未找到匹配的文档") return print(f"找到 {len(results)} 个匹配结果:") print("-" * 80) for i, result in enumerate(results[:max_results]): doc = result["document"] print(f"{i+1}. {doc['title']}") print(f" 文件: {doc['filename']}") print(f" 类别: {doc['category']}") print(f" 大小: {round(doc['size']/1024, 1)} KB") print(f" 修改: {doc['modified'][:10]}") if "matches" in result: print(f" 匹配数: {result['match_count']}") if result["match_count"] > 0: match = result["matches"][0] print(f" 示例匹配: 第{match['start_line']}行 - {match['match_text'][:50]}...") print() def interactive_search(self): """交互式搜索界面""" print("=== 胡汉三千年项目文档搜索工具 ===") print("输入 'quit' 退出搜索") print("输入 'help' 查看帮助") print("-" * 50) while True: try: query = input("\n搜索关键词: ").strip() if query.lower() == 'quit': break elif query.lower() == 'help': self._print_help() continue elif not query: continue # 解析搜索选项 options = self._parse_search_options(query) # 执行搜索 if options["search_type"] == "content": results = self.search_by_keyword( options["keyword"], options["category"], options["case_sensitive"] ) else: results = self.search_by_title( options["keyword"], options["category"] ) self.print_search_results(results, options["max_results"]) except KeyboardInterrupt: print("\n搜索已取消") break except Exception as e: print(f"搜索出错: {e}") def _parse_search_options(self, query): """解析搜索选项""" options = { "search_type": "content", # content 或 title "keyword": query, "category": None, "case_sensitive": False, "max_results": 10 } # 简单的选项解析 if query.startswith("title:"): options["search_type"] = "title" options["keyword"] = query[6:].strip() elif query.startswith("cat:"): parts = query.split(" ") if len(parts) >= 2: options["category"] = parts[0][4:] options["keyword"] = " ".join(parts[1:]) return options def _print_help(self): """打印帮助信息""" print("\n搜索语法:") print(" 普通搜索: 关键词") print(" 标题搜索: title:关键词") print(" 类别搜索: cat:类别名 关键词") print("\n可用类别:") stats = self.get_category_stats() for category, info in stats.items(): print(f" {category}: {info.get('count', 0)} 个文档") print("\n示例:") print(" 搜索音韵相关内容: 音韵") print(" 搜索标题包含'蒙古'的文档: title:蒙古") print(" 在核心理论中搜索'方法论': cat:01-core-theory 方法论") def main(): """主函数""" import sys searcher = DocumentSearcher() if len(sys.argv) > 1: # 命令行模式 query = " ".join(sys.argv[1:]) options = searcher._parse_search_options(query) if options["search_type"] == "content": results = searcher.search_by_keyword( options["keyword"], options["category"], options["case_sensitive"] ) else: results = searcher.search_by_title( options["keyword"], options["category"] ) searcher.print_search_results(results, options["max_results"]) else: # 交互式模式 searcher.interactive_search() if __name__ == "__main__": main()