huhan3000/unified-docs/tools/search-tool.py

303 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
胡汉三千年项目文档搜索工具
功能:
1. 全文搜索文档内容
2. 按关键词检索
3. 按类别过滤
4. 支持模糊搜索
作者:胡汉三千年项目团队
版本1.0.0
"""
import os
import json
import re
from pathlib import Path
class DocumentSearcher:
def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
self.base_path = Path(base_path)
self.index_file = self.base_path / "unified-index.json"
self.index_data = self._load_index()
def _load_index(self):
"""加载索引文件"""
if not self.index_file.exists():
print("警告:索引文件不存在,请先运行文档索引工具")
return {"documents": {}}
with open(self.index_file, 'r', encoding='utf-8') as f:
return json.load(f)
def search_by_keyword(self, keyword, category=None, case_sensitive=False):
"""按关键词搜索文档"""
results = []
for cat, docs in self.index_data.get("documents", {}).items():
# 如果指定了类别,只搜索该类别
if category and cat != category:
continue
for doc in docs:
file_path = self.base_path / doc["path"]
if not file_path.exists():
continue
# 搜索文件内容
matches = self._search_in_file(file_path, keyword, case_sensitive)
if matches:
result = {
"document": doc,
"matches": matches,
"match_count": len(matches)
}
results.append(result)
# 按匹配数量排序
results.sort(key=lambda x: x["match_count"], reverse=True)
return results
def _search_in_file(self, file_path, keyword, case_sensitive):
"""在单个文件中搜索关键词"""
matches = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 构建搜索模式
if case_sensitive:
pattern = re.escape(keyword)
else:
pattern = re.escape(keyword)
flags = re.IGNORECASE
# 搜索关键词
for match in re.finditer(pattern, content, flags if not case_sensitive else 0):
start_line = content[:match.start()].count('\n') + 1
end_line = content[:match.end()].count('\n') + 1
# 获取匹配行的上下文
lines = content.split('\n')
context_start = max(0, start_line - 3)
context_end = min(len(lines), end_line + 3)
context = '\n'.join(lines[context_start:context_end])
matches.append({
"start_line": start_line,
"end_line": end_line,
"match_text": match.group(),
"context": context
})
except Exception as e:
print(f"搜索文件时出错 {file_path}: {e}")
return matches
def search_by_title(self, title_pattern, category=None):
"""按标题模式搜索文档"""
results = []
for cat, docs in self.index_data.get("documents", {}).items():
# 如果指定了类别,只搜索该类别
if category and cat != category:
continue
for doc in docs:
if re.search(title_pattern, doc["title"], re.IGNORECASE):
results.append({
"document": doc,
"match_type": "title",
"match_score": self._calculate_match_score(title_pattern, doc["title"])
})
# 按匹配分数排序
results.sort(key=lambda x: x["match_score"], reverse=True)
return results
def _calculate_match_score(self, pattern, text):
"""计算匹配分数"""
# 简单的匹配分数计算
if pattern.lower() in text.lower():
return 1.0
# 模糊匹配分数
pattern_words = set(pattern.lower().split())
text_words = set(text.lower().split())
if pattern_words.intersection(text_words):
return len(pattern_words.intersection(text_words)) / len(pattern_words)
return 0.0
def list_documents(self, category=None, sort_by="title"):
"""列出文档"""
documents = []
for cat, docs in self.index_data.get("documents", {}).items():
# 如果指定了类别,只列出该类别
if category and cat != category:
continue
documents.extend(docs)
# 排序
if sort_by == "title":
documents.sort(key=lambda x: x["title"])
elif sort_by == "modified":
documents.sort(key=lambda x: x["modified"], reverse=True)
elif sort_by == "size":
documents.sort(key=lambda x: x["size"], reverse=True)
return documents
def get_category_stats(self):
"""获取类别统计信息"""
return self.index_data.get("categories", {})
def get_overall_stats(self):
"""获取总体统计信息"""
return self.index_data.get("statistics", {})
def print_search_results(self, results, max_results=10):
"""打印搜索结果"""
if not results:
print("未找到匹配的文档")
return
print(f"找到 {len(results)} 个匹配结果:")
print("-" * 80)
for i, result in enumerate(results[:max_results]):
doc = result["document"]
print(f"{i+1}. {doc['title']}")
print(f" 文件: {doc['filename']}")
print(f" 类别: {doc['category']}")
print(f" 大小: {round(doc['size']/1024, 1)} KB")
print(f" 修改: {doc['modified'][:10]}")
if "matches" in result:
print(f" 匹配数: {result['match_count']}")
if result["match_count"] > 0:
match = result["matches"][0]
print(f" 示例匹配: 第{match['start_line']}行 - {match['match_text'][:50]}...")
print()
def interactive_search(self):
"""交互式搜索界面"""
print("=== 胡汉三千年项目文档搜索工具 ===")
print("输入 'quit' 退出搜索")
print("输入 'help' 查看帮助")
print("-" * 50)
while True:
try:
query = input("\n搜索关键词: ").strip()
if query.lower() == 'quit':
break
elif query.lower() == 'help':
self._print_help()
continue
elif not query:
continue
# 解析搜索选项
options = self._parse_search_options(query)
# 执行搜索
if options["search_type"] == "content":
results = self.search_by_keyword(
options["keyword"],
options["category"],
options["case_sensitive"]
)
else:
results = self.search_by_title(
options["keyword"],
options["category"]
)
self.print_search_results(results, options["max_results"])
except KeyboardInterrupt:
print("\n搜索已取消")
break
except Exception as e:
print(f"搜索出错: {e}")
def _parse_search_options(self, query):
"""解析搜索选项"""
options = {
"search_type": "content", # content 或 title
"keyword": query,
"category": None,
"case_sensitive": False,
"max_results": 10
}
# 简单的选项解析
if query.startswith("title:"):
options["search_type"] = "title"
options["keyword"] = query[6:].strip()
elif query.startswith("cat:"):
parts = query.split(" ")
if len(parts) >= 2:
options["category"] = parts[0][4:]
options["keyword"] = " ".join(parts[1:])
return options
def _print_help(self):
"""打印帮助信息"""
print("\n搜索语法:")
print(" 普通搜索: 关键词")
print(" 标题搜索: title:关键词")
print(" 类别搜索: cat:类别名 关键词")
print("\n可用类别:")
stats = self.get_category_stats()
for category, info in stats.items():
print(f" {category}: {info.get('count', 0)} 个文档")
print("\n示例:")
print(" 搜索音韵相关内容: 音韵")
print(" 搜索标题包含'蒙古'的文档: title:蒙古")
print(" 在核心理论中搜索'方法论': cat:01-core-theory 方法论")
def main():
"""主函数"""
import sys
searcher = DocumentSearcher()
if len(sys.argv) > 1:
# 命令行模式
query = " ".join(sys.argv[1:])
options = searcher._parse_search_options(query)
if options["search_type"] == "content":
results = searcher.search_by_keyword(
options["keyword"],
options["category"],
options["case_sensitive"]
)
else:
results = searcher.search_by_title(
options["keyword"],
options["category"]
)
searcher.print_search_results(results, options["max_results"])
else:
# 交互式模式
searcher.interactive_search()
if __name__ == "__main__":
main()