303 lines
10 KiB
Python
303 lines
10 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
胡汉三千年项目文档搜索工具
|
||
|
||
功能:
|
||
1. 全文搜索文档内容
|
||
2. 按关键词检索
|
||
3. 按类别过滤
|
||
4. 支持模糊搜索
|
||
|
||
作者:胡汉三千年项目团队
|
||
版本:1.0.0
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
|
||
class DocumentSearcher:
|
||
def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
|
||
self.base_path = Path(base_path)
|
||
self.index_file = self.base_path / "unified-index.json"
|
||
self.index_data = self._load_index()
|
||
|
||
def _load_index(self):
|
||
"""加载索引文件"""
|
||
if not self.index_file.exists():
|
||
print("警告:索引文件不存在,请先运行文档索引工具")
|
||
return {"documents": {}}
|
||
|
||
with open(self.index_file, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
|
||
def search_by_keyword(self, keyword, category=None, case_sensitive=False):
|
||
"""按关键词搜索文档"""
|
||
results = []
|
||
|
||
for cat, docs in self.index_data.get("documents", {}).items():
|
||
# 如果指定了类别,只搜索该类别
|
||
if category and cat != category:
|
||
continue
|
||
|
||
for doc in docs:
|
||
file_path = self.base_path / doc["path"]
|
||
if not file_path.exists():
|
||
continue
|
||
|
||
# 搜索文件内容
|
||
matches = self._search_in_file(file_path, keyword, case_sensitive)
|
||
if matches:
|
||
result = {
|
||
"document": doc,
|
||
"matches": matches,
|
||
"match_count": len(matches)
|
||
}
|
||
results.append(result)
|
||
|
||
# 按匹配数量排序
|
||
results.sort(key=lambda x: x["match_count"], reverse=True)
|
||
return results
|
||
|
||
def _search_in_file(self, file_path, keyword, case_sensitive):
|
||
"""在单个文件中搜索关键词"""
|
||
matches = []
|
||
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 构建搜索模式
|
||
if case_sensitive:
|
||
pattern = re.escape(keyword)
|
||
else:
|
||
pattern = re.escape(keyword)
|
||
flags = re.IGNORECASE
|
||
|
||
# 搜索关键词
|
||
for match in re.finditer(pattern, content, flags if not case_sensitive else 0):
|
||
start_line = content[:match.start()].count('\n') + 1
|
||
end_line = content[:match.end()].count('\n') + 1
|
||
|
||
# 获取匹配行的上下文
|
||
lines = content.split('\n')
|
||
context_start = max(0, start_line - 3)
|
||
context_end = min(len(lines), end_line + 3)
|
||
context = '\n'.join(lines[context_start:context_end])
|
||
|
||
matches.append({
|
||
"start_line": start_line,
|
||
"end_line": end_line,
|
||
"match_text": match.group(),
|
||
"context": context
|
||
})
|
||
|
||
except Exception as e:
|
||
print(f"搜索文件时出错 {file_path}: {e}")
|
||
|
||
return matches
|
||
|
||
def search_by_title(self, title_pattern, category=None):
|
||
"""按标题模式搜索文档"""
|
||
results = []
|
||
|
||
for cat, docs in self.index_data.get("documents", {}).items():
|
||
# 如果指定了类别,只搜索该类别
|
||
if category and cat != category:
|
||
continue
|
||
|
||
for doc in docs:
|
||
if re.search(title_pattern, doc["title"], re.IGNORECASE):
|
||
results.append({
|
||
"document": doc,
|
||
"match_type": "title",
|
||
"match_score": self._calculate_match_score(title_pattern, doc["title"])
|
||
})
|
||
|
||
# 按匹配分数排序
|
||
results.sort(key=lambda x: x["match_score"], reverse=True)
|
||
return results
|
||
|
||
def _calculate_match_score(self, pattern, text):
|
||
"""计算匹配分数"""
|
||
# 简单的匹配分数计算
|
||
if pattern.lower() in text.lower():
|
||
return 1.0
|
||
|
||
# 模糊匹配分数
|
||
pattern_words = set(pattern.lower().split())
|
||
text_words = set(text.lower().split())
|
||
|
||
if pattern_words.intersection(text_words):
|
||
return len(pattern_words.intersection(text_words)) / len(pattern_words)
|
||
|
||
return 0.0
|
||
|
||
def list_documents(self, category=None, sort_by="title"):
|
||
"""列出文档"""
|
||
documents = []
|
||
|
||
for cat, docs in self.index_data.get("documents", {}).items():
|
||
# 如果指定了类别,只列出该类别
|
||
if category and cat != category:
|
||
continue
|
||
|
||
documents.extend(docs)
|
||
|
||
# 排序
|
||
if sort_by == "title":
|
||
documents.sort(key=lambda x: x["title"])
|
||
elif sort_by == "modified":
|
||
documents.sort(key=lambda x: x["modified"], reverse=True)
|
||
elif sort_by == "size":
|
||
documents.sort(key=lambda x: x["size"], reverse=True)
|
||
|
||
return documents
|
||
|
||
def get_category_stats(self):
|
||
"""获取类别统计信息"""
|
||
return self.index_data.get("categories", {})
|
||
|
||
def get_overall_stats(self):
|
||
"""获取总体统计信息"""
|
||
return self.index_data.get("statistics", {})
|
||
|
||
def print_search_results(self, results, max_results=10):
|
||
"""打印搜索结果"""
|
||
if not results:
|
||
print("未找到匹配的文档")
|
||
return
|
||
|
||
print(f"找到 {len(results)} 个匹配结果:")
|
||
print("-" * 80)
|
||
|
||
for i, result in enumerate(results[:max_results]):
|
||
doc = result["document"]
|
||
print(f"{i+1}. {doc['title']}")
|
||
print(f" 文件: {doc['filename']}")
|
||
print(f" 类别: {doc['category']}")
|
||
print(f" 大小: {round(doc['size']/1024, 1)} KB")
|
||
print(f" 修改: {doc['modified'][:10]}")
|
||
|
||
if "matches" in result:
|
||
print(f" 匹配数: {result['match_count']}")
|
||
if result["match_count"] > 0:
|
||
match = result["matches"][0]
|
||
print(f" 示例匹配: 第{match['start_line']}行 - {match['match_text'][:50]}...")
|
||
|
||
print()
|
||
|
||
def interactive_search(self):
|
||
"""交互式搜索界面"""
|
||
print("=== 胡汉三千年项目文档搜索工具 ===")
|
||
print("输入 'quit' 退出搜索")
|
||
print("输入 'help' 查看帮助")
|
||
print("-" * 50)
|
||
|
||
while True:
|
||
try:
|
||
query = input("\n搜索关键词: ").strip()
|
||
|
||
if query.lower() == 'quit':
|
||
break
|
||
elif query.lower() == 'help':
|
||
self._print_help()
|
||
continue
|
||
elif not query:
|
||
continue
|
||
|
||
# 解析搜索选项
|
||
options = self._parse_search_options(query)
|
||
|
||
# 执行搜索
|
||
if options["search_type"] == "content":
|
||
results = self.search_by_keyword(
|
||
options["keyword"],
|
||
options["category"],
|
||
options["case_sensitive"]
|
||
)
|
||
else:
|
||
results = self.search_by_title(
|
||
options["keyword"],
|
||
options["category"]
|
||
)
|
||
|
||
self.print_search_results(results, options["max_results"])
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n搜索已取消")
|
||
break
|
||
except Exception as e:
|
||
print(f"搜索出错: {e}")
|
||
|
||
def _parse_search_options(self, query):
|
||
"""解析搜索选项"""
|
||
options = {
|
||
"search_type": "content", # content 或 title
|
||
"keyword": query,
|
||
"category": None,
|
||
"case_sensitive": False,
|
||
"max_results": 10
|
||
}
|
||
|
||
# 简单的选项解析
|
||
if query.startswith("title:"):
|
||
options["search_type"] = "title"
|
||
options["keyword"] = query[6:].strip()
|
||
elif query.startswith("cat:"):
|
||
parts = query.split(" ")
|
||
if len(parts) >= 2:
|
||
options["category"] = parts[0][4:]
|
||
options["keyword"] = " ".join(parts[1:])
|
||
|
||
return options
|
||
|
||
def _print_help(self):
|
||
"""打印帮助信息"""
|
||
print("\n搜索语法:")
|
||
print(" 普通搜索: 关键词")
|
||
print(" 标题搜索: title:关键词")
|
||
print(" 类别搜索: cat:类别名 关键词")
|
||
print("\n可用类别:")
|
||
|
||
stats = self.get_category_stats()
|
||
for category, info in stats.items():
|
||
print(f" {category}: {info.get('count', 0)} 个文档")
|
||
|
||
print("\n示例:")
|
||
print(" 搜索音韵相关内容: 音韵")
|
||
print(" 搜索标题包含'蒙古'的文档: title:蒙古")
|
||
print(" 在核心理论中搜索'方法论': cat:01-core-theory 方法论")
|
||
|
||
def main():
|
||
"""主函数"""
|
||
import sys
|
||
|
||
searcher = DocumentSearcher()
|
||
|
||
if len(sys.argv) > 1:
|
||
# 命令行模式
|
||
query = " ".join(sys.argv[1:])
|
||
options = searcher._parse_search_options(query)
|
||
|
||
if options["search_type"] == "content":
|
||
results = searcher.search_by_keyword(
|
||
options["keyword"],
|
||
options["category"],
|
||
options["case_sensitive"]
|
||
)
|
||
else:
|
||
results = searcher.search_by_title(
|
||
options["keyword"],
|
||
options["category"]
|
||
)
|
||
|
||
searcher.print_search_results(results, options["max_results"])
|
||
else:
|
||
# 交互式模式
|
||
searcher.interactive_search()
|
||
|
||
if __name__ == "__main__":
|
||
main() |