Files
huhan3000/unified-docs/tools/search-tool.py
ben 2a19a79695 重大发现:全球文明天崇拜和玉崇拜普遍性验证完成
- 验证了地球上所有文明都具备天崇拜和玉崇拜模式
- 覆盖亚洲、欧洲、非洲、美洲、大洋洲、中东等全球范围
- 确认K音文明传播网络的全球分布
- 完善昆仑38词汇系统的理论框架
- 更新坦桑尼亚玉石开采和埃及法老坟墓水银的考古证据
- 全球文明同源论取得重大突破
2025-10-30 13:48:03 +00:00

303 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
胡汉三千年项目文档搜索工具
功能:
1. 全文搜索文档内容
2. 按关键词检索
3. 按类别过滤
4. 支持模糊搜索
作者:胡汉三千年项目团队
版本1.0.0
"""
import os
import json
import re
from pathlib import Path
class DocumentSearcher:
def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
self.base_path = Path(base_path)
self.index_file = self.base_path / "unified-index.json"
self.index_data = self._load_index()
def _load_index(self):
"""加载索引文件"""
if not self.index_file.exists():
print("警告:索引文件不存在,请先运行文档索引工具")
return {"documents": {}}
with open(self.index_file, 'r', encoding='utf-8') as f:
return json.load(f)
def search_by_keyword(self, keyword, category=None, case_sensitive=False):
"""按关键词搜索文档"""
results = []
for cat, docs in self.index_data.get("documents", {}).items():
# 如果指定了类别,只搜索该类别
if category and cat != category:
continue
for doc in docs:
file_path = self.base_path / doc["path"]
if not file_path.exists():
continue
# 搜索文件内容
matches = self._search_in_file(file_path, keyword, case_sensitive)
if matches:
result = {
"document": doc,
"matches": matches,
"match_count": len(matches)
}
results.append(result)
# 按匹配数量排序
results.sort(key=lambda x: x["match_count"], reverse=True)
return results
def _search_in_file(self, file_path, keyword, case_sensitive):
"""在单个文件中搜索关键词"""
matches = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 构建搜索模式
if case_sensitive:
pattern = re.escape(keyword)
else:
pattern = re.escape(keyword)
flags = re.IGNORECASE
# 搜索关键词
for match in re.finditer(pattern, content, flags if not case_sensitive else 0):
start_line = content[:match.start()].count('\n') + 1
end_line = content[:match.end()].count('\n') + 1
# 获取匹配行的上下文
lines = content.split('\n')
context_start = max(0, start_line - 3)
context_end = min(len(lines), end_line + 3)
context = '\n'.join(lines[context_start:context_end])
matches.append({
"start_line": start_line,
"end_line": end_line,
"match_text": match.group(),
"context": context
})
except Exception as e:
print(f"搜索文件时出错 {file_path}: {e}")
return matches
def search_by_title(self, title_pattern, category=None):
"""按标题模式搜索文档"""
results = []
for cat, docs in self.index_data.get("documents", {}).items():
# 如果指定了类别,只搜索该类别
if category and cat != category:
continue
for doc in docs:
if re.search(title_pattern, doc["title"], re.IGNORECASE):
results.append({
"document": doc,
"match_type": "title",
"match_score": self._calculate_match_score(title_pattern, doc["title"])
})
# 按匹配分数排序
results.sort(key=lambda x: x["match_score"], reverse=True)
return results
def _calculate_match_score(self, pattern, text):
"""计算匹配分数"""
# 简单的匹配分数计算
if pattern.lower() in text.lower():
return 1.0
# 模糊匹配分数
pattern_words = set(pattern.lower().split())
text_words = set(text.lower().split())
if pattern_words.intersection(text_words):
return len(pattern_words.intersection(text_words)) / len(pattern_words)
return 0.0
def list_documents(self, category=None, sort_by="title"):
"""列出文档"""
documents = []
for cat, docs in self.index_data.get("documents", {}).items():
# 如果指定了类别,只列出该类别
if category and cat != category:
continue
documents.extend(docs)
# 排序
if sort_by == "title":
documents.sort(key=lambda x: x["title"])
elif sort_by == "modified":
documents.sort(key=lambda x: x["modified"], reverse=True)
elif sort_by == "size":
documents.sort(key=lambda x: x["size"], reverse=True)
return documents
def get_category_stats(self):
"""获取类别统计信息"""
return self.index_data.get("categories", {})
def get_overall_stats(self):
"""获取总体统计信息"""
return self.index_data.get("statistics", {})
def print_search_results(self, results, max_results=10):
"""打印搜索结果"""
if not results:
print("未找到匹配的文档")
return
print(f"找到 {len(results)} 个匹配结果:")
print("-" * 80)
for i, result in enumerate(results[:max_results]):
doc = result["document"]
print(f"{i+1}. {doc['title']}")
print(f" 文件: {doc['filename']}")
print(f" 类别: {doc['category']}")
print(f" 大小: {round(doc['size']/1024, 1)} KB")
print(f" 修改: {doc['modified'][:10]}")
if "matches" in result:
print(f" 匹配数: {result['match_count']}")
if result["match_count"] > 0:
match = result["matches"][0]
print(f" 示例匹配: 第{match['start_line']}行 - {match['match_text'][:50]}...")
print()
def interactive_search(self):
"""交互式搜索界面"""
print("=== 胡汉三千年项目文档搜索工具 ===")
print("输入 'quit' 退出搜索")
print("输入 'help' 查看帮助")
print("-" * 50)
while True:
try:
query = input("\n搜索关键词: ").strip()
if query.lower() == 'quit':
break
elif query.lower() == 'help':
self._print_help()
continue
elif not query:
continue
# 解析搜索选项
options = self._parse_search_options(query)
# 执行搜索
if options["search_type"] == "content":
results = self.search_by_keyword(
options["keyword"],
options["category"],
options["case_sensitive"]
)
else:
results = self.search_by_title(
options["keyword"],
options["category"]
)
self.print_search_results(results, options["max_results"])
except KeyboardInterrupt:
print("\n搜索已取消")
break
except Exception as e:
print(f"搜索出错: {e}")
def _parse_search_options(self, query):
"""解析搜索选项"""
options = {
"search_type": "content", # content 或 title
"keyword": query,
"category": None,
"case_sensitive": False,
"max_results": 10
}
# 简单的选项解析
if query.startswith("title:"):
options["search_type"] = "title"
options["keyword"] = query[6:].strip()
elif query.startswith("cat:"):
parts = query.split(" ")
if len(parts) >= 2:
options["category"] = parts[0][4:]
options["keyword"] = " ".join(parts[1:])
return options
def _print_help(self):
"""打印帮助信息"""
print("\n搜索语法:")
print(" 普通搜索: 关键词")
print(" 标题搜索: title:关键词")
print(" 类别搜索: cat:类别名 关键词")
print("\n可用类别:")
stats = self.get_category_stats()
for category, info in stats.items():
print(f" {category}: {info.get('count', 0)} 个文档")
print("\n示例:")
print(" 搜索音韵相关内容: 音韵")
print(" 搜索标题包含'蒙古'的文档: title:蒙古")
print(" 在核心理论中搜索'方法论': cat:01-core-theory 方法论")
def main():
"""主函数"""
import sys
searcher = DocumentSearcher()
if len(sys.argv) > 1:
# 命令行模式
query = " ".join(sys.argv[1:])
options = searcher._parse_search_options(query)
if options["search_type"] == "content":
results = searcher.search_by_keyword(
options["keyword"],
options["category"],
options["case_sensitive"]
)
else:
results = searcher.search_by_title(
options["keyword"],
options["category"]
)
searcher.print_search_results(results, options["max_results"])
else:
# 交互式模式
searcher.interactive_search()
if __name__ == "__main__":
main()