重大发现:全球文明天崇拜和玉崇拜普遍性验证完成
- 验证了地球上所有文明都具备天崇拜和玉崇拜模式 - 覆盖亚洲、欧洲、非洲、美洲、大洋洲、中东等全球范围 - 确认K音文明传播网络的全球分布 - 完善昆仑38词汇系统的理论框架 - 更新坦桑尼亚玉石开采和埃及法老坟墓水银的考古证据 - 全球文明同源论取得重大突破
This commit is contained in:
303
unified-docs/tools/search-tool.py
Normal file
303
unified-docs/tools/search-tool.py
Normal file
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
胡汉三千年项目文档搜索工具
|
||||
|
||||
功能:
|
||||
1. 全文搜索文档内容
|
||||
2. 按关键词检索
|
||||
3. 按类别过滤
|
||||
4. 支持模糊搜索
|
||||
|
||||
作者:胡汉三千年项目团队
|
||||
版本:1.0.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
class DocumentSearcher:
|
||||
def __init__(self, base_path="/home/ben/code/huhan3000/unified-docs"):
|
||||
self.base_path = Path(base_path)
|
||||
self.index_file = self.base_path / "unified-index.json"
|
||||
self.index_data = self._load_index()
|
||||
|
||||
def _load_index(self):
|
||||
"""加载索引文件"""
|
||||
if not self.index_file.exists():
|
||||
print("警告:索引文件不存在,请先运行文档索引工具")
|
||||
return {"documents": {}}
|
||||
|
||||
with open(self.index_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def search_by_keyword(self, keyword, category=None, case_sensitive=False):
|
||||
"""按关键词搜索文档"""
|
||||
results = []
|
||||
|
||||
for cat, docs in self.index_data.get("documents", {}).items():
|
||||
# 如果指定了类别,只搜索该类别
|
||||
if category and cat != category:
|
||||
continue
|
||||
|
||||
for doc in docs:
|
||||
file_path = self.base_path / doc["path"]
|
||||
if not file_path.exists():
|
||||
continue
|
||||
|
||||
# 搜索文件内容
|
||||
matches = self._search_in_file(file_path, keyword, case_sensitive)
|
||||
if matches:
|
||||
result = {
|
||||
"document": doc,
|
||||
"matches": matches,
|
||||
"match_count": len(matches)
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
# 按匹配数量排序
|
||||
results.sort(key=lambda x: x["match_count"], reverse=True)
|
||||
return results
|
||||
|
||||
def _search_in_file(self, file_path, keyword, case_sensitive):
|
||||
"""在单个文件中搜索关键词"""
|
||||
matches = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 构建搜索模式
|
||||
if case_sensitive:
|
||||
pattern = re.escape(keyword)
|
||||
else:
|
||||
pattern = re.escape(keyword)
|
||||
flags = re.IGNORECASE
|
||||
|
||||
# 搜索关键词
|
||||
for match in re.finditer(pattern, content, flags if not case_sensitive else 0):
|
||||
start_line = content[:match.start()].count('\n') + 1
|
||||
end_line = content[:match.end()].count('\n') + 1
|
||||
|
||||
# 获取匹配行的上下文
|
||||
lines = content.split('\n')
|
||||
context_start = max(0, start_line - 3)
|
||||
context_end = min(len(lines), end_line + 3)
|
||||
context = '\n'.join(lines[context_start:context_end])
|
||||
|
||||
matches.append({
|
||||
"start_line": start_line,
|
||||
"end_line": end_line,
|
||||
"match_text": match.group(),
|
||||
"context": context
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索文件时出错 {file_path}: {e}")
|
||||
|
||||
return matches
|
||||
|
||||
def search_by_title(self, title_pattern, category=None):
|
||||
"""按标题模式搜索文档"""
|
||||
results = []
|
||||
|
||||
for cat, docs in self.index_data.get("documents", {}).items():
|
||||
# 如果指定了类别,只搜索该类别
|
||||
if category and cat != category:
|
||||
continue
|
||||
|
||||
for doc in docs:
|
||||
if re.search(title_pattern, doc["title"], re.IGNORECASE):
|
||||
results.append({
|
||||
"document": doc,
|
||||
"match_type": "title",
|
||||
"match_score": self._calculate_match_score(title_pattern, doc["title"])
|
||||
})
|
||||
|
||||
# 按匹配分数排序
|
||||
results.sort(key=lambda x: x["match_score"], reverse=True)
|
||||
return results
|
||||
|
||||
def _calculate_match_score(self, pattern, text):
|
||||
"""计算匹配分数"""
|
||||
# 简单的匹配分数计算
|
||||
if pattern.lower() in text.lower():
|
||||
return 1.0
|
||||
|
||||
# 模糊匹配分数
|
||||
pattern_words = set(pattern.lower().split())
|
||||
text_words = set(text.lower().split())
|
||||
|
||||
if pattern_words.intersection(text_words):
|
||||
return len(pattern_words.intersection(text_words)) / len(pattern_words)
|
||||
|
||||
return 0.0
|
||||
|
||||
def list_documents(self, category=None, sort_by="title"):
|
||||
"""列出文档"""
|
||||
documents = []
|
||||
|
||||
for cat, docs in self.index_data.get("documents", {}).items():
|
||||
# 如果指定了类别,只列出该类别
|
||||
if category and cat != category:
|
||||
continue
|
||||
|
||||
documents.extend(docs)
|
||||
|
||||
# 排序
|
||||
if sort_by == "title":
|
||||
documents.sort(key=lambda x: x["title"])
|
||||
elif sort_by == "modified":
|
||||
documents.sort(key=lambda x: x["modified"], reverse=True)
|
||||
elif sort_by == "size":
|
||||
documents.sort(key=lambda x: x["size"], reverse=True)
|
||||
|
||||
return documents
|
||||
|
||||
def get_category_stats(self):
|
||||
"""获取类别统计信息"""
|
||||
return self.index_data.get("categories", {})
|
||||
|
||||
def get_overall_stats(self):
|
||||
"""获取总体统计信息"""
|
||||
return self.index_data.get("statistics", {})
|
||||
|
||||
def print_search_results(self, results, max_results=10):
|
||||
"""打印搜索结果"""
|
||||
if not results:
|
||||
print("未找到匹配的文档")
|
||||
return
|
||||
|
||||
print(f"找到 {len(results)} 个匹配结果:")
|
||||
print("-" * 80)
|
||||
|
||||
for i, result in enumerate(results[:max_results]):
|
||||
doc = result["document"]
|
||||
print(f"{i+1}. {doc['title']}")
|
||||
print(f" 文件: {doc['filename']}")
|
||||
print(f" 类别: {doc['category']}")
|
||||
print(f" 大小: {round(doc['size']/1024, 1)} KB")
|
||||
print(f" 修改: {doc['modified'][:10]}")
|
||||
|
||||
if "matches" in result:
|
||||
print(f" 匹配数: {result['match_count']}")
|
||||
if result["match_count"] > 0:
|
||||
match = result["matches"][0]
|
||||
print(f" 示例匹配: 第{match['start_line']}行 - {match['match_text'][:50]}...")
|
||||
|
||||
print()
|
||||
|
||||
def interactive_search(self):
|
||||
"""交互式搜索界面"""
|
||||
print("=== 胡汉三千年项目文档搜索工具 ===")
|
||||
print("输入 'quit' 退出搜索")
|
||||
print("输入 'help' 查看帮助")
|
||||
print("-" * 50)
|
||||
|
||||
while True:
|
||||
try:
|
||||
query = input("\n搜索关键词: ").strip()
|
||||
|
||||
if query.lower() == 'quit':
|
||||
break
|
||||
elif query.lower() == 'help':
|
||||
self._print_help()
|
||||
continue
|
||||
elif not query:
|
||||
continue
|
||||
|
||||
# 解析搜索选项
|
||||
options = self._parse_search_options(query)
|
||||
|
||||
# 执行搜索
|
||||
if options["search_type"] == "content":
|
||||
results = self.search_by_keyword(
|
||||
options["keyword"],
|
||||
options["category"],
|
||||
options["case_sensitive"]
|
||||
)
|
||||
else:
|
||||
results = self.search_by_title(
|
||||
options["keyword"],
|
||||
options["category"]
|
||||
)
|
||||
|
||||
self.print_search_results(results, options["max_results"])
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n搜索已取消")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"搜索出错: {e}")
|
||||
|
||||
def _parse_search_options(self, query):
|
||||
"""解析搜索选项"""
|
||||
options = {
|
||||
"search_type": "content", # content 或 title
|
||||
"keyword": query,
|
||||
"category": None,
|
||||
"case_sensitive": False,
|
||||
"max_results": 10
|
||||
}
|
||||
|
||||
# 简单的选项解析
|
||||
if query.startswith("title:"):
|
||||
options["search_type"] = "title"
|
||||
options["keyword"] = query[6:].strip()
|
||||
elif query.startswith("cat:"):
|
||||
parts = query.split(" ")
|
||||
if len(parts) >= 2:
|
||||
options["category"] = parts[0][4:]
|
||||
options["keyword"] = " ".join(parts[1:])
|
||||
|
||||
return options
|
||||
|
||||
def _print_help(self):
|
||||
"""打印帮助信息"""
|
||||
print("\n搜索语法:")
|
||||
print(" 普通搜索: 关键词")
|
||||
print(" 标题搜索: title:关键词")
|
||||
print(" 类别搜索: cat:类别名 关键词")
|
||||
print("\n可用类别:")
|
||||
|
||||
stats = self.get_category_stats()
|
||||
for category, info in stats.items():
|
||||
print(f" {category}: {info.get('count', 0)} 个文档")
|
||||
|
||||
print("\n示例:")
|
||||
print(" 搜索音韵相关内容: 音韵")
|
||||
print(" 搜索标题包含'蒙古'的文档: title:蒙古")
|
||||
print(" 在核心理论中搜索'方法论': cat:01-core-theory 方法论")
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
import sys
|
||||
|
||||
searcher = DocumentSearcher()
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
# 命令行模式
|
||||
query = " ".join(sys.argv[1:])
|
||||
options = searcher._parse_search_options(query)
|
||||
|
||||
if options["search_type"] == "content":
|
||||
results = searcher.search_by_keyword(
|
||||
options["keyword"],
|
||||
options["category"],
|
||||
options["case_sensitive"]
|
||||
)
|
||||
else:
|
||||
results = searcher.search_by_title(
|
||||
options["keyword"],
|
||||
options["category"]
|
||||
)
|
||||
|
||||
searcher.print_search_results(results, options["max_results"])
|
||||
else:
|
||||
# 交互式模式
|
||||
searcher.interactive_search()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user