488 lines
18 KiB
Python
488 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
RSS数据读取测试器
|
||
测试从MongoDB读取RSS新闻数据,并分析索引需求
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import logging
|
||
import time
|
||
from datetime import datetime, timezone, timedelta
|
||
from typing import Dict, List, Any, Optional
|
||
from src.mcp.swarm_mongodb_client import SwarmMongoDBClient
|
||
|
||
class RSSDataReader:
|
||
"""RSS数据读取器和分析器"""
|
||
|
||
def __init__(self, mongodb_client: SwarmMongoDBClient, database_name: str = "news_debate_db"):
|
||
self.mongodb_client = mongodb_client
|
||
self.database_name = database_name
|
||
self.collection_name = "news_articles"
|
||
self.logger = logging.getLogger(__name__)
|
||
|
||
async def connect_to_database(self) -> bool:
|
||
"""连接到数据库"""
|
||
try:
|
||
result = self.mongodb_client.connect(self.database_name)
|
||
if result.get('success'):
|
||
self.logger.info(f"成功连接到数据库: {self.database_name}")
|
||
return True
|
||
else:
|
||
self.logger.error(f"数据库连接失败: {result}")
|
||
return False
|
||
except Exception as e:
|
||
self.logger.error(f"数据库连接异常: {e}")
|
||
return False
|
||
|
||
async def get_collection_stats(self) -> Dict[str, Any]:
|
||
"""获取集合统计信息"""
|
||
try:
|
||
# 获取文档总数
|
||
count_result = self.mongodb_client.count_documents(self.collection_name)
|
||
total_count = count_result.get('count', 0) if count_result.get('success') else 0
|
||
|
||
# 获取最新的几条记录来分析数据结构
|
||
latest_docs = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query={},
|
||
sort={'collected_at': -1},
|
||
limit=5
|
||
)
|
||
|
||
# 获取最早的记录
|
||
earliest_docs = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query={},
|
||
sort={'collected_at': 1},
|
||
limit=1
|
||
)
|
||
|
||
stats = {
|
||
'total_documents': total_count,
|
||
'latest_documents': latest_docs.get('documents', []) if latest_docs.get('success') else [],
|
||
'earliest_document': earliest_docs.get('documents', []) if earliest_docs.get('success') else [],
|
||
'collection_exists': total_count > 0
|
||
}
|
||
|
||
return stats
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"获取集合统计信息失败: {e}")
|
||
return {'error': str(e)}
|
||
|
||
async def analyze_data_structure(self, sample_size: int = 10) -> Dict[str, Any]:
|
||
"""分析数据结构"""
|
||
try:
|
||
# 获取样本数据
|
||
sample_result = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query={},
|
||
limit=sample_size
|
||
)
|
||
|
||
if not sample_result.get('success'):
|
||
return {'error': '无法获取样本数据'}
|
||
|
||
documents = sample_result.get('documents', [])
|
||
if not documents:
|
||
return {'error': '没有找到任何文档'}
|
||
|
||
# 分析字段结构
|
||
field_analysis = {}
|
||
for doc in documents:
|
||
for field, value in doc.items():
|
||
if field not in field_analysis:
|
||
field_analysis[field] = {
|
||
'type': type(value).__name__,
|
||
'sample_values': [],
|
||
'count': 0
|
||
}
|
||
|
||
field_analysis[field]['count'] += 1
|
||
if len(field_analysis[field]['sample_values']) < 3:
|
||
field_analysis[field]['sample_values'].append(str(value)[:100]) # 限制长度
|
||
|
||
# 分析常见查询字段
|
||
query_fields = {
|
||
'title': '标题搜索',
|
||
'category': '分类筛选',
|
||
'published': '时间范围查询',
|
||
'collected_at': '收集时间排序',
|
||
'tags': '标签搜索',
|
||
'source_title': '来源筛选'
|
||
}
|
||
|
||
return {
|
||
'sample_count': len(documents),
|
||
'field_analysis': field_analysis,
|
||
'recommended_query_fields': query_fields,
|
||
'sample_document': documents[0] if documents else None
|
||
}
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"数据结构分析失败: {e}")
|
||
return {'error': str(e)}
|
||
|
||
async def test_query_performance(self) -> Dict[str, Any]:
|
||
"""测试查询性能"""
|
||
performance_results = {}
|
||
|
||
# 测试不同类型的查询
|
||
test_queries = [
|
||
{
|
||
'name': '全表扫描',
|
||
'query': {},
|
||
'sort': None,
|
||
'limit': 10
|
||
},
|
||
{
|
||
'name': '按时间排序',
|
||
'query': {},
|
||
'sort': {'collected_at': -1},
|
||
'limit': 10
|
||
},
|
||
{
|
||
'name': '标题文本搜索',
|
||
'query': {'title': {'$regex': '市场', '$options': 'i'}},
|
||
'sort': None,
|
||
'limit': 10
|
||
},
|
||
{
|
||
'name': '分类筛选',
|
||
'query': {'category': '财经新闻'},
|
||
'sort': None,
|
||
'limit': 10
|
||
},
|
||
{
|
||
'name': '时间范围查询',
|
||
'query': {
|
||
'collected_at': {
|
||
'$gte': datetime.now(timezone.utc) - timedelta(days=7)
|
||
}
|
||
},
|
||
'sort': {'collected_at': -1},
|
||
'limit': 10
|
||
}
|
||
]
|
||
|
||
for test in test_queries:
|
||
try:
|
||
start_time = time.time()
|
||
|
||
result = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query=test['query'],
|
||
sort=test.get('sort'),
|
||
limit=test['limit']
|
||
)
|
||
|
||
end_time = time.time()
|
||
query_time = (end_time - start_time) * 1000 # 转换为毫秒
|
||
|
||
performance_results[test['name']] = {
|
||
'query_time_ms': round(query_time, 2),
|
||
'success': result.get('success', False),
|
||
'document_count': len(result.get('documents', [])),
|
||
'query': test['query']
|
||
}
|
||
|
||
except Exception as e:
|
||
performance_results[test['name']] = {
|
||
'error': str(e),
|
||
'query': test['query']
|
||
}
|
||
|
||
return performance_results
|
||
|
||
async def check_existing_indexes(self) -> Dict[str, Any]:
|
||
"""检查现有索引"""
|
||
try:
|
||
# 注意:这里需要使用MongoDB的原生命令来获取索引信息
|
||
# 由于SwarmMongoDBClient可能没有直接的索引查询方法,我们尝试其他方式
|
||
|
||
# 尝试通过聚合管道获取索引信息
|
||
pipeline = [
|
||
{"$indexStats": {}}
|
||
]
|
||
|
||
# 如果客户端支持聚合查询
|
||
if hasattr(self.mongodb_client, 'aggregate_documents'):
|
||
result = self.mongodb_client.aggregate_documents(
|
||
self.collection_name,
|
||
pipeline=pipeline
|
||
)
|
||
|
||
if result.get('success'):
|
||
return {
|
||
'indexes': result.get('documents', []),
|
||
'method': 'aggregation'
|
||
}
|
||
|
||
# 如果无法直接获取索引信息,返回建议
|
||
return {
|
||
'message': '无法直接查询索引信息,建议手动检查',
|
||
'method': 'manual_check_needed'
|
||
}
|
||
|
||
except Exception as e:
|
||
return {
|
||
'error': str(e),
|
||
'message': '索引查询失败'
|
||
}
|
||
|
||
def generate_index_recommendations(self, performance_results: Dict[str, Any],
|
||
data_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""生成索引建议"""
|
||
recommendations = {
|
||
'basic_indexes': [],
|
||
'compound_indexes': [],
|
||
'text_indexes': [],
|
||
'vector_indexes': [],
|
||
'reasoning': []
|
||
}
|
||
|
||
# 基础索引建议
|
||
slow_queries = [name for name, result in performance_results.items()
|
||
if isinstance(result, dict) and result.get('query_time_ms', 0) > 100]
|
||
|
||
if slow_queries:
|
||
recommendations['reasoning'].append(f"发现慢查询: {', '.join(slow_queries)}")
|
||
|
||
# 基于数据结构的索引建议
|
||
field_analysis = data_analysis.get('field_analysis', {})
|
||
|
||
# 时间字段索引(用于排序和范围查询)
|
||
if 'collected_at' in field_analysis:
|
||
recommendations['basic_indexes'].append({
|
||
'field': 'collected_at',
|
||
'type': 'descending',
|
||
'reason': '用于时间排序和范围查询'
|
||
})
|
||
|
||
if 'published' in field_analysis:
|
||
recommendations['basic_indexes'].append({
|
||
'field': 'published',
|
||
'type': 'descending',
|
||
'reason': '用于发布时间查询'
|
||
})
|
||
|
||
# 分类字段索引
|
||
if 'category' in field_analysis:
|
||
recommendations['basic_indexes'].append({
|
||
'field': 'category',
|
||
'type': 'ascending',
|
||
'reason': '用于分类筛选'
|
||
})
|
||
|
||
# 唯一标识符索引
|
||
if 'article_id' in field_analysis:
|
||
recommendations['basic_indexes'].append({
|
||
'field': 'article_id',
|
||
'type': 'ascending',
|
||
'unique': True,
|
||
'reason': '唯一标识符,防止重复'
|
||
})
|
||
|
||
# 复合索引建议
|
||
recommendations['compound_indexes'].append({
|
||
'fields': ['category', 'collected_at'],
|
||
'reason': '支持按分类筛选并按时间排序'
|
||
})
|
||
|
||
# 文本搜索索引
|
||
text_fields = []
|
||
for field in ['title', 'description', 'summary']:
|
||
if field in field_analysis:
|
||
text_fields.append(field)
|
||
|
||
if text_fields:
|
||
recommendations['text_indexes'].append({
|
||
'fields': text_fields,
|
||
'type': 'text',
|
||
'reason': '支持全文搜索'
|
||
})
|
||
|
||
# 向量索引建议
|
||
recommendations['vector_indexes'].append({
|
||
'consideration': '如果需要语义搜索',
|
||
'fields': ['title', 'description'],
|
||
'method': 'embedding + vector_search',
|
||
'reason': '用于基于内容相似性的智能搜索和推荐'
|
||
})
|
||
|
||
return recommendations
|
||
|
||
async def test_sample_queries(self) -> Dict[str, Any]:
|
||
"""测试一些示例查询"""
|
||
sample_queries = {}
|
||
|
||
try:
|
||
# 1. 获取最新10条新闻
|
||
latest_news = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query={},
|
||
sort={'collected_at': -1},
|
||
limit=10
|
||
)
|
||
sample_queries['latest_news'] = {
|
||
'success': latest_news.get('success'),
|
||
'count': len(latest_news.get('documents', [])),
|
||
'sample_titles': [doc.get('title', 'N/A')[:50] + '...'
|
||
for doc in latest_news.get('documents', [])[:3]]
|
||
}
|
||
|
||
# 2. 按分类查询
|
||
category_news = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query={'category': '财经新闻'},
|
||
limit=5
|
||
)
|
||
sample_queries['category_news'] = {
|
||
'success': category_news.get('success'),
|
||
'count': len(category_news.get('documents', [])),
|
||
'category': '财经新闻'
|
||
}
|
||
|
||
# 3. 关键词搜索
|
||
keyword_search = self.mongodb_client.find_documents(
|
||
self.collection_name,
|
||
query={'title': {'$regex': '投资|股票|市场', '$options': 'i'}},
|
||
limit=5
|
||
)
|
||
sample_queries['keyword_search'] = {
|
||
'success': keyword_search.get('success'),
|
||
'count': len(keyword_search.get('documents', [])),
|
||
'keywords': '投资|股票|市场'
|
||
}
|
||
|
||
except Exception as e:
|
||
sample_queries['error'] = str(e)
|
||
|
||
return sample_queries
|
||
|
||
async def run_comprehensive_analysis(self) -> Dict[str, Any]:
|
||
"""运行完整的数据分析"""
|
||
self.logger.info("开始RSS数据分析...")
|
||
|
||
# 连接数据库
|
||
if not await self.connect_to_database():
|
||
return {'error': '无法连接到数据库'}
|
||
|
||
analysis_results = {}
|
||
|
||
# 1. 获取集合统计信息
|
||
self.logger.info("获取集合统计信息...")
|
||
analysis_results['collection_stats'] = await self.get_collection_stats()
|
||
|
||
# 2. 分析数据结构
|
||
self.logger.info("分析数据结构...")
|
||
analysis_results['data_structure'] = await self.analyze_data_structure()
|
||
|
||
# 3. 测试查询性能
|
||
self.logger.info("测试查询性能...")
|
||
analysis_results['query_performance'] = await self.test_query_performance()
|
||
|
||
# 4. 检查现有索引
|
||
self.logger.info("检查现有索引...")
|
||
analysis_results['existing_indexes'] = await self.check_existing_indexes()
|
||
|
||
# 5. 生成索引建议
|
||
self.logger.info("生成索引建议...")
|
||
analysis_results['index_recommendations'] = self.generate_index_recommendations(
|
||
analysis_results['query_performance'],
|
||
analysis_results['data_structure']
|
||
)
|
||
|
||
# 6. 测试示例查询
|
||
self.logger.info("测试示例查询...")
|
||
analysis_results['sample_queries'] = await self.test_sample_queries()
|
||
|
||
return analysis_results
|
||
|
||
async def main():
|
||
"""主函数"""
|
||
# 初始化MongoDB客户端
|
||
mongodb_client = SwarmMongoDBClient(
|
||
mcp_server_url="http://localhost:8080",
|
||
default_database="news_debate_db"
|
||
)
|
||
|
||
# 创建数据读取器
|
||
reader = RSSDataReader(mongodb_client)
|
||
|
||
# 运行分析
|
||
results = await reader.run_comprehensive_analysis()
|
||
|
||
# 输出结果
|
||
print("\n" + "="*60)
|
||
print("RSS数据分析报告")
|
||
print("="*60)
|
||
|
||
# 集合统计
|
||
stats = results.get('collection_stats', {})
|
||
print(f"\n📊 集合统计:")
|
||
print(f" 总文档数: {stats.get('total_documents', 0)}")
|
||
print(f" 集合存在: {stats.get('collection_exists', False)}")
|
||
|
||
# 数据结构
|
||
structure = results.get('data_structure', {})
|
||
if 'field_analysis' in structure:
|
||
print(f"\n🏗️ 数据结构:")
|
||
for field, info in structure['field_analysis'].items():
|
||
print(f" {field}: {info['type']} (出现{info['count']}次)")
|
||
|
||
# 查询性能
|
||
performance = results.get('query_performance', {})
|
||
print(f"\n⚡ 查询性能:")
|
||
for query_name, result in performance.items():
|
||
if isinstance(result, dict) and 'query_time_ms' in result:
|
||
print(f" {query_name}: {result['query_time_ms']}ms ({result['document_count']}条结果)")
|
||
|
||
# 索引建议
|
||
recommendations = results.get('index_recommendations', {})
|
||
print(f"\n💡 索引建议:")
|
||
|
||
basic_indexes = recommendations.get('basic_indexes', [])
|
||
if basic_indexes:
|
||
print(f" 基础索引:")
|
||
for idx in basic_indexes:
|
||
print(f" - {idx['field']} ({idx.get('type', 'ascending')}): {idx['reason']}")
|
||
|
||
compound_indexes = recommendations.get('compound_indexes', [])
|
||
if compound_indexes:
|
||
print(f" 复合索引:")
|
||
for idx in compound_indexes:
|
||
print(f" - {', '.join(idx['fields'])}: {idx['reason']}")
|
||
|
||
text_indexes = recommendations.get('text_indexes', [])
|
||
if text_indexes:
|
||
print(f" 文本索引:")
|
||
for idx in text_indexes:
|
||
print(f" - {', '.join(idx['fields'])}: {idx['reason']}")
|
||
|
||
vector_indexes = recommendations.get('vector_indexes', [])
|
||
if vector_indexes:
|
||
print(f" 向量索引建议:")
|
||
for idx in vector_indexes:
|
||
print(f" - {idx['consideration']}: {idx['reason']}")
|
||
|
||
# 示例查询结果
|
||
samples = results.get('sample_queries', {})
|
||
print(f"\n🔍 示例查询:")
|
||
for query_name, result in samples.items():
|
||
if isinstance(result, dict) and 'count' in result:
|
||
print(f" {query_name}: {result['count']}条结果")
|
||
|
||
print(f"\n" + "="*60)
|
||
print("分析完成!")
|
||
print("="*60)
|
||
|
||
# 保存详细结果到文件
|
||
with open('/home/ben/liurenchaxin/rss_analysis_report.json', 'w', encoding='utf-8') as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2, default=str)
|
||
print("\n详细报告已保存到: rss_analysis_report.json")
|
||
|
||
if __name__ == "__main__":
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
asyncio.run(main()) |