488 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			488 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
| #!/usr/bin/env python3
 | ||
| """
 | ||
| RSS数据读取测试器
 | ||
| 测试从MongoDB读取RSS新闻数据,并分析索引需求
 | ||
| """
 | ||
| 
 | ||
| import asyncio
 | ||
| import json
 | ||
| import logging
 | ||
| import time
 | ||
| from datetime import datetime, timezone, timedelta
 | ||
| from typing import Dict, List, Any, Optional
 | ||
| from src.mcp.swarm_mongodb_client import SwarmMongoDBClient
 | ||
| 
 | ||
| class RSSDataReader:
 | ||
|     """RSS数据读取器和分析器"""
 | ||
|     
 | ||
|     def __init__(self, mongodb_client: SwarmMongoDBClient, database_name: str = "news_debate_db"):
 | ||
|         self.mongodb_client = mongodb_client
 | ||
|         self.database_name = database_name
 | ||
|         self.collection_name = "news_articles"
 | ||
|         self.logger = logging.getLogger(__name__)
 | ||
|     
 | ||
|     async def connect_to_database(self) -> bool:
 | ||
|         """连接到数据库"""
 | ||
|         try:
 | ||
|             result = self.mongodb_client.connect(self.database_name)
 | ||
|             if result.get('success'):
 | ||
|                 self.logger.info(f"成功连接到数据库: {self.database_name}")
 | ||
|                 return True
 | ||
|             else:
 | ||
|                 self.logger.error(f"数据库连接失败: {result}")
 | ||
|                 return False
 | ||
|         except Exception as e:
 | ||
|             self.logger.error(f"数据库连接异常: {e}")
 | ||
|             return False
 | ||
|     
 | ||
|     async def get_collection_stats(self) -> Dict[str, Any]:
 | ||
|         """获取集合统计信息"""
 | ||
|         try:
 | ||
|             # 获取文档总数
 | ||
|             count_result = self.mongodb_client.count_documents(self.collection_name)
 | ||
|             total_count = count_result.get('count', 0) if count_result.get('success') else 0
 | ||
|             
 | ||
|             # 获取最新的几条记录来分析数据结构
 | ||
|             latest_docs = self.mongodb_client.find_documents(
 | ||
|                 self.collection_name,
 | ||
|                 query={},
 | ||
|                 sort={'collected_at': -1},
 | ||
|                 limit=5
 | ||
|             )
 | ||
|             
 | ||
|             # 获取最早的记录
 | ||
|             earliest_docs = self.mongodb_client.find_documents(
 | ||
|                 self.collection_name,
 | ||
|                 query={},
 | ||
|                 sort={'collected_at': 1},
 | ||
|                 limit=1
 | ||
|             )
 | ||
|             
 | ||
|             stats = {
 | ||
|                 'total_documents': total_count,
 | ||
|                 'latest_documents': latest_docs.get('documents', []) if latest_docs.get('success') else [],
 | ||
|                 'earliest_document': earliest_docs.get('documents', []) if earliest_docs.get('success') else [],
 | ||
|                 'collection_exists': total_count > 0
 | ||
|             }
 | ||
|             
 | ||
|             return stats
 | ||
|             
 | ||
|         except Exception as e:
 | ||
|             self.logger.error(f"获取集合统计信息失败: {e}")
 | ||
|             return {'error': str(e)}
 | ||
|     
 | ||
|     async def analyze_data_structure(self, sample_size: int = 10) -> Dict[str, Any]:
 | ||
|         """分析数据结构"""
 | ||
|         try:
 | ||
|             # 获取样本数据
 | ||
|             sample_result = self.mongodb_client.find_documents(
 | ||
|                 self.collection_name,
 | ||
|                 query={},
 | ||
|                 limit=sample_size
 | ||
|             )
 | ||
|             
 | ||
|             if not sample_result.get('success'):
 | ||
|                 return {'error': '无法获取样本数据'}
 | ||
|             
 | ||
|             documents = sample_result.get('documents', [])
 | ||
|             if not documents:
 | ||
|                 return {'error': '没有找到任何文档'}
 | ||
|             
 | ||
|             # 分析字段结构
 | ||
|             field_analysis = {}
 | ||
|             for doc in documents:
 | ||
|                 for field, value in doc.items():
 | ||
|                     if field not in field_analysis:
 | ||
|                         field_analysis[field] = {
 | ||
|                             'type': type(value).__name__,
 | ||
|                             'sample_values': [],
 | ||
|                             'count': 0
 | ||
|                         }
 | ||
|                     
 | ||
|                     field_analysis[field]['count'] += 1
 | ||
|                     if len(field_analysis[field]['sample_values']) < 3:
 | ||
|                         field_analysis[field]['sample_values'].append(str(value)[:100])  # 限制长度
 | ||
|             
 | ||
|             # 分析常见查询字段
 | ||
|             query_fields = {
 | ||
|                 'title': '标题搜索',
 | ||
|                 'category': '分类筛选',
 | ||
|                 'published': '时间范围查询',
 | ||
|                 'collected_at': '收集时间排序',
 | ||
|                 'tags': '标签搜索',
 | ||
|                 'source_title': '来源筛选'
 | ||
|             }
 | ||
|             
 | ||
|             return {
 | ||
|                 'sample_count': len(documents),
 | ||
|                 'field_analysis': field_analysis,
 | ||
|                 'recommended_query_fields': query_fields,
 | ||
|                 'sample_document': documents[0] if documents else None
 | ||
|             }
 | ||
|             
 | ||
|         except Exception as e:
 | ||
|             self.logger.error(f"数据结构分析失败: {e}")
 | ||
|             return {'error': str(e)}
 | ||
|     
 | ||
|     async def test_query_performance(self) -> Dict[str, Any]:
 | ||
|         """测试查询性能"""
 | ||
|         performance_results = {}
 | ||
|         
 | ||
|         # 测试不同类型的查询
 | ||
|         test_queries = [
 | ||
|             {
 | ||
|                 'name': '全表扫描',
 | ||
|                 'query': {},
 | ||
|                 'sort': None,
 | ||
|                 'limit': 10
 | ||
|             },
 | ||
|             {
 | ||
|                 'name': '按时间排序',
 | ||
|                 'query': {},
 | ||
|                 'sort': {'collected_at': -1},
 | ||
|                 'limit': 10
 | ||
|             },
 | ||
|             {
 | ||
|                 'name': '标题文本搜索',
 | ||
|                 'query': {'title': {'$regex': '市场', '$options': 'i'}},
 | ||
|                 'sort': None,
 | ||
|                 'limit': 10
 | ||
|             },
 | ||
|             {
 | ||
|                 'name': '分类筛选',
 | ||
|                 'query': {'category': '财经新闻'},
 | ||
|                 'sort': None,
 | ||
|                 'limit': 10
 | ||
|             },
 | ||
|             {
 | ||
|                 'name': '时间范围查询',
 | ||
|                 'query': {
 | ||
|                     'collected_at': {
 | ||
|                         '$gte': datetime.now(timezone.utc) - timedelta(days=7)
 | ||
|                     }
 | ||
|                 },
 | ||
|                 'sort': {'collected_at': -1},
 | ||
|                 'limit': 10
 | ||
|             }
 | ||
|         ]
 | ||
|         
 | ||
|         for test in test_queries:
 | ||
|             try:
 | ||
|                 start_time = time.time()
 | ||
|                 
 | ||
|                 result = self.mongodb_client.find_documents(
 | ||
|                     self.collection_name,
 | ||
|                     query=test['query'],
 | ||
|                     sort=test.get('sort'),
 | ||
|                     limit=test['limit']
 | ||
|                 )
 | ||
|                 
 | ||
|                 end_time = time.time()
 | ||
|                 query_time = (end_time - start_time) * 1000  # 转换为毫秒
 | ||
|                 
 | ||
|                 performance_results[test['name']] = {
 | ||
|                     'query_time_ms': round(query_time, 2),
 | ||
|                     'success': result.get('success', False),
 | ||
|                     'document_count': len(result.get('documents', [])),
 | ||
|                     'query': test['query']
 | ||
|                 }
 | ||
|                 
 | ||
|             except Exception as e:
 | ||
|                 performance_results[test['name']] = {
 | ||
|                     'error': str(e),
 | ||
|                     'query': test['query']
 | ||
|                 }
 | ||
|         
 | ||
|         return performance_results
 | ||
|     
 | ||
|     async def check_existing_indexes(self) -> Dict[str, Any]:
 | ||
|         """检查现有索引"""
 | ||
|         try:
 | ||
|             # 注意:这里需要使用MongoDB的原生命令来获取索引信息
 | ||
|             # 由于SwarmMongoDBClient可能没有直接的索引查询方法,我们尝试其他方式
 | ||
|             
 | ||
|             # 尝试通过聚合管道获取索引信息
 | ||
|             pipeline = [
 | ||
|                 {"$indexStats": {}}
 | ||
|             ]
 | ||
|             
 | ||
|             # 如果客户端支持聚合查询
 | ||
|             if hasattr(self.mongodb_client, 'aggregate_documents'):
 | ||
|                 result = self.mongodb_client.aggregate_documents(
 | ||
|                     self.collection_name,
 | ||
|                     pipeline=pipeline
 | ||
|                 )
 | ||
|                 
 | ||
|                 if result.get('success'):
 | ||
|                     return {
 | ||
|                         'indexes': result.get('documents', []),
 | ||
|                         'method': 'aggregation'
 | ||
|                     }
 | ||
|             
 | ||
|             # 如果无法直接获取索引信息,返回建议
 | ||
|             return {
 | ||
|                 'message': '无法直接查询索引信息,建议手动检查',
 | ||
|                 'method': 'manual_check_needed'
 | ||
|             }
 | ||
|             
 | ||
|         except Exception as e:
 | ||
|             return {
 | ||
|                 'error': str(e),
 | ||
|                 'message': '索引查询失败'
 | ||
|             }
 | ||
|     
 | ||
|     def generate_index_recommendations(self, performance_results: Dict[str, Any], 
 | ||
|                                      data_analysis: Dict[str, Any]) -> Dict[str, Any]:
 | ||
|         """生成索引建议"""
 | ||
|         recommendations = {
 | ||
|             'basic_indexes': [],
 | ||
|             'compound_indexes': [],
 | ||
|             'text_indexes': [],
 | ||
|             'vector_indexes': [],
 | ||
|             'reasoning': []
 | ||
|         }
 | ||
|         
 | ||
|         # 基础索引建议
 | ||
|         slow_queries = [name for name, result in performance_results.items() 
 | ||
|                        if isinstance(result, dict) and result.get('query_time_ms', 0) > 100]
 | ||
|         
 | ||
|         if slow_queries:
 | ||
|             recommendations['reasoning'].append(f"发现慢查询: {', '.join(slow_queries)}")
 | ||
|         
 | ||
|         # 基于数据结构的索引建议
 | ||
|         field_analysis = data_analysis.get('field_analysis', {})
 | ||
|         
 | ||
|         # 时间字段索引(用于排序和范围查询)
 | ||
|         if 'collected_at' in field_analysis:
 | ||
|             recommendations['basic_indexes'].append({
 | ||
|                 'field': 'collected_at',
 | ||
|                 'type': 'descending',
 | ||
|                 'reason': '用于时间排序和范围查询'
 | ||
|             })
 | ||
|         
 | ||
|         if 'published' in field_analysis:
 | ||
|             recommendations['basic_indexes'].append({
 | ||
|                 'field': 'published',
 | ||
|                 'type': 'descending', 
 | ||
|                 'reason': '用于发布时间查询'
 | ||
|             })
 | ||
|         
 | ||
|         # 分类字段索引
 | ||
|         if 'category' in field_analysis:
 | ||
|             recommendations['basic_indexes'].append({
 | ||
|                 'field': 'category',
 | ||
|                 'type': 'ascending',
 | ||
|                 'reason': '用于分类筛选'
 | ||
|             })
 | ||
|         
 | ||
|         # 唯一标识符索引
 | ||
|         if 'article_id' in field_analysis:
 | ||
|             recommendations['basic_indexes'].append({
 | ||
|                 'field': 'article_id',
 | ||
|                 'type': 'ascending',
 | ||
|                 'unique': True,
 | ||
|                 'reason': '唯一标识符,防止重复'
 | ||
|             })
 | ||
|         
 | ||
|         # 复合索引建议
 | ||
|         recommendations['compound_indexes'].append({
 | ||
|             'fields': ['category', 'collected_at'],
 | ||
|             'reason': '支持按分类筛选并按时间排序'
 | ||
|         })
 | ||
|         
 | ||
|         # 文本搜索索引
 | ||
|         text_fields = []
 | ||
|         for field in ['title', 'description', 'summary']:
 | ||
|             if field in field_analysis:
 | ||
|                 text_fields.append(field)
 | ||
|         
 | ||
|         if text_fields:
 | ||
|             recommendations['text_indexes'].append({
 | ||
|                 'fields': text_fields,
 | ||
|                 'type': 'text',
 | ||
|                 'reason': '支持全文搜索'
 | ||
|             })
 | ||
|         
 | ||
|         # 向量索引建议
 | ||
|         recommendations['vector_indexes'].append({
 | ||
|             'consideration': '如果需要语义搜索',
 | ||
|             'fields': ['title', 'description'],
 | ||
|             'method': 'embedding + vector_search',
 | ||
|             'reason': '用于基于内容相似性的智能搜索和推荐'
 | ||
|         })
 | ||
|         
 | ||
|         return recommendations
 | ||
|     
 | ||
|     async def test_sample_queries(self) -> Dict[str, Any]:
 | ||
|         """测试一些示例查询"""
 | ||
|         sample_queries = {}
 | ||
|         
 | ||
|         try:
 | ||
|             # 1. 获取最新10条新闻
 | ||
|             latest_news = self.mongodb_client.find_documents(
 | ||
|                 self.collection_name,
 | ||
|                 query={},
 | ||
|                 sort={'collected_at': -1},
 | ||
|                 limit=10
 | ||
|             )
 | ||
|             sample_queries['latest_news'] = {
 | ||
|                 'success': latest_news.get('success'),
 | ||
|                 'count': len(latest_news.get('documents', [])),
 | ||
|                 'sample_titles': [doc.get('title', 'N/A')[:50] + '...' 
 | ||
|                                 for doc in latest_news.get('documents', [])[:3]]
 | ||
|             }
 | ||
|             
 | ||
|             # 2. 按分类查询
 | ||
|             category_news = self.mongodb_client.find_documents(
 | ||
|                 self.collection_name,
 | ||
|                 query={'category': '财经新闻'},
 | ||
|                 limit=5
 | ||
|             )
 | ||
|             sample_queries['category_news'] = {
 | ||
|                 'success': category_news.get('success'),
 | ||
|                 'count': len(category_news.get('documents', [])),
 | ||
|                 'category': '财经新闻'
 | ||
|             }
 | ||
|             
 | ||
|             # 3. 关键词搜索
 | ||
|             keyword_search = self.mongodb_client.find_documents(
 | ||
|                 self.collection_name,
 | ||
|                 query={'title': {'$regex': '投资|股票|市场', '$options': 'i'}},
 | ||
|                 limit=5
 | ||
|             )
 | ||
|             sample_queries['keyword_search'] = {
 | ||
|                 'success': keyword_search.get('success'),
 | ||
|                 'count': len(keyword_search.get('documents', [])),
 | ||
|                 'keywords': '投资|股票|市场'
 | ||
|             }
 | ||
|             
 | ||
|         except Exception as e:
 | ||
|             sample_queries['error'] = str(e)
 | ||
|         
 | ||
|         return sample_queries
 | ||
|     
 | ||
|     async def run_comprehensive_analysis(self) -> Dict[str, Any]:
 | ||
|         """运行完整的数据分析"""
 | ||
|         self.logger.info("开始RSS数据分析...")
 | ||
|         
 | ||
|         # 连接数据库
 | ||
|         if not await self.connect_to_database():
 | ||
|             return {'error': '无法连接到数据库'}
 | ||
|         
 | ||
|         analysis_results = {}
 | ||
|         
 | ||
|         # 1. 获取集合统计信息
 | ||
|         self.logger.info("获取集合统计信息...")
 | ||
|         analysis_results['collection_stats'] = await self.get_collection_stats()
 | ||
|         
 | ||
|         # 2. 分析数据结构
 | ||
|         self.logger.info("分析数据结构...")
 | ||
|         analysis_results['data_structure'] = await self.analyze_data_structure()
 | ||
|         
 | ||
|         # 3. 测试查询性能
 | ||
|         self.logger.info("测试查询性能...")
 | ||
|         analysis_results['query_performance'] = await self.test_query_performance()
 | ||
|         
 | ||
|         # 4. 检查现有索引
 | ||
|         self.logger.info("检查现有索引...")
 | ||
|         analysis_results['existing_indexes'] = await self.check_existing_indexes()
 | ||
|         
 | ||
|         # 5. 生成索引建议
 | ||
|         self.logger.info("生成索引建议...")
 | ||
|         analysis_results['index_recommendations'] = self.generate_index_recommendations(
 | ||
|             analysis_results['query_performance'],
 | ||
|             analysis_results['data_structure']
 | ||
|         )
 | ||
|         
 | ||
|         # 6. 测试示例查询
 | ||
|         self.logger.info("测试示例查询...")
 | ||
|         analysis_results['sample_queries'] = await self.test_sample_queries()
 | ||
|         
 | ||
|         return analysis_results
 | ||
| 
 | ||
| async def main():
 | ||
|     """主函数"""
 | ||
|     # 初始化MongoDB客户端
 | ||
|     mongodb_client = SwarmMongoDBClient(
 | ||
|         mcp_server_url="http://localhost:8080",
 | ||
|         default_database="news_debate_db"
 | ||
|     )
 | ||
|     
 | ||
|     # 创建数据读取器
 | ||
|     reader = RSSDataReader(mongodb_client)
 | ||
|     
 | ||
|     # 运行分析
 | ||
|     results = await reader.run_comprehensive_analysis()
 | ||
|     
 | ||
|     # 输出结果
 | ||
|     print("\n" + "="*60)
 | ||
|     print("RSS数据分析报告")
 | ||
|     print("="*60)
 | ||
|     
 | ||
|     # 集合统计
 | ||
|     stats = results.get('collection_stats', {})
 | ||
|     print(f"\n📊 集合统计:")
 | ||
|     print(f"  总文档数: {stats.get('total_documents', 0)}")
 | ||
|     print(f"  集合存在: {stats.get('collection_exists', False)}")
 | ||
|     
 | ||
|     # 数据结构
 | ||
|     structure = results.get('data_structure', {})
 | ||
|     if 'field_analysis' in structure:
 | ||
|         print(f"\n🏗️  数据结构:")
 | ||
|         for field, info in structure['field_analysis'].items():
 | ||
|             print(f"  {field}: {info['type']} (出现{info['count']}次)")
 | ||
|     
 | ||
|     # 查询性能
 | ||
|     performance = results.get('query_performance', {})
 | ||
|     print(f"\n⚡ 查询性能:")
 | ||
|     for query_name, result in performance.items():
 | ||
|         if isinstance(result, dict) and 'query_time_ms' in result:
 | ||
|             print(f"  {query_name}: {result['query_time_ms']}ms ({result['document_count']}条结果)")
 | ||
|     
 | ||
|     # 索引建议
 | ||
|     recommendations = results.get('index_recommendations', {})
 | ||
|     print(f"\n💡 索引建议:")
 | ||
|     
 | ||
|     basic_indexes = recommendations.get('basic_indexes', [])
 | ||
|     if basic_indexes:
 | ||
|         print(f"  基础索引:")
 | ||
|         for idx in basic_indexes:
 | ||
|             print(f"    - {idx['field']} ({idx.get('type', 'ascending')}): {idx['reason']}")
 | ||
|     
 | ||
|     compound_indexes = recommendations.get('compound_indexes', [])
 | ||
|     if compound_indexes:
 | ||
|         print(f"  复合索引:")
 | ||
|         for idx in compound_indexes:
 | ||
|             print(f"    - {', '.join(idx['fields'])}: {idx['reason']}")
 | ||
|     
 | ||
|     text_indexes = recommendations.get('text_indexes', [])
 | ||
|     if text_indexes:
 | ||
|         print(f"  文本索引:")
 | ||
|         for idx in text_indexes:
 | ||
|             print(f"    - {', '.join(idx['fields'])}: {idx['reason']}")
 | ||
|     
 | ||
|     vector_indexes = recommendations.get('vector_indexes', [])
 | ||
|     if vector_indexes:
 | ||
|         print(f"  向量索引建议:")
 | ||
|         for idx in vector_indexes:
 | ||
|             print(f"    - {idx['consideration']}: {idx['reason']}")
 | ||
|     
 | ||
|     # 示例查询结果
 | ||
|     samples = results.get('sample_queries', {})
 | ||
|     print(f"\n🔍 示例查询:")
 | ||
|     for query_name, result in samples.items():
 | ||
|         if isinstance(result, dict) and 'count' in result:
 | ||
|             print(f"  {query_name}: {result['count']}条结果")
 | ||
|     
 | ||
|     print(f"\n" + "="*60)
 | ||
|     print("分析完成!")
 | ||
|     print("="*60)
 | ||
|     
 | ||
|     # 保存详细结果到文件
 | ||
|     with open('/home/ben/liurenchaxin/rss_analysis_report.json', 'w', encoding='utf-8') as f:
 | ||
|         json.dump(results, f, ensure_ascii=False, indent=2, default=str)
 | ||
|     print("\n详细报告已保存到: rss_analysis_report.json")
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
|     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 | ||
|     asyncio.run(main()) |