107 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			107 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| 清理MongoDB中的重复文章数据
 | |
| """
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| from pymongo import MongoClient
 | |
| from collections import defaultdict
 | |
| import hashlib
 | |
| 
 | |
| def generate_stable_id(title, pub_date, content):
 | |
|     """生成稳定的文章ID"""
 | |
|     normalized_title = title.strip().lower()
 | |
|     content_hash = content[:100] if content else ''
 | |
|     date_str = pub_date or ''
 | |
|     
 | |
|     combined = f"{normalized_title}|{date_str}|{content_hash}"
 | |
|     return hashlib.md5(combined.encode()).hexdigest()[:16]
 | |
| 
 | |
| def cleanup_duplicates():
 | |
|     """清理重复数据"""
 | |
|     # 连接MongoDB
 | |
|     mongo_uri = os.getenv('MONGODB_URI')
 | |
|     if not mongo_uri:
 | |
|         raise ValueError("MONGODB_URI environment variable is required")
 | |
|     client = MongoClient(mongo_uri)
 | |
|     db = client['taigong']
 | |
|     collection = db['articles']
 | |
|     
 | |
|     print("开始清理重复数据...")
 | |
|     
 | |
|     # 1. 查找所有文章
 | |
|     articles = list(collection.find({}))
 | |
|     print(f"总共找到 {len(articles)} 篇文章")
 | |
|     
 | |
|     # 2. 按标题分组,找出重复项
 | |
|     title_groups = defaultdict(list)
 | |
|     for article in articles:
 | |
|         title_groups[article['title']].append(article)
 | |
|     
 | |
|     # 3. 处理重复项
 | |
|     duplicates_removed = 0
 | |
|     articles_updated = 0
 | |
|     
 | |
|     for title, group in title_groups.items():
 | |
|         if len(group) > 1:
 | |
|             print(f"发现重复标题: {title} ({len(group)} 篇)")
 | |
|             
 | |
|             # 保留最早的一篇,删除其他
 | |
|             group.sort(key=lambda x: x.get('created_at', ''))
 | |
|             keep_article = group[0]
 | |
|             
 | |
|             # 更新保留文章的ID为稳定ID
 | |
|             stable_id = generate_stable_id(
 | |
|                 keep_article['title'],
 | |
|                 keep_article.get('published_time', ''),
 | |
|                 keep_article.get('content', '')
 | |
|             )
 | |
|             
 | |
|             collection.update_one(
 | |
|                 {'_id': keep_article['_id']},
 | |
|                 {
 | |
|                     '$set': {
 | |
|                         'article_id': stable_id,
 | |
|                         'content_hash': generate_stable_id(keep_article.get('content', ''), '', ''),
 | |
|                         'last_updated': '2025-02-08T00:00:00Z'
 | |
|                     }
 | |
|                 }
 | |
|             )
 | |
|             articles_updated += 1
 | |
|             
 | |
|             # 删除重复项
 | |
|             for duplicate in group[1:]:
 | |
|                 collection.delete_one({'_id': duplicate['_id']})
 | |
|                 duplicates_removed += 1
 | |
|                 print(f"  删除重复项: {duplicate.get('article_id', 'unknown')}")
 | |
|     
 | |
|     # 4. 为没有重复的文章更新ID
 | |
|     single_articles = [group[0] for group in title_groups.values() if len(group) == 1]
 | |
|     for article in single_articles:
 | |
|         if not article.get('article_id') or len(article.get('article_id', '')) > 20:
 | |
|             stable_id = generate_stable_id(
 | |
|                 article['title'],
 | |
|                 article.get('published_time', ''),
 | |
|                 article.get('content', '')
 | |
|             )
 | |
|             
 | |
|             collection.update_one(
 | |
|                 {'_id': article['_id']},
 | |
|                 {
 | |
|                     '$set': {
 | |
|                         'article_id': stable_id,
 | |
|                         'content_hash': generate_stable_id(article.get('content', ''), '', ''),
 | |
|                         'last_updated': '2025-02-08T00:00:00Z'
 | |
|                     }
 | |
|                 }
 | |
|             )
 | |
|             articles_updated += 1
 | |
|     
 | |
|     print(f"清理完成:")
 | |
|     print(f"  删除重复文章: {duplicates_removed} 篇")
 | |
|     print(f"  更新文章ID: {articles_updated} 篇")
 | |
|     print(f"  最终文章数: {collection.count_documents({})} 篇")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     cleanup_duplicates() |