liurenchaxin/scripts/cleanup_duplicates.py

#!/usr/bin/env python3
"""
清理MongoDB中的重复文章数据
"""

import os
import sys
from pymongo import MongoClient
from collections import defaultdict
import hashlib

def generate_stable_id(title, pub_date, content):
    """生成稳定的文章ID"""
    normalized_title = title.strip().lower()
    content_hash = content[:100] if content else ''
    date_str = pub_date or ''

    combined = f"{normalized_title}|{date_str}|{content_hash}"
    return hashlib.md5(combined.encode()).hexdigest()[:16]

def cleanup_duplicates():
    """清理重复数据"""
    # 连接MongoDB
    mongo_uri = os.getenv('MONGODB_URI')
    if not mongo_uri:
        raise ValueError("MONGODB_URI environment variable is required")
    client = MongoClient(mongo_uri)
    db = client['taigong']
    collection = db['articles']

    print("开始清理重复数据...")

    # 1. 查找所有文章
    articles = list(collection.find({}))
    print(f"总共找到 {len(articles)} 篇文章")

    # 2. 按标题分组，找出重复项
    title_groups = defaultdict(list)
    for article in articles:
        title_groups[article['title']].append(article)

    # 3. 处理重复项
    duplicates_removed = 0
    articles_updated = 0

    for title, group in title_groups.items():
        if len(group) > 1:
            print(f"发现重复标题: {title} ({len(group)} 篇)")

            # 保留最早的一篇，删除其他
            group.sort(key=lambda x: x.get('created_at', ''))
            keep_article = group[0]

            # 更新保留文章的ID为稳定ID
            stable_id = generate_stable_id(
                keep_article['title'],
                keep_article.get('published_time', ''),
                keep_article.get('content', '')
            )

            collection.update_one(
                {'_id': keep_article['_id']},
                {
                    '$set': {
                        'article_id': stable_id,
                        'content_hash': generate_stable_id(keep_article.get('content', ''), '', ''),
                        'last_updated': '2025-02-08T00:00:00Z'
                    }
                }
            )
            articles_updated += 1

            # 删除重复项
            for duplicate in group[1:]:
                collection.delete_one({'_id': duplicate['_id']})
                duplicates_removed += 1
                print(f"  删除重复项: {duplicate.get('article_id', 'unknown')}")

    # 4. 为没有重复的文章更新ID
    single_articles = [group[0] for group in title_groups.values() if len(group) == 1]
    for article in single_articles:
        if not article.get('article_id') or len(article.get('article_id', '')) > 20:
            stable_id = generate_stable_id(
                article['title'],
                article.get('published_time', ''),
                article.get('content', '')
            )

            collection.update_one(
                {'_id': article['_id']},
                {
                    '$set': {
                        'article_id': stable_id,
                        'content_hash': generate_stable_id(article.get('content', ''), '', ''),
                        'last_updated': '2025-02-08T00:00:00Z'
                    }
                }
            )
            articles_updated += 1

    print(f"清理完成:")
    print(f"  删除重复文章: {duplicates_removed} 篇")
    print(f"  更新文章ID: {articles_updated} 篇")
    print(f"  最终文章数: {collection.count_documents({})} 篇")

if __name__ == "__main__":
    cleanup_duplicates()