feat: 重构项目结构并添加新功能

- 新增Cloudflare AutoRAG/Vectorize集成文档
- 实现Vertex AI记忆银行功能
- 重构项目目录结构,清理无用文件
- 更新README以反映最新架构
- 添加Google ADK集成测试脚本
- 完善需求文档和设计规范
This commit is contained in:
ben
2025-08-16 10:37:11 +00:00
parent 26338d48cf
commit c4e8cfefc7
106 changed files with 12243 additions and 1839 deletions

View File

@@ -1,63 +0,0 @@
#!/usr/bin/env python3
"""
为现有文章添加流水号
"""
import os
from pymongo import MongoClient
def add_sequence_ids():
"""为现有文章添加流水号"""
# 连接MongoDB
mongo_uri = os.getenv('MONGODB_URI')
if not mongo_uri:
raise ValueError("MONGODB_URI environment variable is required")
client = MongoClient(mongo_uri)
db = client['taigong']
collection = db['articles']
print("开始为现有文章添加流水号...")
# 查找所有没有sequence_id的文章
articles_without_seq = list(collection.find(
{"sequence_id": {"$exists": False}},
{"_id": 1, "title": 1, "created_at": 1}
).sort("created_at", 1)) # 按创建时间排序
print(f"找到 {len(articles_without_seq)} 篇文章需要添加流水号")
if len(articles_without_seq) == 0:
print("所有文章都已有流水号")
return
# 从1开始分配流水号
for i, article in enumerate(articles_without_seq, 1):
sequence_id = i
article_id = f"NEWS_{sequence_id:08d}" # NEWS_00000001 格式
collection.update_one(
{"_id": article["_id"]},
{
"$set": {
"sequence_id": sequence_id,
"article_id": article_id,
"batch_id": "migration_batch",
"last_updated": "2025-02-08T00:00:00Z"
}
}
)
print(f" {sequence_id:3d}: {article['title'][:50]}...")
print(f"流水号添加完成,共处理 {len(articles_without_seq)} 篇文章")
# 验证结果
total_with_seq = collection.count_documents({"sequence_id": {"$exists": True}})
max_seq = collection.find_one({}, sort=[("sequence_id", -1)])
print(f"验证结果:")
print(f" 有流水号的文章: {total_with_seq}")
print(f" 最大流水号: {max_seq['sequence_id'] if max_seq else 0}")
if __name__ == "__main__":
add_sequence_ids()

View File

@@ -1,107 +0,0 @@
#!/usr/bin/env python3
"""
清理MongoDB中的重复文章数据
"""
import os
import sys
from pymongo import MongoClient
from collections import defaultdict
import hashlib
def generate_stable_id(title, pub_date, content):
"""生成稳定的文章ID"""
normalized_title = title.strip().lower()
content_hash = content[:100] if content else ''
date_str = pub_date or ''
combined = f"{normalized_title}|{date_str}|{content_hash}"
return hashlib.md5(combined.encode()).hexdigest()[:16]
def cleanup_duplicates():
"""清理重复数据"""
# 连接MongoDB
mongo_uri = os.getenv('MONGODB_URI')
if not mongo_uri:
raise ValueError("MONGODB_URI environment variable is required")
client = MongoClient(mongo_uri)
db = client['taigong']
collection = db['articles']
print("开始清理重复数据...")
# 1. 查找所有文章
articles = list(collection.find({}))
print(f"总共找到 {len(articles)} 篇文章")
# 2. 按标题分组,找出重复项
title_groups = defaultdict(list)
for article in articles:
title_groups[article['title']].append(article)
# 3. 处理重复项
duplicates_removed = 0
articles_updated = 0
for title, group in title_groups.items():
if len(group) > 1:
print(f"发现重复标题: {title} ({len(group)} 篇)")
# 保留最早的一篇,删除其他
group.sort(key=lambda x: x.get('created_at', ''))
keep_article = group[0]
# 更新保留文章的ID为稳定ID
stable_id = generate_stable_id(
keep_article['title'],
keep_article.get('published_time', ''),
keep_article.get('content', '')
)
collection.update_one(
{'_id': keep_article['_id']},
{
'$set': {
'article_id': stable_id,
'content_hash': generate_stable_id(keep_article.get('content', ''), '', ''),
'last_updated': '2025-02-08T00:00:00Z'
}
}
)
articles_updated += 1
# 删除重复项
for duplicate in group[1:]:
collection.delete_one({'_id': duplicate['_id']})
duplicates_removed += 1
print(f" 删除重复项: {duplicate.get('article_id', 'unknown')}")
# 4. 为没有重复的文章更新ID
single_articles = [group[0] for group in title_groups.values() if len(group) == 1]
for article in single_articles:
if not article.get('article_id') or len(article.get('article_id', '')) > 20:
stable_id = generate_stable_id(
article['title'],
article.get('published_time', ''),
article.get('content', '')
)
collection.update_one(
{'_id': article['_id']},
{
'$set': {
'article_id': stable_id,
'content_hash': generate_stable_id(article.get('content', ''), '', ''),
'last_updated': '2025-02-08T00:00:00Z'
}
}
)
articles_updated += 1
print(f"清理完成:")
print(f" 删除重复文章: {duplicates_removed}")
print(f" 更新文章ID: {articles_updated}")
print(f" 最终文章数: {collection.count_documents({})}")
if __name__ == "__main__":
cleanup_duplicates()

View File

@@ -1,35 +0,0 @@
// MongoDB Atlas Vector Search Index Creation Script
// 为swarm辩论系统创建向量索引
// 连接到数据库
use('taigong');
// 创建向量索引用于语义搜索和内容聚类
// 这个索引将支持swarm辩论系统的语义相似性匹配
db.articles.createSearchIndex(
"vector_search_index",
{
"fields": [
{
"type": "vector",
"path": "embedding",
"numDimensions": 1536, // OpenAI text-embedding-ada-002 维度
"similarity": "cosine"
},
{
"type": "filter",
"path": "published_time"
},
{
"type": "filter",
"path": "title"
}
]
}
);
print("向量索引创建完成!");
print("索引名称: vector_search_index");
print("向量维度: 1536 (OpenAI embedding)");
print("相似性算法: cosine");
print("支持过滤字段: published_time, title");

View File

@@ -1,75 +0,0 @@
#!/usr/bin/env python3
"""
为MongoDB中的文章生成向量embeddings
用于swarm辩论系统的语义搜索和内容聚类
"""
import os
import openai
from pymongo import MongoClient
from typing import List, Dict
import time
def get_mongodb_client():
"""从Doppler获取MongoDB连接"""
mongodb_uri = os.getenv('MONGODB_URI')
if not mongodb_uri:
raise ValueError("MONGODB_URI not found in environment variables")
return MongoClient(mongodb_uri)
def generate_embedding(text: str) -> List[float]:
"""使用OpenAI API生成文本embedding"""
try:
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
return response['data'][0]['embedding']
except Exception as e:
print(f"生成embedding失败: {e}")
return None
def update_articles_with_embeddings():
"""为所有文章添加embedding字段"""
client = get_mongodb_client()
db = client.taigong
collection = db.articles
# 获取所有没有embedding的文章
articles = collection.find({"embedding": {"$exists": False}})
count = 0
for article in articles:
title = article.get('title', '')
if not title:
continue
print(f"处理文章: {title[:50]}...")
# 生成embedding
embedding = generate_embedding(title)
if embedding:
# 更新文档
collection.update_one(
{"_id": article["_id"]},
{"$set": {"embedding": embedding}}
)
count += 1
print(f"✓ 已更新 {count} 篇文章")
# 避免API rate limit
time.sleep(0.1)
else:
print(f"× 跳过文章: {title[:50]}")
print(f"\n完成!共处理 {count} 篇文章")
client.close()
if __name__ == "__main__":
# 设置OpenAI API密钥 (应该从Doppler获取)
openai.api_key = os.getenv('OPENAI_API_KEY')
if not openai.api_key:
print("警告: OPENAI_API_KEY 未设置请先在Doppler中配置")
exit(1)
update_articles_with_embeddings()

View File

@@ -1,73 +0,0 @@
#!/usr/bin/env python3
"""
安装OpenAI Swarm的脚本
"""
import subprocess
import sys
def install_swarm():
"""安装OpenAI Swarm"""
print("🚀 正在安装OpenAI Swarm...")
try:
# 安装Swarm
result = subprocess.run([
sys.executable, "-m", "pip", "install",
"git+https://github.com/openai/swarm.git"
], check=True, capture_output=True, text=True)
print("✅ OpenAI Swarm安装成功")
print(result.stdout)
# 验证安装
try:
import swarm
print("✅ Swarm导入测试成功")
print(f"📦 Swarm版本: {getattr(swarm, '__version__', '未知')}")
except ImportError as e:
print(f"❌ Swarm导入失败: {e}")
return False
return True
except subprocess.CalledProcessError as e:
print(f"❌ 安装失败: {e}")
print(f"错误输出: {e.stderr}")
return False
except Exception as e:
print(f"❌ 未知错误: {e}")
return False
def main():
"""主函数"""
print("🏛️ 稷下学宫Swarm环境安装")
print("=" * 40)
# 检查是否已安装
try:
import swarm
print("✅ OpenAI Swarm已安装")
print(f"📦 版本: {getattr(swarm, '__version__', '未知')}")
choice = input("是否重新安装?(y/N): ").strip().lower()
if choice not in ['y', 'yes']:
print("🎉 安装检查完成")
return
except ImportError:
print("📦 OpenAI Swarm未安装开始安装...")
# 安装Swarm
success = install_swarm()
if success:
print("\n🎉 安装完成现在可以使用Swarm八仙论道了")
print("💡 使用方法:")
print(" python src/jixia/debates/swarm_debate.py")
print(" 或在Streamlit应用中选择'Swarm模式'")
else:
print("\n❌ 安装失败,请手动安装:")
print(" pip install git+https://github.com/openai/swarm.git")
if __name__ == "__main__":
main()

View File

@@ -1,148 +0,0 @@
const items = $input.all();
const results = [];
// 改进的哈希函数 - 基于内容生成稳定的ID
function generateStableId(title, pubDate, content) {
const normalizedTitle = title.trim().toLowerCase();
const contentHash = content ? content.substring(0, 100) : '';
const dateStr = pubDate || '';
const combined = normalizedTitle + '|' + dateStr + '|' + contentHash;
let hash = 0;
for (let i = 0; i < combined.length; i++) {
const char = combined.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash).toString(16);
}
console.log(`开始处理 ${items.length} 条RSS数据`);
// 用于本次执行内去重
const processedInThisRun = new Set();
// 处理每个RSS项目
for (const item of items) {
const data = item.json;
// 跳过无效数据
if (!data.title) {
console.log('跳过无标题数据');
continue;
}
// 生成稳定的文章ID
const stableId = generateStableId(
data.title,
data.isoDate || data.pubDate,
data['content:encodedSnippet'] || data.contentSnippet || ''
);
// 生成内容哈希
const contentHash = generateStableId(
data['content:encodedSnippet'] || data.contentSnippet || '',
'',
''
);
// 准备文章数据
const articleData = {
article_id: stableId,
title: data.title,
content: data['content:encodedSnippet'] || data.contentSnippet || '',
content_hash: contentHash,
published_time: data.isoDate || data.pubDate || new Date().toISOString(),
source_url: data.link || '',
processed: false,
created_at: new Date().toISOString(),
last_updated: new Date().toISOString()
};
try {
// 使用upsert操作避免重复插入
const result = await mongoClient.db('taigong').collection('articles').updateOne(
{
$or: [
{ article_id: stableId },
{ title: data.title }
]
},
{
$setOnInsert: {
article_id: stableId,
title: data.title,
content: articleData.content,
content_hash: contentHash,
published_time: articleData.published_time,
source_url: articleData.source_url,
processed: false,
created_at: articleData.created_at
},
$set: {
last_updated: new Date().toISOString()
}
},
{ upsert: true }
);
if (result.upsertedCount > 0) {
console.log('✅ 新增文章:', data.title);
results.push({
json: {
action: 'inserted',
article_id: stableId,
title: data.title,
status: 'success'
}
});
} else if (result.modifiedCount > 0) {
console.log('🔄 更新文章:', data.title);
results.push({
json: {
action: 'updated',
article_id: stableId,
title: data.title,
status: 'success'
}
});
} else {
console.log('⏭️ 文章已存在,跳过:', data.title);
}
} catch (error) {
console.error('❌ 处理文章失败:', data.title, error.message);
results.push({
json: {
action: 'error',
title: data.title,
error: error.message,
status: 'failed'
}
});
}
}
console.log(`处理完成: 原始${items.length}条, 成功处理${results.length}`);
// 统计结果
const stats = results.reduce((acc, item) => {
acc[item.json.action] = (acc[item.json.action] || 0) + 1;
return acc;
}, {});
console.log('处理统计:', stats);
// 如果没有任何结果,返回一个空的成功状态
if (results.length === 0) {
return [{
json: {
message: '没有新数据需要处理',
total_processed: items.length,
status: 'completed'
}
}];
}
return results;

View File

@@ -1,85 +0,0 @@
const items = $input.all();
const processedItems = [];
// 改进的哈希函数 - 基于内容生成稳定的ID
function generateStableId(title, pubDate, content) {
const normalizedTitle = title.trim().toLowerCase();
const contentHash = content ? content.substring(0, 100) : '';
const dateStr = pubDate || '';
const combined = normalizedTitle + '|' + dateStr + '|' + contentHash;
let hash = 0;
for (let i = 0; i < combined.length; i++) {
const char = combined.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash).toString(16);
}
// 1. 从数据库查询已存在的文章ID和标题
const existingArticles = new Set();
try {
const existing = await mongoClient.db('taigong').collection('articles')
.find({}, { projection: { article_id: 1, title: 1, content_hash: 1 } })
.toArray();
existing.forEach(article => {
existingArticles.add(article.article_id);
// 同时用标题做备用检查
existingArticles.add(article.title);
});
console.log(`数据库中已有 ${existing.length} 篇文章`);
} catch (error) {
console.log('查询现有文章失败:', error);
}
// 2. 处理新数据
for (const item of items) {
const data = item.json;
// 跳过无效数据
if (!data.title) continue;
// 生成稳定的文章ID
const stableId = generateStableId(
data.title,
data.isoDate || data.pubDate,
data['content:encodedSnippet'] || data.contentSnippet || ''
);
// 检查是否已存在用ID和标题双重检查
if (existingArticles.has(stableId) || existingArticles.has(data.title)) {
console.log('跳过重复文章:', data.title);
continue;
}
// 生成内容哈希用于后续去重检查
const contentHash = generateStableId(
data['content:encodedSnippet'] || data.contentSnippet || '',
'',
''
);
const processedItem = {
article_id: stableId, // 使用稳定ID
title: data.title,
content: data['content:encodedSnippet'] || data.contentSnippet || '',
content_hash: contentHash, // 新增:内容哈希
published_time: data.isoDate || data.pubDate || new Date().toISOString(),
source_url: data.link || '', // 新增:源链接
processed: false,
created_at: new Date().toISOString(),
last_updated: new Date().toISOString() // 新增:更新时间
};
processedItems.push({ json: processedItem });
// 添加到已存在集合,避免本次执行内重复
existingArticles.add(stableId);
existingArticles.add(data.title);
}
console.log(`处理完成: 原始${items.length}条, 去重后${processedItems.length}`);
return processedItems;

View File

@@ -1,85 +0,0 @@
const items = $input.all();
const results = [];
// 如果没有数据需要插入
if (items.length === 0 || (items.length === 1 && items[0].json.status === 'no_new_data')) {
console.log('没有新数据需要插入');
return items;
}
console.log(`准备插入 ${items.length} 条新文章`);
// 准备批量插入的数据
const documentsToInsert = items.map(item => item.json);
try {
// 批量插入,因为已经确保了唯一性,所以直接插入
const result = await mongoClient.db('taigong').collection('articles').insertMany(
documentsToInsert,
{ ordered: false } // 即使某条失败也继续插入其他的
);
console.log(`✅ 成功插入 ${result.insertedCount} 条文章`);
// 返回插入结果
for (let i = 0; i < documentsToInsert.length; i++) {
const doc = documentsToInsert[i];
const insertedId = result.insertedIds[i];
results.push({
json: {
action: 'inserted',
sequence_id: doc.sequence_id,
article_id: doc.article_id,
title: doc.title,
mongodb_id: insertedId,
status: 'success'
}
});
}
} catch (error) {
console.error('❌ 批量插入失败:', error.message);
// 如果批量插入失败,尝试逐条插入
console.log('尝试逐条插入...');
for (const doc of documentsToInsert) {
try {
const result = await mongoClient.db('taigong').collection('articles').insertOne(doc);
console.log(`✅ 单条插入成功: ${doc.article_id}`);
results.push({
json: {
action: 'inserted',
sequence_id: doc.sequence_id,
article_id: doc.article_id,
title: doc.title,
mongodb_id: result.insertedId,
status: 'success'
}
});
} catch (singleError) {
console.error(`❌ 单条插入失败 ${doc.article_id}:`, singleError.message);
results.push({
json: {
action: 'error',
sequence_id: doc.sequence_id,
article_id: doc.article_id,
title: doc.title,
error: singleError.message,
status: 'failed'
}
});
}
}
}
// 统计结果
const successCount = results.filter(r => r.json.status === 'success').length;
const failCount = results.filter(r => r.json.status === 'failed').length;
console.log(`插入完成: 成功 ${successCount} 条, 失败 ${failCount}`);
return results;

View File

@@ -1,39 +0,0 @@
const items = $input.all();
console.log(`原始数据: ${items.length}`);
// 本批次内去重
const seenTitles = new Set();
const uniqueItems = [];
// 生成起始ID基于时间戳确保每次运行都不同
let nextId = Math.floor(Date.now() / 1000);
for (const item of items) {
const data = item.json;
// 跳过无效数据
if (!data.title) continue;
// 本批次内去重
if (seenTitles.has(data.title)) {
console.log('⏭️ 本批次重复,跳过:', data.title);
continue;
}
const newsItem = {
id: nextId,
title: data.title,
published_time: data.isoDate || data.pubDate || new Date().toISOString(),
source_url: data.link || ''
};
uniqueItems.push({ json: newsItem });
seenTitles.add(data.title);
console.log(`✅ ID ${nextId}: ${data.title}`);
nextId++;
}
console.log(`本批次去重后: ${uniqueItems.length}`);
return uniqueItems;

View File

@@ -1,54 +0,0 @@
// n8n MongoDB插入节点代码
const items = $input.all();
const results = [];
for (const item of items) {
const data = item.json;
try {
// 使用upsert操作避免重复插入
const result = await mongoClient.db('taigong').collection('articles').updateOne(
{
$or: [
{ article_id: data.article_id },
{ title: data.title }
]
},
{
$setOnInsert: {
article_id: data.article_id,
title: data.title,
content: data.content,
content_hash: data.content_hash,
published_time: data.published_time,
source_url: data.source_url,
processed: data.processed,
created_at: data.created_at
},
$set: {
last_updated: new Date().toISOString()
}
},
{ upsert: true }
);
if (result.upsertedCount > 0) {
console.log('新增文章:', data.title);
results.push({
json: {
action: 'inserted',
article_id: data.article_id,
title: data.title
}
});
} else {
console.log('文章已存在,跳过:', data.title);
}
} catch (error) {
console.error('插入文章失败:', data.title, error);
}
}
console.log(`成功处理 ${results.length} 篇新文章`);
return results;

View File

@@ -1,119 +0,0 @@
const items = $input.all();
const processedItems = [];
// 获取当前最大流水号
async function getCurrentMaxId() {
try {
const result = await mongoClient.db('taigong').collection('articles')
.findOne({}, {
sort: { sequence_id: -1 },
projection: { sequence_id: 1 }
});
return result ? result.sequence_id : 0;
} catch (error) {
console.log('获取最大流水号失败从1开始:', error.message);
return 0;
}
}
// 获取已存在的文章标题集合(用于去重检查)
async function getExistingTitles() {
try {
const existing = await mongoClient.db('taigong').collection('articles')
.find({}, { projection: { title: 1 } })
.toArray();
return new Set(existing.map(doc => doc.title));
} catch (error) {
console.log('获取已存在标题失败:', error.message);
return new Set();
}
}
// 生成内容哈希(用于内容变化检测)
function generateContentHash(content) {
if (!content) return '';
let hash = 0;
const str = content.substring(0, 200); // 取前200字符
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash).toString(16);
}
console.log(`开始处理 ${items.length} 条RSS数据`);
// 1. 获取当前最大流水号
const currentMaxId = await getCurrentMaxId();
console.log(`当前数据库最大流水号: ${currentMaxId}`);
// 2. 获取已存在的文章标题
const existingTitles = await getExistingTitles();
console.log(`数据库中已有 ${existingTitles.size} 篇文章`);
// 3. 处理新数据,分配流水号
let nextSequenceId = currentMaxId + 1;
const seenTitlesInBatch = new Set(); // 本批次内去重
for (const item of items) {
const data = item.json;
// 跳过无效数据
if (!data.title) {
console.log('跳过无标题数据');
continue;
}
// 检查是否已存在(数据库 + 本批次)
if (existingTitles.has(data.title) || seenTitlesInBatch.has(data.title)) {
console.log('⏭️ 跳过重复文章:', data.title);
continue;
}
// 分配新的流水号
const sequenceId = nextSequenceId++;
// 生成文章数据
const articleData = {
sequence_id: sequenceId, // 主键:流水号
article_id: `NEWS_${sequenceId.toString().padStart(8, '0')}`, // 格式化IDNEWS_00000001
title: data.title,
content: data['content:encodedSnippet'] || data.contentSnippet || '',
content_hash: generateContentHash(data['content:encodedSnippet'] || data.contentSnippet || ''),
published_time: data.isoDate || data.pubDate || new Date().toISOString(),
source_url: data.link || '',
rss_source: data.meta?.title || 'unknown', // RSS源名称
processed: false,
created_at: new Date().toISOString(),
batch_id: Date.now().toString() // 批次ID用于追踪
};
processedItems.push({ json: articleData });
seenTitlesInBatch.add(data.title);
console.log(`✅ 分配流水号 ${sequenceId}: ${data.title}`);
}
console.log(`流水号分配完成:`);
console.log(` 原始数据: ${items.length}`);
console.log(` 跳过重复: ${items.length - processedItems.length}`);
console.log(` 新增数据: ${processedItems.length}`);
console.log(` 流水号范围: ${currentMaxId + 1} - ${nextSequenceId - 1}`);
// 如果没有新数据,返回空结果
if (processedItems.length === 0) {
return [{
json: {
message: '没有新数据需要处理',
current_max_id: currentMaxId,
total_articles_in_db: existingTitles.size,
status: 'no_new_data'
}
}];
}
return processedItems;

View File

@@ -1,52 +0,0 @@
const items = $input.all();
// 简单哈希函数
function simpleHash(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash).toString(16);
}
console.log(`原始数据: ${items.length}`);
// 用标题去重
const seenTitles = new Set();
const uniqueItems = [];
for (const item of items) {
const data = item.json;
// 跳过无效数据
if (!data.title) continue;
// 本批次内去重
if (seenTitles.has(data.title)) {
console.log('跳过重复:', data.title);
continue;
}
// 生成稳定ID
const stableId = simpleHash(data.title + (data.isoDate || data.pubDate || ''));
const processedItem = {
article_id: stableId,
title: data.title,
content: data['content:encodedSnippet'] || data.contentSnippet || '',
published_time: data.isoDate || data.pubDate || new Date().toISOString(),
source_url: data.link || '',
processed: false,
created_at: new Date().toISOString()
};
uniqueItems.push({ json: processedItem });
seenTitles.add(data.title);
console.log(`✅ 处理: ${data.title}`);
}
console.log(`去重后: ${uniqueItems.length}`);
return uniqueItems;

View File

@@ -1,163 +0,0 @@
const items = $input.all();
const results = [];
// 通用MongoDB连接获取函数
function getMongoConnection() {
// 尝试不同的MongoDB连接变量名
if (typeof mongoClient !== 'undefined') return mongoClient;
if (typeof mongo !== 'undefined') return mongo;
if (typeof db !== 'undefined') return db;
if (typeof $mongo !== 'undefined') return $mongo;
if (typeof client !== 'undefined') return client;
throw new Error('找不到MongoDB连接对象请检查n8n MongoDB节点配置');
}
// 改进的哈希函数 - 基于内容生成稳定的ID
function generateStableId(title, pubDate, content) {
const normalizedTitle = title.trim().toLowerCase();
const contentHash = content ? content.substring(0, 100) : '';
const dateStr = pubDate || '';
const combined = normalizedTitle + '|' + dateStr + '|' + contentHash;
let hash = 0;
for (let i = 0; i < combined.length; i++) {
const char = combined.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash).toString(16);
}
console.log(`开始处理 ${items.length} 条RSS数据`);
// 获取MongoDB连接
let mongoConnection;
try {
mongoConnection = getMongoConnection();
console.log('✅ MongoDB连接获取成功');
} catch (error) {
console.error('❌ MongoDB连接失败:', error.message);
return [{
json: {
error: 'MongoDB连接失败',
message: error.message,
status: 'connection_failed'
}
}];
}
// 用于本次执行内去重
const processedInThisRun = new Set();
// 处理每个RSS项目
for (const item of items) {
const data = item.json;
// 跳过无效数据
if (!data.title) {
console.log('跳过无标题数据');
continue;
}
// 本次执行内去重检查
if (processedInThisRun.has(data.title)) {
console.log('⏭️ 本次执行内重复,跳过:', data.title);
continue;
}
// 生成稳定的文章ID
const stableId = generateStableId(
data.title,
data.isoDate || data.pubDate,
data['content:encodedSnippet'] || data.contentSnippet || ''
);
// 生成内容哈希
const contentHash = generateStableId(
data['content:encodedSnippet'] || data.contentSnippet || '',
'',
''
);
// 准备文章数据
const articleData = {
article_id: stableId,
title: data.title,
content: data['content:encodedSnippet'] || data.contentSnippet || '',
content_hash: contentHash,
published_time: data.isoDate || data.pubDate || new Date().toISOString(),
source_url: data.link || '',
rss_source: data.meta?.title || 'unknown',
processed: false,
created_at: new Date().toISOString(),
last_updated: new Date().toISOString()
};
try {
// 检查数据库中是否已存在
const existing = await mongoConnection.db('taigong').collection('articles').findOne({
$or: [
{ article_id: stableId },
{ title: data.title }
]
});
if (existing) {
console.log('⏭️ 数据库中已存在,跳过:', data.title);
continue;
}
// 插入新文章
const result = await mongoConnection.db('taigong').collection('articles').insertOne(articleData);
console.log('✅ 新增文章:', data.title);
results.push({
json: {
action: 'inserted',
article_id: stableId,
title: data.title,
mongodb_id: result.insertedId,
status: 'success'
}
});
// 添加到本次执行的去重集合
processedInThisRun.add(data.title);
} catch (error) {
console.error('❌ 处理文章失败:', data.title, error.message);
results.push({
json: {
action: 'error',
title: data.title,
error: error.message,
status: 'failed'
}
});
}
}
console.log(`处理完成: 原始${items.length}条, 成功处理${results.length}`);
// 统计结果
const stats = results.reduce((acc, item) => {
acc[item.json.action] = (acc[item.json.action] || 0) + 1;
return acc;
}, {});
console.log('处理统计:', stats);
// 如果没有任何结果,返回一个空的成功状态
if (results.length === 0) {
return [{
json: {
message: '没有新数据需要处理',
total_processed: items.length,
status: 'completed'
}
}];
}
return results;

View File

@@ -1,163 +0,0 @@
#!/usr/bin/env python3
"""
测试OpenRouter API连接
重构版本:使用统一配置管理
"""
import requests
from typing import Dict, Any
def test_openrouter_api() -> bool:
"""
测试OpenRouter API连接
Returns:
测试是否成功
"""
# 使用统一配置管理
try:
from config.doppler_config import get_openrouter_key
api_key = get_openrouter_key()
except ImportError:
# 如果配置模块不可用,使用环境变量
import os
api_key = os.getenv('OPENROUTER_API_KEY_1')
except Exception as e:
print(f"❌ 无法获取API密钥: {e}")
return False
if not api_key:
print("❌ 未找到OpenRouter API密钥")
print("请确保已配置 OPENROUTER_API_KEY_1 环境变量")
return False
print(f"🔑 使用API密钥: {api_key[:20]}...")
# 测试API调用
url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
"Authorization": f"Bearer {api_key}",
"HTTP-Referer": "https://github.com/ben/liurenchaxin",
"X-Title": "Jixia Academy Debate System",
"Content-Type": "application/json"
}
data = {
"model": "openai/gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "你好,请简单回复一下测试连接"}
],
"max_tokens": 50
}
try:
print("📡 正在测试API连接...")
response = requests.post(url, headers=headers, json=data, timeout=30)
print(f"📡 响应状态码: {response.status_code}")
if response.status_code == 200:
result = response.json()
print("✅ OpenRouter API连接成功!")
if 'choices' in result and len(result['choices']) > 0:
content = result['choices'][0]['message']['content']
print(f"📝 AI回复: {content}")
else:
print("📝 API响应格式异常但连接成功")
return True
else:
print(f"❌ API调用失败: HTTP {response.status_code}")
print(f"错误详情: {response.text}")
return False
except requests.exceptions.Timeout:
print("❌ 请求超时,请检查网络连接")
return False
except requests.exceptions.RequestException as e:
print(f"❌ 网络请求异常: {e}")
return False
except Exception as e:
print(f"❌ 未知异常: {e}")
return False
def test_rapidapi_connection() -> bool:
"""
测试RapidAPI连接
Returns:
测试是否成功
"""
try:
from config.doppler_config import get_rapidapi_key
api_key = get_rapidapi_key()
except ImportError:
import os
api_key = os.getenv('RAPIDAPI_KEY')
except Exception as e:
print(f"❌ 无法获取RapidAPI密钥: {e}")
return False
if not api_key:
print("❌ 未找到RapidAPI密钥")
return False
print(f"🔑 测试RapidAPI连接...")
# 测试一个简单的API端点
url = "https://yahoo-finance15.p.rapidapi.com/api/yahoo/qu/quote/AAPL"
headers = {
'X-RapidAPI-Key': api_key,
'X-RapidAPI-Host': 'yahoo-finance15.p.rapidapi.com'
}
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
print("✅ RapidAPI连接成功!")
return True
else:
print(f"❌ RapidAPI连接失败: HTTP {response.status_code}")
return False
except Exception as e:
print(f"❌ RapidAPI连接异常: {e}")
return False
def main():
"""主函数 - 运行所有API连接测试"""
print("🧪 API连接测试套件")
print("=" * 50)
# 测试配置验证
try:
from config.doppler_config import validate_config
print("\n🔧 验证配置...")
config_valid = validate_config()
except ImportError:
print("⚠️ 配置模块不可用,跳过配置验证")
config_valid = True
# 测试OpenRouter API
print("\n🤖 测试OpenRouter API...")
openrouter_success = test_openrouter_api()
# 测试RapidAPI
print("\n📊 测试RapidAPI...")
rapidapi_success = test_rapidapi_api()
# 总结测试结果
print("\n" + "=" * 50)
print("📋 测试结果总结:")
print(f" 配置验证: {'✅ 通过' if config_valid else '❌ 失败'}")
print(f" OpenRouter API: {'✅ 通过' if openrouter_success else '❌ 失败'}")
print(f" RapidAPI: {'✅ 通过' if rapidapi_success else '❌ 失败'}")
all_passed = config_valid and openrouter_success and rapidapi_success
if all_passed:
print("\n🎉 所有API连接测试通过系统已准备就绪。")
else:
print("\n⚠️ 部分测试失败,请检查配置和网络连接。")
return all_passed
if __name__ == "__main__":
success = main()
exit(0 if success else 1)

View File

@@ -1,297 +0,0 @@
#!/usr/bin/env python3
"""
RapidAPI库存测试脚本
自动测试所有订阅的API服务生成可用性报告
"""
import requests
import json
import time
from datetime import datetime
from typing import Dict, List, Any
import os
class RapidAPITester:
"""RapidAPI测试器"""
def __init__(self):
"""初始化测试器"""
# 从环境变量获取API密钥
self.api_key = os.getenv('RAPIDAPI_KEY')
if not self.api_key:
raise ValueError("RAPIDAPI_KEY环境变量未设置")
# API配置 - 基于永动机引擎的配置
self.api_configs = {
'alpha_vantage': 'alpha-vantage.p.rapidapi.com',
'yahoo_finance_1': 'yahoo-finance15.p.rapidapi.com',
'yh_finance_complete': 'yh-finance.p.rapidapi.com',
'yahoo_finance_api_data': 'yahoo-finance-api1.p.rapidapi.com',
'yahoo_finance_realtime': 'yahoo-finance-low-latency.p.rapidapi.com',
'yh_finance': 'yh-finance-complete.p.rapidapi.com',
'yahoo_finance_basic': 'yahoo-finance127.p.rapidapi.com',
'seeking_alpha': 'seeking-alpha.p.rapidapi.com',
'webull': 'webull.p.rapidapi.com',
'morning_star': 'morningstar1.p.rapidapi.com',
'tradingview': 'tradingview-ta.p.rapidapi.com',
'investing_com': 'investing-cryptocurrency-markets.p.rapidapi.com',
'finance_api': 'real-time-finance-data.p.rapidapi.com',
'ms_finance': 'ms-finance.p.rapidapi.com',
'sec_filings': 'sec-filings.p.rapidapi.com',
'exchangerate_api': 'exchangerate-api.p.rapidapi.com',
'crypto_news': 'cryptocurrency-news2.p.rapidapi.com'
}
# 测试端点配置
self.test_endpoints = {
'alpha_vantage': '/query?function=GLOBAL_QUOTE&symbol=AAPL',
'yahoo_finance_1': '/api/yahoo/qu/quote/AAPL',
'yh_finance_complete': '/stock/v2/get-summary?symbol=AAPL',
'yahoo_finance_api_data': '/v8/finance/chart/AAPL',
'yahoo_finance_realtime': '/stock/v2/get-summary?symbol=AAPL',
'yh_finance': '/stock/v2/get-summary?symbol=AAPL',
'yahoo_finance_basic': '/api/yahoo/qu/quote/AAPL',
'seeking_alpha': '/symbols/get-profile?symbols=AAPL',
'webull': '/stock/search?keyword=AAPL',
'morning_star': '/market/v2/get-movers?performanceId=0P0000OQN8',
'tradingview': '/get-analysis?symbol=AAPL&screener=america&exchange=NASDAQ',
'investing_com': '/coins/get-overview',
'finance_api': '/stock-price?symbol=AAPL',
'ms_finance': '/stock/v2/get-summary?symbol=AAPL',
'sec_filings': '/search?query=AAPL',
'exchangerate_api': '/latest?base=USD',
'crypto_news': '/v1/cryptonews'
}
self.results = {}
def test_api(self, api_name: str) -> Dict[str, Any]:
"""
测试单个API
Args:
api_name: API名称
Returns:
测试结果
"""
if api_name not in self.api_configs:
return {
'success': False,
'error': 'API not configured',
'status_code': None,
'response_time': 0
}
host = self.api_configs[api_name]
endpoint = self.test_endpoints.get(api_name, '/')
headers = {
'X-RapidAPI-Key': self.api_key,
'X-RapidAPI-Host': host,
'Content-Type': 'application/json'
}
url = f"https://{host}{endpoint}"
print(f"🧪 测试 {api_name} ({host})")
print(f" URL: {url}")
start_time = time.time()
try:
response = requests.get(url, headers=headers, timeout=10)
response_time = time.time() - start_time
result = {
'success': response.status_code == 200,
'status_code': response.status_code,
'response_time': round(response_time, 2),
'response_size': len(response.text),
'error': None if response.status_code == 200 else response.text[:200]
}
if response.status_code == 200:
print(f" ✅ 成功 - {response_time:.2f}s - {len(response.text)} bytes")
# 尝试解析JSON
try:
data = response.json()
result['has_data'] = bool(data)
result['data_keys'] = list(data.keys()) if isinstance(data, dict) else []
except:
result['has_data'] = False
result['data_keys'] = []
else:
print(f" ❌ 失败 - HTTP {response.status_code}")
print(f" 错误: {response.text[:100]}...")
return result
except requests.exceptions.Timeout:
print(f" ⏰ 超时")
return {
'success': False,
'error': 'Request timeout',
'status_code': None,
'response_time': 10.0
}
except requests.exceptions.RequestException as e:
print(f" ❌ 请求异常: {str(e)}")
return {
'success': False,
'error': f'Request error: {str(e)}',
'status_code': None,
'response_time': time.time() - start_time
}
except Exception as e:
print(f" ❌ 未知异常: {str(e)}")
return {
'success': False,
'error': f'Unexpected error: {str(e)}',
'status_code': None,
'response_time': time.time() - start_time
}
def test_all_apis(self) -> Dict[str, Any]:
"""测试所有API"""
print("🚀 开始测试所有RapidAPI服务")
print("=" * 60)
for api_name in self.api_configs.keys():
result = self.test_api(api_name)
self.results[api_name] = result
time.sleep(1) # 避免请求过快
print()
return self.results
def generate_report(self) -> str:
"""生成测试报告"""
if not self.results:
return "没有测试结果"
# 统计
total_apis = len(self.results)
successful_apis = len([r for r in self.results.values() if r['success']])
failed_apis = total_apis - successful_apis
# 按状态分类
success_list = []
failed_list = []
for api_name, result in self.results.items():
if result['success']:
success_list.append({
'name': api_name,
'host': self.api_configs[api_name],
'response_time': result['response_time'],
'data_keys': result.get('data_keys', [])
})
else:
failed_list.append({
'name': api_name,
'host': self.api_configs[api_name],
'error': result['error'],
'status_code': result['status_code']
})
# 生成报告
report = f"""# RapidAPI 测试报告
## 📊 测试概览
- **测试时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- **总API数**: {total_apis}
- **成功数**: {successful_apis} ({successful_apis/total_apis*100:.1f}%)
- **失败数**: {failed_apis} ({failed_apis/total_apis*100:.1f}%)
## ✅ 可用的API ({len(success_list)}个)
"""
for api in sorted(success_list, key=lambda x: x['response_time']):
report += f"### {api['name']}\n"
report += f"- **主机**: `{api['host']}`\n"
report += f"- **响应时间**: {api['response_time']}s\n"
if api['data_keys']:
report += f"- **数据字段**: {', '.join(api['data_keys'][:5])}\n"
report += "\n"
report += f"## ❌ 失败的API ({len(failed_list)}个)\n\n"
for api in failed_list:
report += f"### {api['name']}\n"
report += f"- **主机**: `{api['host']}`\n"
report += f"- **状态码**: {api['status_code']}\n"
report += f"- **错误**: {api['error'][:100] if api['error'] else 'Unknown'}...\n"
report += "\n"
# 建议
report += """## 🔧 优化建议
### 立即可用的API
"""
fast_apis = [api for api in success_list if api['response_time'] < 2.0]
if fast_apis:
report += "以下API响应快速建议优先使用\n"
for api in fast_apis:
report += f"- **{api['name']}**: {api['response_time']}s\n"
report += """
### 需要修复的API
"""
if failed_list:
report += "以下API需要检查端点配置或权限\n"
for api in failed_list[:5]: # 只显示前5个
report += f"- **{api['name']}**: {api['error'][:50] if api['error'] else 'Unknown error'}...\n"
return report
def save_report(self, filename: str = None):
"""保存报告到文件"""
if not filename:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"docs/rapidapi/test_report_{timestamp}.md"
report = self.generate_report()
with open(filename, 'w', encoding='utf-8') as f:
f.write(report)
print(f"📄 报告已保存到: {filename}")
return filename
def main():
"""主函数"""
print("🧪 RapidAPI库存测试工具")
print("=" * 40)
try:
tester = RapidAPITester()
# 测试所有API
results = tester.test_all_apis()
# 生成并显示报告
print("\n" + "=" * 60)
print("📊 测试完成,生成报告...")
report = tester.generate_report()
print(report)
# 保存报告
filename = tester.save_report()
# 更新库存文档
print(f"\n💡 建议更新 docs/rapidapi/api_inventory.md")
print(f"📁 详细报告: {filename}")
except Exception as e:
print(f"❌ 测试失败: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()