feat: 重构项目结构并增强安全性
- 将文档和脚本移动到更合适的目录结构 - 删除敏感信息并替换为Doppler环境变量引用 - 新增GitGuardian配置以加强代码安全扫描 - 实现MongoDB向量搜索索引创建脚本 - 添加文章embedding生成脚本 - 新增Alpha Vantage测试脚本 - 重构八仙辩论系统架构 - 实现swarm辩论触发机制 - 新增MongoDB与Swarm集成示例 - 完善RapidAPI负载均衡策略文档 所有修改均遵循安全最佳实践,敏感信息不再硬编码在代码中
This commit is contained in:
@@ -9,7 +9,9 @@ from pymongo import MongoClient
|
||||
def add_sequence_ids():
|
||||
"""为现有文章添加流水号"""
|
||||
# 连接MongoDB
|
||||
mongo_uri = os.getenv('MONGODB_URI', 'mongodb+srv://ben:313131@cauldron.tx3qnoq.mongodb.net/')
|
||||
mongo_uri = os.getenv('MONGODB_URI')
|
||||
if not mongo_uri:
|
||||
raise ValueError("MONGODB_URI environment variable is required")
|
||||
client = MongoClient(mongo_uri)
|
||||
db = client['taigong']
|
||||
collection = db['articles']
|
||||
|
||||
@@ -21,7 +21,9 @@ def generate_stable_id(title, pub_date, content):
|
||||
def cleanup_duplicates():
|
||||
"""清理重复数据"""
|
||||
# 连接MongoDB
|
||||
mongo_uri = os.getenv('MONGODB_URI', 'mongodb+srv://ben:313131@cauldron.tx3qnoq.mongodb.net/')
|
||||
mongo_uri = os.getenv('MONGODB_URI')
|
||||
if not mongo_uri:
|
||||
raise ValueError("MONGODB_URI environment variable is required")
|
||||
client = MongoClient(mongo_uri)
|
||||
db = client['taigong']
|
||||
collection = db['articles']
|
||||
|
||||
35
scripts/create_vector_index.js
Normal file
35
scripts/create_vector_index.js
Normal file
@@ -0,0 +1,35 @@
|
||||
// MongoDB Atlas Vector Search Index Creation Script
|
||||
// 为swarm辩论系统创建向量索引
|
||||
|
||||
// 连接到数据库
|
||||
use('taigong');
|
||||
|
||||
// 创建向量索引用于语义搜索和内容聚类
|
||||
// 这个索引将支持swarm辩论系统的语义相似性匹配
|
||||
db.articles.createSearchIndex(
|
||||
"vector_search_index",
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"type": "vector",
|
||||
"path": "embedding",
|
||||
"numDimensions": 1536, // OpenAI text-embedding-ada-002 维度
|
||||
"similarity": "cosine"
|
||||
},
|
||||
{
|
||||
"type": "filter",
|
||||
"path": "published_time"
|
||||
},
|
||||
{
|
||||
"type": "filter",
|
||||
"path": "title"
|
||||
}
|
||||
]
|
||||
}
|
||||
);
|
||||
|
||||
print("向量索引创建完成!");
|
||||
print("索引名称: vector_search_index");
|
||||
print("向量维度: 1536 (OpenAI embedding)");
|
||||
print("相似性算法: cosine");
|
||||
print("支持过滤字段: published_time, title");
|
||||
75
scripts/generate_embeddings.py
Normal file
75
scripts/generate_embeddings.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
为MongoDB中的文章生成向量embeddings
|
||||
用于swarm辩论系统的语义搜索和内容聚类
|
||||
"""
|
||||
|
||||
import os
|
||||
import openai
|
||||
from pymongo import MongoClient
|
||||
from typing import List, Dict
|
||||
import time
|
||||
|
||||
def get_mongodb_client():
|
||||
"""从Doppler获取MongoDB连接"""
|
||||
mongodb_uri = os.getenv('MONGODB_URI')
|
||||
if not mongodb_uri:
|
||||
raise ValueError("MONGODB_URI not found in environment variables")
|
||||
return MongoClient(mongodb_uri)
|
||||
|
||||
def generate_embedding(text: str) -> List[float]:
|
||||
"""使用OpenAI API生成文本embedding"""
|
||||
try:
|
||||
response = openai.Embedding.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=text
|
||||
)
|
||||
return response['data'][0]['embedding']
|
||||
except Exception as e:
|
||||
print(f"生成embedding失败: {e}")
|
||||
return None
|
||||
|
||||
def update_articles_with_embeddings():
|
||||
"""为所有文章添加embedding字段"""
|
||||
client = get_mongodb_client()
|
||||
db = client.taigong
|
||||
collection = db.articles
|
||||
|
||||
# 获取所有没有embedding的文章
|
||||
articles = collection.find({"embedding": {"$exists": False}})
|
||||
|
||||
count = 0
|
||||
for article in articles:
|
||||
title = article.get('title', '')
|
||||
if not title:
|
||||
continue
|
||||
|
||||
print(f"处理文章: {title[:50]}...")
|
||||
|
||||
# 生成embedding
|
||||
embedding = generate_embedding(title)
|
||||
if embedding:
|
||||
# 更新文档
|
||||
collection.update_one(
|
||||
{"_id": article["_id"]},
|
||||
{"$set": {"embedding": embedding}}
|
||||
)
|
||||
count += 1
|
||||
print(f"✓ 已更新 {count} 篇文章")
|
||||
|
||||
# 避免API rate limit
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
print(f"× 跳过文章: {title[:50]}")
|
||||
|
||||
print(f"\n完成!共处理 {count} 篇文章")
|
||||
client.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 设置OpenAI API密钥 (应该从Doppler获取)
|
||||
openai.api_key = os.getenv('OPENAI_API_KEY')
|
||||
if not openai.api_key:
|
||||
print("警告: OPENAI_API_KEY 未设置,请先在Doppler中配置")
|
||||
exit(1)
|
||||
|
||||
update_articles_with_embeddings()
|
||||
Reference in New Issue
Block a user