feat: 重构项目结构并增强安全性

- 将文档和脚本移动到更合适的目录结构
- 删除敏感信息并替换为Doppler环境变量引用
- 新增GitGuardian配置以加强代码安全扫描
- 实现MongoDB向量搜索索引创建脚本
- 添加文章embedding生成脚本
- 新增Alpha Vantage测试脚本
- 重构八仙辩论系统架构
- 实现swarm辩论触发机制
- 新增MongoDB与Swarm集成示例
- 完善RapidAPI负载均衡策略文档

所有修改均遵循安全最佳实践,敏感信息不再硬编码在代码中
This commit is contained in:
ben
2025-08-02 16:58:12 +00:00
parent 6b464b6e07
commit 4d58c6f938
24 changed files with 4162 additions and 504 deletions

View File

@@ -9,7 +9,9 @@ from pymongo import MongoClient
def add_sequence_ids():
"""为现有文章添加流水号"""
# 连接MongoDB
mongo_uri = os.getenv('MONGODB_URI', 'mongodb+srv://ben:313131@cauldron.tx3qnoq.mongodb.net/')
mongo_uri = os.getenv('MONGODB_URI')
if not mongo_uri:
raise ValueError("MONGODB_URI environment variable is required")
client = MongoClient(mongo_uri)
db = client['taigong']
collection = db['articles']

View File

@@ -21,7 +21,9 @@ def generate_stable_id(title, pub_date, content):
def cleanup_duplicates():
"""清理重复数据"""
# 连接MongoDB
mongo_uri = os.getenv('MONGODB_URI', 'mongodb+srv://ben:313131@cauldron.tx3qnoq.mongodb.net/')
mongo_uri = os.getenv('MONGODB_URI')
if not mongo_uri:
raise ValueError("MONGODB_URI environment variable is required")
client = MongoClient(mongo_uri)
db = client['taigong']
collection = db['articles']

View File

@@ -0,0 +1,35 @@
// MongoDB Atlas Vector Search Index Creation Script
// 为swarm辩论系统创建向量索引
// 连接到数据库
use('taigong');
// 创建向量索引用于语义搜索和内容聚类
// 这个索引将支持swarm辩论系统的语义相似性匹配
db.articles.createSearchIndex(
"vector_search_index",
{
"fields": [
{
"type": "vector",
"path": "embedding",
"numDimensions": 1536, // OpenAI text-embedding-ada-002 维度
"similarity": "cosine"
},
{
"type": "filter",
"path": "published_time"
},
{
"type": "filter",
"path": "title"
}
]
}
);
print("向量索引创建完成!");
print("索引名称: vector_search_index");
print("向量维度: 1536 (OpenAI embedding)");
print("相似性算法: cosine");
print("支持过滤字段: published_time, title");

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""
为MongoDB中的文章生成向量embeddings
用于swarm辩论系统的语义搜索和内容聚类
"""
import os
import openai
from pymongo import MongoClient
from typing import List, Dict
import time
def get_mongodb_client():
"""从Doppler获取MongoDB连接"""
mongodb_uri = os.getenv('MONGODB_URI')
if not mongodb_uri:
raise ValueError("MONGODB_URI not found in environment variables")
return MongoClient(mongodb_uri)
def generate_embedding(text: str) -> List[float]:
"""使用OpenAI API生成文本embedding"""
try:
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
return response['data'][0]['embedding']
except Exception as e:
print(f"生成embedding失败: {e}")
return None
def update_articles_with_embeddings():
"""为所有文章添加embedding字段"""
client = get_mongodb_client()
db = client.taigong
collection = db.articles
# 获取所有没有embedding的文章
articles = collection.find({"embedding": {"$exists": False}})
count = 0
for article in articles:
title = article.get('title', '')
if not title:
continue
print(f"处理文章: {title[:50]}...")
# 生成embedding
embedding = generate_embedding(title)
if embedding:
# 更新文档
collection.update_one(
{"_id": article["_id"]},
{"$set": {"embedding": embedding}}
)
count += 1
print(f"✓ 已更新 {count} 篇文章")
# 避免API rate limit
time.sleep(0.1)
else:
print(f"× 跳过文章: {title[:50]}")
print(f"\n完成!共处理 {count} 篇文章")
client.close()
if __name__ == "__main__":
# 设置OpenAI API密钥 (应该从Doppler获取)
openai.api_key = os.getenv('OPENAI_API_KEY')
if not openai.api_key:
print("警告: OPENAI_API_KEY 未设置请先在Doppler中配置")
exit(1)
update_articles_with_embeddings()