feat: 更新OCI Provider版本至7.20并集成Vault配置

refactor: 重构Terraform配置以使用Consul和Vault存储敏感信息

docs: 添加Vault实施文档和配置指南

chore: 清理不再使用的配置文件和脚本

feat: 添加Nomad集群领导者发现脚本和文档

feat: 实现MCP配置共享方案和同步脚本

style: 更新README中的网络访问注意事项

test: 添加Consul Provider集成测试脚本
This commit is contained in:
2025-09-29 01:42:57 +00:00
parent ad531936dd
commit f72b17a34f
38 changed files with 3741 additions and 888 deletions

193
scripts/nomad-leader-discovery.sh Executable file
View File

@@ -0,0 +1,193 @@
#!/bin/bash
# Nomad 集群领导者发现与访问脚本
# 此脚本自动发现当前 Nomad 集群领导者并执行相应命令
# 默认服务器列表(可根据实际情况修改)
SERVERS=(
"100.116.158.95" # bj-semaphore.global
"100.81.26.3" # ash1d.global
"100.103.147.94" # ash2e.global
"100.90.159.68" # ch2.global
"100.86.141.112" # ch3.global
"100.98.209.50" # bj-onecloud1.global
"100.120.225.29" # de.global
)
# 超时设置(秒)
TIMEOUT=5
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 打印帮助信息
function show_help() {
echo "Nomad 集群领导者发现与访问脚本"
echo ""
echo "用法: $0 [选项] [nomad命令]"
echo ""
echo "选项:"
echo " -h, --help 显示此帮助信息"
echo " -s, --server IP 指定初始服务器IP"
echo " -t, --timeout SECS 设置超时时间(默认: $TIMEOUT 秒)"
echo " -l, --list-servers 列出所有配置的服务器"
echo " -c, --check-leader 仅检查领导者,不执行命令"
echo ""
echo "示例:"
echo " $0 node status # 使用自动发现的领导者查看节点状态"
echo " $0 -s 100.116.158.95 job status # 指定初始服务器查看作业状态"
echo " $0 -c # 仅检查当前领导者"
echo ""
}
# 列出所有配置的服务器
function list_servers() {
echo -e "${YELLOW}配置的服务器列表:${NC}"
for server in "${SERVERS[@]}"; do
echo " - $server"
done
}
# 发现领导者
function discover_leader() {
local initial_server=$1
# 如果指定了初始服务器,先尝试使用它
if [ -n "$initial_server" ]; then
echo -e "${YELLOW}尝试从服务器 $initial_server 发现领导者...${NC}" >&2
leader=$(curl -s --max-time $TIMEOUT "http://${initial_server}:4646/v1/status/leader" 2>/dev/null | sed 's/"//g')
if [ -n "$leader" ] && [ "$leader" != "" ]; then
# 将RPC端口(4647)替换为HTTP端口(4646)
leader=$(echo "$leader" | sed 's/:4647$/:4646/')
echo -e "${GREEN}发现领导者: $leader${NC}" >&2
echo "$leader"
return 0
fi
echo -e "${RED}无法从 $initial_server 获取领导者信息${NC}" >&2
fi
# 遍历所有服务器尝试发现领导者
echo -e "${YELLOW}遍历所有服务器寻找领导者...${NC}" >&2
for server in "${SERVERS[@]}"; do
echo -n " 检查 $server ... " >&2
leader=$(curl -s --max-time $TIMEOUT "http://${server}:4646/v1/status/leader" 2>/dev/null | sed 's/"//g')
if [ -n "$leader" ] && [ "$leader" != "" ]; then
# 将RPC端口(4647)替换为HTTP端口(4646)
leader=$(echo "$leader" | sed 's/:4647$/:4646/')
echo -e "${GREEN}成功${NC}" >&2
echo -e "${GREEN}发现领导者: $leader${NC}" >&2
echo "$leader"
return 0
else
echo -e "${RED}失败${NC}" >&2
fi
done
echo -e "${RED}无法发现领导者,请检查集群状态${NC}" >&2
return 1
}
# 解析命令行参数
INITIAL_SERVER=""
CHECK_LEADER_ONLY=false
NOMAD_COMMAND=()
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
show_help
exit 0
;;
-s|--server)
INITIAL_SERVER="$2"
shift 2
;;
-t|--timeout)
TIMEOUT="$2"
shift 2
;;
-l|--list-servers)
list_servers
exit 0
;;
-c|--check-leader)
CHECK_LEADER_ONLY=true
shift
;;
*)
NOMAD_COMMAND+=("$1")
shift
;;
esac
done
# 主逻辑
echo -e "${YELLOW}Nomad 集群领导者发现与访问脚本${NC}" >&2
echo "==================================" >&2
# 发现领导者
LEADER=$(discover_leader "$INITIAL_SERVER")
if [ $? -ne 0 ]; then
exit 1
fi
# 提取领导者IP和端口
LEADER_IP=$(echo "$LEADER" | cut -d':' -f1)
LEADER_PORT=$(echo "$LEADER" | cut -d':' -f2)
# 如果仅检查领导者,则退出
if [ "$CHECK_LEADER_ONLY" = true ]; then
echo -e "${GREEN}当前领导者: $LEADER${NC}" >&2
exit 0
fi
# 如果没有指定命令,显示交互式菜单
if [ ${#NOMAD_COMMAND[@]} -eq 0 ]; then
echo -e "${YELLOW}未指定命令,请选择要执行的操作:${NC}" >&2
echo "1) 查看节点状态" >&2
echo "2) 查看作业状态" >&2
echo "3) 查看服务器成员" >&2
echo "4) 查看集群状态" >&2
echo "5) 自定义命令" >&2
echo "0) 退出" >&2
read -p "请输入选项 (0-5): " choice
case $choice in
1) NOMAD_COMMAND=("node" "status") ;;
2) NOMAD_COMMAND=("job" "status") ;;
3) NOMAD_COMMAND=("server" "members") ;;
4) NOMAD_COMMAND=("operator" "raft" "list-peers") ;;
5)
read -p "请输入完整的 Nomad 命令: " -a NOMAD_COMMAND
;;
0) exit 0 ;;
*)
echo -e "${RED}无效选项${NC}" >&2
exit 1
;;
esac
fi
# 执行命令
echo -e "${YELLOW}执行命令: nomad ${NOMAD_COMMAND[*]} -address=http://${LEADER}${NC}" >&2
nomad "${NOMAD_COMMAND[@]}" -address="http://${LEADER}"
# 检查命令执行结果
if [ $? -eq 0 ]; then
echo -e "${GREEN}命令执行成功${NC}" >&2
else
echo -e "${RED}命令执行失败,可能需要重新发现领导者${NC}" >&2
echo -e "${YELLOW}尝试重新发现领导者...${NC}" >&2
NEW_LEADER=$(discover_leader)
if [ $? -eq 0 ] && [ "$NEW_LEADER" != "$LEADER" ]; then
echo -e "${YELLOW}领导者已更改,重新执行命令...${NC}" >&2
nomad "${NOMAD_COMMAND[@]}" -address="http://${NEW_LEADER}"
else
echo -e "${RED}无法恢复,请检查集群状态${NC}" >&2
exit 1
fi
fi

View File

@@ -0,0 +1,275 @@
#!/bin/bash
# Traefik部署测试脚本
# 用于测试Traefik在Nomad集群中的部署和功能
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查Nomad集群状态
check_nomad_cluster() {
log_info "检查Nomad集群状态..."
# 使用我们之前创建的领导者发现脚本
if [ -f "/root/mgmt/scripts/nomad-leader-discovery.sh" ]; then
chmod +x /root/mgmt/scripts/nomad-leader-discovery.sh
LEADER_INFO=$(/root/mgmt/scripts/nomad-leader-discovery.sh -c 2>&1)
log_info "Nomad领导者信息: $LEADER_INFO"
else
log_warn "未找到Nomad领导者发现脚本使用默认方式检查"
nomad server members 2>/dev/null || log_error "无法连接到Nomad集群"
fi
}
# 检查Consul集群状态
check_consul_cluster() {
log_info "检查Consul集群状态..."
consul members 2>/dev/null || log_error "无法连接到Consul集群"
# 检查Consul领导者
CONSUL_LEADER=$(curl -s http://127.0.0.1:8500/v1/status/leader)
if [ -n "$CONSUL_LEADER" ]; then
log_info "Consul领导者: $CONSUL_LEADER"
else
log_error "无法获取Consul领导者信息"
fi
}
# 部署Traefik
deploy_traefik() {
log_info "部署Traefik..."
# 检查作业文件是否存在
if [ ! -f "/root/mgmt/jobs/traefik.nomad" ]; then
log_error "Traefik作业文件不存在: /root/mgmt/jobs/traefik.nomad"
exit 1
fi
# 部署作业
nomad run /root/mgmt/jobs/traefik.nomad
# 等待部署完成
log_info "等待Traefik部署完成..."
sleep 10
# 检查作业状态
nomad status traefik
}
# 检查Traefik状态
check_traefik_status() {
log_info "检查Traefik状态..."
# 检查作业状态
JOB_STATUS=$(nomad job status traefik -json | jq -r '.Status')
if [ "$JOB_STATUS" == "running" ]; then
log_info "Traefik作业状态: $JOB_STATUS"
else
log_error "Traefik作业状态异常: $JOB_STATUS"
return 1
fi
# 检查分配状态
ALLOCATIONS=$(nomad job allocs traefik | tail -n +3 | head -n -1 | awk '{print $1}')
for alloc in $ALLOCATIONS; do
alloc_status=$(nomad alloc status $alloc -json | jq -r '.ClientStatus')
if [ "$alloc_status" == "running" ]; then
log_info "分配 $alloc 状态: $alloc_status"
else
log_error "分配 $alloc 状态异常: $alloc_status"
fi
done
# 检查服务注册
log_info "检查Consul中的服务注册..."
consul catalog services | grep traefik && log_info "Traefik服务已注册到Consul" || log_warn "Traefik服务未注册到Consul"
}
# 测试Traefik功能
test_traefik_functionality() {
log_info "测试Traefik功能..."
# 获取Traefik服务地址
TRAEFIK_ADDR=$(consul catalog service traefik | jq -r '.[0].ServiceAddress' 2>/dev/null)
if [ -z "$TRAEFIK_ADDR" ]; then
log_warn "无法从Consul获取Traefik地址使用本地地址"
TRAEFIK_ADDR="127.0.0.1"
fi
# 测试API端点
log_info "测试Traefik API端点..."
if curl -s http://$TRAEFIK_ADDR:8080/ping > /dev/null; then
log_info "Traefik API端点响应正常"
else
log_error "Traefik API端点无响应"
fi
# 测试仪表板
log_info "测试Traefik仪表板..."
if curl -s http://$TRAEFIK_ADDR:8080/dashboard/ > /dev/null; then
log_info "Traefik仪表板可访问"
else
log_error "无法访问Traefik仪表板"
fi
# 测试HTTP入口点
log_info "测试HTTP入口点..."
if curl -s -I http://$TRAEFIK_ADDR:80 | grep -q "Location: https://"; then
log_info "HTTP到HTTPS重定向正常工作"
else
log_warn "HTTP到HTTPS重定向可能未正常工作"
fi
}
# 创建测试服务
create_test_service() {
log_info "创建测试服务..."
# 创建一个简单的测试服务作业文件
cat > /tmp/test-service.nomad << EOF
job "test-web" {
datacenters = ["dc1"]
type = "service"
group "web" {
count = 1
network {
port "http" {
to = 8080
}
}
task "nginx" {
driver = "podman"
config {
image = "nginx:alpine"
ports = ["http"]
}
resources {
cpu = 100
memory = 64
}
service {
name = "test-web"
port = "http"
tags = [
"traefik.enable=true",
"traefik.http.routers.test-web.rule=Host(`test-web.service.consul`)",
"traefik.http.routers.test-web.entrypoints=https"
]
check {
type = "http"
path = "/"
interval = "10s"
timeout = "2s"
}
}
}
}
}
EOF
# 部署测试服务
nomad run /tmp/test-service.nomad
# 等待服务启动
sleep 15
# 测试服务是否可通过Traefik访问
log_info "测试服务是否可通过Traefik访问..."
if curl -s -H "Host: test-web.service.consul" http://$TRAEFIK_ADDR:80 | grep -q "Welcome to nginx"; then
log_info "测试服务可通过Traefik正常访问"
else
log_error "无法通过Traefik访问测试服务"
fi
}
# 清理测试资源
cleanup_test_resources() {
log_info "清理测试资源..."
# 停止测试服务
nomad job stop test-web 2>/dev/null || true
nomad job purge test-web 2>/dev/null || true
# 停止Traefik
nomad job stop traefik 2>/dev/null || true
nomad job purge traefik 2>/dev/null || true
# 删除临时文件
rm -f /tmp/test-service.nomad
log_info "清理完成"
}
# 主函数
main() {
case "${1:-all}" in
"check")
check_nomad_cluster
check_consul_cluster
;;
"deploy")
deploy_traefik
;;
"status")
check_traefik_status
;;
"test")
test_traefik_functionality
;;
"test-service")
create_test_service
;;
"cleanup")
cleanup_test_resources
;;
"all")
check_nomad_cluster
check_consul_cluster
deploy_traefik
check_traefik_status
test_traefik_functionality
create_test_service
log_info "所有测试完成"
;;
*)
echo "用法: $0 {check|deploy|status|test|test-service|cleanup|all}"
echo " check - 检查集群状态"
echo " deploy - 部署Traefik"
echo " status - 检查Traefik状态"
echo " test - 测试Traefik功能"
echo " test-service - 创建并测试示例服务"
echo " cleanup - 清理测试资源"
echo " all - 执行所有步骤(默认)"
exit 1
;;
esac
}
# 执行主函数
main "$@"