mgmt/scripts/utilities/simple-nomad-fix.sh

104 lines
2.7 KiB
Bash
Executable File

#!/bin/bash
echo "=== 简单的 Nomad 集群修复脚本 ==="
# 定义 Tailscale IP 地址
SEMAPHORE_IP="100.116.158.95"
MASTER_IP="100.117.106.136"
ASH3C_IP="100.116.80.94"
ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
# 创建配置文件函数
create_config() {
local node_name=$1
local bind_ip=$2
cat > /tmp/nomad-${node_name}.hcl << EOF
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "${bind_ip}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "${ENCRYPT_KEY}"
server_join {
retry_join = ["${SEMAPHORE_IP}", "${MASTER_IP}", "${ASH3C_IP}"]
}
}
client {
enabled = true
}
ui_config {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "${bind_ip}"
serf = "${bind_ip}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
EOF
}
echo "1. 停止所有 Nomad 服务..."
systemctl stop nomad
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl stop nomad"
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl stop nomad"
echo "2. 清理数据目录..."
rm -rf /opt/nomad/data/*
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*"
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*"
echo "3. 创建新配置文件..."
create_config "semaphore" "${SEMAPHORE_IP}"
create_config "master" "${MASTER_IP}"
create_config "ash3c" "${ASH3C_IP}"
echo "4. 部署配置文件..."
cp /tmp/nomad-semaphore.hcl /etc/nomad.d/nomad.hcl
chown nomad:nomad /etc/nomad.d/nomad.hcl
scp -P 60022 -i ~/.ssh/id_ed25519 /tmp/nomad-master.hcl ben@${MASTER_IP}:/tmp/
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S cp /tmp/nomad-master.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl"
scp -P 22 -i ~/.ssh/id_ed25519 /tmp/nomad-ash3c.hcl ben@${ASH3C_IP}:/tmp/
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S cp /tmp/nomad-ash3c.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl"
echo "5. 启动服务..."
systemctl start nomad
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl start nomad"
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl start nomad"
echo "6. 等待集群形成..."
sleep 30
echo "7. 检查集群状态..."
nomad server members
nomad node status
echo "=== 修复完成 ==="