124 lines
3.8 KiB
Bash
Executable File
124 lines
3.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# 🔍 Nomad 集群快速诊断脚本
|
|
|
|
echo "🔍 Nomad 集群快速诊断"
|
|
echo "===================="
|
|
echo ""
|
|
|
|
# 定义节点信息
|
|
declare -A NODES=(
|
|
["semaphore"]="local"
|
|
["master"]="100.117.106.136:60022"
|
|
["ash3c"]="100.116.80.94:22"
|
|
)
|
|
|
|
declare -A TAILSCALE_IPS=(
|
|
["semaphore"]="100.116.158.95"
|
|
["master"]="100.117.106.136"
|
|
["ash3c"]="100.116.80.94"
|
|
)
|
|
|
|
echo "📊 1. 本地 Nomad 服务状态"
|
|
echo "------------------------"
|
|
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
|
|
echo ""
|
|
|
|
echo "📊 2. 集群成员状态"
|
|
echo "----------------"
|
|
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
|
|
echo ""
|
|
|
|
echo "📊 3. 节点状态"
|
|
echo "------------"
|
|
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
|
|
echo ""
|
|
|
|
echo "🌐 4. 网络连通性测试"
|
|
echo "------------------"
|
|
for node in "${!NODES[@]}"; do
|
|
ip="${TAILSCALE_IPS[$node]}"
|
|
echo "测试 $node ($ip):"
|
|
|
|
if [[ "$node" == "semaphore" ]]; then
|
|
echo " ✅ 本地节点"
|
|
else
|
|
# Ping 测试
|
|
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
|
|
echo " ✅ Ping: 成功"
|
|
else
|
|
echo " ❌ Ping: 失败"
|
|
fi
|
|
|
|
# 端口测试
|
|
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
|
|
echo " ✅ RPC端口(4647): 开放"
|
|
else
|
|
echo " ❌ RPC端口(4647): 关闭"
|
|
fi
|
|
|
|
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
|
|
echo " ✅ HTTP端口(4646): 开放"
|
|
else
|
|
echo " ❌ HTTP端口(4646): 关闭"
|
|
fi
|
|
fi
|
|
echo ""
|
|
done
|
|
|
|
echo "🔧 5. 远程节点服务状态"
|
|
echo "-------------------"
|
|
for node in "${!NODES[@]}"; do
|
|
if [[ "$node" == "semaphore" ]]; then
|
|
continue
|
|
fi
|
|
|
|
connection="${NODES[$node]}"
|
|
ip=$(echo "$connection" | cut -d: -f1)
|
|
port=$(echo "$connection" | cut -d: -f2)
|
|
|
|
echo "检查 $node ($ip:$port):"
|
|
|
|
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
|
|
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
|
|
echo " 服务状态: $status"
|
|
|
|
# 检查配置文件中的 bind_addr
|
|
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
|
|
echo " 配置绑定地址: $bind_addr"
|
|
|
|
# 检查实际监听端口
|
|
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
|
|
if [[ -n "$listening" ]]; then
|
|
echo " 监听端口:"
|
|
echo "$listening" | sed 's/^/ /'
|
|
else
|
|
echo " ❌ 未发现 Nomad 监听端口"
|
|
fi
|
|
else
|
|
echo " ❌ 无法连接或服务未运行"
|
|
fi
|
|
echo ""
|
|
done
|
|
|
|
echo "📋 6. 问题总结和建议"
|
|
echo "=================="
|
|
|
|
# 检查是否有 leader
|
|
if nomad server members 2>/dev/null | grep -q "leader"; then
|
|
echo "✅ 集群有 leader"
|
|
else
|
|
echo "❌ 集群没有 leader - 这是主要问题!"
|
|
echo ""
|
|
echo "🔧 建议的修复步骤:"
|
|
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
|
|
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
|
|
echo "3. 检查 master 节点是否需要重启"
|
|
fi
|
|
|
|
echo ""
|
|
echo "🔗 有用的链接:"
|
|
echo " Web UI: http://100.116.158.95:4646"
|
|
echo " 日志查看: journalctl -u nomad -f"
|
|
echo ""
|
|
echo "🔍 诊断完成!" |