mgmt/scripts/utilities/nomad-diagnosis.sh

124 lines
3.8 KiB
Bash
Executable File

#!/bin/bash
# 🔍 Nomad 集群快速诊断脚本
echo "🔍 Nomad 集群快速诊断"
echo "===================="
echo ""
# 定义节点信息
declare -A NODES=(
["semaphore"]="local"
["master"]="100.117.106.136:60022"
["ash3c"]="100.116.80.94:22"
)
declare -A TAILSCALE_IPS=(
["semaphore"]="100.116.158.95"
["master"]="100.117.106.136"
["ash3c"]="100.116.80.94"
)
echo "📊 1. 本地 Nomad 服务状态"
echo "------------------------"
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
echo ""
echo "📊 2. 集群成员状态"
echo "----------------"
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 3. 节点状态"
echo "------------"
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
echo ""
echo "🌐 4. 网络连通性测试"
echo "------------------"
for node in "${!NODES[@]}"; do
ip="${TAILSCALE_IPS[$node]}"
echo "测试 $node ($ip):"
if [[ "$node" == "semaphore" ]]; then
echo " ✅ 本地节点"
else
# Ping 测试
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
echo " ✅ Ping: 成功"
else
echo " ❌ Ping: 失败"
fi
# 端口测试
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
echo " ✅ RPC端口(4647): 开放"
else
echo " ❌ RPC端口(4647): 关闭"
fi
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
echo " ✅ HTTP端口(4646): 开放"
else
echo " ❌ HTTP端口(4646): 关闭"
fi
fi
echo ""
done
echo "🔧 5. 远程节点服务状态"
echo "-------------------"
for node in "${!NODES[@]}"; do
if [[ "$node" == "semaphore" ]]; then
continue
fi
connection="${NODES[$node]}"
ip=$(echo "$connection" | cut -d: -f1)
port=$(echo "$connection" | cut -d: -f2)
echo "检查 $node ($ip:$port):"
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
echo " 服务状态: $status"
# 检查配置文件中的 bind_addr
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
echo " 配置绑定地址: $bind_addr"
# 检查实际监听端口
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
if [[ -n "$listening" ]]; then
echo " 监听端口:"
echo "$listening" | sed 's/^/ /'
else
echo " ❌ 未发现 Nomad 监听端口"
fi
else
echo " ❌ 无法连接或服务未运行"
fi
echo ""
done
echo "📋 6. 问题总结和建议"
echo "=================="
# 检查是否有 leader
if nomad server members 2>/dev/null | grep -q "leader"; then
echo "✅ 集群有 leader"
else
echo "❌ 集群没有 leader - 这是主要问题!"
echo ""
echo "🔧 建议的修复步骤:"
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
echo "3. 检查 master 节点是否需要重启"
fi
echo ""
echo "🔗 有用的链接:"
echo " Web UI: http://100.116.158.95:4646"
echo " 日志查看: journalctl -u nomad -f"
echo ""
echo "🔍 诊断完成!"