#!/bin/bash # 🔍 Nomad 集群快速诊断脚本 echo "🔍 Nomad 集群快速诊断" echo "====================" echo "" # 定义节点信息 declare -A NODES=( ["semaphore"]="local" ["master"]="100.117.106.136:60022" ["ash3c"]="100.116.80.94:22" ) declare -A TAILSCALE_IPS=( ["semaphore"]="100.116.158.95" ["master"]="100.117.106.136" ["ash3c"]="100.116.80.94" ) echo "📊 1. 本地 Nomad 服务状态" echo "------------------------" systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常" echo "" echo "📊 2. 集群成员状态" echo "----------------" nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态" echo "" echo "📊 3. 节点状态" echo "------------" nomad node status 2>/dev/null || echo "❌ 无法获取节点状态" echo "" echo "🌐 4. 网络连通性测试" echo "------------------" for node in "${!NODES[@]}"; do ip="${TAILSCALE_IPS[$node]}" echo "测试 $node ($ip):" if [[ "$node" == "semaphore" ]]; then echo " ✅ 本地节点" else # Ping 测试 if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then echo " ✅ Ping: 成功" else echo " ❌ Ping: 失败" fi # 端口测试 if timeout 5 bash -c "/dev/null; then echo " ✅ RPC端口(4647): 开放" else echo " ❌ RPC端口(4647): 关闭" fi if timeout 5 bash -c "/dev/null; then echo " ✅ HTTP端口(4646): 开放" else echo " ❌ HTTP端口(4646): 关闭" fi fi echo "" done echo "🔧 5. 远程节点服务状态" echo "-------------------" for node in "${!NODES[@]}"; do if [[ "$node" == "semaphore" ]]; then continue fi connection="${NODES[$node]}" ip=$(echo "$connection" | cut -d: -f1) port=$(echo "$connection" | cut -d: -f2) echo "检查 $node ($ip:$port):" if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null) echo " 服务状态: $status" # 检查配置文件中的 bind_addr bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1) echo " 配置绑定地址: $bind_addr" # 检查实际监听端口 listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3) if [[ -n "$listening" ]]; then echo " 监听端口:" echo "$listening" | sed 's/^/ /' else echo " ❌ 未发现 Nomad 监听端口" fi else echo " ❌ 无法连接或服务未运行" fi echo "" done echo "📋 6. 问题总结和建议" echo "==================" # 检查是否有 leader if nomad server members 2>/dev/null | grep -q "leader"; then echo "✅ 集群有 leader" else echo "❌ 集群没有 leader - 这是主要问题!" echo "" echo "🔧 建议的修复步骤:" echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh" echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh" echo "3. 检查 master 节点是否需要重启" fi echo "" echo "🔗 有用的链接:" echo " Web UI: http://100.116.158.95:4646" echo " 日志查看: journalctl -u nomad -f" echo "" echo "🔍 诊断完成!"