#!/bin/bash echo "=== Nomad 集群状态检查 ===" # 检查所有节点的服务状态 echo "1. 检查服务状态..." ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null echo -e "\n2. 检查网络连通性..." # 检查网络连通性 for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do echo "检查到 $ip 的连接..." timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)" timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)" timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)" done echo -e "\n3. 检查 Nomad 集群成员..." # 尝试查询集群成员 if nomad server members 2>/dev/null; then echo "集群成员查询成功" else echo "无法查询集群成员 - 可能没有 leader" fi echo -e "\n4. 检查节点状态..." if nomad node status 2>/dev/null; then echo "节点状态查询成功" else echo "无法查询节点状态" fi echo -e "\n5. 检查最近的日志..." echo "=== Semaphore 节点日志 ===" journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5 echo -e "\n=== 检查完成 ==="