mgmt/scripts/utilities/check-nomad-cluster.sh

37 lines
1.1 KiB
Bash
Executable File

#!/bin/bash
echo "=== Nomad 集群状态检查 ==="
# 检查所有节点的服务状态
echo "1. 检查服务状态..."
ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null
echo -e "\n2. 检查网络连通性..."
# 检查网络连通性
for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do
echo "检查到 $ip 的连接..."
timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)"
timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)"
timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)"
done
echo -e "\n3. 检查 Nomad 集群成员..."
# 尝试查询集群成员
if nomad server members 2>/dev/null; then
echo "集群成员查询成功"
else
echo "无法查询集群成员 - 可能没有 leader"
fi
echo -e "\n4. 检查节点状态..."
if nomad node status 2>/dev/null; then
echo "节点状态查询成功"
else
echo "无法查询节点状态"
fi
echo -e "\n5. 检查最近的日志..."
echo "=== Semaphore 节点日志 ==="
journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5
echo -e "\n=== 检查完成 ==="