mgmt/scripts/diagnose-consul-sync.sh

63 lines
2.3 KiB
Bash
Executable File

#!/bin/bash
# Consul 集群同步诊断脚本
echo "=== Consul 集群同步诊断 ==="
echo "时间: $(date)"
echo ""
CONSUL_NODES=(
"master.tailnet-68f9.ts.net:8500"
"warden.tailnet-68f9.ts.net:8500"
"ash3c.tailnet-68f9.ts.net:8500"
)
echo "1. 检查集群状态"
echo "=================="
for node in "${CONSUL_NODES[@]}"; do
echo "节点: $node"
echo " Leader: $(curl -s http://$node/v1/status/leader 2>/dev/null || echo 'ERROR')"
echo " Peers: $(curl -s http://$node/v1/status/peers 2>/dev/null | jq length 2>/dev/null || echo 'ERROR')"
echo ""
done
echo "2. 检查服务注册"
echo "================"
for node in "${CONSUL_NODES[@]}"; do
echo "节点: $node"
echo " Catalog 服务:"
curl -s http://$node/v1/catalog/services 2>/dev/null | jq -r 'keys[]' 2>/dev/null | grep -E "(consul-lb|traefik)" | sed 's/^/ /' || echo " ERROR 或无服务"
echo " Agent 服务:"
curl -s http://$node/v1/agent/services 2>/dev/null | jq -r 'keys[]' 2>/dev/null | grep -E "traefik" | sed 's/^/ /' || echo " 无本地服务"
echo ""
done
echo "3. 检查健康状态"
echo "================"
for node in "${CONSUL_NODES[@]}"; do
echo "节点: $node"
checks=$(curl -s http://$node/v1/agent/checks 2>/dev/null)
if [ $? -eq 0 ]; then
echo "$checks" | jq -r 'to_entries[] | select(.key | contains("traefik")) | " \(.key): \(.value.Status)"' 2>/dev/null || echo " 无 Traefik 健康检查"
else
echo " ERROR: 无法连接"
fi
echo ""
done
echo "4. 网络连通性测试"
echo "=================="
echo "测试从当前节点到 Traefik 的连接:"
curl -s -w " HTTP %{http_code} - 响应时间: %{time_total}s\n" -o /dev/null http://100.97.62.111:80/ || echo " ERROR: 无法连接到 Traefik"
curl -s -w " HTTP %{http_code} - 响应时间: %{time_total}s\n" -o /dev/null http://100.97.62.111:8080/api/overview || echo " ERROR: 无法连接到 Traefik Dashboard"
echo ""
echo "5. 建议操作"
echo "==========="
echo "如果发现问题:"
echo " 1. 重新注册服务: ./scripts/register-traefik-to-all-consul.sh"
echo " 2. 检查 Consul 日志: nomad alloc logs \$(nomad job allocs consul-cluster-nomad | grep warden | awk '{print \$1}') consul"
echo " 3. 重启有问题的 Consul 节点"
echo " 4. 检查网络连通性和防火墙设置"