mgmt/scripts/utilities/fix-consul-cluster.sh

151 lines
4.6 KiB
Bash
Executable File

#!/bin/bash
# Consul 集群修复脚本
# 解决 "No cluster leader" 问题
set -e
echo "=== Consul 集群修复脚本 ==="
echo "当前时间: $(date)"
echo
# 检查当前 Consul 服务状态
echo "1. 检查当前 Consul 服务状态..."
docker service ls | grep consul || echo "未找到 consul 服务"
echo
# 显示当前问题
echo "2. 检查 Consul 日志中的错误..."
echo "Master 节点日志:"
docker service logs consul-cluster_consul-master --tail 5 2>/dev/null || echo "无法获取 master 日志"
echo
echo "Ash3c 节点日志:"
docker service logs consul-cluster_consul-ash3c --tail 5 2>/dev/null || echo "无法获取 ash3c 日志"
echo
# 提供修复选项
echo "3. 修复选项:"
echo " a) 使用修复后的 overlay 网络配置 (推荐)"
echo " b) 使用 macvlan 网络配置"
echo " c) 仅重启现有服务"
echo
read -p "请选择修复方案 (a/b/c): " choice
case $choice in
a)
echo "使用修复后的 overlay 网络配置..."
# 停止现有服务
echo "停止现有 Consul 集群..."
docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在"
# 等待服务完全停止
echo "等待服务完全停止..."
sleep 10
# 清理数据卷 (可选)
read -p "是否清理现有数据卷? (y/n): " clean_volumes
if [[ $clean_volumes == "y" ]]; then
docker volume rm consul-cluster_consul_master_data 2>/dev/null || true
docker volume rm consul-cluster_consul_ash3c_data 2>/dev/null || true
echo "数据卷已清理"
fi
# 部署修复后的配置
echo "部署修复后的 Consul 集群..."
docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-fixed.yml consul-cluster
echo "等待服务启动..."
sleep 15
# 检查服务状态
echo "检查新服务状态..."
docker service ls | grep consul
;;
b)
echo "使用 macvlan 网络配置..."
echo "注意: 需要根据你的网络环境调整 IP 地址和网络接口"
# 检查网络接口
echo "当前网络接口:"
ip link show | grep -E "^[0-9]+:" | awk '{print $2}' | sed 's/://'
echo
read -p "请输入要使用的网络接口 (如 eth0): " interface
read -p "请输入子网 (如 192.168.1.0/24): " subnet
read -p "请输入网关 (如 192.168.1.1): " gateway
# 更新 macvlan 配置文件
sed -i "s/parent: eth0/parent: $interface/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
sed -i "s/192.168.1.0\/24/$subnet/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
sed -i "s/192.168.1.1/$gateway/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
# 停止现有服务
echo "停止现有 Consul 集群..."
docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在"
# 等待服务完全停止
echo "等待服务完全停止..."
sleep 10
# 部署 macvlan 配置
echo "部署 macvlan Consul 集群..."
docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml consul-cluster
echo "等待服务启动..."
sleep 15
# 检查服务状态
echo "检查新服务状态..."
docker service ls | grep consul
;;
c)
echo "重启现有服务..."
# 重启服务
docker service update --force consul-cluster_consul-master
docker service update --force consul-cluster_consul-ash3c
echo "等待服务重启..."
sleep 10
# 检查服务状态
echo "检查服务状态..."
docker service ls | grep consul
;;
*)
echo "无效选择,退出"
exit 1
;;
esac
echo
echo "4. 验证修复结果..."
sleep 5
# 检查服务状态
echo "服务状态:"
docker service ls | grep consul
echo
echo "等待 30 秒后检查集群状态..."
sleep 30
# 尝试检查集群成员
echo "尝试检查集群成员状态..."
timeout 10 docker service logs consul-cluster_consul-master --tail 10 2>/dev/null || echo "无法获取日志"
echo
echo "=== 修复完成 ==="
echo "请等待几分钟让集群完全启动,然后访问:"
echo "- Master UI: http://your-master-ip:8500"
echo "- Ash3c UI: http://your-ash3c-ip:8501"
echo
echo "如果问题仍然存在,请检查:"
echo "1. 节点间网络连通性"
echo "2. 防火墙设置"
echo "3. Docker Swarm 网络配置"