mgmt/scripts/utilities/NUCLEAR-NOMAD-RESET.yml

375 lines
11 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

---
# ☢️ NUCLEAR NOMAD RESET ☢️
# 这是比终极还要强的修复脚本
# 警告:这将完全摧毁并重建 Nomad 集群
- name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️"
hosts: nomad_cluster
become: yes
gather_facts: yes
serial: 1 # 一次处理一个节点,避免同时炸掉所有节点
vars:
nomad_version: "1.10.5"
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: "🚨 警告:即将进行核弹级重置"
debug:
msg: |
☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️
这将完全摧毁所有 Nomad 相关的数据、配置和进程!
如果你不确定,请立即按 Ctrl+C 取消!
- name: "⏰ 等待 10 秒,给你最后的机会取消..."
pause:
seconds: 10
# ========== 第一阶段:核弹级清理 ==========
- name: "💀 第一阶段:核弹级进程清理"
debug:
msg: "开始核弹级进程清理..."
- name: "🔥 停止 Nomad 服务(如果存在)"
systemd:
name: nomad
state: stopped
enabled: no
daemon_reload: yes
ignore_errors: yes
- name: "💣 强制杀死所有 Nomad 相关进程"
shell: |
# 杀死所有 nomad 进程
pkill -9 -f nomad || true
# 杀死所有可能的子进程
pkill -9 -f "nomad agent" || true
pkill -9 -f "nomad server" || true
pkill -9 -f "nomad client" || true
# 等待进程完全死亡
sleep 5
# 再次确认杀死
ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true
ignore_errors: yes
- name: "🧹 清理所有 Nomad 相关文件和目录"
file:
path: "{{ item }}"
state: absent
loop:
- /opt/nomad
- /etc/nomad.d
- /var/log/nomad
- /etc/systemd/system/nomad.service
- /usr/local/bin/nomad
- /usr/bin/nomad
- /tmp/nomad*
- /var/lib/nomad
- /run/nomad
- /var/run/nomad.pid
ignore_errors: yes
- name: "🔧 清理 systemd 缓存"
systemd:
daemon_reload: yes
# ========== 第二阶段:重新安装 Nomad ==========
- name: "🚀 第二阶段:重新安装 Nomad"
debug:
msg: "开始重新安装 Nomad..."
- name: "🔑 添加 HashiCorp GPG 密钥"
apt_key:
url: https://apt.releases.hashicorp.com/gpg
state: present
- name: "📦 添加 HashiCorp APT 仓库"
apt_repository:
repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main"
state: present
update_cache: yes
- name: "🔧 安装 Nomad自动检测架构"
apt:
name: "nomad={{ nomad_version }}-1"
state: present
update_cache: yes
- name: "👤 创建 nomad 用户和组"
group:
name: nomad
state: present
- name: "👤 创建 nomad 用户"
user:
name: nomad
group: nomad
system: yes
shell: /bin/false
home: /opt/nomad
create_home: no
- name: "📁 创建全新的目录结构"
file:
path: "{{ item.path }}"
state: directory
owner: "{{ item.owner | default('nomad') }}"
group: "{{ item.group | default('nomad') }}"
mode: "{{ item.mode | default('0755') }}"
loop:
- { path: "/etc/nomad.d", mode: "0755" }
- { path: "/opt/nomad", mode: "0755" }
- { path: "/opt/nomad/data", mode: "0755" }
- { path: "/opt/nomad/alloc_mounts", mode: "0755" }
- { path: "/var/log/nomad", mode: "0755" }
# ========== 第三阶段:网络和防火墙检查 ==========
- name: "🌐 第三阶段:网络配置验证"
debug:
msg: "验证网络配置..."
- name: "🔍 检查 Tailscale IP 是否正确绑定"
shell: |
ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND"
register: ip_check
- name: "⚠️ IP 地址检查结果"
debug:
msg: |
节点: {{ inventory_hostname }}
期望 IP: {{ tailscale_ips[inventory_hostname] }}
检查结果: {{ ip_check.stdout }}
{% if 'IP_NOT_FOUND' in ip_check.stdout %}
❌ 警告IP 地址未正确绑定!
{% else %}
✅ IP 地址检查通过
{% endif %}
- name: "🔥 确保防火墙端口开放"
shell: |
# 检查并开放 Nomad 端口
if command -v ufw >/dev/null 2>&1; then
ufw allow 4646/tcp # HTTP API
ufw allow 4647/tcp # RPC
ufw allow 4648/tcp # Serf
elif command -v firewall-cmd >/dev/null 2>&1; then
firewall-cmd --permanent --add-port=4646/tcp
firewall-cmd --permanent --add-port=4647/tcp
firewall-cmd --permanent --add-port=4648/tcp
firewall-cmd --reload
fi
ignore_errors: yes
# ========== 第四阶段:创建超强配置 ==========
- name: "⚙️ 第四阶段:创建超强配置文件"
debug:
msg: "创建超强配置文件..."
- name: "📝 创建核弹级 Nomad 配置"
copy:
content: |
# ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }}
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
# 使用正确的 Tailscale IP
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
# 日志配置
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
log_rotate_duration = "24h"
log_rotate_max_files = 5
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
# 更激进的重试配置
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}:4647",
"{{ tailscale_ips.master }}:4647",
"{{ tailscale_ips.ash3c }}:4647"
]
retry_max = 10
retry_interval = "15s"
}
# 更宽松的心跳配置
heartbeat_grace = "30s"
min_heartbeat_ttl = "10s"
max_heartbeats_per_second = 50.0
# Raft 配置优化
raft_protocol = 3
raft_multiplier = 1
}
client {
enabled = true
# 网络接口配置
network_interface = "tailscale0"
# 更宽松的心跳配置
max_kill_timeout = "30s"
# 主机卷配置
host_volume "docker-sock" {
path = "/var/run/docker.sock"
read_only = false
}
}
# 地址和端口配置
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
# Docker 插件配置
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
# 更宽松的资源限制
gc {
image = true
image_delay = "10m"
container = true
dangling_containers {
enabled = true
dry_run = false
period = "5m"
creation_grace = "5m"
}
}
}
}
# 遥测配置
telemetry {
collection_interval = "10s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
dest: "/etc/nomad.d/nomad.hcl"
owner: nomad
group: nomad
mode: '0640'
# ========== 第五阶段:创建超强 systemd 服务 ==========
- name: "🔧 创建超强 systemd 服务文件"
copy:
content: |
[Unit]
Description=Nomad - Nuclear Edition
Documentation=https://www.nomadproject.io/
Wants=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=always
RestartSec=10
LimitNOFILE=65536
# 更强的重启策略
StartLimitInterval=0
StartLimitBurst=10
# 环境变量
Environment=NOMAD_DISABLE_UPDATE_CHECK=1
[Install]
WantedBy=multi-user.target
dest: "/etc/systemd/system/nomad.service"
owner: root
group: root
mode: '0644'
- name: "🔄 重新加载 systemd"
systemd:
daemon_reload: yes
# ========== 第六阶段:启动和验证 ==========
- name: "🚀 第六阶段:启动服务"
debug:
msg: "启动 Nomad 服务..."
- name: "🔥 启用并启动 Nomad 服务"
systemd:
name: nomad
enabled: yes
state: started
daemon_reload: yes
- name: "⏰ 等待服务启动"
pause:
seconds: 15
- name: "🔍 验证服务状态"
systemd:
name: nomad
register: nomad_service_status
- name: "📊 显示服务状态"
debug:
msg: |
☢️ 核弹级重置完成!
节点: {{ inventory_hostname }}
服务状态: {{ nomad_service_status.status.ActiveState }}
IP 地址: {{ tailscale_ips[inventory_hostname] }}
{% if nomad_service_status.status.ActiveState == 'active' %}
✅ 服务启动成功!
{% else %}
❌ 服务启动失败,请检查日志!
{% endif %}
- name: "🧹 清理临时文件"
file:
path: "{{ item }}"
state: absent
loop:
- "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
- "/tmp/nomad"
ignore_errors: yes
- name: "🎉 核弹级重置完成通知"
debug:
msg: |
☢️☢️☢️ 核弹级重置完成!☢️☢️☢️
节点 {{ inventory_hostname }} 已经被完全摧毁并重建!
下一步:
1. 等待所有节点完成重置
2. 检查集群状态nomad server members
3. 检查节点状态nomad node status
4. 如果还有问题,那就真的没救了... 😅