375 lines
11 KiB
YAML
375 lines
11 KiB
YAML
---
|
||
# ☢️ NUCLEAR NOMAD RESET ☢️
|
||
# 这是比终极还要强的修复脚本
|
||
# 警告:这将完全摧毁并重建 Nomad 集群
|
||
- name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️"
|
||
hosts: nomad_cluster
|
||
become: yes
|
||
gather_facts: yes
|
||
serial: 1 # 一次处理一个节点,避免同时炸掉所有节点
|
||
vars:
|
||
nomad_version: "1.10.5"
|
||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||
tailscale_ips:
|
||
semaphore: "100.116.158.95"
|
||
master: "100.117.106.136"
|
||
ash3c: "100.116.80.94"
|
||
|
||
tasks:
|
||
- name: "🚨 警告:即将进行核弹级重置"
|
||
debug:
|
||
msg: |
|
||
☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️
|
||
这将完全摧毁所有 Nomad 相关的数据、配置和进程!
|
||
如果你不确定,请立即按 Ctrl+C 取消!
|
||
|
||
- name: "⏰ 等待 10 秒,给你最后的机会取消..."
|
||
pause:
|
||
seconds: 10
|
||
|
||
# ========== 第一阶段:核弹级清理 ==========
|
||
- name: "💀 第一阶段:核弹级进程清理"
|
||
debug:
|
||
msg: "开始核弹级进程清理..."
|
||
|
||
- name: "🔥 停止 Nomad 服务(如果存在)"
|
||
systemd:
|
||
name: nomad
|
||
state: stopped
|
||
enabled: no
|
||
daemon_reload: yes
|
||
ignore_errors: yes
|
||
|
||
- name: "💣 强制杀死所有 Nomad 相关进程"
|
||
shell: |
|
||
# 杀死所有 nomad 进程
|
||
pkill -9 -f nomad || true
|
||
# 杀死所有可能的子进程
|
||
pkill -9 -f "nomad agent" || true
|
||
pkill -9 -f "nomad server" || true
|
||
pkill -9 -f "nomad client" || true
|
||
# 等待进程完全死亡
|
||
sleep 5
|
||
# 再次确认杀死
|
||
ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true
|
||
ignore_errors: yes
|
||
|
||
- name: "🧹 清理所有 Nomad 相关文件和目录"
|
||
file:
|
||
path: "{{ item }}"
|
||
state: absent
|
||
loop:
|
||
- /opt/nomad
|
||
- /etc/nomad.d
|
||
- /var/log/nomad
|
||
- /etc/systemd/system/nomad.service
|
||
- /usr/local/bin/nomad
|
||
- /usr/bin/nomad
|
||
- /tmp/nomad*
|
||
- /var/lib/nomad
|
||
- /run/nomad
|
||
- /var/run/nomad.pid
|
||
ignore_errors: yes
|
||
|
||
- name: "🔧 清理 systemd 缓存"
|
||
systemd:
|
||
daemon_reload: yes
|
||
|
||
# ========== 第二阶段:重新安装 Nomad ==========
|
||
- name: "🚀 第二阶段:重新安装 Nomad"
|
||
debug:
|
||
msg: "开始重新安装 Nomad..."
|
||
|
||
- name: "🔑 添加 HashiCorp GPG 密钥"
|
||
apt_key:
|
||
url: https://apt.releases.hashicorp.com/gpg
|
||
state: present
|
||
|
||
- name: "📦 添加 HashiCorp APT 仓库"
|
||
apt_repository:
|
||
repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main"
|
||
state: present
|
||
update_cache: yes
|
||
|
||
- name: "🔧 安装 Nomad(自动检测架构)"
|
||
apt:
|
||
name: "nomad={{ nomad_version }}-1"
|
||
state: present
|
||
update_cache: yes
|
||
|
||
- name: "👤 创建 nomad 用户和组"
|
||
group:
|
||
name: nomad
|
||
state: present
|
||
|
||
- name: "👤 创建 nomad 用户"
|
||
user:
|
||
name: nomad
|
||
group: nomad
|
||
system: yes
|
||
shell: /bin/false
|
||
home: /opt/nomad
|
||
create_home: no
|
||
|
||
- name: "📁 创建全新的目录结构"
|
||
file:
|
||
path: "{{ item.path }}"
|
||
state: directory
|
||
owner: "{{ item.owner | default('nomad') }}"
|
||
group: "{{ item.group | default('nomad') }}"
|
||
mode: "{{ item.mode | default('0755') }}"
|
||
loop:
|
||
- { path: "/etc/nomad.d", mode: "0755" }
|
||
- { path: "/opt/nomad", mode: "0755" }
|
||
- { path: "/opt/nomad/data", mode: "0755" }
|
||
- { path: "/opt/nomad/alloc_mounts", mode: "0755" }
|
||
- { path: "/var/log/nomad", mode: "0755" }
|
||
|
||
# ========== 第三阶段:网络和防火墙检查 ==========
|
||
- name: "🌐 第三阶段:网络配置验证"
|
||
debug:
|
||
msg: "验证网络配置..."
|
||
|
||
- name: "🔍 检查 Tailscale IP 是否正确绑定"
|
||
shell: |
|
||
ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND"
|
||
register: ip_check
|
||
|
||
- name: "⚠️ IP 地址检查结果"
|
||
debug:
|
||
msg: |
|
||
节点: {{ inventory_hostname }}
|
||
期望 IP: {{ tailscale_ips[inventory_hostname] }}
|
||
检查结果: {{ ip_check.stdout }}
|
||
{% if 'IP_NOT_FOUND' in ip_check.stdout %}
|
||
❌ 警告:IP 地址未正确绑定!
|
||
{% else %}
|
||
✅ IP 地址检查通过
|
||
{% endif %}
|
||
|
||
- name: "🔥 确保防火墙端口开放"
|
||
shell: |
|
||
# 检查并开放 Nomad 端口
|
||
if command -v ufw >/dev/null 2>&1; then
|
||
ufw allow 4646/tcp # HTTP API
|
||
ufw allow 4647/tcp # RPC
|
||
ufw allow 4648/tcp # Serf
|
||
elif command -v firewall-cmd >/dev/null 2>&1; then
|
||
firewall-cmd --permanent --add-port=4646/tcp
|
||
firewall-cmd --permanent --add-port=4647/tcp
|
||
firewall-cmd --permanent --add-port=4648/tcp
|
||
firewall-cmd --reload
|
||
fi
|
||
ignore_errors: yes
|
||
|
||
# ========== 第四阶段:创建超强配置 ==========
|
||
- name: "⚙️ 第四阶段:创建超强配置文件"
|
||
debug:
|
||
msg: "创建超强配置文件..."
|
||
|
||
- name: "📝 创建核弹级 Nomad 配置"
|
||
copy:
|
||
content: |
|
||
# ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }}
|
||
datacenter = "dc1"
|
||
region = "global"
|
||
data_dir = "/opt/nomad/data"
|
||
|
||
# 使用正确的 Tailscale IP
|
||
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
|
||
|
||
# 日志配置
|
||
log_level = "INFO"
|
||
log_file = "/var/log/nomad/nomad.log"
|
||
log_rotate_duration = "24h"
|
||
log_rotate_max_files = 5
|
||
|
||
server {
|
||
enabled = true
|
||
bootstrap_expect = 3
|
||
encrypt = "{{ nomad_encrypt_key }}"
|
||
|
||
# 更激进的重试配置
|
||
server_join {
|
||
retry_join = [
|
||
"{{ tailscale_ips.semaphore }}:4647",
|
||
"{{ tailscale_ips.master }}:4647",
|
||
"{{ tailscale_ips.ash3c }}:4647"
|
||
]
|
||
retry_max = 10
|
||
retry_interval = "15s"
|
||
}
|
||
|
||
# 更宽松的心跳配置
|
||
heartbeat_grace = "30s"
|
||
min_heartbeat_ttl = "10s"
|
||
max_heartbeats_per_second = 50.0
|
||
|
||
# Raft 配置优化
|
||
raft_protocol = 3
|
||
raft_multiplier = 1
|
||
}
|
||
|
||
client {
|
||
enabled = true
|
||
|
||
# 网络接口配置
|
||
network_interface = "tailscale0"
|
||
|
||
# 更宽松的心跳配置
|
||
max_kill_timeout = "30s"
|
||
|
||
# 主机卷配置
|
||
host_volume "docker-sock" {
|
||
path = "/var/run/docker.sock"
|
||
read_only = false
|
||
}
|
||
}
|
||
|
||
|
||
|
||
# 地址和端口配置
|
||
addresses {
|
||
http = "0.0.0.0"
|
||
rpc = "{{ tailscale_ips[inventory_hostname] }}"
|
||
serf = "{{ tailscale_ips[inventory_hostname] }}"
|
||
}
|
||
|
||
ports {
|
||
http = 4646
|
||
rpc = 4647
|
||
serf = 4648
|
||
}
|
||
|
||
# Docker 插件配置
|
||
plugin "docker" {
|
||
config {
|
||
allow_privileged = true
|
||
volumes {
|
||
enabled = true
|
||
}
|
||
|
||
# 更宽松的资源限制
|
||
gc {
|
||
image = true
|
||
image_delay = "10m"
|
||
container = true
|
||
dangling_containers {
|
||
enabled = true
|
||
dry_run = false
|
||
period = "5m"
|
||
creation_grace = "5m"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
# 遥测配置
|
||
telemetry {
|
||
collection_interval = "10s"
|
||
disable_hostname = false
|
||
prometheus_metrics = true
|
||
publish_allocation_metrics = true
|
||
publish_node_metrics = true
|
||
}
|
||
dest: "/etc/nomad.d/nomad.hcl"
|
||
owner: nomad
|
||
group: nomad
|
||
mode: '0640'
|
||
|
||
# ========== 第五阶段:创建超强 systemd 服务 ==========
|
||
- name: "🔧 创建超强 systemd 服务文件"
|
||
copy:
|
||
content: |
|
||
[Unit]
|
||
Description=Nomad - Nuclear Edition
|
||
Documentation=https://www.nomadproject.io/
|
||
Wants=network-online.target
|
||
After=network-online.target
|
||
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
|
||
|
||
[Service]
|
||
Type=notify
|
||
User=nomad
|
||
Group=nomad
|
||
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
||
ExecReload=/bin/kill -HUP $MAINPID
|
||
KillMode=process
|
||
Restart=always
|
||
RestartSec=10
|
||
LimitNOFILE=65536
|
||
|
||
# 更强的重启策略
|
||
StartLimitInterval=0
|
||
StartLimitBurst=10
|
||
|
||
# 环境变量
|
||
Environment=NOMAD_DISABLE_UPDATE_CHECK=1
|
||
|
||
[Install]
|
||
WantedBy=multi-user.target
|
||
dest: "/etc/systemd/system/nomad.service"
|
||
owner: root
|
||
group: root
|
||
mode: '0644'
|
||
|
||
- name: "🔄 重新加载 systemd"
|
||
systemd:
|
||
daemon_reload: yes
|
||
|
||
# ========== 第六阶段:启动和验证 ==========
|
||
- name: "🚀 第六阶段:启动服务"
|
||
debug:
|
||
msg: "启动 Nomad 服务..."
|
||
|
||
- name: "🔥 启用并启动 Nomad 服务"
|
||
systemd:
|
||
name: nomad
|
||
enabled: yes
|
||
state: started
|
||
daemon_reload: yes
|
||
|
||
- name: "⏰ 等待服务启动"
|
||
pause:
|
||
seconds: 15
|
||
|
||
- name: "🔍 验证服务状态"
|
||
systemd:
|
||
name: nomad
|
||
register: nomad_service_status
|
||
|
||
- name: "📊 显示服务状态"
|
||
debug:
|
||
msg: |
|
||
☢️ 核弹级重置完成!
|
||
节点: {{ inventory_hostname }}
|
||
服务状态: {{ nomad_service_status.status.ActiveState }}
|
||
IP 地址: {{ tailscale_ips[inventory_hostname] }}
|
||
|
||
{% if nomad_service_status.status.ActiveState == 'active' %}
|
||
✅ 服务启动成功!
|
||
{% else %}
|
||
❌ 服务启动失败,请检查日志!
|
||
{% endif %}
|
||
|
||
- name: "🧹 清理临时文件"
|
||
file:
|
||
path: "{{ item }}"
|
||
state: absent
|
||
loop:
|
||
- "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||
- "/tmp/nomad"
|
||
ignore_errors: yes
|
||
|
||
- name: "🎉 核弹级重置完成通知"
|
||
debug:
|
||
msg: |
|
||
☢️☢️☢️ 核弹级重置完成!☢️☢️☢️
|
||
|
||
节点 {{ inventory_hostname }} 已经被完全摧毁并重建!
|
||
|
||
下一步:
|
||
1. 等待所有节点完成重置
|
||
2. 检查集群状态:nomad server members
|
||
3. 检查节点状态:nomad node status
|
||
4. 如果还有问题,那就真的没救了... 😅 |