mgmt/scripts/utilities/complete-nomad-reset.yml

151 lines
3.6 KiB
YAML

---
- name: Complete Nomad Cluster Reset and Rebuild
hosts: nomad_cluster
become: yes
serial: 1 # 一次处理一个节点
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: Stop nomad service completely
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Remove all nomad data and state
shell: |
rm -rf /opt/nomad/data/*
rm -rf /opt/nomad/data/.*
rm -rf /var/log/nomad/*
ignore_errors: yes
- name: Create fresh nomad configuration with correct Tailscale IPs
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
# 使用 Tailscale IP 地址
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}",
"{{ tailscale_ips.master }}",
"{{ tailscale_ips.ash3c }}"
]
}
}
client {
enabled = true
network_interface = "tailscale0"
}
ui_config {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Ensure log directory exists
file:
path: /var/log/nomad
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Start nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ tailscale_ips[inventory_hostname] }}"
delay: 5
timeout: 30
- name: Check nomad service status
shell: systemctl status nomad --no-pager -l
register: nomad_status
ignore_errors: yes
- name: Display nomad status
debug:
var: nomad_status.stdout_lines
- name: Wait for cluster to form
hosts: localhost
gather_facts: no
tasks:
- name: Wait for cluster formation
pause:
seconds: 30
prompt: "等待集群形成..."
- name: Verify cluster status
hosts: semaphore
become: yes
tasks:
- name: Check cluster members
shell: nomad server members
register: cluster_members
ignore_errors: yes
- name: Display cluster members
debug:
var: cluster_members.stdout_lines
- name: Check node status
shell: nomad node status
register: node_status
ignore_errors: yes
- name: Display node status
debug:
var: node_status.stdout_lines