mgmt/scripts/utilities/ultimate-nomad-fix.yml

113 lines
2.7 KiB
YAML

---
- name: Ultimate Nomad Cluster Fix - Complete Reset
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tasks:
- name: Stop and disable nomad service completely
systemd:
name: nomad
state: stopped
enabled: no
daemon_reload: yes
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Remove all nomad data and state
file:
path: "{{ item }}"
state: absent
loop:
- /opt/nomad/data
- /etc/nomad.d/nomad.hcl
- /var/log/nomad
- name: Create clean nomad directories
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /etc/nomad.d
- /opt/nomad
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- name: Create minimal nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_default_ipv4.address }}"
server {
enabled = true
bootstrap_expect = 1
encrypt = "{{ nomad_encrypt_key }}"
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ ansible_default_ipv4.address }}"
serf = "{{ ansible_default_ipv4.address }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Enable and start nomad service
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 10
timeout: 60
- name: Check nomad status
uri:
url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader"
method: GET
register: nomad_leader
retries: 5
delay: 5
ignore_errors: yes
- name: Display nomad status
debug:
msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}"