mgmt/ansible/fix-nomad-nodes.yml

74 lines
1.8 KiB
YAML

---
- name: 修复 Nomad 节点配置
hosts: nomad_cluster
become: yes
vars:
nomad_datacenter: "dc1"
consul_servers:
- "ash3c.tailnet-68f9.ts.net:8500"
- "ch4.tailnet-68f9.ts.net:8500"
- "warden.tailnet-68f9.ts.net:8500"
tasks:
- name: 检查节点当前状态
debug:
msg: "正在修复节点: {{ inventory_hostname }}"
- name: 停止 Nomad 服务
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: 备份现有配置
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup.{{ ansible_date_time.epoch }}
remote_src: yes
ignore_errors: yes
- name: 创建 Nomad 配置目录
file:
path: /etc/nomad.d
state: directory
mode: '0755'
- name: 生成 Nomad 客户端配置
template:
src: nomad-client.hcl.j2
dest: /etc/nomad.d/nomad.hcl
mode: '0644'
notify: restart nomad
- name: 启动 Nomad 服务
systemd:
name: nomad
state: started
enabled: yes
- name: 等待 Nomad 服务启动
wait_for:
port: 4646
host: "{{ inventory_hostname }}.tailnet-68f9.ts.net"
delay: 5
timeout: 30
ignore_errors: yes
- name: 验证 Nomad 节点状态
uri:
url: "http://{{ inventory_hostname }}.tailnet-68f9.ts.net:4646/v1/agent/self"
method: GET
register: nomad_status
ignore_errors: yes
- name: 显示修复结果
debug:
msg:
- "节点 {{ inventory_hostname }} 修复完成"
- "Nomad 状态: {{ 'OK' if nomad_status.status == 200 else 'ERROR' }}"
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted