FIX: 统一 Ansible inventory 并创建 Nomad 节点修复 playbook
Simple Test / test (push) Successful in 6s Details

- 统一使用 ben/3131 凭据
- 删除重复的 pve inventory
- 创建 fix-nomad-nodes.yml 修复异常节点
- 基于 warden 成功配置创建 Nomad 客户端模板
- 目标修复: ch4, hcp1, warden, ash1d (ash2e 连接超时)
This commit is contained in:
Houzhong Xu 2025-10-09 13:03:03 +00:00
parent 1426d5b526
commit 09dca62603
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
3 changed files with 254 additions and 5 deletions

View File

@ -0,0 +1,74 @@
---
- name: 修复 Nomad 节点配置
hosts: nomad_cluster
become: yes
vars:
nomad_datacenter: "dc1"
consul_servers:
- "ash3c.tailnet-68f9.ts.net:8500"
- "ch4.tailnet-68f9.ts.net:8500"
- "warden.tailnet-68f9.ts.net:8500"
tasks:
- name: 检查节点当前状态
debug:
msg: "正在修复节点: {{ inventory_hostname }}"
- name: 停止 Nomad 服务
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: 备份现有配置
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup.{{ ansible_date_time.epoch }}
remote_src: yes
ignore_errors: yes
- name: 创建 Nomad 配置目录
file:
path: /etc/nomad.d
state: directory
mode: '0755'
- name: 生成 Nomad 客户端配置
template:
src: nomad-client.hcl.j2
dest: /etc/nomad.d/nomad.hcl
mode: '0644'
notify: restart nomad
- name: 启动 Nomad 服务
systemd:
name: nomad
state: started
enabled: yes
- name: 等待 Nomad 服务启动
wait_for:
port: 4646
host: "{{ inventory_hostname }}.tailnet-68f9.ts.net"
delay: 5
timeout: 30
ignore_errors: yes
- name: 验证 Nomad 节点状态
uri:
url: "http://{{ inventory_hostname }}.tailnet-68f9.ts.net:4646/v1/agent/self"
method: GET
register: nomad_status
ignore_errors: yes
- name: 显示修复结果
debug:
msg:
- "节点 {{ inventory_hostname }} 修复完成"
- "Nomad 状态: {{ 'OK' if nomad_status.status == 200 else 'ERROR' }}"
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@ -1,10 +1,81 @@
---
all:
children:
warden:
pve_cluster:
hosts:
warden:
ansible_host: 100.122.197.112
nuc12:
ansible_host: nuc12
ansible_user: root
ansible_ssh_pass: "Aa313131@ben"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
xgp:
ansible_host: xgp
ansible_user: root
ansible_ssh_pass: "Aa313131@ben"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
pve:
ansible_host: pve
ansible_user: root
ansible_ssh_pass: "Aa313131@ben"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
vars:
ansible_python_interpreter: /usr/bin/python3
nomad_cluster:
hosts:
ch4:
ansible_host: ch4.tailnet-68f9.ts.net
ansible_user: ben
ansible_password: "3131"
ansible_become_password: "3131"
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
hcp1:
ansible_host: hcp1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
ash3c:
ansible_host: ash3c.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
warden:
ansible_host: warden.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
onecloud1:
ansible_host: onecloud1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
influxdb1:
ansible_host: influxdb1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
browser:
ansible_host: browser.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
ash1d:
ansible_host: ash1d.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
ash2e:
ansible_host: ash2e.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
vars:
ansible_python_interpreter: /usr/bin/python3

View File

@ -0,0 +1,104 @@
datacenter = "{{ nomad_datacenter }}"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ inventory_hostname }}"
bind_addr = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
addresses {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
}
advertise {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4646"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4647"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = false
}
client {
enabled = true
network_interface = "tailscale0"
# Consul 服务器地址
servers = [
{% for server in consul_servers %}
"{{ server.split(':')[0] }}:4647"{{ ',' if not loop.last else '' }}
{% endfor %}
]
# 配置 host volumes
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
host_volume "vault-storage" {
path = "/opt/nomad/data/vault-storage"
read_only = false
}
# 驱动配置
options {
"driver.raw_exec.enable" = "1"
"driver.exec.enable" = "1"
}
# 节点元数据
meta {
consul = "true"
consul_version = "1.21.5"
node_type = "client"
}
# 垃圾清理策略
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "{{ consul_servers | join(',') }}"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = false
client_auto_join = true
}
vault {
enabled = true
address = "http://master.tailnet-68f9.ts.net:8200,http://ash3c.tailnet-68f9.ts.net:8200,http://warden.tailnet-68f9.ts.net:8200"
token = "hvs.A5Fu4E1oHyezJapVllKPFsWg"
create_from_role = "nomad-cluster"
tls_skip_verify = true
}
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}