feat(配置): 新增多个Ansible playbook用于Nomad集群管理

添加用于调试、配置和维护Nomad集群的playbook
包括节点连通性检查、配置读取、Tailscale IP获取等功能
修改现有playbook以支持更广泛的节点部署
This commit is contained in:
2025-09-26 13:35:44 +00:00
parent 79b721e165
commit c0d4cf54dc
17 changed files with 304 additions and 128 deletions

View File

@@ -0,0 +1,14 @@
---
- name: Check for AppArmor or SELinux denials
hosts: germany
become: yes
tasks:
- name: Search journalctl for AppArmor/SELinux messages
shell: 'journalctl -k | grep -i -e apparmor -e selinux -e "avc: denied"'
register: security_logs
changed_when: false
failed_when: false
- name: Display security logs
debug:
var: security_logs.stdout_lines

View File

@@ -116,6 +116,7 @@
client {
enabled = true
network_interface = "tailscale0"
cpu_total_compute = 0
servers = [
"100.116.158.95:4647", # semaphore
@@ -162,7 +163,7 @@
Type=notify
User=root
Group=root
ExecStart=/snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure

View File

@@ -0,0 +1,33 @@
---
- name: Debug cgroup permissions
hosts: germany
become: yes
tasks:
- name: Check permissions of /sys/fs/cgroup/cpuset/
stat:
path: /sys/fs/cgroup/cpuset/
register: cpuset_dir
- name: Display cpuset dir stats
debug:
var: cpuset_dir.stat
- name: Check for nomad subdir in cpuset
stat:
path: /sys/fs/cgroup/cpuset/nomad
register: nomad_cpuset_dir
ignore_errors: true
- name: Display nomad cpuset dir stats
debug:
var: nomad_cpuset_dir.stat
when: nomad_cpuset_dir.stat.exists is defined and nomad_cpuset_dir.stat.exists
- name: List contents of /sys/fs/cgroup/cpuset/
command: ls -la /sys/fs/cgroup/cpuset/
register: ls_cpuset
changed_when: false
- name: Display contents of /sys/fs/cgroup/cpuset/
debug:
var: ls_cpuset.stdout_lines

View File

@@ -0,0 +1,14 @@
---
- name: Debug Nomad cgroup subdirectory
hosts: germany
become: yes
tasks:
- name: List contents of /sys/fs/cgroup/cpuset/nomad/
command: ls -la /sys/fs/cgroup/cpuset/nomad/
register: ls_nomad_cpuset
changed_when: false
failed_when: false
- name: Display contents of /sys/fs/cgroup/cpuset/nomad/
debug:
var: ls_nomad_cpuset.stdout_lines

View File

@@ -0,0 +1,30 @@
---
- name: Gather Nomad debug information from multiple nodes
hosts: all
become: yes
tasks:
- name: Get Nomad service status
shell: systemctl status nomad --no-pager -l
register: nomad_status
changed_when: false
failed_when: false
- name: Get last 50 lines of Nomad journal logs
shell: journalctl -u nomad -n 50 --no-pager
register: nomad_journal
changed_when: false
failed_when: false
- name: Display Nomad Status
debug:
msg: |
--- Nomad Status for {{ inventory_hostname }} ---
{{ nomad_status.stdout }}
{{ nomad_status.stderr }}
- name: Display Nomad Journal
debug:
msg: |
--- Nomad Journal for {{ inventory_hostname }} ---
{{ nomad_journal.stdout }}
{{ nomad_journal.stderr }}

View File

@@ -0,0 +1,14 @@
---
- name: Find Nomad service
hosts: germany
become: yes
tasks:
- name: List systemd services and filter for nomad
shell: systemctl list-unit-files --type=service | grep -i nomad
register: nomad_services
changed_when: false
failed_when: false
- name: Display found services
debug:
var: nomad_services.stdout_lines

View File

@@ -0,0 +1,19 @@
---
- name: Fix cgroup permissions for Nomad
hosts: germany
become: yes
tasks:
- name: Recursively change ownership of nomad cgroup directory
file:
path: /sys/fs/cgroup/cpuset/nomad
state: directory
owner: root
group: root
recurse: yes
- name: Change ownership of the parent cpuset directory
file:
path: /sys/fs/cgroup/cpuset/
state: directory
owner: root
group: root

View File

@@ -0,0 +1,45 @@
---
- name: Fix Nomad server configuration
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create corrected nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = true
bootstrap_expect = 5
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
retry_join = [
"100.116.158.95", # semaphore
"100.81.26.3", # ash1d
"100.103.147.94", # ash2e
"100.90.159.68", # ch2
"100.86.141.112" # ch3
]
}
client {
enabled = false
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}

View File

@@ -0,0 +1,12 @@
---
- name: Get Tailscale IP for specified nodes
hosts: all
gather_facts: no
tasks:
- name: Get tailscale IP
shell: "tailscale ip -4"
register: tailscale_ip
- name: Display Tailscale IP
debug:
msg: "Node {{ inventory_hostname }} has IP: {{ tailscale_ip.stdout }}"

View File

@@ -1,10 +1,8 @@
---
- name: Install Nomad by direct download from HashiCorp
hosts: hcs
hosts: all
become: yes
vars:
nomad_version: "1.10.5"
nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip"
nomad_user: "nomad"
nomad_group: "nomad"
nomad_home: "/opt/nomad"

View File

@@ -1,17 +1,22 @@
- name: Manually run Nomad agent to capture output
---
- name: Manually run Nomad agent for debugging
hosts: germany
gather_facts: false
become: yes
tasks:
- name: Run nomad agent directly
command: /snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
register: nomad_agent_output
ignore_errors: true
- name: Find Nomad binary path
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
register: nomad_binary_path
failed_when: nomad_binary_path.stdout == ""
- name: Display agent output
- name: Run nomad agent directly
command: "{{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl"
register: nomad_run
failed_when: false
- name: Display Nomad output
debug:
msg: |
--- Nomad Agent STDOUT ---
{{ nomad_agent_output.stdout }}
--- Nomad Agent STDERR ---
{{ nomad_agent_output.stderr }}
var: nomad_run.stdout
- name: Display Nomad error output
debug:
var: nomad_run.stderr

View File

@@ -0,0 +1,7 @@
---
- name: Ping nodes to check connectivity
hosts: all
gather_facts: no
tasks:
- name: Ping the host
ping:

View File

@@ -0,0 +1,13 @@
---
- name: Read Nomad config file
hosts: localhost
gather_facts: no
tasks:
- name: Read nomad.hcl
slurp:
src: /etc/nomad.d/nomad.hcl
register: nomad_config
- name: Display Nomad config
debug:
msg: "{{ nomad_config['content'] | b64decode }}"

View File

@@ -0,0 +1,37 @@
---
- name: Update Nomad config to run as a client
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create new nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = false
}
client {
enabled = true
servers = ["100.81.26.3:4647", "100.103.147.94:4647", "100.90.159.68:4647"]
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}