feat(配置): 新增多个Ansible playbook用于Nomad集群管理

添加用于调试、配置和维护Nomad集群的playbook
包括节点连通性检查、配置读取、Tailscale IP获取等功能
修改现有playbook以支持更广泛的节点部署
This commit is contained in:
Houzhong Xu 2025-09-26 13:35:44 +00:00
parent 79b721e165
commit c0d4cf54dc
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
17 changed files with 304 additions and 128 deletions

17
1
View File

@ -1,17 +0,0 @@
===> 连接到 Nomad Leader: http://100.81.26.3:4646
\n--- 当前节点列表 (Before) ---
ID Node Pool DC Name Class Drain Eligibility Status
ec4bf738 default dc1 pdns <none> false eligible ready
583f1b77 default dc1 semaphore <none> false eligible down
cd121e59 default dc1 influxdb <none> false eligible ready
3edfa5bc default dc1 ash3c <none> false eligible ready
300c11e7 default dc1 hcp1 <none> false eligible ready
5e218d15 default dc1 master <none> false eligible ready
06bb8a3a default dc1 hcs <none> false eligible ready
baea7bb6 default dc1 hcp2 <none> false eligible ready
d2e4ceee default dc1 ch3 <none> false ineligible down
3521e4a1 default dc1 ch2 <none> false eligible down
e6c0cdbf default dc1 ash1d <none> false eligible down
645fbd8b default dc1 ash2e <none> false eligible down
84913d2f default dc1 semaphore <none> false eligible down
a3d0b0e3 default dc1 Syd <none> false eligible ready

View File

@ -0,0 +1,12 @@
[consul_servers:children]
nomad_servers
[consul_servers:vars]
consul_cert_dir=/etc/consul.d/certs
consul_ca_src=security/certificates/ca.pem
consul_cert_src=security/certificates/consul-server.pem
consul_key_src=security/certificates/consul-server-key.pem
[nomad_cluster:children]
nomad_servers
nomad_clients

View File

@ -0,0 +1,14 @@
---
- name: Check for AppArmor or SELinux denials
hosts: germany
become: yes
tasks:
- name: Search journalctl for AppArmor/SELinux messages
shell: 'journalctl -k | grep -i -e apparmor -e selinux -e "avc: denied"'
register: security_logs
changed_when: false
failed_when: false
- name: Display security logs
debug:
var: security_logs.stdout_lines

View File

@ -116,6 +116,7 @@
client {
enabled = true
network_interface = "tailscale0"
cpu_total_compute = 0
servers = [
"100.116.158.95:4647", # semaphore
@ -162,7 +163,7 @@
Type=notify
User=root
Group=root
ExecStart=/snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure

View File

@ -0,0 +1,33 @@
---
- name: Debug cgroup permissions
hosts: germany
become: yes
tasks:
- name: Check permissions of /sys/fs/cgroup/cpuset/
stat:
path: /sys/fs/cgroup/cpuset/
register: cpuset_dir
- name: Display cpuset dir stats
debug:
var: cpuset_dir.stat
- name: Check for nomad subdir in cpuset
stat:
path: /sys/fs/cgroup/cpuset/nomad
register: nomad_cpuset_dir
ignore_errors: true
- name: Display nomad cpuset dir stats
debug:
var: nomad_cpuset_dir.stat
when: nomad_cpuset_dir.stat.exists is defined and nomad_cpuset_dir.stat.exists
- name: List contents of /sys/fs/cgroup/cpuset/
command: ls -la /sys/fs/cgroup/cpuset/
register: ls_cpuset
changed_when: false
- name: Display contents of /sys/fs/cgroup/cpuset/
debug:
var: ls_cpuset.stdout_lines

View File

@ -0,0 +1,14 @@
---
- name: Debug Nomad cgroup subdirectory
hosts: germany
become: yes
tasks:
- name: List contents of /sys/fs/cgroup/cpuset/nomad/
command: ls -la /sys/fs/cgroup/cpuset/nomad/
register: ls_nomad_cpuset
changed_when: false
failed_when: false
- name: Display contents of /sys/fs/cgroup/cpuset/nomad/
debug:
var: ls_nomad_cpuset.stdout_lines

View File

@ -0,0 +1,30 @@
---
- name: Gather Nomad debug information from multiple nodes
hosts: all
become: yes
tasks:
- name: Get Nomad service status
shell: systemctl status nomad --no-pager -l
register: nomad_status
changed_when: false
failed_when: false
- name: Get last 50 lines of Nomad journal logs
shell: journalctl -u nomad -n 50 --no-pager
register: nomad_journal
changed_when: false
failed_when: false
- name: Display Nomad Status
debug:
msg: |
--- Nomad Status for {{ inventory_hostname }} ---
{{ nomad_status.stdout }}
{{ nomad_status.stderr }}
- name: Display Nomad Journal
debug:
msg: |
--- Nomad Journal for {{ inventory_hostname }} ---
{{ nomad_journal.stdout }}
{{ nomad_journal.stderr }}

View File

@ -0,0 +1,14 @@
---
- name: Find Nomad service
hosts: germany
become: yes
tasks:
- name: List systemd services and filter for nomad
shell: systemctl list-unit-files --type=service | grep -i nomad
register: nomad_services
changed_when: false
failed_when: false
- name: Display found services
debug:
var: nomad_services.stdout_lines

View File

@ -0,0 +1,19 @@
---
- name: Fix cgroup permissions for Nomad
hosts: germany
become: yes
tasks:
- name: Recursively change ownership of nomad cgroup directory
file:
path: /sys/fs/cgroup/cpuset/nomad
state: directory
owner: root
group: root
recurse: yes
- name: Change ownership of the parent cpuset directory
file:
path: /sys/fs/cgroup/cpuset/
state: directory
owner: root
group: root

View File

@ -0,0 +1,45 @@
---
- name: Fix Nomad server configuration
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create corrected nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = true
bootstrap_expect = 5
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
retry_join = [
"100.116.158.95", # semaphore
"100.81.26.3", # ash1d
"100.103.147.94", # ash2e
"100.90.159.68", # ch2
"100.86.141.112" # ch3
]
}
client {
enabled = false
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}

View File

@ -0,0 +1,12 @@
---
- name: Get Tailscale IP for specified nodes
hosts: all
gather_facts: no
tasks:
- name: Get tailscale IP
shell: "tailscale ip -4"
register: tailscale_ip
- name: Display Tailscale IP
debug:
msg: "Node {{ inventory_hostname }} has IP: {{ tailscale_ip.stdout }}"

View File

@ -1,10 +1,8 @@
---
- name: Install Nomad by direct download from HashiCorp
hosts: hcs
hosts: all
become: yes
vars:
nomad_version: "1.10.5"
nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip"
nomad_user: "nomad"
nomad_group: "nomad"
nomad_home: "/opt/nomad"

View File

@ -1,17 +1,22 @@
- name: Manually run Nomad agent to capture output
---
- name: Manually run Nomad agent for debugging
hosts: germany
gather_facts: false
become: yes
tasks:
- name: Find Nomad binary path
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
register: nomad_binary_path
failed_when: nomad_binary_path.stdout == ""
- name: Run nomad agent directly
command: /snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
register: nomad_agent_output
ignore_errors: true
command: "{{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl"
register: nomad_run
failed_when: false
- name: Display agent output
- name: Display Nomad output
debug:
msg: |
--- Nomad Agent STDOUT ---
{{ nomad_agent_output.stdout }}
var: nomad_run.stdout
--- Nomad Agent STDERR ---
{{ nomad_agent_output.stderr }}
- name: Display Nomad error output
debug:
var: nomad_run.stderr

View File

@ -0,0 +1,7 @@
---
- name: Ping nodes to check connectivity
hosts: all
gather_facts: no
tasks:
- name: Ping the host
ping:

View File

@ -0,0 +1,13 @@
---
- name: Read Nomad config file
hosts: localhost
gather_facts: no
tasks:
- name: Read nomad.hcl
slurp:
src: /etc/nomad.d/nomad.hcl
register: nomad_config
- name: Display Nomad config
debug:
msg: "{{ nomad_config['content'] | b64decode }}"

View File

@ -0,0 +1,37 @@
---
- name: Update Nomad config to run as a client
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create new nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = false
}
client {
enabled = true
servers = ["100.81.26.3:4647", "100.103.147.94:4647", "100.90.159.68:4647"]
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}

View File

@ -2,117 +2,56 @@ job "consul-cluster" {
datacenters = ["dc1"]
type = "service"
# 确保在指定的节点上运行
constraint {
attribute = "${node.unique.name}"
operator = "regexp"
value = "(hcs|master|ash3c)"
}
group "consul-servers" {
count = 3
# 每个节点只运行一个 Consul 实例
constraint {
operator = "distinct_hosts"
value = "true"
}
# 网络配置
network {
mode = "host"
port "http" {
static = 8500
}
port "rpc" {
static = 8300
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
port "grpc" {
static = 8502
}
}
# 持久化存储
volume "consul-data" {
type = "host"
read_only = false
source = "consul-data"
attribute = "${node.unique.name}"
operator = "regexp"
value = "(master|ash3c|hcp)"
}
task "consul" {
driver = "podman"
config {
image = "hashicorp/consul:latest"
ports = ["server", "serf_lan", "serf_wan", "ui"]
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-data-dir=/consul/data",
"-ui",
"-client=0.0.0.0",
"-bind={{ env `NOMAD_IP_server` }}",
"-retry-join=100.117.106.136",
"-retry-join=100.116.80.94",
"-retry-join=100.76.13.187"
]
}
volume_mount {
volume = "consul-data"
destination = "/consul/data"
read_only = false
}
config {
image = "docker.io/hashicorp/consul:1.17"
ports = ["http", "rpc", "serf_lan", "serf_wan", "grpc"]
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-datacenter=dc1",
"-data-dir=/consul/data",
"-log-level=INFO",
"-node=${node.unique.name}",
"-bind=${NOMAD_IP_serf_lan}",
"-client=0.0.0.0",
"-retry-join=100.84.197.26",
"-retry-join=100.117.106.136",
"-retry-join=100.116.80.94",
"-ui-config-enabled=true",
"-connect-enabled=true"
]
}
# 环境变量
env {
CONSUL_ALLOW_PRIVILEGED_PORTS = "true"
}
# 资源配置
resources {
cpu = 500
memory = 512
network {
mbits = 10
port "server" { static = 8300 }
port "serf_lan" { static = 8301 }
port "serf_wan" { static = 8302 }
port "ui" { static = 8500 }
}
# 健康检查
service {
name = "consul"
port = "http"
tags = [
"consul",
"server",
"${node.unique.name}"
]
check {
type = "http"
path = "/v1/status/leader"
interval = "10s"
timeout = "3s"
}
}
# 重启策略
restart {
attempts = 3
interval = "30m"
delay = "15s"
mode = "fail"
}
volume "consul-data" {
type = "host"
read_only = false
source = "consul-data"
}
}
}