feat(配置): 新增多个Ansible playbook用于Nomad集群管理
添加用于调试、配置和维护Nomad集群的playbook 包括节点连通性检查、配置读取、Tailscale IP获取等功能 修改现有playbook以支持更广泛的节点部署
This commit is contained in:
parent
79b721e165
commit
c0d4cf54dc
17
1
17
1
|
|
@ -1,17 +0,0 @@
|
|||
===> 连接到 Nomad Leader: http://100.81.26.3:4646
|
||||
\n--- 当前节点列表 (Before) ---
|
||||
ID Node Pool DC Name Class Drain Eligibility Status
|
||||
ec4bf738 default dc1 pdns <none> false eligible ready
|
||||
583f1b77 default dc1 semaphore <none> false eligible down
|
||||
cd121e59 default dc1 influxdb <none> false eligible ready
|
||||
3edfa5bc default dc1 ash3c <none> false eligible ready
|
||||
300c11e7 default dc1 hcp1 <none> false eligible ready
|
||||
5e218d15 default dc1 master <none> false eligible ready
|
||||
06bb8a3a default dc1 hcs <none> false eligible ready
|
||||
baea7bb6 default dc1 hcp2 <none> false eligible ready
|
||||
d2e4ceee default dc1 ch3 <none> false ineligible down
|
||||
3521e4a1 default dc1 ch2 <none> false eligible down
|
||||
e6c0cdbf default dc1 ash1d <none> false eligible down
|
||||
645fbd8b default dc1 ash2e <none> false eligible down
|
||||
84913d2f default dc1 semaphore <none> false eligible down
|
||||
a3d0b0e3 default dc1 Syd <none> false eligible ready
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
[consul_servers:children]
|
||||
nomad_servers
|
||||
|
||||
[consul_servers:vars]
|
||||
consul_cert_dir=/etc/consul.d/certs
|
||||
consul_ca_src=security/certificates/ca.pem
|
||||
consul_cert_src=security/certificates/consul-server.pem
|
||||
consul_key_src=security/certificates/consul-server-key.pem
|
||||
|
||||
[nomad_cluster:children]
|
||||
nomad_servers
|
||||
nomad_clients
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
- name: Check for AppArmor or SELinux denials
|
||||
hosts: germany
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Search journalctl for AppArmor/SELinux messages
|
||||
shell: 'journalctl -k | grep -i -e apparmor -e selinux -e "avc: denied"'
|
||||
register: security_logs
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display security logs
|
||||
debug:
|
||||
var: security_logs.stdout_lines
|
||||
|
|
@ -116,6 +116,7 @@
|
|||
client {
|
||||
enabled = true
|
||||
network_interface = "tailscale0"
|
||||
cpu_total_compute = 0
|
||||
|
||||
servers = [
|
||||
"100.116.158.95:4647", # semaphore
|
||||
|
|
@ -162,7 +163,7 @@
|
|||
Type=notify
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
||||
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
|
|
|
|||
|
|
@ -0,0 +1,33 @@
|
|||
---
|
||||
- name: Debug cgroup permissions
|
||||
hosts: germany
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Check permissions of /sys/fs/cgroup/cpuset/
|
||||
stat:
|
||||
path: /sys/fs/cgroup/cpuset/
|
||||
register: cpuset_dir
|
||||
|
||||
- name: Display cpuset dir stats
|
||||
debug:
|
||||
var: cpuset_dir.stat
|
||||
|
||||
- name: Check for nomad subdir in cpuset
|
||||
stat:
|
||||
path: /sys/fs/cgroup/cpuset/nomad
|
||||
register: nomad_cpuset_dir
|
||||
ignore_errors: true
|
||||
|
||||
- name: Display nomad cpuset dir stats
|
||||
debug:
|
||||
var: nomad_cpuset_dir.stat
|
||||
when: nomad_cpuset_dir.stat.exists is defined and nomad_cpuset_dir.stat.exists
|
||||
|
||||
- name: List contents of /sys/fs/cgroup/cpuset/
|
||||
command: ls -la /sys/fs/cgroup/cpuset/
|
||||
register: ls_cpuset
|
||||
changed_when: false
|
||||
|
||||
- name: Display contents of /sys/fs/cgroup/cpuset/
|
||||
debug:
|
||||
var: ls_cpuset.stdout_lines
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
- name: Debug Nomad cgroup subdirectory
|
||||
hosts: germany
|
||||
become: yes
|
||||
tasks:
|
||||
- name: List contents of /sys/fs/cgroup/cpuset/nomad/
|
||||
command: ls -la /sys/fs/cgroup/cpuset/nomad/
|
||||
register: ls_nomad_cpuset
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display contents of /sys/fs/cgroup/cpuset/nomad/
|
||||
debug:
|
||||
var: ls_nomad_cpuset.stdout_lines
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
- name: Gather Nomad debug information from multiple nodes
|
||||
hosts: all
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Get Nomad service status
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: nomad_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Get last 50 lines of Nomad journal logs
|
||||
shell: journalctl -u nomad -n 50 --no-pager
|
||||
register: nomad_journal
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display Nomad Status
|
||||
debug:
|
||||
msg: |
|
||||
--- Nomad Status for {{ inventory_hostname }} ---
|
||||
{{ nomad_status.stdout }}
|
||||
{{ nomad_status.stderr }}
|
||||
|
||||
- name: Display Nomad Journal
|
||||
debug:
|
||||
msg: |
|
||||
--- Nomad Journal for {{ inventory_hostname }} ---
|
||||
{{ nomad_journal.stdout }}
|
||||
{{ nomad_journal.stderr }}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
- name: Find Nomad service
|
||||
hosts: germany
|
||||
become: yes
|
||||
tasks:
|
||||
- name: List systemd services and filter for nomad
|
||||
shell: systemctl list-unit-files --type=service | grep -i nomad
|
||||
register: nomad_services
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display found services
|
||||
debug:
|
||||
var: nomad_services.stdout_lines
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
---
|
||||
- name: Fix cgroup permissions for Nomad
|
||||
hosts: germany
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Recursively change ownership of nomad cgroup directory
|
||||
file:
|
||||
path: /sys/fs/cgroup/cpuset/nomad
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
recurse: yes
|
||||
|
||||
- name: Change ownership of the parent cpuset directory
|
||||
file:
|
||||
path: /sys/fs/cgroup/cpuset/
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
---
|
||||
- name: Fix Nomad server configuration
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Create corrected nomad.hcl
|
||||
copy:
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
data_dir = "/opt/nomad/data"
|
||||
log_level = "INFO"
|
||||
|
||||
bind_addr = "100.116.158.95"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 5
|
||||
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
retry_join = [
|
||||
"100.116.158.95", # semaphore
|
||||
"100.81.26.3", # ash1d
|
||||
"100.103.147.94", # ash2e
|
||||
"100.90.159.68", # ch2
|
||||
"100.86.141.112" # ch3
|
||||
]
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = false
|
||||
}
|
||||
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
consul {
|
||||
address = "100.116.158.95:8500"
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
- name: Get Tailscale IP for specified nodes
|
||||
hosts: all
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Get tailscale IP
|
||||
shell: "tailscale ip -4"
|
||||
register: tailscale_ip
|
||||
|
||||
- name: Display Tailscale IP
|
||||
debug:
|
||||
msg: "Node {{ inventory_hostname }} has IP: {{ tailscale_ip.stdout }}"
|
||||
|
|
@ -1,10 +1,8 @@
|
|||
---
|
||||
- name: Install Nomad by direct download from HashiCorp
|
||||
hosts: hcs
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
nomad_version: "1.10.5"
|
||||
nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||||
nomad_user: "nomad"
|
||||
nomad_group: "nomad"
|
||||
nomad_home: "/opt/nomad"
|
||||
|
|
|
|||
|
|
@ -1,17 +1,22 @@
|
|||
- name: Manually run Nomad agent to capture output
|
||||
---
|
||||
- name: Manually run Nomad agent for debugging
|
||||
hosts: germany
|
||||
gather_facts: false
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Run nomad agent directly
|
||||
command: /snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
||||
register: nomad_agent_output
|
||||
ignore_errors: true
|
||||
- name: Find Nomad binary path
|
||||
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
|
||||
register: nomad_binary_path
|
||||
failed_when: nomad_binary_path.stdout == ""
|
||||
|
||||
- name: Display agent output
|
||||
- name: Run nomad agent directly
|
||||
command: "{{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl"
|
||||
register: nomad_run
|
||||
failed_when: false
|
||||
|
||||
- name: Display Nomad output
|
||||
debug:
|
||||
msg: |
|
||||
--- Nomad Agent STDOUT ---
|
||||
{{ nomad_agent_output.stdout }}
|
||||
|
||||
--- Nomad Agent STDERR ---
|
||||
{{ nomad_agent_output.stderr }}
|
||||
var: nomad_run.stdout
|
||||
|
||||
- name: Display Nomad error output
|
||||
debug:
|
||||
var: nomad_run.stderr
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
- name: Ping nodes to check connectivity
|
||||
hosts: all
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Ping the host
|
||||
ping:
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
- name: Read Nomad config file
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Read nomad.hcl
|
||||
slurp:
|
||||
src: /etc/nomad.d/nomad.hcl
|
||||
register: nomad_config
|
||||
|
||||
- name: Display Nomad config
|
||||
debug:
|
||||
msg: "{{ nomad_config['content'] | b64decode }}"
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
---
|
||||
- name: Update Nomad config to run as a client
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Create new nomad.hcl
|
||||
copy:
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
data_dir = "/opt/nomad/data"
|
||||
log_level = "INFO"
|
||||
|
||||
bind_addr = "100.116.158.95"
|
||||
|
||||
server {
|
||||
enabled = false
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
servers = ["100.81.26.3:4647", "100.103.147.94:4647", "100.90.159.68:4647"]
|
||||
}
|
||||
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
consul {
|
||||
address = "100.116.158.95:8500"
|
||||
}
|
||||
|
|
@ -1,118 +1,57 @@
|
|||
job "consul-cluster" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
# 确保在指定的节点上运行
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
operator = "regexp"
|
||||
value = "(hcs|master|ash3c)"
|
||||
}
|
||||
type = "service"
|
||||
|
||||
group "consul-servers" {
|
||||
count = 3
|
||||
|
||||
# 每个节点只运行一个 Consul 实例
|
||||
|
||||
constraint {
|
||||
operator = "distinct_hosts"
|
||||
value = "true"
|
||||
}
|
||||
|
||||
# 网络配置
|
||||
network {
|
||||
mode = "host"
|
||||
port "http" {
|
||||
static = 8500
|
||||
}
|
||||
port "rpc" {
|
||||
static = 8300
|
||||
}
|
||||
port "serf_lan" {
|
||||
static = 8301
|
||||
}
|
||||
port "serf_wan" {
|
||||
static = 8302
|
||||
}
|
||||
port "grpc" {
|
||||
static = 8502
|
||||
}
|
||||
}
|
||||
|
||||
# 持久化存储
|
||||
volume "consul-data" {
|
||||
type = "host"
|
||||
read_only = false
|
||||
source = "consul-data"
|
||||
attribute = "${node.unique.name}"
|
||||
operator = "regexp"
|
||||
value = "(master|ash3c|hcp)"
|
||||
}
|
||||
|
||||
task "consul" {
|
||||
driver = "podman"
|
||||
|
||||
config {
|
||||
image = "hashicorp/consul:latest"
|
||||
ports = ["server", "serf_lan", "serf_wan", "ui"]
|
||||
args = [
|
||||
"agent",
|
||||
"-server",
|
||||
"-bootstrap-expect=3",
|
||||
"-data-dir=/consul/data",
|
||||
"-ui",
|
||||
"-client=0.0.0.0",
|
||||
"-bind={{ env `NOMAD_IP_server` }}",
|
||||
"-retry-join=100.117.106.136",
|
||||
"-retry-join=100.116.80.94",
|
||||
"-retry-join=100.76.13.187"
|
||||
]
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "consul-data"
|
||||
destination = "/consul/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
config {
|
||||
image = "docker.io/hashicorp/consul:1.17"
|
||||
ports = ["http", "rpc", "serf_lan", "serf_wan", "grpc"]
|
||||
|
||||
args = [
|
||||
"agent",
|
||||
"-server",
|
||||
"-bootstrap-expect=3",
|
||||
"-datacenter=dc1",
|
||||
"-data-dir=/consul/data",
|
||||
"-log-level=INFO",
|
||||
"-node=${node.unique.name}",
|
||||
"-bind=${NOMAD_IP_serf_lan}",
|
||||
"-client=0.0.0.0",
|
||||
"-retry-join=100.84.197.26",
|
||||
"-retry-join=100.117.106.136",
|
||||
"-retry-join=100.116.80.94",
|
||||
"-ui-config-enabled=true",
|
||||
"-connect-enabled=true"
|
||||
]
|
||||
}
|
||||
|
||||
# 环境变量
|
||||
env {
|
||||
CONSUL_ALLOW_PRIVILEGED_PORTS = "true"
|
||||
}
|
||||
|
||||
# 资源配置
|
||||
resources {
|
||||
cpu = 500
|
||||
memory = 512
|
||||
}
|
||||
|
||||
# 健康检查
|
||||
service {
|
||||
name = "consul"
|
||||
port = "http"
|
||||
|
||||
tags = [
|
||||
"consul",
|
||||
"server",
|
||||
"${node.unique.name}"
|
||||
]
|
||||
|
||||
check {
|
||||
type = "http"
|
||||
path = "/v1/status/leader"
|
||||
interval = "10s"
|
||||
timeout = "3s"
|
||||
network {
|
||||
mbits = 10
|
||||
port "server" { static = 8300 }
|
||||
port "serf_lan" { static = 8301 }
|
||||
port "serf_wan" { static = 8302 }
|
||||
port "ui" { static = 8500 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 重启策略
|
||||
restart {
|
||||
attempts = 3
|
||||
interval = "30m"
|
||||
delay = "15s"
|
||||
mode = "fail"
|
||||
}
|
||||
volume "consul-data" {
|
||||
type = "host"
|
||||
read_only = false
|
||||
source = "consul-data"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue