feat(配置): 新增多个Ansible playbook用于Nomad集群管理
添加用于调试、配置和维护Nomad集群的playbook 包括节点连通性检查、配置读取、Tailscale IP获取等功能 修改现有playbook以支持更广泛的节点部署
This commit is contained in:
parent
79b721e165
commit
c0d4cf54dc
17
1
17
1
|
|
@ -1,17 +0,0 @@
|
||||||
===> 连接到 Nomad Leader: http://100.81.26.3:4646
|
|
||||||
\n--- 当前节点列表 (Before) ---
|
|
||||||
ID Node Pool DC Name Class Drain Eligibility Status
|
|
||||||
ec4bf738 default dc1 pdns <none> false eligible ready
|
|
||||||
583f1b77 default dc1 semaphore <none> false eligible down
|
|
||||||
cd121e59 default dc1 influxdb <none> false eligible ready
|
|
||||||
3edfa5bc default dc1 ash3c <none> false eligible ready
|
|
||||||
300c11e7 default dc1 hcp1 <none> false eligible ready
|
|
||||||
5e218d15 default dc1 master <none> false eligible ready
|
|
||||||
06bb8a3a default dc1 hcs <none> false eligible ready
|
|
||||||
baea7bb6 default dc1 hcp2 <none> false eligible ready
|
|
||||||
d2e4ceee default dc1 ch3 <none> false ineligible down
|
|
||||||
3521e4a1 default dc1 ch2 <none> false eligible down
|
|
||||||
e6c0cdbf default dc1 ash1d <none> false eligible down
|
|
||||||
645fbd8b default dc1 ash2e <none> false eligible down
|
|
||||||
84913d2f default dc1 semaphore <none> false eligible down
|
|
||||||
a3d0b0e3 default dc1 Syd <none> false eligible ready
|
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
[consul_servers:children]
|
||||||
|
nomad_servers
|
||||||
|
|
||||||
|
[consul_servers:vars]
|
||||||
|
consul_cert_dir=/etc/consul.d/certs
|
||||||
|
consul_ca_src=security/certificates/ca.pem
|
||||||
|
consul_cert_src=security/certificates/consul-server.pem
|
||||||
|
consul_key_src=security/certificates/consul-server-key.pem
|
||||||
|
|
||||||
|
[nomad_cluster:children]
|
||||||
|
nomad_servers
|
||||||
|
nomad_clients
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
- name: Check for AppArmor or SELinux denials
|
||||||
|
hosts: germany
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Search journalctl for AppArmor/SELinux messages
|
||||||
|
shell: 'journalctl -k | grep -i -e apparmor -e selinux -e "avc: denied"'
|
||||||
|
register: security_logs
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Display security logs
|
||||||
|
debug:
|
||||||
|
var: security_logs.stdout_lines
|
||||||
|
|
@ -116,6 +116,7 @@
|
||||||
client {
|
client {
|
||||||
enabled = true
|
enabled = true
|
||||||
network_interface = "tailscale0"
|
network_interface = "tailscale0"
|
||||||
|
cpu_total_compute = 0
|
||||||
|
|
||||||
servers = [
|
servers = [
|
||||||
"100.116.158.95:4647", # semaphore
|
"100.116.158.95:4647", # semaphore
|
||||||
|
|
@ -162,7 +163,7 @@
|
||||||
Type=notify
|
Type=notify
|
||||||
User=root
|
User=root
|
||||||
Group=root
|
Group=root
|
||||||
ExecStart=/snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
KillMode=process
|
KillMode=process
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
---
|
||||||
|
- name: Debug cgroup permissions
|
||||||
|
hosts: germany
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Check permissions of /sys/fs/cgroup/cpuset/
|
||||||
|
stat:
|
||||||
|
path: /sys/fs/cgroup/cpuset/
|
||||||
|
register: cpuset_dir
|
||||||
|
|
||||||
|
- name: Display cpuset dir stats
|
||||||
|
debug:
|
||||||
|
var: cpuset_dir.stat
|
||||||
|
|
||||||
|
- name: Check for nomad subdir in cpuset
|
||||||
|
stat:
|
||||||
|
path: /sys/fs/cgroup/cpuset/nomad
|
||||||
|
register: nomad_cpuset_dir
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
- name: Display nomad cpuset dir stats
|
||||||
|
debug:
|
||||||
|
var: nomad_cpuset_dir.stat
|
||||||
|
when: nomad_cpuset_dir.stat.exists is defined and nomad_cpuset_dir.stat.exists
|
||||||
|
|
||||||
|
- name: List contents of /sys/fs/cgroup/cpuset/
|
||||||
|
command: ls -la /sys/fs/cgroup/cpuset/
|
||||||
|
register: ls_cpuset
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Display contents of /sys/fs/cgroup/cpuset/
|
||||||
|
debug:
|
||||||
|
var: ls_cpuset.stdout_lines
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
- name: Debug Nomad cgroup subdirectory
|
||||||
|
hosts: germany
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: List contents of /sys/fs/cgroup/cpuset/nomad/
|
||||||
|
command: ls -la /sys/fs/cgroup/cpuset/nomad/
|
||||||
|
register: ls_nomad_cpuset
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Display contents of /sys/fs/cgroup/cpuset/nomad/
|
||||||
|
debug:
|
||||||
|
var: ls_nomad_cpuset.stdout_lines
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
---
|
||||||
|
- name: Gather Nomad debug information from multiple nodes
|
||||||
|
hosts: all
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Get Nomad service status
|
||||||
|
shell: systemctl status nomad --no-pager -l
|
||||||
|
register: nomad_status
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Get last 50 lines of Nomad journal logs
|
||||||
|
shell: journalctl -u nomad -n 50 --no-pager
|
||||||
|
register: nomad_journal
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Display Nomad Status
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
--- Nomad Status for {{ inventory_hostname }} ---
|
||||||
|
{{ nomad_status.stdout }}
|
||||||
|
{{ nomad_status.stderr }}
|
||||||
|
|
||||||
|
- name: Display Nomad Journal
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
--- Nomad Journal for {{ inventory_hostname }} ---
|
||||||
|
{{ nomad_journal.stdout }}
|
||||||
|
{{ nomad_journal.stderr }}
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
- name: Find Nomad service
|
||||||
|
hosts: germany
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: List systemd services and filter for nomad
|
||||||
|
shell: systemctl list-unit-files --type=service | grep -i nomad
|
||||||
|
register: nomad_services
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Display found services
|
||||||
|
debug:
|
||||||
|
var: nomad_services.stdout_lines
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
- name: Fix cgroup permissions for Nomad
|
||||||
|
hosts: germany
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Recursively change ownership of nomad cgroup directory
|
||||||
|
file:
|
||||||
|
path: /sys/fs/cgroup/cpuset/nomad
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
recurse: yes
|
||||||
|
|
||||||
|
- name: Change ownership of the parent cpuset directory
|
||||||
|
file:
|
||||||
|
path: /sys/fs/cgroup/cpuset/
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
---
|
||||||
|
- name: Fix Nomad server configuration
|
||||||
|
hosts: localhost
|
||||||
|
gather_facts: no
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Create corrected nomad.hcl
|
||||||
|
copy:
|
||||||
|
dest: /etc/nomad.d/nomad.hcl
|
||||||
|
content: |
|
||||||
|
datacenter = "dc1"
|
||||||
|
data_dir = "/opt/nomad/data"
|
||||||
|
log_level = "INFO"
|
||||||
|
|
||||||
|
bind_addr = "100.116.158.95"
|
||||||
|
|
||||||
|
server {
|
||||||
|
enabled = true
|
||||||
|
bootstrap_expect = 5
|
||||||
|
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||||
|
retry_join = [
|
||||||
|
"100.116.158.95", # semaphore
|
||||||
|
"100.81.26.3", # ash1d
|
||||||
|
"100.103.147.94", # ash2e
|
||||||
|
"100.90.159.68", # ch2
|
||||||
|
"100.86.141.112" # ch3
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
client {
|
||||||
|
enabled = false
|
||||||
|
}
|
||||||
|
|
||||||
|
plugin "podman" {
|
||||||
|
config {
|
||||||
|
socket_path = "unix:///run/podman/podman.sock"
|
||||||
|
volumes {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
consul {
|
||||||
|
address = "100.116.158.95:8500"
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
---
|
||||||
|
- name: Get Tailscale IP for specified nodes
|
||||||
|
hosts: all
|
||||||
|
gather_facts: no
|
||||||
|
tasks:
|
||||||
|
- name: Get tailscale IP
|
||||||
|
shell: "tailscale ip -4"
|
||||||
|
register: tailscale_ip
|
||||||
|
|
||||||
|
- name: Display Tailscale IP
|
||||||
|
debug:
|
||||||
|
msg: "Node {{ inventory_hostname }} has IP: {{ tailscale_ip.stdout }}"
|
||||||
|
|
@ -1,10 +1,8 @@
|
||||||
---
|
---
|
||||||
- name: Install Nomad by direct download from HashiCorp
|
- name: Install Nomad by direct download from HashiCorp
|
||||||
hosts: hcs
|
hosts: all
|
||||||
become: yes
|
become: yes
|
||||||
vars:
|
vars:
|
||||||
nomad_version: "1.10.5"
|
|
||||||
nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip"
|
|
||||||
nomad_user: "nomad"
|
nomad_user: "nomad"
|
||||||
nomad_group: "nomad"
|
nomad_group: "nomad"
|
||||||
nomad_home: "/opt/nomad"
|
nomad_home: "/opt/nomad"
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,22 @@
|
||||||
- name: Manually run Nomad agent to capture output
|
---
|
||||||
|
- name: Manually run Nomad agent for debugging
|
||||||
hosts: germany
|
hosts: germany
|
||||||
gather_facts: false
|
become: yes
|
||||||
tasks:
|
tasks:
|
||||||
- name: Run nomad agent directly
|
- name: Find Nomad binary path
|
||||||
command: /snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
|
||||||
register: nomad_agent_output
|
register: nomad_binary_path
|
||||||
ignore_errors: true
|
failed_when: nomad_binary_path.stdout == ""
|
||||||
|
|
||||||
- name: Display agent output
|
- name: Run nomad agent directly
|
||||||
|
command: "{{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl"
|
||||||
|
register: nomad_run
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Display Nomad output
|
||||||
debug:
|
debug:
|
||||||
msg: |
|
var: nomad_run.stdout
|
||||||
--- Nomad Agent STDOUT ---
|
|
||||||
{{ nomad_agent_output.stdout }}
|
- name: Display Nomad error output
|
||||||
|
debug:
|
||||||
--- Nomad Agent STDERR ---
|
var: nomad_run.stderr
|
||||||
{{ nomad_agent_output.stderr }}
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
---
|
||||||
|
- name: Ping nodes to check connectivity
|
||||||
|
hosts: all
|
||||||
|
gather_facts: no
|
||||||
|
tasks:
|
||||||
|
- name: Ping the host
|
||||||
|
ping:
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
---
|
||||||
|
- name: Read Nomad config file
|
||||||
|
hosts: localhost
|
||||||
|
gather_facts: no
|
||||||
|
tasks:
|
||||||
|
- name: Read nomad.hcl
|
||||||
|
slurp:
|
||||||
|
src: /etc/nomad.d/nomad.hcl
|
||||||
|
register: nomad_config
|
||||||
|
|
||||||
|
- name: Display Nomad config
|
||||||
|
debug:
|
||||||
|
msg: "{{ nomad_config['content'] | b64decode }}"
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
---
|
||||||
|
- name: Update Nomad config to run as a client
|
||||||
|
hosts: localhost
|
||||||
|
gather_facts: no
|
||||||
|
become: yes
|
||||||
|
tasks:
|
||||||
|
- name: Create new nomad.hcl
|
||||||
|
copy:
|
||||||
|
dest: /etc/nomad.d/nomad.hcl
|
||||||
|
content: |
|
||||||
|
datacenter = "dc1"
|
||||||
|
data_dir = "/opt/nomad/data"
|
||||||
|
log_level = "INFO"
|
||||||
|
|
||||||
|
bind_addr = "100.116.158.95"
|
||||||
|
|
||||||
|
server {
|
||||||
|
enabled = false
|
||||||
|
}
|
||||||
|
|
||||||
|
client {
|
||||||
|
enabled = true
|
||||||
|
servers = ["100.81.26.3:4647", "100.103.147.94:4647", "100.90.159.68:4647"]
|
||||||
|
}
|
||||||
|
|
||||||
|
plugin "podman" {
|
||||||
|
config {
|
||||||
|
socket_path = "unix:///run/podman/podman.sock"
|
||||||
|
volumes {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
consul {
|
||||||
|
address = "100.116.158.95:8500"
|
||||||
|
}
|
||||||
|
|
@ -1,118 +1,57 @@
|
||||||
job "consul-cluster" {
|
job "consul-cluster" {
|
||||||
datacenters = ["dc1"]
|
datacenters = ["dc1"]
|
||||||
type = "service"
|
type = "service"
|
||||||
|
|
||||||
# 确保在指定的节点上运行
|
|
||||||
constraint {
|
|
||||||
attribute = "${node.unique.name}"
|
|
||||||
operator = "regexp"
|
|
||||||
value = "(hcs|master|ash3c)"
|
|
||||||
}
|
|
||||||
|
|
||||||
group "consul-servers" {
|
group "consul-servers" {
|
||||||
count = 3
|
count = 3
|
||||||
|
|
||||||
# 每个节点只运行一个 Consul 实例
|
|
||||||
constraint {
|
constraint {
|
||||||
operator = "distinct_hosts"
|
attribute = "${node.unique.name}"
|
||||||
value = "true"
|
operator = "regexp"
|
||||||
}
|
value = "(master|ash3c|hcp)"
|
||||||
|
|
||||||
# 网络配置
|
|
||||||
network {
|
|
||||||
mode = "host"
|
|
||||||
port "http" {
|
|
||||||
static = 8500
|
|
||||||
}
|
|
||||||
port "rpc" {
|
|
||||||
static = 8300
|
|
||||||
}
|
|
||||||
port "serf_lan" {
|
|
||||||
static = 8301
|
|
||||||
}
|
|
||||||
port "serf_wan" {
|
|
||||||
static = 8302
|
|
||||||
}
|
|
||||||
port "grpc" {
|
|
||||||
static = 8502
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# 持久化存储
|
|
||||||
volume "consul-data" {
|
|
||||||
type = "host"
|
|
||||||
read_only = false
|
|
||||||
source = "consul-data"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
task "consul" {
|
task "consul" {
|
||||||
driver = "podman"
|
driver = "podman"
|
||||||
|
|
||||||
|
config {
|
||||||
|
image = "hashicorp/consul:latest"
|
||||||
|
ports = ["server", "serf_lan", "serf_wan", "ui"]
|
||||||
|
args = [
|
||||||
|
"agent",
|
||||||
|
"-server",
|
||||||
|
"-bootstrap-expect=3",
|
||||||
|
"-data-dir=/consul/data",
|
||||||
|
"-ui",
|
||||||
|
"-client=0.0.0.0",
|
||||||
|
"-bind={{ env `NOMAD_IP_server` }}",
|
||||||
|
"-retry-join=100.117.106.136",
|
||||||
|
"-retry-join=100.116.80.94",
|
||||||
|
"-retry-join=100.76.13.187"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
volume_mount {
|
volume_mount {
|
||||||
volume = "consul-data"
|
volume = "consul-data"
|
||||||
destination = "/consul/data"
|
destination = "/consul/data"
|
||||||
read_only = false
|
read_only = false
|
||||||
}
|
}
|
||||||
|
|
||||||
config {
|
|
||||||
image = "docker.io/hashicorp/consul:1.17"
|
|
||||||
ports = ["http", "rpc", "serf_lan", "serf_wan", "grpc"]
|
|
||||||
|
|
||||||
args = [
|
|
||||||
"agent",
|
|
||||||
"-server",
|
|
||||||
"-bootstrap-expect=3",
|
|
||||||
"-datacenter=dc1",
|
|
||||||
"-data-dir=/consul/data",
|
|
||||||
"-log-level=INFO",
|
|
||||||
"-node=${node.unique.name}",
|
|
||||||
"-bind=${NOMAD_IP_serf_lan}",
|
|
||||||
"-client=0.0.0.0",
|
|
||||||
"-retry-join=100.84.197.26",
|
|
||||||
"-retry-join=100.117.106.136",
|
|
||||||
"-retry-join=100.116.80.94",
|
|
||||||
"-ui-config-enabled=true",
|
|
||||||
"-connect-enabled=true"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
# 环境变量
|
|
||||||
env {
|
|
||||||
CONSUL_ALLOW_PRIVILEGED_PORTS = "true"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 资源配置
|
|
||||||
resources {
|
resources {
|
||||||
cpu = 500
|
network {
|
||||||
memory = 512
|
mbits = 10
|
||||||
}
|
port "server" { static = 8300 }
|
||||||
|
port "serf_lan" { static = 8301 }
|
||||||
# 健康检查
|
port "serf_wan" { static = 8302 }
|
||||||
service {
|
port "ui" { static = 8500 }
|
||||||
name = "consul"
|
|
||||||
port = "http"
|
|
||||||
|
|
||||||
tags = [
|
|
||||||
"consul",
|
|
||||||
"server",
|
|
||||||
"${node.unique.name}"
|
|
||||||
]
|
|
||||||
|
|
||||||
check {
|
|
||||||
type = "http"
|
|
||||||
path = "/v1/status/leader"
|
|
||||||
interval = "10s"
|
|
||||||
timeout = "3s"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# 重启策略
|
volume "consul-data" {
|
||||||
restart {
|
type = "host"
|
||||||
attempts = 3
|
read_only = false
|
||||||
interval = "30m"
|
source = "consul-data"
|
||||||
delay = "15s"
|
|
||||||
mode = "fail"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue