feat: 重构项目目录结构并添加多个功能

- 新增脚本和配置文件用于管理Nomad节点和NFS存储
- 添加多个Ansible playbook用于配置和调试Nomad集群
- 新增Nomad job文件用于测试Podman和NFS功能
- 重构playbooks目录结构,按功能分类
- 更新Nomad客户端和服务端配置模板
- 添加SSH密钥分发和配置脚本
- 新增多个调试和修复问题的playbook
This commit is contained in:
2025-09-27 13:05:30 +00:00
parent a06e5e1a00
commit 44b098bd20
98 changed files with 1141 additions and 2 deletions

View File

@@ -0,0 +1,16 @@
---
- name: Debug apt repository issues
hosts: beijing:children
become: yes
ignore_unreachable: yes
tasks:
- name: Run apt-get update to capture error
ansible.builtin.shell: apt-get update
register: apt_update_result
failed_when: false
changed_when: false
- name: Display apt-get update stderr
ansible.builtin.debug:
var: apt_update_result.stderr
verbosity: 2

View File

@@ -0,0 +1,126 @@
---
- name: Fix duplicate Podman configuration in Nomad
hosts: nomad_cluster
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Backup current configuration
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup-duplicate-fix
remote_src: yes
- name: Read current configuration
slurp:
src: /etc/nomad.d/nomad.hcl
register: current_config
- name: Create clean configuration for clients
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "/opt/nomad/data"
bind_addr = "{{ tailscale_ip }}"
server {
enabled = false
}
client {
enabled = true
servers = ["100.116.158.95:4647", "100.117.106.136:4647", "100.86.141.112:4647", "100.81.26.3:4647", "100.103.147.94:4647"]
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ip }}"
serf = "{{ tailscale_ip }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
auto_advertise = false
server_auto_join = false
client_auto_join = false
}
log_level = "INFO"
enable_syslog = true
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
when: nomad_role == "client"
- name: Ensure Podman is installed
package:
name: podman
state: present
- name: Enable and start Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
- name: Set proper permissions on Podman socket
file:
path: /run/podman/podman.sock
mode: '0666'
ignore_errors: yes
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl || /usr/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for drivers to load
pause:
seconds: 20
- name: Check driver status
shell: |
/usr/local/bin/nomad node status -self | grep -A 10 "Driver Status" || /usr/bin/nomad node status -self | grep -A 10 "Driver Status"
register: driver_status
failed_when: false
- name: Display driver status
debug:
var: driver_status.stdout_lines

View File

@@ -0,0 +1,27 @@
---
- name: 直接复制正确的 HashiCorp APT 源配置
hosts: nomad_cluster
become: yes
tasks:
- name: 创建正确的 HashiCorp APT 源配置
copy:
content: "deb [trusted=yes] http://apt.releases.hashicorp.com {{ ansible_distribution_release }} main\n"
dest: "/etc/apt/sources.list.d/hashicorp.list"
owner: root
group: root
mode: '0644'
- name: 更新 APT 缓存
apt:
update_cache: yes
ignore_errors: yes
- name: 验证配置
command: cat /etc/apt/sources.list.d/hashicorp.list
register: config_check
changed_when: false
- name: 显示配置内容
debug:
msg: "HashiCorp APT 源配置: {{ config_check.stdout }}"

View File

@@ -0,0 +1,83 @@
---
- name: Fix HCP1 and HCP2 Podman Configuration
hosts: hcp1,hcp2
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Ensure nomad user exists
user:
name: nomad
system: yes
shell: /bin/false
home: /home/nomad
create_home: yes
- name: Ensure Podman socket is running
systemd:
name: podman.socket
state: started
enabled: yes
- name: Set proper permissions on Podman socket
file:
path: /run/podman/podman.sock
mode: '0666'
ignore_errors: yes
- name: Create nomad data directory
file:
path: /opt/nomad/data
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create nomad log directory
file:
path: /var/log/nomad
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Test Podman access for nomad user
shell: sudo -u nomad podman version
register: podman_test
failed_when: false
- name: Display Podman test result
debug:
var: podman_test.stdout_lines
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: false
- name: Display configuration validation
debug:
var: config_validation
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
timeout: 60
- name: Check Nomad node status
shell: /usr/local/bin/nomad node status -self
register: node_status
failed_when: false
- name: Display node status
debug:
var: node_status.stdout_lines

View File

@@ -0,0 +1,56 @@
---
- name: Fix dpkg and initramfs issues on hcs
hosts: hcs
become: yes
tasks:
- name: Check current dpkg status
shell: dpkg --audit
register: dpkg_status
ignore_errors: yes
- name: Display dpkg status
debug:
var: dpkg_status.stdout_lines
- name: Fix broken btrfs hook
shell: |
# Remove problematic btrfs hook temporarily
mv /usr/share/initramfs-tools/hooks/btrfs /usr/share/initramfs-tools/hooks/btrfs.bak || true
# Try to reconfigure the failed package
dpkg --configure -a
# If that works, restore the hook
if [ $? -eq 0 ]; then
mv /usr/share/initramfs-tools/hooks/btrfs.bak /usr/share/initramfs-tools/hooks/btrfs || true
fi
register: fix_result
ignore_errors: yes
- name: Display fix result
debug:
var: fix_result
- name: Alternative fix - reinstall initramfs-tools
apt:
name: initramfs-tools
state: latest
force: yes
when: fix_result.rc != 0
ignore_errors: yes
- name: Clean up and update
shell: |
apt autoremove -y
apt update
apt upgrade -y
ignore_errors: yes
- name: Check final dpkg status
shell: dpkg --audit
register: final_status
ignore_errors: yes
- name: Display final status
debug:
var: final_status.stdout_lines

View File

@@ -0,0 +1,98 @@
---
- name: Fix Nomad Cluster Configuration
hosts: nomad_servers
become: yes
vars:
nomad_servers_list:
- "100.116.158.95" # semaphore
- "100.103.147.94" # ash2e
- "100.81.26.3" # ash1d
- "100.90.159.68" # ch2
- "{{ ansible_default_ipv4.address }}" # ch3 (will be determined dynamically)
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Create nomad user
user:
name: nomad
system: yes
shell: /bin/false
home: /opt/nomad
create_home: no
- name: Create Nomad configuration directory
file:
path: /etc/nomad.d
state: directory
mode: '0755'
- name: Create Nomad data directory
file:
path: /opt/nomad/data
state: directory
mode: '0755'
owner: nomad
group: nomad
ignore_errors: yes
- name: Create Nomad log directory
file:
path: /var/log/nomad
state: directory
mode: '0755'
owner: nomad
group: nomad
ignore_errors: yes
- name: Generate Nomad server configuration
template:
src: nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
mode: '0644'
notify: restart nomad
- name: Create Nomad systemd service file
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Enable and start Nomad service
systemd:
name: nomad
enabled: yes
state: started
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@@ -0,0 +1,99 @@
---
- name: Update Nomad configuration for Podman and fix issues
hosts: localhost
become: yes
connection: local
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration to use Podman and disable Consul
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "100.116.158.95"
server {
enabled = true
bootstrap_expect = 1
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "100.116.158.95"
serf = "100.116.158.95"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
# Disable Consul integration for now
consul {
address = ""
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
backup: yes
- name: Enable Podman socket for systemd
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 5
timeout: 30
- name: Check Nomad status
uri:
url: http://localhost:4646/v1/status/leader
method: GET
register: nomad_status
retries: 3
delay: 5
- name: Display Nomad status
debug:
msg: "Nomad leader: {{ nomad_status.json if nomad_status.json is defined else 'No leader elected' }}"

View File

@@ -0,0 +1,72 @@
---
- name: Fix Nomad Podman Driver Configuration
hosts: all
become: yes
vars:
nomad_user: nomad
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration to properly reference Podman plugin
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'plugin "podman" \{\n config \{\n socket_path = "unix:///run/user/1001/podman/podman.sock"\n volumes \{\n enabled = true\n \}\n \}\n\}'
replace: |
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/user/1001/podman/podman.sock"
volumes {
enabled = true
}
}
}
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for plugins to load
pause:
seconds: 15
- name: Check if Podman driver is now loaded
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
register: driver_status
- name: Display driver status
debug:
var: driver_status.stdout_lines
- name: Check Nomad logs for successful plugin loading
shell: journalctl -u nomad -n 20 --no-pager | grep -E "(podman|plugin)"
register: recent_logs
failed_when: false
- name: Display recent plugin logs
debug:
var: recent_logs.stdout_lines
- name: Final verification - Test Podman functionality
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | grep -i podman
register: podman_driver_check
failed_when: false
- name: Display final result
debug:
msg: |
Podman driver status: {{ 'SUCCESS - Driver loaded!' if 'podman' in (podman_driver_check.stdout | default('')) else 'Still checking...' }}
Available drivers: {{ podman_driver_check.stdout_lines | default(['none']) | join(', ') }}

View File

@@ -0,0 +1,45 @@
---
- name: Fix Nomad server configuration
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create corrected nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = true
bootstrap_expect = 5
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
retry_join = [
"100.116.158.95", # semaphore
"100.81.26.3", # ash1d
"100.103.147.94", # ash2e
"100.90.159.68", # ch2
"100.86.141.112" # ch3
]
}
client {
enabled = false
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}

View File

@@ -0,0 +1,88 @@
---
- name: Fix Nomad systemd service binary path
hosts: nomad_cluster
become: yes
tasks:
- name: Check Nomad binary location
shell: which nomad
register: nomad_binary_path
- name: Display binary path
debug:
msg: "Nomad binary 位于: {{ nomad_binary_path.stdout }}"
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Update Nomad systemd service with correct binary path
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
notify: reload systemd
- name: Reload systemd and start Nomad servers first
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
when: inventory_hostname in groups['nomad_servers']
- name: Wait for servers to be ready
pause:
seconds: 15
when: inventory_hostname in groups['nomad_servers']
- name: Start Nomad clients
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
when: inventory_hostname in groups['nomad_clients']
- name: Wait for clients to connect
pause:
seconds: 10
when: inventory_hostname in groups['nomad_clients']
- name: Check final service status
shell: systemctl status nomad --no-pager -l
register: service_status
ignore_errors: yes
- name: Display service status
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 服务状态:
📊 状态: {{ 'SUCCESS' if service_status.rc == 0 else 'FAILED' }}
💾 二进制路径: {{ nomad_binary_path.stdout }}
handlers:
- name: reload systemd
systemd:
daemon_reload: yes

View File

@@ -0,0 +1,79 @@
---
- name: Fix Podman installation on remaining nodes
hosts: semaphore,master,ash3c,hcs
become: yes
serial: 1 # 逐个处理,避免同时影响多个节点
tasks:
- name: Current node status
debug:
msg: "🔧 修复节点: {{ inventory_hostname }}"
- name: Check if Podman is already installed
shell: podman --version 2>/dev/null || echo "NOT_INSTALLED"
register: podman_check
- name: Install Podman if not present (semaphore special handling)
apt:
name:
- podman
- buildah
- skopeo
state: present
update_cache: yes
force_apt_get: yes
when: inventory_hostname == 'semaphore' and 'NOT_INSTALLED' in podman_check.stdout
ignore_errors: yes
- name: Install Podman on other nodes
apt:
name:
- podman
- buildah
- skopeo
state: present
when: inventory_hostname != 'semaphore'
ignore_errors: yes
- name: Install Python dependencies for podman-compose
apt:
name:
- python3-pip
- python3-setuptools
- python3-yaml
- python3-dotenv
state: present
ignore_errors: yes
- name: Install podman-compose via pip
pip:
name:
- podman-compose
state: present
executable: pip3
ignore_errors: yes
- name: Alternative podman-compose installation via apt
apt:
name: podman-compose
state: present
ignore_errors: yes
- name: Verify installations
shell: |
echo "Podman: $(podman --version 2>/dev/null || echo 'FAILED')"
echo "Podman Compose: $(podman-compose --version 2>/dev/null || echo 'FAILED')"
register: verify_result
- name: Display verification results
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 验证结果:
{{ verify_result.stdout }}
- name: Enable Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes

View File

@@ -0,0 +1,109 @@
---
- name: Fix Nomad server configuration
hosts: nomad_servers
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Backup current configuration
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup-server-fix
remote_src: yes
- name: Create clean server configuration
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_default_ipv4.address }}"
server {
enabled = true
bootstrap_expect = {{ nomad_bootstrap_expect }}
encrypt = "{{ nomad_encrypt_key }}"
retry_join = [
"100.116.158.95",
"100.103.147.94",
"100.81.26.3",
"100.90.159.68",
"100.86.141.112"
]
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ ansible_default_ipv4.address }}"
serf = "{{ ansible_default_ipv4.address }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
auto_advertise = false
server_auto_join = false
client_auto_join = false
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Ensure Podman is installed
package:
name: podman
state: present
- name: Enable and start Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl || /usr/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60

View File

@@ -0,0 +1,103 @@
---
- name: Fix Nomad server network configuration
hosts: nomad_servers
become: yes
vars:
server_ips:
semaphore: "100.116.158.95"
ash2e: "100.103.147.94"
ash1d: "100.81.26.3"
ch2: "100.90.159.68"
ch3: "100.86.141.112"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Get server IP for this host
set_fact:
server_ip: "{{ server_ips[inventory_hostname] }}"
- name: Create corrected server configuration
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "/opt/nomad/data"
bind_addr = "{{ server_ip }}"
server {
enabled = true
bootstrap_expect = {{ nomad_bootstrap_expect }}
encrypt = "{{ nomad_encrypt_key }}"
retry_join = [
"100.116.158.95",
"100.103.147.94",
"100.81.26.3",
"100.90.159.68",
"100.86.141.112"
]
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ server_ip }}"
serf = "{{ server_ip }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
auto_advertise = false
server_auto_join = false
client_auto_join = false
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl || /usr/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60

View File

@@ -0,0 +1,39 @@
---
- name: Fix Warden docker-compose.yml
hosts: warden
become: yes
gather_facts: no
tasks:
- name: Ensure /opt/warden directory exists
file:
path: /opt/warden
state: directory
owner: root
group: root
mode: '0755'
- name: Create or update docker-compose.yml with correct indentation
copy:
dest: /opt/warden/docker-compose.yml
content: |
services:
vaultwarden:
image: hub.git4ta.fun/vaultwarden/server:latest
security_opt:
- "seccomp=unconfined"
env_file:
- .env
volumes:
- ./data:/data
ports:
- "980:80"
restart: always
networks:
- vaultwarden_network
networks:
vaultwarden_network:
owner: root
group: root
mode: '0644'