清理中间过程脚本和程序文件,保留与Nomad v1.10.5一致的核心配置文件

This commit is contained in:
Houzhong Xu 2025-09-28 05:49:32 +00:00
parent 44b098bd20
commit bc529a25fa
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
70 changed files with 20 additions and 4382 deletions

View File

@ -1,15 +1,22 @@
[nomad_nodes]
# 本机节点 (已通过PVE挂载NFS)
semaphore ansible_host=100.116.158.95 ansible_user=root
# 云服务器节点 (需要配置NFS挂载)
[nomad_servers]
# 服务器节点 (7个服务器节点)
bj-semaphore ansible_host=100.116.158.95 ansible_user=root
ash1d.global ansible_host=100.81.26.3 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash2e.global ansible_host=100.103.147.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch2.global ansible_host=100.90.159.68 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch3.global ansible_host=100.86.141.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131
onecloud1 ansible_host=100.98.209.50 ansible_user=ben ansible_password=3131 ansible_become_password=3131
de ansible_host=100.120.225.29 ansible_user=ben ansible_password=3131 ansible_become_password=3131
[nomad_clients]
# 客户端节点
master ansible_host=100.117.106.136 ansible_user=ben ansible_password=3131 ansible_become_password=3131 ansible_port=60022
ash3c ansible_host=100.116.80.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
[nomad_nodes:children]
nomad_servers
nomad_clients
[nomad_nodes:vars]
# NFS配置
nfs_server=snail

View File

@ -66,7 +66,8 @@ hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=
snail ansible_host=snail ansible_user=houzhongxu ansible_ssh_pass=Aa313131@ben ansible_become=yes ansible_become_pass=Aa313131@ben
[armbian]
onecloud1 ansible_host=onecloud1 ansible_user=ben ansible_ssh_pass=3131 ansible_become=yes ansible_become_pass=3131
onecloud1 ansible_host=100.98.209.50 ansible_user=ben ansible_password=3131 ansible_become_password=3131
de ansible_host=100.120.225.29 ansible_user=ben ansible_password=3131 ansible_become_password=3131
[beijing:children]
nomadlxc
@ -85,6 +86,7 @@ ditigalocean
oci_us
oci_kr
semaphore
armbian
[nomad_cluster:children]
nomad_servers

View File

@ -1,15 +0,0 @@
[nomad_servers]
localhost ansible_connection=local nomad_role=server nomad_bootstrap_expect=1
[nomad_clients]
# 如果需要客户端节点,可以在这里添加
[nomad_cluster:children]
nomad_servers
nomad_clients
[nomad_cluster:vars]
ansible_user=root
nomad_version=1.6.2
nomad_datacenter=dc1
nomad_region=global

View File

@ -1,30 +0,0 @@
---
- name: Gather Nomad debug information from multiple nodes
hosts: all
become: yes
tasks:
- name: Get Nomad service status
shell: systemctl status nomad --no-pager -l
register: nomad_status
changed_when: false
failed_when: false
- name: Get last 50 lines of Nomad journal logs
shell: journalctl -u nomad -n 50 --no-pager
register: nomad_journal
changed_when: false
failed_when: false
- name: Display Nomad Status
debug:
msg: |
--- Nomad Status for {{ inventory_hostname }} ---
{{ nomad_status.stdout }}
{{ nomad_status.stderr }}
- name: Display Nomad Journal
debug:
msg: |
--- Nomad Journal for {{ inventory_hostname }} ---
{{ nomad_journal.stdout }}
{{ nomad_journal.stderr }}

View File

@ -1,60 +0,0 @@
---
- name: Debug Nomad Podman Driver Issues
hosts: all
become: yes
vars:
nomad_user: nomad
tasks:
- name: Check Nomad configuration
shell: cat /etc/nomad.d/nomad.hcl
register: nomad_config
- name: Display Nomad configuration
debug:
var: nomad_config.stdout_lines
- name: Check plugin directory contents
shell: ls -la /opt/nomad/data/plugins/
register: plugin_dir
- name: Display plugin directory
debug:
var: plugin_dir.stdout_lines
- name: Check Nomad logs for plugin loading
shell: journalctl -u nomad -n 50 --no-pager | grep -E "(plugin|driver|podman)"
register: nomad_logs
failed_when: false
- name: Display relevant Nomad logs
debug:
var: nomad_logs.stdout_lines
- name: Check if plugin is executable
stat:
path: /opt/nomad/data/plugins/nomad-driver-podman
register: plugin_stat
- name: Display plugin file info
debug:
var: plugin_stat
- name: Test plugin directly
shell: /opt/nomad/data/plugins/nomad-driver-podman --version
register: plugin_version
failed_when: false
become_user: "{{ nomad_user }}"
- name: Display plugin version
debug:
msg: "Plugin version test: {{ 'SUCCESS' if plugin_version.rc == 0 else 'FAILED' }} - {{ plugin_version.stdout if plugin_version.rc == 0 else plugin_version.stderr }}"
- name: Check Podman socket accessibility
shell: sudo -u {{ nomad_user }} curl --unix-socket /run/user/1001/podman/podman.sock http://localhost/v1.0.0/libpod/info 2>/dev/null | head -3
register: podman_socket_test
failed_when: false
- name: Display Podman socket test
debug:
msg: "Podman socket test: {{ 'SUCCESS' if podman_socket_test.rc == 0 else 'FAILED' }}"

View File

@ -1,12 +0,0 @@
- name: Distribute new podman binary to syd
hosts: syd
gather_facts: false
tasks:
- name: Copy new podman binary to /usr/local/bin
copy:
src: /root/mgmt/configuration/podman-remote-static-linux_amd64
dest: /usr/local/bin/podman
owner: root
group: root
mode: '0755'
become: yes

View File

@ -1,16 +0,0 @@
---
- name: Debug apt repository issues
hosts: beijing:children
become: yes
ignore_unreachable: yes
tasks:
- name: Run apt-get update to capture error
ansible.builtin.shell: apt-get update
register: apt_update_result
failed_when: false
changed_when: false
- name: Display apt-get update stderr
ansible.builtin.debug:
var: apt_update_result.stderr
verbosity: 2

View File

@ -1,126 +0,0 @@
---
- name: Fix duplicate Podman configuration in Nomad
hosts: nomad_cluster
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Backup current configuration
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup-duplicate-fix
remote_src: yes
- name: Read current configuration
slurp:
src: /etc/nomad.d/nomad.hcl
register: current_config
- name: Create clean configuration for clients
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "/opt/nomad/data"
bind_addr = "{{ tailscale_ip }}"
server {
enabled = false
}
client {
enabled = true
servers = ["100.116.158.95:4647", "100.117.106.136:4647", "100.86.141.112:4647", "100.81.26.3:4647", "100.103.147.94:4647"]
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ip }}"
serf = "{{ tailscale_ip }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
auto_advertise = false
server_auto_join = false
client_auto_join = false
}
log_level = "INFO"
enable_syslog = true
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
when: nomad_role == "client"
- name: Ensure Podman is installed
package:
name: podman
state: present
- name: Enable and start Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
- name: Set proper permissions on Podman socket
file:
path: /run/podman/podman.sock
mode: '0666'
ignore_errors: yes
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl || /usr/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for drivers to load
pause:
seconds: 20
- name: Check driver status
shell: |
/usr/local/bin/nomad node status -self | grep -A 10 "Driver Status" || /usr/bin/nomad node status -self | grep -A 10 "Driver Status"
register: driver_status
failed_when: false
- name: Display driver status
debug:
var: driver_status.stdout_lines

View File

@ -1,27 +0,0 @@
---
- name: 直接复制正确的 HashiCorp APT 源配置
hosts: nomad_cluster
become: yes
tasks:
- name: 创建正确的 HashiCorp APT 源配置
copy:
content: "deb [trusted=yes] http://apt.releases.hashicorp.com {{ ansible_distribution_release }} main\n"
dest: "/etc/apt/sources.list.d/hashicorp.list"
owner: root
group: root
mode: '0644'
- name: 更新 APT 缓存
apt:
update_cache: yes
ignore_errors: yes
- name: 验证配置
command: cat /etc/apt/sources.list.d/hashicorp.list
register: config_check
changed_when: false
- name: 显示配置内容
debug:
msg: "HashiCorp APT 源配置: {{ config_check.stdout }}"

View File

@ -1,83 +0,0 @@
---
- name: Fix HCP1 and HCP2 Podman Configuration
hosts: hcp1,hcp2
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Ensure nomad user exists
user:
name: nomad
system: yes
shell: /bin/false
home: /home/nomad
create_home: yes
- name: Ensure Podman socket is running
systemd:
name: podman.socket
state: started
enabled: yes
- name: Set proper permissions on Podman socket
file:
path: /run/podman/podman.sock
mode: '0666'
ignore_errors: yes
- name: Create nomad data directory
file:
path: /opt/nomad/data
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create nomad log directory
file:
path: /var/log/nomad
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Test Podman access for nomad user
shell: sudo -u nomad podman version
register: podman_test
failed_when: false
- name: Display Podman test result
debug:
var: podman_test.stdout_lines
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: false
- name: Display configuration validation
debug:
var: config_validation
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
timeout: 60
- name: Check Nomad node status
shell: /usr/local/bin/nomad node status -self
register: node_status
failed_when: false
- name: Display node status
debug:
var: node_status.stdout_lines

View File

@ -1,56 +0,0 @@
---
- name: Fix dpkg and initramfs issues on hcs
hosts: hcs
become: yes
tasks:
- name: Check current dpkg status
shell: dpkg --audit
register: dpkg_status
ignore_errors: yes
- name: Display dpkg status
debug:
var: dpkg_status.stdout_lines
- name: Fix broken btrfs hook
shell: |
# Remove problematic btrfs hook temporarily
mv /usr/share/initramfs-tools/hooks/btrfs /usr/share/initramfs-tools/hooks/btrfs.bak || true
# Try to reconfigure the failed package
dpkg --configure -a
# If that works, restore the hook
if [ $? -eq 0 ]; then
mv /usr/share/initramfs-tools/hooks/btrfs.bak /usr/share/initramfs-tools/hooks/btrfs || true
fi
register: fix_result
ignore_errors: yes
- name: Display fix result
debug:
var: fix_result
- name: Alternative fix - reinstall initramfs-tools
apt:
name: initramfs-tools
state: latest
force: yes
when: fix_result.rc != 0
ignore_errors: yes
- name: Clean up and update
shell: |
apt autoremove -y
apt update
apt upgrade -y
ignore_errors: yes
- name: Check final dpkg status
shell: dpkg --audit
register: final_status
ignore_errors: yes
- name: Display final status
debug:
var: final_status.stdout_lines

View File

@ -1,98 +0,0 @@
---
- name: Fix Nomad Cluster Configuration
hosts: nomad_servers
become: yes
vars:
nomad_servers_list:
- "100.116.158.95" # semaphore
- "100.103.147.94" # ash2e
- "100.81.26.3" # ash1d
- "100.90.159.68" # ch2
- "{{ ansible_default_ipv4.address }}" # ch3 (will be determined dynamically)
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Create nomad user
user:
name: nomad
system: yes
shell: /bin/false
home: /opt/nomad
create_home: no
- name: Create Nomad configuration directory
file:
path: /etc/nomad.d
state: directory
mode: '0755'
- name: Create Nomad data directory
file:
path: /opt/nomad/data
state: directory
mode: '0755'
owner: nomad
group: nomad
ignore_errors: yes
- name: Create Nomad log directory
file:
path: /var/log/nomad
state: directory
mode: '0755'
owner: nomad
group: nomad
ignore_errors: yes
- name: Generate Nomad server configuration
template:
src: nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
mode: '0644'
notify: restart nomad
- name: Create Nomad systemd service file
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Enable and start Nomad service
systemd:
name: nomad
enabled: yes
state: started
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@ -1,99 +0,0 @@
---
- name: Update Nomad configuration for Podman and fix issues
hosts: localhost
become: yes
connection: local
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration to use Podman and disable Consul
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "100.116.158.95"
server {
enabled = true
bootstrap_expect = 1
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "100.116.158.95"
serf = "100.116.158.95"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
# Disable Consul integration for now
consul {
address = ""
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
backup: yes
- name: Enable Podman socket for systemd
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 5
timeout: 30
- name: Check Nomad status
uri:
url: http://localhost:4646/v1/status/leader
method: GET
register: nomad_status
retries: 3
delay: 5
- name: Display Nomad status
debug:
msg: "Nomad leader: {{ nomad_status.json if nomad_status.json is defined else 'No leader elected' }}"

View File

@ -1,72 +0,0 @@
---
- name: Fix Nomad Podman Driver Configuration
hosts: all
become: yes
vars:
nomad_user: nomad
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration to properly reference Podman plugin
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'plugin "podman" \{\n config \{\n socket_path = "unix:///run/user/1001/podman/podman.sock"\n volumes \{\n enabled = true\n \}\n \}\n\}'
replace: |
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/user/1001/podman/podman.sock"
volumes {
enabled = true
}
}
}
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for plugins to load
pause:
seconds: 15
- name: Check if Podman driver is now loaded
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
register: driver_status
- name: Display driver status
debug:
var: driver_status.stdout_lines
- name: Check Nomad logs for successful plugin loading
shell: journalctl -u nomad -n 20 --no-pager | grep -E "(podman|plugin)"
register: recent_logs
failed_when: false
- name: Display recent plugin logs
debug:
var: recent_logs.stdout_lines
- name: Final verification - Test Podman functionality
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | grep -i podman
register: podman_driver_check
failed_when: false
- name: Display final result
debug:
msg: |
Podman driver status: {{ 'SUCCESS - Driver loaded!' if 'podman' in (podman_driver_check.stdout | default('')) else 'Still checking...' }}
Available drivers: {{ podman_driver_check.stdout_lines | default(['none']) | join(', ') }}

View File

@ -1,45 +0,0 @@
---
- name: Fix Nomad server configuration
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create corrected nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = true
bootstrap_expect = 5
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
retry_join = [
"100.116.158.95", # semaphore
"100.81.26.3", # ash1d
"100.103.147.94", # ash2e
"100.90.159.68", # ch2
"100.86.141.112" # ch3
]
}
client {
enabled = false
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}

View File

@ -1,88 +0,0 @@
---
- name: Fix Nomad systemd service binary path
hosts: nomad_cluster
become: yes
tasks:
- name: Check Nomad binary location
shell: which nomad
register: nomad_binary_path
- name: Display binary path
debug:
msg: "Nomad binary 位于: {{ nomad_binary_path.stdout }}"
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Update Nomad systemd service with correct binary path
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
notify: reload systemd
- name: Reload systemd and start Nomad servers first
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
when: inventory_hostname in groups['nomad_servers']
- name: Wait for servers to be ready
pause:
seconds: 15
when: inventory_hostname in groups['nomad_servers']
- name: Start Nomad clients
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
when: inventory_hostname in groups['nomad_clients']
- name: Wait for clients to connect
pause:
seconds: 10
when: inventory_hostname in groups['nomad_clients']
- name: Check final service status
shell: systemctl status nomad --no-pager -l
register: service_status
ignore_errors: yes
- name: Display service status
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 服务状态:
📊 状态: {{ 'SUCCESS' if service_status.rc == 0 else 'FAILED' }}
💾 二进制路径: {{ nomad_binary_path.stdout }}
handlers:
- name: reload systemd
systemd:
daemon_reload: yes

View File

@ -1,79 +0,0 @@
---
- name: Fix Podman installation on remaining nodes
hosts: semaphore,master,ash3c,hcs
become: yes
serial: 1 # 逐个处理,避免同时影响多个节点
tasks:
- name: Current node status
debug:
msg: "🔧 修复节点: {{ inventory_hostname }}"
- name: Check if Podman is already installed
shell: podman --version 2>/dev/null || echo "NOT_INSTALLED"
register: podman_check
- name: Install Podman if not present (semaphore special handling)
apt:
name:
- podman
- buildah
- skopeo
state: present
update_cache: yes
force_apt_get: yes
when: inventory_hostname == 'semaphore' and 'NOT_INSTALLED' in podman_check.stdout
ignore_errors: yes
- name: Install Podman on other nodes
apt:
name:
- podman
- buildah
- skopeo
state: present
when: inventory_hostname != 'semaphore'
ignore_errors: yes
- name: Install Python dependencies for podman-compose
apt:
name:
- python3-pip
- python3-setuptools
- python3-yaml
- python3-dotenv
state: present
ignore_errors: yes
- name: Install podman-compose via pip
pip:
name:
- podman-compose
state: present
executable: pip3
ignore_errors: yes
- name: Alternative podman-compose installation via apt
apt:
name: podman-compose
state: present
ignore_errors: yes
- name: Verify installations
shell: |
echo "Podman: $(podman --version 2>/dev/null || echo 'FAILED')"
echo "Podman Compose: $(podman-compose --version 2>/dev/null || echo 'FAILED')"
register: verify_result
- name: Display verification results
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 验证结果:
{{ verify_result.stdout }}
- name: Enable Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes

View File

@ -1,109 +0,0 @@
---
- name: Fix Nomad server configuration
hosts: nomad_servers
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Backup current configuration
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup-server-fix
remote_src: yes
- name: Create clean server configuration
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_default_ipv4.address }}"
server {
enabled = true
bootstrap_expect = {{ nomad_bootstrap_expect }}
encrypt = "{{ nomad_encrypt_key }}"
retry_join = [
"100.116.158.95",
"100.103.147.94",
"100.81.26.3",
"100.90.159.68",
"100.86.141.112"
]
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ ansible_default_ipv4.address }}"
serf = "{{ ansible_default_ipv4.address }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
auto_advertise = false
server_auto_join = false
client_auto_join = false
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Ensure Podman is installed
package:
name: podman
state: present
- name: Enable and start Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl || /usr/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60

View File

@ -1,103 +0,0 @@
---
- name: Fix Nomad server network configuration
hosts: nomad_servers
become: yes
vars:
server_ips:
semaphore: "100.116.158.95"
ash2e: "100.103.147.94"
ash1d: "100.81.26.3"
ch2: "100.90.159.68"
ch3: "100.86.141.112"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Get server IP for this host
set_fact:
server_ip: "{{ server_ips[inventory_hostname] }}"
- name: Create corrected server configuration
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "/opt/nomad/data"
bind_addr = "{{ server_ip }}"
server {
enabled = true
bootstrap_expect = {{ nomad_bootstrap_expect }}
encrypt = "{{ nomad_encrypt_key }}"
retry_join = [
"100.116.158.95",
"100.103.147.94",
"100.81.26.3",
"100.90.159.68",
"100.86.141.112"
]
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ server_ip }}"
serf = "{{ server_ip }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
auto_advertise = false
server_auto_join = false
client_auto_join = false
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Validate Nomad configuration
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl || /usr/bin/nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60

View File

@ -1,39 +0,0 @@
---
- name: Fix Warden docker-compose.yml
hosts: warden
become: yes
gather_facts: no
tasks:
- name: Ensure /opt/warden directory exists
file:
path: /opt/warden
state: directory
owner: root
group: root
mode: '0755'
- name: Create or update docker-compose.yml with correct indentation
copy:
dest: /opt/warden/docker-compose.yml
content: |
services:
vaultwarden:
image: hub.git4ta.fun/vaultwarden/server:latest
security_opt:
- "seccomp=unconfined"
env_file:
- .env
volumes:
- ./data:/data
ports:
- "980:80"
restart: always
networks:
- vaultwarden_network
networks:
vaultwarden_network:
owner: root
group: root
mode: '0644'

View File

@ -1,15 +0,0 @@
---
- name: 检查 Podman 版本
hosts: warden
become: yes
gather_facts: yes
tasks:
- name: 检查当前 Podman 版本
shell: podman --version
register: current_podman_version
ignore_errors: yes
- name: 显示当前版本
debug:
msg: "当前 Podman 版本: {{ current_podman_version.stdout if current_podman_version.rc == 0 else '未安装或无法获取' }}"

View File

@ -1,22 +0,0 @@
- name: Check podman version on semaphore (local)
hosts: semaphore
connection: local
gather_facts: false
tasks:
- name: Check podman version
command: /usr/local/bin/podman --version
register: podman_version
- name: Display podman version
debug:
msg: "Podman version on {{ inventory_hostname }} is: {{ podman_version.stdout }}"
- name: Check podman version on other beijing nodes
hosts: beijing:!semaphore
gather_facts: false
tasks:
- name: Check podman version
command: /usr/local/bin/podman --version
register: podman_version
- name: Display podman version
debug:
msg: "Podman version on {{ inventory_hostname }} is: {{ podman_version.stdout }}"

View File

@ -1,22 +0,0 @@
---
- name: 清理 HashiCorp APT 源备份文件
hosts: nomad_cluster
become: yes
tasks:
- name: 查找所有 HashiCorp 备份文件
find:
paths: "/etc/apt/sources.list.d/"
patterns: "hashicorp.list.backup-*"
register: backup_files
- name: 删除所有备份文件
file:
path: "{{ item.path }}"
state: absent
loop: "{{ backup_files.files }}"
when: backup_files.files | length > 0
- name: 显示清理结果
debug:
msg: "已删除 {{ backup_files.files | length }} 个备份文件"

View File

@ -1,89 +0,0 @@
---
- name: Clear all aliases on hcp1 and hcp2
hosts: hcp1,hcp2
become: yes
tasks:
- name: Check current aliases
shell: alias || echo "No aliases found"
register: current_aliases
- name: Display current aliases
debug:
msg: "Current aliases: {{ current_aliases.stdout_lines }}"
- name: Clear aliases from /root/.bashrc
shell: |
sed -i '/^alias /d' /root/.bashrc
sed -i '/^alias\t/d' /root/.bashrc
ignore_errors: yes
- name: Clear aliases from /root/.profile
shell: |
sed -i '/^alias /d' /root/.profile
sed -i '/^alias\t/d' /root/.profile
ignore_errors: yes
- name: Clear aliases from /root/.zshrc
shell: |
sed -i '/^alias /d' /root/.zshrc
sed -i '/^alias\t/d' /root/.zshrc
ignore_errors: yes
- name: Clear aliases from /etc/bash.bashrc
shell: |
sed -i '/^alias /d' /etc/bash.bashrc
sed -i '/^alias\t/d' /etc/bash.bashrc
ignore_errors: yes
- name: Clear aliases from /etc/profile
shell: |
sed -i '/^alias /d' /etc/profile
sed -i '/^alias\t/d' /etc/profile
ignore_errors: yes
- name: Find and clear custom alias files
find:
paths: ["/root", "/etc", "/home"]
patterns: ["*.aliases", ".aliases", "aliases"]
recurse: yes
register: alias_files
- name: Remove found alias files
file:
path: "{{ item.path }}"
state: absent
loop: "{{ alias_files.files }}"
when: alias_files.files is defined
- name: Clear aliases from /etc/profile.d/aliases.sh
ansible.builtin.file:
path: /etc/profile.d/aliases.sh
state: absent
- name: Clear aliases from /root/.bashrc
ansible.builtin.lineinfile:
path: /root/.bashrc
state: absent
regexp: "^alias "
- name: Clear aliases from /root/.bash_aliases
ansible.builtin.file:
path: /root/.bash_aliases
state: absent
- name: Clear history
ansible.builtin.command:
cmd: > /root/.bash_history
- name: Restart shell to apply changes
ansible.builtin.command:
cmd: pkill -f bash || true
- name: Test network connectivity after clearing aliases
shell: ping -c 2 8.8.8.8 || echo "Ping failed"
register: ping_test
- name: Display ping test result
debug:
msg: "Ping test: {{ ping_test.stdout_lines }}"

View File

@ -1,32 +0,0 @@
---
- name: Remove all aliases from user shell configuration files
hosts: all
become: yes
gather_facts: false
tasks:
- name: Find all relevant shell configuration files
find:
paths: /home
patterns: .bashrc, .bash_aliases, .profile
register: shell_config_files
- name: Remove aliases from shell configuration files
replace:
path: "{{ item.path }}"
regexp: '^alias .*'
replace: ''
loop: "{{ shell_config_files.files }}"
when: shell_config_files.files is defined
- name: Remove functions from shell configuration files
replace:
path: "{{ item.path }}"
regexp: '^function .*'
replace: ''
loop: "{{ shell_config_files.files }}"
when: shell_config_files.files is defined
- name: Display completion message
debug:
msg: "All aliases and functions have been removed from user shell configuration files."

View File

@ -1,47 +0,0 @@
---
- name: Clear proxy settings from the system
hosts: all
become: yes
gather_facts: false
tasks:
- name: Remove proxy environment file
file:
path: /root/mgmt/configuration/proxy.env
state: absent
ignore_errors: yes
- name: Unset proxy environment variables
shell: |
unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
unset no_proxy
unset NO_PROXY
unset ALL_PROXY
unset all_proxy
unset DOCKER_BUILDKIT
unset BUILDKIT_PROGRESS
unset GIT_HTTP_PROXY
unset GIT_HTTPS_PROXY
unset CURL_PROXY
unset WGET_PROXY
ignore_errors: yes
- name: Remove proxy settings from /etc/environment
lineinfile:
path: /etc/environment
state: absent
regexp: '^(http_proxy|https_proxy|no_proxy|ALL_PROXY|DOCKER_BUILDKIT|BUILDKIT_PROGRESS|GIT_HTTP_PROXY|GIT_HTTPS_PROXY|CURL_PROXY|WGET_PROXY)='
ignore_errors: yes
- name: Remove proxy settings from /etc/apt/apt.conf.d/proxy.conf
file:
path: /etc/apt/apt.conf.d/proxy.conf
state: absent
ignore_errors: yes
- name: Display completion message
debug:
msg: "Proxy settings have been cleared from the system."

View File

@ -1,76 +0,0 @@
---
- name: Clear proxy settings on hcp1 and hcp2
hosts: hcp1,hcp2
become: yes
tasks:
- name: Check current proxy environment variables
shell: env | grep -i proxy || echo "No proxy vars found"
register: proxy_env_before
- name: Display current proxy settings
debug:
msg: "Current proxy env: {{ proxy_env_before.stdout_lines }}"
- name: Clear proxy from /etc/environment
lineinfile:
path: /etc/environment
regexp: "{{ item }}"
state: absent
loop:
- "^http_proxy="
- "^https_proxy="
- "^HTTP_PROXY="
- "^HTTPS_PROXY="
- "^ftp_proxy="
- "^FTP_PROXY="
- "^no_proxy="
- "^NO_PROXY="
- name: Clear proxy from /etc/apt/apt.conf.d/
file:
path: "{{ item }}"
state: absent
loop:
- /etc/apt/apt.conf.d/95proxies
- /etc/apt/apt.conf.d/proxy.conf
- /etc/apt/apt.conf.d/00proxy
- name: Clear proxy from user profiles
lineinfile:
path: "{{ item }}"
regexp: ".*proxy.*"
state: absent
loop:
- /root/.bashrc
- /root/.profile
- /home/root/.bashrc
- /home/root/.profile
ignore_errors: yes
- name: Unset proxy variables in current session
shell: |
unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
unset ftp_proxy
unset FTP_PROXY
unset no_proxy
unset NO_PROXY
- name: Check APT proxy configuration
shell: apt-config dump | grep -i proxy || echo "No APT proxy found"
register: apt_proxy_check
- name: Display APT proxy status
debug:
msg: "APT proxy config: {{ apt_proxy_check.stdout_lines }}"
- name: Test direct connection to HashiCorp
shell: curl -I --connect-timeout 10 https://releases.hashicorp.com/ || echo "Connection failed"
register: connection_test
- name: Display connection test result
debug:
msg: "Connection test: {{ connection_test.stdout_lines }}"

View File

@ -1,25 +0,0 @@
---
- name: Ensure nomad user and plugin directory exist
hosts: nomad_clients
become: yes
tasks:
- name: Ensure nomad group exists
group:
name: nomad
state: present
- name: Ensure nomad user exists
user:
name: nomad
group: nomad
shell: /usr/sbin/nologin
system: yes
create_home: no
- name: Ensure plugin directory exists with correct ownership
file:
path: /opt/nomad/data/plugins
state: directory
owner: nomad
group: nomad
mode: '0755'

View File

@ -1,105 +0,0 @@
---
- name: Final Podman Permission Fix for Nomad
hosts: all
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Install podman for nomad user (system-wide)
package:
name: podman
state: present
- name: Enable podman socket for nomad user
systemd:
name: podman.socket
enabled: yes
state: started
scope: system
daemon_reload: yes
- name: Create nomad user podman configuration directory
file:
path: /home/nomad/.config/containers
state: directory
owner: nomad
group: nomad
mode: '0755'
recurse: yes
- name: Configure podman for nomad user to use system socket
copy:
content: |
[containers]
[engine]
remote = true
[service_destinations]
[service_destinations.system]
uri = "unix:///run/podman/podman.sock"
dest: /home/nomad/.config/containers/containers.conf
owner: nomad
group: nomad
mode: '0644'
- name: Update Nomad configuration to use system podman socket
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'socket_path = "unix:///run/user/1001/podman/podman.sock"'
replace: 'socket_path = "unix:///run/podman/podman.sock"'
- name: Add nomad user to necessary groups
user:
name: nomad
groups:
- podman
append: yes
- name: Create podman group if it doesn't exist
group:
name: podman
state: present
- name: Set proper permissions on system podman socket directory
file:
path: /run/podman
state: directory
mode: '0755'
group: podman
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
timeout: 60
- name: Wait for plugins to load
pause:
seconds: 20
- name: Final verification - Check driver status
shell: sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 "Driver Status"
register: final_driver_status
failed_when: false
- name: Display final driver status
debug:
var: final_driver_status.stdout_lines
- name: Test podman access for nomad user
shell: sudo -u nomad podman version
register: podman_test
failed_when: false
- name: Display podman test result
debug:
var: podman_test.stdout_lines

View File

@ -1,12 +0,0 @@
---
- name: Get Tailscale IP for specified nodes
hosts: all
gather_facts: no
tasks:
- name: Get tailscale IP
shell: "tailscale ip -4"
register: tailscale_ip
- name: Display Tailscale IP
debug:
msg: "Node {{ inventory_hostname }} has IP: {{ tailscale_ip.stdout }}"

View File

@ -1,67 +0,0 @@
---
- name: 强制升级 Podman 到最新版本
hosts: warden
become: yes
gather_facts: yes
tasks:
- name: 检查当前 Podman 版本
shell: podman --version
register: current_podman_version
ignore_errors: yes
- name: 显示当前版本
debug:
msg: "升级前版本: {{ current_podman_version.stdout if current_podman_version.rc == 0 else '未安装' }}"
- name: 卸载现有 Podman
shell: apt-get remove -y --purge podman* containerd* runc*
ignore_errors: yes
- name: 清理残留配置
shell: |
rm -rf /etc/containers
rm -rf /usr/share/containers
rm -rf /var/lib/containers
ignore_errors: yes
- name: 直接下载并安装最新版Podman二进制文件
shell: |
# 清理可能存在的旧版本
rm -f /tmp/podman-latest.tar.gz
rm -f /usr/local/bin/podman
# 获取最新版本号
LATEST_VERSION="v5.6.1" # 硬编码最新版本避免网络问题
echo "安装版本: $LATEST_VERSION"
# 使用GitHub镜像站点下载二进制文件
echo "使用GitHub镜像站点下载..."
wget -O /tmp/podman-latest.tar.gz "https://gh.git4ta.fun/github.com/containers/podman/releases/download/${LATEST_VERSION}/podman-linux-static-amd64.tar.gz"
# 检查文件是否下载成功,如果失败尝试直接下载
if [ ! -f /tmp/podman-latest.tar.gz ]; then
echo "镜像下载失败,尝试直接下载..."
wget -O /tmp/podman-latest.tar.gz "https://github.com/containers/podman/releases/download/${LATEST_VERSION}/podman-linux-static-amd64.tar.gz"
fi
# 解压并安装
tar -xzf /tmp/podman-latest.tar.gz -C /usr/local/bin/ --strip-components=1
chmod +x /usr/local/bin/podman
# 更新PATH
echo 'export PATH=/usr/local/bin:$PATH' >> /etc/profile
. /etc/profile
# 验证安装
/usr/local/bin/podman --version
ignore_errors: yes
- name: 验证安装结果
shell: podman --version
register: new_podman_version
ignore_errors: yes
- name: 显示最终版本
debug:
msg: "升级后版本: {{ new_podman_version.stdout if new_podman_version.rc == 0 else '安装失败' }}"

View File

@ -1,218 +0,0 @@
---
- name: Integrated Podman Setup - Remove Docker, Install and Configure Podman with Compose for Nomad
hosts: all
become: yes
gather_facts: yes
tasks:
- name: 显示当前处理的节点
debug:
msg: "🔧 开始集成 Podman 设置: {{ inventory_hostname }}"
- name: 检查 Docker 服务状态
shell: systemctl is-active docker 2>/dev/null || echo "inactive"
register: docker_status
changed_when: false
- name: 停止 Docker 服务
systemd:
name: docker
state: stopped
enabled: no
ignore_errors: yes
when: docker_status.stdout == "active"
- name: 停止 Docker socket
systemd:
name: docker.socket
state: stopped
enabled: no
ignore_errors: yes
- name: 移除 Docker 相关包
apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
- docker.io
- docker-doc
- docker-compose
- docker-registry
- containerd
- runc
state: absent
purge: yes
ignore_errors: yes
- name: 清理 Docker 数据目录
file:
path: "{{ item }}"
state: absent
loop:
- /var/lib/docker
- /var/lib/containerd
- /etc/docker
- /etc/containerd
ignore_errors: yes
- name: 清理 Docker 用户组
group:
name: docker
state: absent
ignore_errors: yes
- name: 更新包缓存
apt:
update_cache: yes
cache_valid_time: 3600
- name: 安装 Podman 及相关工具
apt:
name:
- podman
- buildah
- skopeo
- python3-pip
- python3-setuptools
state: present
retries: 3
delay: 10
- name: 安装 Podman Compose via pip
pip:
name: podman-compose
state: present
ignore_errors: yes
- name: 启用 Podman socket 服务
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: 创建 Podman 用户服务目录
file:
path: /etc/systemd/user
state: directory
mode: '0755'
- name: 验证 Podman 安装
shell: podman --version
register: podman_version
- name: 验证 Podman Compose 安装
shell: podman-compose --version 2>/dev/null || echo "未安装"
register: podman_compose_version
- name: 检查 Docker 清理状态
shell: systemctl is-active docker 2>/dev/null || echo "已移除"
register: final_docker_status
- name: 显示 Docker 移除和 Podman 安装结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} Docker 移除和 Podman 安装完成
🐳 Docker 状态: {{ final_docker_status.stdout }}
📦 Podman 版本: {{ podman_version.stdout }}
🔧 Compose 状态: {{ podman_compose_version.stdout }}
- name: 创建 Podman 系统配置目录
file:
path: /etc/containers
state: directory
mode: '0755'
- name: 配置 Podman 使用系统 socket
copy:
content: |
[engine]
# 使用系统级 socket 而不是用户级 socket
active_service = "system"
[engine.service_destinations]
[engine.service_destinations.system]
uri = "unix:///run/podman/podman.sock"
dest: /etc/containers/containers.conf
mode: '0644'
- name: 检查是否存在 nomad 用户
getent:
database: passwd
key: nomad
register: nomad_user_check
ignore_errors: yes
- name: 为 nomad 用户创建配置目录
file:
path: "/home/nomad/.config/containers"
state: directory
owner: nomad
group: nomad
mode: '0755'
when: nomad_user_check is succeeded
- name: 为 nomad 用户配置 Podman
copy:
content: |
[engine]
active_service = "system"
[engine.service_destinations]
[engine.service_destinations.system]
uri = "unix:///run/podman/podman.sock"
dest: /home/nomad/.config/containers/containers.conf
owner: nomad
group: nomad
mode: '0644'
when: nomad_user_check is succeeded
- name: 将 nomad 用户添加到 podman 组
user:
name: nomad
groups: podman
append: yes
when: nomad_user_check is succeeded
ignore_errors: yes
- name: 创建 podman 组(如果不存在)
group:
name: podman
state: present
ignore_errors: yes
- name: 设置 podman socket 目录权限
file:
path: /run/podman
state: directory
mode: '0755'
group: podman
ignore_errors: yes
- name: 验证 Podman socket 权限
file:
path: /run/podman/podman.sock
mode: '0666'
when: nomad_user_check is succeeded
ignore_errors: yes
- name: 测试 Podman 功能
shell: podman info
register: podman_info
ignore_errors: yes
- name: 清理 apt 缓存
apt:
autoclean: yes
autoremove: yes
- name: 显示最终配置结果
debug:
msg: |
🎉 节点 {{ inventory_hostname }} 集成 Podman 设置完成!
📦 Podman 版本: {{ podman_version.stdout }}
🐳 Podman Compose: {{ podman_compose_version.stdout }}
👤 Nomad 用户: {{ 'FOUND' if nomad_user_check is succeeded else 'NOT FOUND' }}
🔧 Podman 状态: {{ 'SUCCESS' if podman_info.rc == 0 else 'WARNING' }}
🚀 Docker 已移除Podman 已配置为与 Nomad 集成

View File

@ -1,167 +0,0 @@
---
- name: Migrate Nomad from Docker to Podman (Simple Version)
hosts: all
become: yes
vars:
nomad_user: nomad
nomad_config_dir: /etc/nomad.d
nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Backup current Nomad configuration
copy:
src: "{{ nomad_config_file }}"
dest: "{{ nomad_config_file }}.backup-{{ ansible_date_time.epoch }}"
remote_src: yes
- name: Get nomad user info
getent:
database: passwd
key: "{{ nomad_user }}"
register: nomad_user_info
- name: Set nomad user UID variable
set_fact:
nomad_uid: "{{ nomad_user_info.ansible_facts.getent_passwd[nomad_user][1] }}"
- name: Enable lingering for nomad user
command: loginctl enable-linger {{ nomad_user }}
failed_when: false
- name: Create runtime directory for nomad user
file:
path: "/run/user/{{ nomad_uid }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0700'
- name: Start Podman socket as nomad user
shell: |
sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} systemctl --user enable --now podman.socket
args:
creates: "/run/user/{{ nomad_uid }}/podman/podman.sock"
- name: Create new Nomad configuration with Podman
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "0.0.0.0"
client {
enabled = true
servers = [
"100.116.158.95:4647",
]
}
# Docker plugin (disabled)
# plugin "docker" {
# config {
# allow_privileged = true
# volumes {
# enabled = true
# }
# }
# }
plugin "podman" {
config {
socket_path = "unix:///run/user/{{ nomad_uid }}/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "127.0.0.1:8500"
}
dest: "{{ nomad_config_file }}"
owner: root
group: root
mode: '0644'
- name: Update Nomad systemd service to run as nomad user
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
Wants=network-online.target
[Service]
Type=notify
User={{ nomad_user }}
Group={{ nomad_user }}
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }}
KillMode=process
Restart=on-failure
LimitNOFILE=65536
Environment=XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }}
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
owner: root
group: root
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready (local check)
wait_for:
port: 4646
host: localhost
delay: 5
timeout: 60
- name: Verify Nomad is running
shell: systemctl is-active nomad
register: nomad_status
- name: Display Nomad status
debug:
msg: "Nomad service status: {{ nomad_status.stdout }}"
- name: Check Podman socket
stat:
path: "/run/user/{{ nomad_uid }}/podman/podman.sock"
register: podman_socket
- name: Display Podman socket status
debug:
msg: "Podman socket exists: {{ podman_socket.stat.exists }}"
- name: Test Podman as nomad user
shell: |
sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} podman version --format json
register: podman_test
failed_when: false
- name: Display Podman test result
debug:
msg: |
Podman test: {{ 'SUCCESS' if podman_test.rc == 0 else 'FAILED' }}
{% if podman_test.rc != 0 %}
Error: {{ podman_test.stderr }}
{% endif %}

View File

@ -1,7 +0,0 @@
---
- name: Ping nodes to check connectivity
hosts: all
gather_facts: no
tasks:
- name: Ping the host
ping:

View File

@ -1,13 +0,0 @@
---
- name: Read Nomad config file
hosts: localhost
gather_facts: no
tasks:
- name: Read nomad.hcl
slurp:
src: /etc/nomad.d/nomad.hcl
register: nomad_config
- name: Display Nomad config
debug:
msg: "{{ nomad_config['content'] | b64decode }}"

View File

@ -1,39 +0,0 @@
---
- name: Restart Tailscale to fix DNS issues
hosts: hcp1,hcp2
become: yes
tasks:
- name: Check current DNS configuration
shell: cat /etc/resolv.conf
register: dns_before
- name: Display current DNS config
debug:
msg: "Current DNS config: {{ dns_before.stdout_lines }}"
- name: Restart tailscaled service
systemd:
name: tailscaled
state: restarted
- name: Wait for tailscale to stabilize
wait_for:
timeout: 10
- name: Check DNS configuration after restart
shell: cat /etc/resolv.conf
register: dns_after
- name: Display new DNS config
debug:
msg: "New DNS config: {{ dns_after.stdout_lines }}"
- name: Test DNS resolution
shell: nslookup apt.releases.hashicorp.com
register: dns_test
ignore_errors: yes
- name: Display DNS test result
debug:
msg: "DNS test result: {{ dns_test.stdout_lines }}"

View File

@ -1,37 +0,0 @@
---
- name: Update Nomad config to run as a client
hosts: localhost
gather_facts: no
become: yes
tasks:
- name: Create new nomad.hcl
copy:
dest: /etc/nomad.d/nomad.hcl
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "100.116.158.95"
server {
enabled = false
}
client {
enabled = true
servers = ["100.81.26.3:4647", "100.103.147.94:4647", "100.90.159.68:4647"]
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.116.158.95:8500"
}

View File

@ -1,77 +0,0 @@
---
- name: 升级 Podman 到最新版本 (warden 节点测试)
hosts: warden
become: yes
gather_facts: yes
tasks:
- name: 检查当前 Podman 版本
shell: podman --version
register: current_podman_version
ignore_errors: yes
- name: 显示当前版本
debug:
msg: "当前 Podman 版本: {{ current_podman_version.stdout if current_podman_version.rc == 0 else '未安装或无法获取' }}"
- name: 备份现有 Podman 配置
shell: |
if [ -d /etc/containers ]; then
cp -r /etc/containers /etc/containers.backup.$(date +%Y%m%d)
fi
if [ -d /usr/share/containers ]; then
cp -r /usr/share/containers /usr/share/containers.backup.$(date +%Y%m%d)
fi
ignore_errors: yes
- name: 添加 Kubic 仓库 (HTTP 跳过签名)
shell: |
# 添加仓库并跳过签名验证
echo "deb [trusted=yes] http://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_22.04/ /" > /etc/apt/sources.list.d/kubic-containers.list
- name: 更新包列表 (跳过签名验证)
shell: apt-get update -o Acquire::AllowInsecureRepositories=true -o Acquire::AllowDowngradeToInsecureRepositories=true
- name: 检查仓库中可用的 Podman 版本
shell: apt-cache policy podman
register: podman_versions
- name: 显示可用的 Podman 版本
debug:
msg: "{{ podman_versions.stdout }}"
- name: 安装 Podman 5.x (强制跳过签名)
shell: apt-get install -y --allow-unauthenticated --allow-downgrades --allow-remove-essential --allow-change-held-packages podman
- name: 验证 Podman 5.x 安装
shell: |
podman --version
podman info --format json | jq -r '.Version.Version'
register: podman_5_verify
- name: 显示升级结果
debug:
msg: |
✅ Podman 升级完成
🚀 新版本: {{ podman_5_verify.stdout_lines[0] }}
📊 详细版本: {{ podman_5_verify.stdout_lines[1] }}
- name: 测试基本功能
shell: |
podman run --rm hello-world
register: podman_test
ignore_errors: yes
- name: 显示测试结果
debug:
msg: "Podman 功能测试: {{ '成功' if podman_test.rc == 0 else '失败 - ' + podman_test.stderr }}"
- name: 检查相关服务状态
shell: |
systemctl status podman.socket 2>/dev/null || echo "podman.socket 未运行"
systemctl status containerd 2>/dev/null || echo "containerd 未运行"
register: service_status
- name: 显示服务状态
debug:
msg: "{{ service_status.stdout }}"

View File

@ -1,126 +0,0 @@
---
- name: 移除 Docker 并安装带 Compose 功能的 Podman
hosts: all
become: yes
gather_facts: yes
tasks:
- name: 显示当前处理的节点
debug:
msg: "🔧 正在处理节点: {{ inventory_hostname }}"
- name: 检查 Docker 服务状态
shell: systemctl is-active docker 2>/dev/null || echo "inactive"
register: docker_status
changed_when: false
- name: 停止 Docker 服务
systemd:
name: docker
state: stopped
enabled: no
ignore_errors: yes
when: docker_status.stdout == "active"
- name: 停止 Docker socket
systemd:
name: docker.socket
state: stopped
enabled: no
ignore_errors: yes
- name: 移除 Docker 相关包
apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
- docker.io
- docker-doc
- docker-compose
- docker-registry
- containerd
- runc
state: absent
purge: yes
ignore_errors: yes
- name: 清理 Docker 数据目录
file:
path: "{{ item }}"
state: absent
loop:
- /var/lib/docker
- /var/lib/containerd
- /etc/docker
- /etc/containerd
ignore_errors: yes
- name: 清理 Docker 用户组
group:
name: docker
state: absent
ignore_errors: yes
- name: 更新包缓存
apt:
update_cache: yes
cache_valid_time: 3600
- name: 安装 Podman 及相关工具
apt:
name:
- podman
- buildah
- skopeo
- python3-pip
- python3-setuptools
state: present
retries: 3
delay: 10
- name: 安装 Podman Compose via pip
pip:
name: podman-compose
state: present
ignore_errors: yes
- name: 启用 Podman socket 服务
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: 创建 Podman 用户服务目录
file:
path: /etc/systemd/user
state: directory
mode: '0755'
- name: 验证 Podman 安装
shell: podman --version
register: podman_version
- name: 验证 Podman Compose 安装
shell: podman-compose --version 2>/dev/null || echo "未安装"
register: podman_compose_version
- name: 检查 Docker 清理状态
shell: systemctl is-active docker 2>/dev/null || echo "已移除"
register: final_docker_status
- name: 显示节点处理结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 处理完成
🐳 Docker 状态: {{ final_docker_status.stdout }}
📦 Podman 版本: {{ podman_version.stdout }}
🔧 Compose 状态: {{ podman_compose_version.stdout }}
- name: 清理 apt 缓存
apt:
autoclean: yes
autoremove: yes

View File

@ -1,120 +0,0 @@
---
- name: 移除 Docker 并安装 Podman - 新 Server 节点
hosts: ash2e,ash1d,ch2
become: yes
gather_facts: no
serial: 1 # 逐个节点处理,避免并发冲突
tasks:
- name: 显示当前处理的节点
debug:
msg: "🔧 正在处理节点: {{ inventory_hostname }}"
- name: 检查 Docker 服务状态
shell: systemctl is-active docker 2>/dev/null || echo "inactive"
register: docker_status
changed_when: false
- name: 停止 Docker 服务
systemd:
name: docker
state: stopped
enabled: no
ignore_errors: yes
when: docker_status.stdout == "active"
- name: 停止 Docker socket
systemd:
name: docker.socket
state: stopped
enabled: no
ignore_errors: yes
- name: 移除 Docker 相关包
apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
- docker.io
- docker-doc
- docker-compose
- docker-registry
- containerd
- runc
state: absent
purge: yes
ignore_errors: yes
- name: 清理 Docker 数据目录
file:
path: "{{ item }}"
state: absent
loop:
- /var/lib/docker
- /var/lib/containerd
- /etc/docker
- /etc/containerd
ignore_errors: yes
- name: 清理 Docker 用户组
group:
name: docker
state: absent
ignore_errors: yes
- name: 更新包缓存
apt:
update_cache: yes
cache_valid_time: 3600
- name: 安装 Podman 及相关工具
apt:
name:
- podman
- buildah
- skopeo
- podman-compose
state: present
retries: 3
delay: 10
- name: 启用 Podman socket 服务
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: 创建 Podman 用户服务目录
file:
path: /etc/systemd/user
state: directory
mode: '0755'
- name: 验证 Podman 安装
shell: podman --version
register: podman_version
- name: 验证 Podman Compose 安装
shell: podman-compose --version 2>/dev/null || echo "未安装"
register: podman_compose_version
- name: 检查 Docker 清理状态
shell: systemctl is-active docker 2>/dev/null || echo "已移除"
register: final_docker_status
- name: 显示节点处理结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 处理完成
🐳 Docker 状态: {{ final_docker_status.stdout }}
📦 Podman 版本: {{ podman_version.stdout }}
🔧 Compose 状态: {{ podman_compose_version.stdout }}
- name: 清理 apt 缓存
apt:
autoclean: yes
autoremove: yes

View File

@ -1,69 +0,0 @@
---
- name: Add Beijing prefix to LXC node names in Nomad configuration
hosts: beijing
become: yes
vars:
node_prefixes:
influxdb: "bj-influxdb"
warden: "bj-warden"
hcp1: "bj-hcp1"
hcp2: "bj-hcp2"
tailscale_ips:
influxdb: "100.100.7.4"
warden: "100.122.197.112"
hcp1: "100.97.62.111"
hcp2: "100.116.112.45"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Get current node name from inventory
set_fact:
current_node_name: "{{ inventory_hostname }}"
new_node_name: "{{ node_prefixes[inventory_hostname] }}"
tailscale_ip: "{{ tailscale_ips[inventory_hostname] }}"
- name: Display node name change
debug:
msg: "Changing node name from {{ current_node_name }} to {{ new_node_name }}, using Tailscale IP {{ tailscale_ip }}"
- name: Update node name in Nomad configuration
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^name\s*='
line: 'name = "{{ new_node_name }}"'
insertafter: 'datacenter = "dc1"'
state: present
- name: Validate Nomad configuration
shell: nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready on Tailscale IP
wait_for:
port: 4646
host: "{{ tailscale_ip }}"
delay: 10
timeout: 60
- name: Wait for node registration
pause:
seconds: 15
- name: Display new configuration
shell: cat /etc/nomad.d/nomad.hcl | grep -E "^(datacenter|name|bind_addr)\s*="
register: nomad_config_check
- name: Show updated configuration
debug:
var: nomad_config_check.stdout_lines

View File

@ -1,56 +0,0 @@
---
- name: Fix duplicate plugin_dir configuration
hosts: nomadlxc,hcp
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Remove duplicate plugin_dir lines
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^plugin_dir = "/opt/nomad/plugins"'
state: absent
- name: Ensure only one plugin_dir configuration exists
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^plugin_dir = "/opt/nomad/data/plugins"'
line: 'plugin_dir = "/opt/nomad/data/plugins"'
insertafter: 'data_dir = "/opt/nomad/data"'
state: present
- name: Validate Nomad configuration
shell: nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
failed_when: config_validation.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for plugins to load
pause:
seconds: 15
- name: Check driver status
shell: |
export NOMAD_ADDR=http://localhost:4646
nomad node status -self | grep -A 10 "Driver Status"
register: driver_status
failed_when: false
- name: Display driver status
debug:
var: driver_status.stdout_lines

View File

@ -1,112 +0,0 @@
---
- name: Fix Nomad Podman Driver Configuration
hosts: nomadlxc,hcp
become: yes
vars:
nomad_user: nomad
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Install Podman driver plugin if missing
block:
- name: Check if plugin exists
stat:
path: /opt/nomad/data/plugins/nomad-driver-podman
register: plugin_exists
- name: Download and install Podman driver plugin
block:
- name: Download Nomad Podman driver
get_url:
url: "https://releases.hashicorp.com/nomad-driver-podman/0.6.1/nomad-driver-podman_0.6.1_linux_amd64.zip"
dest: "/tmp/nomad-driver-podman.zip"
mode: '0644'
- name: Extract Podman driver
unarchive:
src: "/tmp/nomad-driver-podman.zip"
dest: "/tmp"
remote_src: yes
- name: Install Podman driver
copy:
src: "/tmp/nomad-driver-podman"
dest: "/opt/nomad/data/plugins/nomad-driver-podman"
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
remote_src: yes
- name: Clean up temporary files
file:
path: "{{ item }}"
state: absent
loop:
- "/tmp/nomad-driver-podman.zip"
- "/tmp/nomad-driver-podman"
when: not plugin_exists.stat.exists
- name: Update Nomad configuration with correct plugin name and socket path
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'plugin "podman" \{'
replace: 'plugin "nomad-driver-podman" {'
- name: Update socket path to system socket
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'socket_path = "unix:///run/user/1001/podman/podman.sock"'
replace: 'socket_path = "unix:///run/podman/podman.sock"'
- name: Add plugin_dir configuration if missing
lineinfile:
path: /etc/nomad.d/nomad.hcl
line: 'plugin_dir = "/opt/nomad/data/plugins"'
insertafter: 'data_dir = "/opt/nomad/data"'
state: present
- name: Ensure Podman socket is enabled and running
systemd:
name: podman.socket
enabled: yes
state: started
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for plugins to load
pause:
seconds: 20
- name: Check driver status
shell: |
export NOMAD_ADDR=http://localhost:4646
nomad node status -self | grep -A 10 "Driver Status"
register: driver_status
failed_when: false
- name: Display driver status
debug:
var: driver_status.stdout_lines
- name: Check for Podman driver in logs
shell: journalctl -u nomad -n 30 --no-pager | grep -E "(podman|plugin)" | tail -10
register: plugin_logs
failed_when: false
- name: Display plugin logs
debug:
var: plugin_logs.stdout_lines

View File

@ -1,46 +0,0 @@
---
- name: Fix NFS mounting on warden node
hosts: warden
become: yes
tasks:
- name: Ensure rpcbind is running
systemd:
name: rpcbind
state: started
enabled: yes
- name: Ensure nfs-client.target is active
systemd:
name: nfs-client.target
state: started
enabled: yes
- name: Create consul-shared directory
file:
path: /opt/consul-shared
state: directory
mode: '0755'
- name: Mount NFS share
mount:
path: /opt/consul-shared
src: snail:/fs/1000/nfs
fstype: nfs
opts: rw,sync,vers=3
state: mounted
- name: Add to fstab for persistence
mount:
path: /opt/consul-shared
src: snail:/fs/1000/nfs
fstype: nfs
opts: rw,sync,vers=3
state: present
- name: Verify mount
command: df -h /opt/consul-shared
register: mount_result
- name: Display mount result
debug:
var: mount_result.stdout

View File

@ -1,82 +0,0 @@
---
- name: Setup NFS for different container types
hosts: all
become: yes
vars:
nfs_server: snail
nfs_export_path: /fs/1000/nfs/Fnsync
nfs_mount_path: /mnt/fnsync
nfs_options_local: "rw,sync,vers=4.2"
nfs_options_overseas: "rw,sync,vers=3,timeo=600,retrans=2"
tasks:
- name: Detect container type and location
set_fact:
container_type: "{{ 'lxc' if inventory_hostname in groups['lxc'] else 'pve' }}"
is_overseas: "{{ inventory_hostname in ['ash1d', 'ash2e', 'ash3c', 'ch2', 'ch3'] }}"
- name: Install NFS client for all nodes
package:
name: nfs-common
state: present
- name: Create mount directory for all nodes
file:
path: "{{ nfs_mount_path }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Mount NFS for local LXC containers (direct mount)
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options_local }}"
state: mounted
when: container_type == 'lxc' and not is_overseas
- name: Mount NFS for overseas PVE containers (with retry options)
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options_overseas }}"
state: mounted
when: container_type == 'pve' and is_overseas
- name: Ensure NFS mount persists after reboot
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options_local if container_type == 'lxc' and not is_overseas else nfs_options_overseas }}"
state: present
- name: Verify NFS mount
command: df -h "{{ nfs_mount_path }}"
register: mount_result
ignore_errors: yes
- name: Display mount status
debug:
msg: "{{ inventory_hostname }} - {{ container_type }} - {{ '海外' if is_overseas else '本地' }} - Mount: {{ '成功' if mount_result.rc == 0 else '失败' }}"
- name: Create Nomad directories for LXC containers
file:
path: "{{ nfs_mount_path }}/nomad/{{ inventory_hostname }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
when: container_type == 'lxc'
- name: Create shared volumes directory for PVE containers
file:
path: "{{ nfs_mount_path }}/nomad/volumes/{{ inventory_hostname }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
when: container_type == 'pve'

View File

@ -1,75 +0,0 @@
---
- name: Setup NFS Storage for Consul Cluster
hosts: localhost
gather_facts: false
vars:
nfs_server: snail
nfs_export_path: /fs/1000/nfs/Fnsync
nfs_mount_path: /mnt/fnsync
tasks:
- name: Install NFS client and mount on master
ansible.builtin.shell: |
ssh -o StrictHostKeyChecking=no -p 60022 ben@master '
echo "3131" | sudo -S apt update &&
echo "3131" | sudo -S apt install -y nfs-common &&
echo "3131" | sudo -S mkdir -p {{ nfs_mount_path }} &&
echo "3131" | sudo -S mount -t nfs {{ nfs_server }}:{{ nfs_export_path }} {{ nfs_mount_path }} &&
echo "{{ nfs_server }}:{{ nfs_export_path }} {{ nfs_mount_path }} nfs defaults 0 0" | echo "3131" | sudo -S tee -a /etc/fstab
'
delegate_to: localhost
register: master_result
- name: Install NFS client and mount on ash3c
ansible.builtin.shell: |
ssh -o StrictHostKeyChecking=no ben@ash3c '
echo "3131" | sudo -S apt update &&
echo "3131" | sudo -S apt install -y nfs-common &&
echo "3131" | sudo -S mkdir -p {{ nfs_mount_path }} &&
echo "3131" | sudo -S mount -t nfs {{ nfs_server }}:{{ nfs_export_path }} {{ nfs_mount_path }} &&
echo "{{ nfs_server }}:{{ nfs_export_path }} {{ nfs_mount_path }} nfs defaults 0 0" | echo "3131" | sudo -S tee -a /etc/fstab
'
delegate_to: localhost
register: ash3c_result
- name: Install NFS client and mount on warden
ansible.builtin.shell: |
ssh -o StrictHostKeyChecking=no ben@warden '
echo "3131" | sudo -S apt update &&
echo "3131" | sudo -S apt install -y nfs-common &&
echo "3131" | sudo -S mkdir -p {{ nfs_mount_path }} &&
echo "3131" | sudo -S mount -t nfs {{ nfs_server }}:{{ nfs_export_path }} {{ nfs_mount_path }} &&
echo "{{ nfs_server }}:{{ nfs_export_path }} {{ nfs_mount_path }} nfs defaults 0 0" | echo "3131" | sudo -S tee -a /etc/fstab
'
delegate_to: localhost
register: warden_result
- name: Test NFS connectivity on all nodes
ansible.builtin.shell: |
ssh -o StrictHostKeyChecking=no -p 60022 ben@master 'echo "3131" | sudo -S touch {{ nfs_mount_path }}/test-master-$(date +%s) && ls -la {{ nfs_mount_path }}/'
ssh -o StrictHostKeyChecking=no ben@ash3c 'echo "3131" | sudo -S touch {{ nfs_mount_path }}/test-ash3c-$(date +%s) && ls -la {{ nfs_mount_path }}/'
ssh -o StrictHostKeyChecking=no ben@warden 'echo "3131" | sudo -S touch {{ nfs_mount_path }}/test-warden-$(date +%s) && ls -la {{ nfs_mount_path }}/'
delegate_to: localhost
register: nfs_test_result
- name: Display NFS test results
ansible.builtin.debug:
var: nfs_test_result.stdout_lines
- name: Create Consul data directories on NFS
ansible.builtin.shell: |
ssh -o StrictHostKeyChecking=no -p 60022 ben@master 'echo "3131" | sudo -S mkdir -p {{ nfs_mount_path }}/consul-master'
ssh -o StrictHostKeyChecking=no ben@ash3c 'echo "3131" | sudo -S mkdir -p {{ nfs_mount_path }}/consul-ash3c'
ssh -o StrictHostKeyChecking=no ben@warden 'echo "3131" | sudo -S mkdir -p {{ nfs_mount_path }}/consul-warden'
delegate_to: localhost
register: consul_dirs_result
- name: Display setup completion
ansible.builtin.debug:
msg:
- "NFS setup completed successfully!"
- "NFS mount point: {{ nfs_mount_path }}"
- "Consul data directories created:"
- " - {{ nfs_mount_path }}/consul-master"
- " - {{ nfs_mount_path }}/consul-ash3c"
- " - {{ nfs_mount_path }}/consul-warden"

View File

@ -1,50 +0,0 @@
---
- name: Configure Nomad client for NFS volumes
hosts: nomad_clients
become: yes
vars:
nfs_mount_path: /mnt/fnsync
tasks:
- name: Create Nomad plugin directory for NFS
file:
path: /opt/nomad/plugins
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Configure Nomad client to use NFS volumes
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} NFS VOLUME CONFIGURATION"
block: |
plugin "nomad-driver-podman" {
config {
volumes {
enabled = true
}
}
}
client {
host_volume "nfs-shared" {
path = "{{ nfs_mount_path }}/nomad/volumes"
read_only = false
}
}
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Restart Nomad service to apply changes
systemd:
name: nomad
state: restarted
- name: Verify Nomad client configuration
command: nomad node status -self
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
msg: "{{ inventory_hostname }} - Nomad status: {{ '运行中' if nomad_status.rc == 0 else '异常' }}"

View File

@ -1,63 +0,0 @@
---
- name: Setup NFS Storage for Nomad Cluster
hosts: nomad_cluster
become: yes
vars:
nfs_server: snail
nfs_export_path: /fs/1000/nfs/Fnsync
nfs_mount_path: /mnt/fnsync
nfs_options: "rw,sync,vers=4.2"
tasks:
- name: Install NFS client packages
package:
name: nfs-common
state: present
- name: Create NFS mount directory
file:
path: "{{ nfs_mount_path }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Mount NFS share
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options }}"
state: mounted
- name: Ensure NFS mount persists after reboot
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options }}"
state: present
- name: Verify NFS mount
command: df -h "{{ nfs_mount_path }}"
register: mount_result
- name: Display mount result
debug:
var: mount_result.stdout
- name: Create Nomad data directories on NFS
file:
path: "{{ nfs_mount_path }}/nomad/{{ inventory_hostname }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create shared volumes directory
file:
path: "{{ nfs_mount_path }}/nomad/volumes"
state: directory
owner: nomad
group: nomad
mode: '0755'

View File

@ -1,100 +0,0 @@
---
- name: 测试将 Podman 切换到 Snap 版本 (ch2 节点)
hosts: ch2
become: yes
gather_facts: yes
tasks:
- name: 检查当前 Podman 版本和安装方式
shell: |
echo "=== 当前 Podman 信息 ==="
podman --version
echo "安装路径: $(which podman)"
echo "=== Snap 状态 ==="
which snap || echo "snap 未安装"
snap list podman 2>/dev/null || echo "Podman snap 未安装"
echo "=== 包管理器状态 ==="
dpkg -l | grep podman || echo "未通过 apt 安装"
register: current_status
- name: 显示当前状态
debug:
msg: "{{ current_status.stdout }}"
- name: 检查 snap 是否已安装
shell: which snap
register: snap_check
ignore_errors: yes
changed_when: false
- name: 安装 snapd (如果未安装)
apt:
name: snapd
state: present
when: snap_check.rc != 0
- name: 确保 snapd 服务运行
systemd:
name: snapd
state: started
enabled: yes
- name: 检查当前 Podman snap 版本
shell: snap info podman
register: snap_podman_info
ignore_errors: yes
- name: 显示可用的 Podman snap 版本
debug:
msg: "{{ snap_podman_info.stdout if snap_podman_info.rc == 0 else '无法获取 snap podman 信息' }}"
- name: 停止当前 Podman 相关服务
systemd:
name: podman
state: stopped
ignore_errors: yes
- name: 移除通过包管理器安装的 Podman
apt:
name: podman
state: absent
purge: yes
ignore_errors: yes
- name: 安装 Podman snap (edge 通道)
snap:
name: podman
state: present
classic: yes
channel: edge
- name: 创建符号链接 (确保 podman 命令可用)
file:
src: /snap/bin/podman
dest: /usr/local/bin/podman
state: link
force: yes
- name: 验证 Snap Podman 安装
shell: |
/snap/bin/podman --version
which podman
register: snap_podman_verify
- name: 显示安装结果
debug:
msg: |
✅ Snap Podman 安装完成
🚀 版本: {{ snap_podman_verify.stdout_lines[0] }}
📍 路径: {{ snap_podman_verify.stdout_lines[1] }}
- name: 测试 Podman 基本功能
shell: |
/snap/bin/podman version
/snap/bin/podman info --format json | jq -r '.host.arch'
register: podman_test
ignore_errors: yes
- name: 显示测试结果
debug:
msg: "Podman 测试结果: {{ podman_test.stdout if podman_test.rc == 0 else '测试失败' }}"

View File

@ -1,87 +0,0 @@
job "consul-cluster-arm64" {
datacenters = ["dc1"]
type = "service"
# 只在 ARM64 节点上运行master 和 ash3c
constraint {
attribute = "${attr.unique.hostname}"
operator = "regexp"
value = "(master|ash3c)"
}
group "consul" {
count = 2
# 确保每个节点只运行一个实例
constraint {
operator = "distinct_hosts"
value = "true"
}
network {
port "http" {
static = 8500
}
port "rpc" {
static = 8400
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
port "server" {
static = 8300
}
port "dns" {
static = 8600
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = [
"agent",
"-server",
"-bootstrap-expect=2",
"-data-dir=/tmp/consul-cluster-data",
"-bind=${NOMAD_IP_serf_lan}",
"-client=0.0.0.0",
"-retry-join=100.117.106.136", # master Tailscale IP
"-retry-join=100.116.80.94", # ash3c Tailscale IP
"-ui-config-enabled=true",
"-log-level=INFO",
"-node=${node.unique.name}-consul",
"-datacenter=dc1"
]
}
artifact {
source = "https://releases.hashicorp.com/consul/1.17.0/consul_1.17.0_linux_arm64.zip"
destination = "local/"
}
resources {
cpu = 200
memory = 256
}
service {
name = "consul-cluster-arm64"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "3s"
}
}
}
}
}

View File

@ -1,88 +0,0 @@
job "consul-cluster" {
datacenters = ["dc1"]
type = "service"
# 在三个节点上运行bj-warden, master, ash3c
constraint {
attribute = "${node.unique.name}"
operator = "regexp"
value = "(bj-warden|master|ash3c)"
}
group "consul" {
count = 3
# 确保每个节点只运行一个实例
constraint {
operator = "distinct_hosts"
value = "true"
}
network {
port "http" {
static = 8500
}
port "rpc" {
static = 8400
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
port "server" {
static = 8300
}
port "dns" {
static = 8600
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-data-dir=/tmp/consul-cluster-data",
"-bind=${NOMAD_IP_serf_lan}",
"-client=0.0.0.0",
"-retry-join=100.122.197.112", # bj-warden Tailscale IP
"-retry-join=100.117.106.136", # master Tailscale IP
"-retry-join=100.116.80.94", # ash3c Tailscale IP
"-ui-config-enabled=true",
"-log-level=INFO",
"-node=${node.unique.name}-consul",
"-datacenter=dc1"
]
}
artifact {
source = "https://releases.hashicorp.com/consul/1.17.0/consul_1.17.0_linux_arm64.zip"
destination = "local/"
}
resources {
cpu = 200
memory = 256
}
service {
name = "consul-cluster"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "3s"
}
}
}
}
}

View File

@ -1,81 +0,0 @@
job "consul-cluster" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${node.unique.name}"
operator = "regexp"
value = "^(master|ash3c|semaphore)$"
}
group "consul" {
count = 3
network {
port "http" {
static = 8500
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
port "server" {
static = 8300
}
port "dns" {
static = 8600
}
}
service {
name = "consul"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
interval = "10s"
timeout = "2s"
}
}
task "consul" {
driver = "podman"
config {
image = "consul:1.15.4"
network_mode = "host"
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-ui",
"-data-dir=/consul/data",
"-config-dir=/consul/config",
"-bind={{ env \"attr.unique.network.ip-address\" }}",
"-client=0.0.0.0",
"-retry-join=100.117.106.136",
"-retry-join=100.116.80.94",
"-retry-join=100.116.158.95"
]
volumes = [
"consul-data:/consul/data",
"consul-config:/consul/config"
]
}
resources {
cpu = 500
memory = 512
}
env {
CONSUL_BIND_INTERFACE = "tailscale0"
}
}
}
}

View File

@ -7,7 +7,7 @@ job "consul-cluster-simple" {
constraint {
attribute = "${node.unique.name}"
value = "master"
value = "kr-master"
}
network {
@ -35,7 +35,7 @@ job "consul-cluster-simple" {
"-server",
"-bootstrap-expect=3",
"-data-dir=/opt/nomad/data/consul",
"-client=100.64.0.0/10",
"-client=100.117.106.136",
"-bind=100.117.106.136",
"-advertise=100.117.106.136",
"-retry-join=100.116.80.94",
@ -58,7 +58,7 @@ job "consul-cluster-simple" {
constraint {
attribute = "${node.unique.name}"
value = "ash3c"
value = "us-ash3c"
}
network {
@ -86,7 +86,7 @@ job "consul-cluster-simple" {
"-server",
"-bootstrap-expect=3",
"-data-dir=/opt/nomad/data/consul",
"-client=100.64.0.0/10",
"-client=100.116.80.94",
"-bind=100.116.80.94",
"-advertise=100.116.80.94",
"-retry-join=100.117.106.136",
@ -137,7 +137,7 @@ job "consul-cluster-simple" {
"-server",
"-bootstrap-expect=3",
"-data-dir=/opt/nomad/data/consul",
"-client=100.64.0.0/10",
"-client=100.122.197.112",
"-bind=100.122.197.112",
"-advertise=100.122.197.112",
"-retry-join=100.117.106.136",

View File

@ -1,190 +0,0 @@
job "consul-cluster-three-nodes" {
datacenters = ["dc1"]
type = "service"
group "consul-master" {
count = 1
constraint {
attribute = "${node.unique.name}"
value = "master"
}
network {
port "http" {
static = 8500
}
port "rpc" {
static = 8300
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-data-dir=/opt/nomad/data/consul",
"-client=0.0.0.0",
"-bind=100.117.106.136",
"-advertise=100.117.106.136",
"-retry-join=100.116.80.94",
"-retry-join=100.122.197.112",
"-ui-config-enabled=true"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "consul-master"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "3s"
}
}
}
}
group "consul-ash3c" {
count = 1
constraint {
attribute = "${node.unique.name}"
value = "ash3c"
}
network {
port "http" {
static = 8500
}
port "rpc" {
static = 8300
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-data-dir=/opt/nomad/data/consul",
"-client=0.0.0.0",
"-bind=100.116.80.94",
"-advertise=100.116.80.94",
"-retry-join=100.117.106.136",
"-retry-join=100.122.197.112",
"-ui-config-enabled=true"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "consul-ash3c"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "3s"
}
}
}
}
group "consul-warden" {
count = 1
constraint {
attribute = "${node.unique.name}"
value = "bj-warden"
}
network {
port "http" {
static = 8500
}
port "rpc" {
static = 8300
}
port "serf_lan" {
static = 8301
}
port "serf_wan" {
static = 8302
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = [
"agent",
"-server",
"-bootstrap-expect=3",
"-data-dir=/opt/nomad/data/consul",
"-client=0.0.0.0",
"-bind=100.122.197.112",
"-advertise=100.122.197.112",
"-retry-join=100.117.106.136",
"-retry-join=100.116.80.94",
"-ui-config-enabled=true"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "consul-warden"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "3s"
}
}
}
}
}

View File

@ -1,47 +0,0 @@
job "consul-single-member" {
datacenters = ["dc1"]
type = "service"
priority = 50
constraint {
attribute = "${node.unique.name}"
value = "warden"
}
group "consul" {
count = 1
task "consul" {
driver = "exec"
config {
command = "consul"
args = ["agent", "-dev", "-client=0.0.0.0", "-data-dir=/tmp/consul-data"]
}
resources {
cpu = 200
memory = 256
network {
mbits = 10
port "http" {
static = 8500
}
}
}
service {
name = "consul"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -1,47 +0,0 @@
job "consul-single-member" {
datacenters = ["dc1"]
type = "service"
priority = 50
constraint {
attribute = "${node.unique.name}"
value = "warden"
}
group "consul" {
count = 1
task "consul" {
driver = "exec"
config {
command = "consul"
args = ["agent", "-dev", "-client=0.0.0.0", "-data-dir=/tmp/consul-data"]
}
resources {
cpu = 200
memory = 256
network {
mbits = 10
port "http" {
static = 8500
}
}
}
service {
name = "consul"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -1,46 +0,0 @@
job "consul-test-warden" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${node.unique.name}"
value = "bj-warden"
}
group "consul" {
count = 1
network {
port "http" {
static = 8500
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = ["agent", "-dev", "-client=0.0.0.0", "-data-dir=/tmp/consul-test"]
}
resources {
cpu = 200
memory = 256
}
service {
name = "consul-test"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -1,46 +0,0 @@
job "consul-warden" {
datacenters = ["dc1"]
type = "service"
priority = 50
constraint {
attribute = "${node.unique.name}"
value = "warden"
}
group "consul" {
count = 1
task "consul" {
driver = "exec"
config {
command = "consul"
args = ["agent", "-dev", "-client=0.0.0.0", "-data-dir=/tmp/consul-data"]
}
resources {
cpu = 200
memory = 256
network {
port "http" {
static = 8500
}
}
}
service {
name = "consul"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -1,46 +0,0 @@
job "service-discovery-warden" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${node.unique.name}"
value = "warden"
}
group "discovery" {
count = 1
network {
port "http" {
static = 8500
}
}
task "discovery" {
driver = "exec"
config {
command = "consul"
args = ["agent", "-dev", "-client=0.0.0.0", "-data-dir=/tmp/discovery-data"]
}
resources {
cpu = 200
memory = 256
}
service {
name = "discovery-service"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -1,52 +0,0 @@
job "simple-consul-test" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${node.unique.name}"
value = "warden"
}
group "consul" {
count = 1
network {
port "http" {
static = 8500
}
}
task "consul" {
driver = "exec"
config {
command = "consul"
args = [
"agent",
"-dev",
"-client=0.0.0.0",
"-bind=100.122.197.112",
"-data-dir=/tmp/consul-test-data"
]
}
resources {
cpu = 200
memory = 256
}
service {
name = "consul-test"
port = "http"
check {
type = "http"
path = "/v1/status/leader"
port = "http"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -1,40 +0,0 @@
job "test-nginx" {
datacenters = ["dc1"]
type = "service"
group "web" {
count = 1
network {
port "http" {
static = 8080
}
}
task "nginx" {
driver = "podman"
config {
image = "nginx:alpine"
ports = ["http"]
}
resources {
cpu = 100
memory = 128
}
service {
name = "nginx-test"
port = "http"
check {
type = "http"
path = "/"
interval = "10s"
timeout = "3s"
}
}
}
}
}

View File

@ -1,24 +0,0 @@
job "test-podman" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
task "hello" {
driver = "podman"
config {
image = "docker.io/library/hello-world:latest"
logging = {
driver = "journald"
}
}
resources {
cpu = 100
memory = 128
}
}
}
}

View File

@ -1,23 +0,0 @@
job "test-podman-simple" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
task "hello" {
driver = "podman"
config {
image = "alpine:latest"
command = "echo"
args = ["Hello from Podman!"]
}
resources {
cpu = 100
memory = 64
}
}
}
}

View File

@ -1,31 +0,0 @@
job "test-private-registry" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
# 指定运行在北京节点上
constraint {
attribute = "${node.unique.name}"
operator = "regexp"
value = "bj-.*"
}
task "hello" {
driver = "podman"
config {
image = "hello-world:latest"
logging = {
driver = "journald"
}
}
resources {
cpu = 100
memory = 64
}
}
}
}

View File

@ -1,27 +0,0 @@
job "test-simple" {
datacenters = ["dc1"]
type = "service"
constraint {
attribute = "${node.unique.name}"
value = "warden"
}
group "test" {
count = 1
task "hello" {
driver = "exec"
config {
command = "echo"
args = ["Hello from warden node!"]
}
resources {
cpu = 100
memory = 64
}
}
}
}

View File

@ -1,26 +0,0 @@
#!/bin/bash
echo "🔧 使用 HashiCorp 官方脚本修复 master 节点二进制文件..."
# 停止 nomad 服务
echo '3131' | sudo -S systemctl stop nomad || true
echo '3131' | sudo -S pkill -9 -f nomad || true
# 删除旧的二进制文件
echo '3131' | sudo -S rm -f /usr/local/bin/nomad /usr/bin/nomad
# 使用 HashiCorp 官方安装脚本(自动检测架构)
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
echo '3131' | sudo -S apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
echo '3131' | sudo -S apt-get update
echo '3131' | sudo -S apt-get install -y nomad=1.10.5-1
# 验证安装
nomad version
# 重启服务
echo '3131' | sudo -S systemctl daemon-reload
echo '3131' | sudo -S systemctl enable nomad
echo '3131' | sudo -S systemctl start nomad
echo "✅ Master 节点二进制文件修复完成!"

View File

@ -1,124 +0,0 @@
#!/bin/bash
# 🔍 Nomad 集群快速诊断脚本
echo "🔍 Nomad 集群快速诊断"
echo "===================="
echo ""
# 定义节点信息
declare -A NODES=(
["semaphore"]="local"
["master"]="100.117.106.136:60022"
["ash3c"]="100.116.80.94:22"
)
declare -A TAILSCALE_IPS=(
["semaphore"]="100.116.158.95"
["master"]="100.117.106.136"
["ash3c"]="100.116.80.94"
)
echo "📊 1. 本地 Nomad 服务状态"
echo "------------------------"
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
echo ""
echo "📊 2. 集群成员状态"
echo "----------------"
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 3. 节点状态"
echo "------------"
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
echo ""
echo "🌐 4. 网络连通性测试"
echo "------------------"
for node in "${!NODES[@]}"; do
ip="${TAILSCALE_IPS[$node]}"
echo "测试 $node ($ip):"
if [[ "$node" == "semaphore" ]]; then
echo " ✅ 本地节点"
else
# Ping 测试
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
echo " ✅ Ping: 成功"
else
echo " ❌ Ping: 失败"
fi
# 端口测试
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
echo " ✅ RPC端口(4647): 开放"
else
echo " ❌ RPC端口(4647): 关闭"
fi
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
echo " ✅ HTTP端口(4646): 开放"
else
echo " ❌ HTTP端口(4646): 关闭"
fi
fi
echo ""
done
echo "🔧 5. 远程节点服务状态"
echo "-------------------"
for node in "${!NODES[@]}"; do
if [[ "$node" == "semaphore" ]]; then
continue
fi
connection="${NODES[$node]}"
ip=$(echo "$connection" | cut -d: -f1)
port=$(echo "$connection" | cut -d: -f2)
echo "检查 $node ($ip:$port):"
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
echo " 服务状态: $status"
# 检查配置文件中的 bind_addr
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
echo " 配置绑定地址: $bind_addr"
# 检查实际监听端口
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
if [[ -n "$listening" ]]; then
echo " 监听端口:"
echo "$listening" | sed 's/^/ /'
else
echo " ❌ 未发现 Nomad 监听端口"
fi
else
echo " ❌ 无法连接或服务未运行"
fi
echo ""
done
echo "📋 6. 问题总结和建议"
echo "=================="
# 检查是否有 leader
if nomad server members 2>/dev/null | grep -q "leader"; then
echo "✅ 集群有 leader"
else
echo "❌ 集群没有 leader - 这是主要问题!"
echo ""
echo "🔧 建议的修复步骤:"
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
echo "3. 检查 master 节点是否需要重启"
fi
echo ""
echo "🔗 有用的链接:"
echo " Web UI: http://100.116.158.95:4646"
echo " 日志查看: journalctl -u nomad -f"
echo ""
echo "🔍 诊断完成!"

View File

@ -1,76 +0,0 @@
#!/bin/bash
# ☢️ 核弹级 Nomad 重置执行脚本 ☢️
set -e
echo "☢️☢️☢️ 核弹级 Nomad 集群重置 ☢️☢️☢️"
echo ""
echo "这个脚本将:"
echo "1. 完全摧毁所有 Nomad 进程和数据"
echo "2. 重新下载并安装 Nomad 二进制文件"
echo "3. 创建全新的配置文件"
echo "4. 重新启动整个集群"
echo ""
echo "⚠️ 警告:这是不可逆的操作!⚠️"
echo ""
# 检查是否在正确的目录
if [[ ! -f "scripts/utilities/NUCLEAR-NOMAD-RESET.yml" ]]; then
echo "❌ 错误:请在 /root/mgmt 目录下运行此脚本"
exit 1
fi
# 确认操作
read -p "你确定要进行核弹级重置吗?输入 'NUCLEAR' 确认: " confirm
if [[ "$confirm" != "NUCLEAR" ]]; then
echo "❌ 操作已取消"
exit 1
fi
echo ""
echo "🚀 开始核弹级重置..."
echo ""
# 设置 Ansible 配置
export ANSIBLE_HOST_KEY_CHECKING=False
export ANSIBLE_STDOUT_CALLBACK=yaml
# 执行核弹级重置
echo "📡 执行 Ansible playbook..."
cd /root/mgmt/configuration
ansible-playbook \
-i inventories/production/nomad-cluster.ini \
../scripts/utilities/NUCLEAR-NOMAD-RESET.yml \
--extra-vars "ansible_ssh_common_args='-o StrictHostKeyChecking=no'" \
-v
echo ""
echo "⏰ 等待集群稳定..."
sleep 30
echo ""
echo "🔍 检查集群状态..."
# 检查集群成员
echo "📊 集群成员状态:"
nomad server members || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 节点状态:"
nomad node status || echo "❌ 无法获取节点状态"
echo ""
echo "🎯 如果上面显示错误,请等待几分钟后再次检查"
echo "集群可能需要一些时间来完全启动和同步"
echo ""
echo "🔧 有用的命令:"
echo " 检查集群成员: nomad server members"
echo " 检查节点状态: nomad node status"
echo " 查看日志: journalctl -u nomad -f"
echo " Web UI: http://100.116.158.95:4646"
echo ""
echo "☢️ 核弹级重置完成!☢️"

View File

@ -1,113 +0,0 @@
---
- name: Ultimate Nomad Cluster Fix - Complete Reset
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tasks:
- name: Stop and disable nomad service completely
systemd:
name: nomad
state: stopped
enabled: no
daemon_reload: yes
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Remove all nomad data and state
file:
path: "{{ item }}"
state: absent
loop:
- /opt/nomad/data
- /etc/nomad.d/nomad.hcl
- /var/log/nomad
- name: Create clean nomad directories
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /etc/nomad.d
- /opt/nomad
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- name: Create minimal nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_default_ipv4.address }}"
server {
enabled = true
bootstrap_expect = 1
encrypt = "{{ nomad_encrypt_key }}"
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ ansible_default_ipv4.address }}"
serf = "{{ ansible_default_ipv4.address }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Enable and start nomad service
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 10
timeout: 60
- name: Check nomad status
uri:
url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader"
method: GET
register: nomad_leader
retries: 5
delay: 5
ignore_errors: yes
- name: Display nomad status
debug:
msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}"

View File

@ -1,31 +0,0 @@
#!/bin/bash
echo "=== Nomad Podman Migration Verification ==="
echo
# Check Nomad service status
echo "1. Checking Nomad service status..."
ssh ben@100.84.197.26 "sudo systemctl status nomad --no-pager -l"
echo
# Check Nomad configuration
echo "2. Checking Nomad configuration..."
ssh ben@100.84.197.26 "sudo cat /etc/nomad.d/nomad.hcl | grep -A 10 -B 2 podman"
echo
# Check Podman socket
echo "3. Checking Podman socket..."
ssh ben@100.84.197.26 "ls -la /run/user/*/podman/podman.sock 2>/dev/null || echo 'Podman socket not found'"
echo
# Check Nomad node status
echo "4. Checking Nomad node status..."
ssh ben@100.84.197.26 "sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 'Driver Status'" 2>/dev/null || echo "Could not get node status"
echo
# Test Podman functionality
echo "5. Testing Podman as nomad user..."
ssh ben@100.84.197.26 "sudo -u nomad podman version --format '{{.Version}}'" 2>/dev/null || echo "Podman test failed"
echo
echo "=== Verification Complete ==="