This commit is contained in:
2025-10-09 01:22:22 +00:00
parent 1c994f9f60
commit eab95c8c80
136 changed files with 11001 additions and 849 deletions

View File

@@ -0,0 +1,62 @@
---
- name: Configure Nomad Dynamic Host Volumes for NFS
hosts: nomad_clients
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration for dynamic host volumes
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} DYNAMIC HOST VOLUMES CONFIGURATION"
block: |
client {
# 启用动态host volumes
host_volume "fnsync" {
path = "{{ mount_point }}"
read_only = false
}
# 添加NFS相关的节点元数据
meta {
nfs_server = "{{ nfs_server }}"
nfs_share = "{{ nfs_share }}"
nfs_mounted = "true"
}
}
insertafter: 'client {'
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check Nomad status
command: nomad node status
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,41 @@
---
- name: 部署Nomad服务器配置模板
hosts: nomad_servers
become: yes
tasks:
- name: 部署Nomad配置文件
template:
src: nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
backup: yes
owner: root
group: root
mode: '0644'
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,39 @@
---
- name: 紧急修复Nomad bootstrap_expect配置
hosts: nomad_servers
become: yes
tasks:
- name: 修复bootstrap_expect为3
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ bootstrap_expect = \d+'
line: ' bootstrap_expect = 3'
backup: yes
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,103 @@
---
- name: Fix ch4 Nomad configuration - convert from server to client
hosts: ch4
become: yes
vars:
ansible_host: 100.117.106.136
tasks:
- name: Backup current Nomad config
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup
remote_src: yes
backup: yes
- name: Update Nomad config to client mode
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CLIENT CONFIG"
block: |
server {
enabled = false
}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = "true"
}
}
insertbefore: '^server \{'
replace: '^server \{.*?\}'
- name: Update client block
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CLIENT BLOCK"
block: |
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = "true"
}
}
insertbefore: '^client \{'
replace: '^client \{.*?\}'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Verify Nomad client status
shell: |
NOMAD_ADDR=http://localhost:4646 nomad node status | grep -q "ready"
register: nomad_ready
failed_when: nomad_ready.rc != 0
retries: 3
delay: 10
- name: Display completion message
debug:
msg: |
✅ Successfully converted ch4 from Nomad server to client
✅ Nomad service restarted
✅ Configuration updated

View File

@@ -0,0 +1,82 @@
---
- name: Fix master node - rename to ch4 and restore SSH port 22
hosts: master
become: yes
vars:
new_hostname: ch4
old_hostname: master
tasks:
- name: Backup current hostname
copy:
content: "{{ old_hostname }}"
dest: /etc/hostname.backup
mode: '0644'
when: ansible_hostname == old_hostname
- name: Update hostname to ch4
hostname:
name: "{{ new_hostname }}"
when: ansible_hostname == old_hostname
- name: Update /etc/hostname file
copy:
content: "{{ new_hostname }}"
dest: /etc/hostname
mode: '0644'
when: ansible_hostname == old_hostname
- name: Update /etc/hosts file
lineinfile:
path: /etc/hosts
regexp: '^127\.0\.1\.1.*{{ old_hostname }}'
line: '127.0.1.1 {{ new_hostname }}'
state: present
when: ansible_hostname == old_hostname
- name: Update Tailscale hostname
shell: |
tailscale set --hostname={{ new_hostname }}
when: ansible_hostname == old_hostname
- name: Backup SSH config
copy:
src: /etc/ssh/sshd_config
dest: /etc/ssh/sshd_config.backup
remote_src: yes
backup: yes
- name: Restore SSH port to 22
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^Port '
line: 'Port 22'
state: present
- name: Restart SSH service
systemd:
name: ssh
state: restarted
enabled: yes
- name: Wait for SSH to be ready on port 22
wait_for:
port: 22
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Test SSH connection on port 22
ping:
delegate_to: "{{ inventory_hostname }}"
vars:
ansible_port: 22
- name: Display completion message
debug:
msg: |
✅ Successfully renamed {{ old_hostname }} to {{ new_hostname }}
✅ SSH port restored to 22
✅ Tailscale hostname updated
🔄 Please update your inventory file to use the new hostname and port

View File

@@ -0,0 +1,71 @@
---
- name: Install and configure Consul clients on all nodes
hosts: all
become: yes
vars:
consul_servers:
- "100.117.106.136" # ch4 (韩国)
- "100.122.197.112" # warden (北京)
- "100.116.80.94" # ash3c (美国)
tasks:
- name: Get Tailscale IP address
shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d/ -f1
register: tailscale_ip_result
changed_when: false
- name: Set Tailscale IP fact
set_fact:
tailscale_ip: "{{ tailscale_ip_result.stdout }}"
- name: Install Consul
apt:
name: consul
state: present
update_cache: yes
- name: Create Consul data directory
file:
path: /opt/consul/data
state: directory
owner: consul
group: consul
mode: '0755'
- name: Create Consul log directory
file:
path: /var/log/consul
state: directory
owner: consul
group: consul
mode: '0755'
- name: Create Consul config directory
file:
path: /etc/consul.d
state: directory
owner: consul
group: consul
mode: '0755'
- name: Generate Consul client configuration
template:
src: consul-client.hcl.j2
dest: /etc/consul.d/consul.hcl
owner: consul
group: consul
mode: '0644'
notify: restart consul
- name: Enable and start Consul service
systemd:
name: consul
enabled: yes
state: started
daemon_reload: yes
handlers:
- name: restart consul
systemd:
name: consul
state: restarted

View File

@@ -0,0 +1,91 @@
---
- name: Install NFS CSI Plugin for Nomad
hosts: nomad_nodes
become: yes
vars:
nomad_user: nomad
nomad_plugins_dir: /opt/nomad/plugins
csi_driver_version: "v4.0.0"
csi_driver_url: "https://github.com/kubernetes-csi/csi-driver-nfs/releases/download/{{ csi_driver_version }}/csi-nfs-driver"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Create plugins directory
file:
path: "{{ nomad_plugins_dir }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Download NFS CSI driver
get_url:
url: "{{ csi_driver_url }}"
dest: "{{ nomad_plugins_dir }}/csi-nfs-driver"
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Install required packages for CSI
package:
name:
- nfs-common
- mount
state: present
- name: Create CSI mount directory
file:
path: /opt/nomad/csi
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Update Nomad configuration for CSI plugin
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} CSI PLUGIN CONFIGURATION"
block: |
plugin_dir = "{{ nomad_plugins_dir }}"
plugin "csi-nfs" {
type = "csi"
config {
driver_name = "nfs.csi.k8s.io"
mount_dir = "/opt/nomad/csi"
health_timeout = "30s"
log_level = "INFO"
}
}
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check Nomad status
command: nomad node status
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,33 @@
---
- name: 启动所有Nomad服务器形成集群
hosts: nomad_servers
become: yes
tasks:
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 启动Nomad服务如果未运行
systemd:
name: nomad
state: started
enabled: yes
when: nomad_status.status.ActiveState != "active"
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示Nomad服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,61 @@
# Consul Client Configuration for {{ inventory_hostname }}
datacenter = "dc1"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "{{ inventory_hostname }}"
bind_addr = "{{ hostvars[inventory_hostname]['tailscale_ip'] }}"
# Client mode (not server)
server = false
# Connect to Consul servers (指向三节点集群)
retry_join = [
{% for server in consul_servers %}
"{{ server }}"{% if not loop.last %},{% endif %}
{% endfor %}
]
# Performance optimization
performance {
raft_multiplier = 5
}
# Ports configuration
ports {
grpc = 8502
http = 8500
dns = 8600
}
# Enable Connect for service mesh
connect {
enabled = true
}
# Cache configuration for performance
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# Node metadata
node_meta = {
region = "unknown"
zone = "nomad-{{ 'server' if 'server' in group_names else 'client' }}"
}
# UI disabled for clients
ui_config {
enabled = false
}
# ACL configuration (if needed)
acl = {
enabled = false
default_policy = "allow"
}
# Logging
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7

View File

@@ -0,0 +1,106 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ ansible_hostname }}"
bind_addr = "0.0.0.0"
addresses {
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 3
server_join {
retry_join = [
"semaphore.tailnet-68f9.ts.net:4648",
"ash1d.tailnet-68f9.ts.net:4648",
"ash2e.tailnet-68f9.ts.net:4648",
"ch2.tailnet-68f9.ts.net:4648",
"ch3.tailnet-68f9.ts.net:4648",
"onecloud1.tailnet-68f9.ts.net:4648",
"de.tailnet-68f9.ts.net:4648",
"hcp1.tailnet-68f9.ts.net:4648"
]
}
}
{% if ansible_hostname == 'hcp1' %}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647",
"hcp1.tailnet-68f9.ts.net:4647"
]
host_volume "traefik-certs" {
path = "/opt/traefik/certs"
read_only = false
}
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
meta {
consul = "true"
consul_version = "1.21.5"
consul_client = "true"
}
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
{% endif %}
consul {
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = false
client_auto_join = true
}
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}

View File

@@ -19,7 +19,7 @@
- ip: "100.120.225.29"
hostnames: ["de"]
- ip: "100.117.106.136"
hostnames: ["master"]
hostnames: ["ch4"]
- ip: "100.116.80.94"
hostnames: ["ash3c", "influxdb1"]
- ip: "100.116.112.45"

View File

@@ -0,0 +1,56 @@
---
- name: 更新Nomad服务器配置添加hcp1作为peer
hosts: nomad_servers
become: yes
vars:
hcp1_ip: "100.97.62.111"
bootstrap_expect: 8
tasks:
- name: 备份原配置文件
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.bak
remote_src: yes
backup: yes
- name: 添加hcp1到retry_join列表
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ retry_join = \['
line: ' retry_join = ["{{ hcp1_ip }}",'
backup: yes
- name: 更新bootstrap_expect为8
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ bootstrap_expect = \d+'
line: ' bootstrap_expect = {{ bootstrap_expect }}'
backup: yes
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "Nomad服务状态: {{ nomad_status.status.ActiveState }}"