This commit is contained in:
2025-10-09 01:22:22 +00:00
parent 1c994f9f60
commit eab95c8c80
136 changed files with 11001 additions and 849 deletions

View File

@@ -0,0 +1,57 @@
---
- name: Clean up Consul configuration from dedicated clients
hosts: hcp1,influxdb1,browser
become: yes
tasks:
- name: Stop Consul service
systemd:
name: consul
state: stopped
enabled: no
- name: Disable Consul service
systemd:
name: consul
enabled: no
- name: Kill any remaining Consul processes
shell: |
pkill -f consul || true
sleep 2
pkill -9 -f consul || true
ignore_errors: yes
- name: Remove Consul systemd service file
file:
path: /etc/systemd/system/consul.service
state: absent
- name: Remove Consul configuration directory
file:
path: /etc/consul.d
state: absent
- name: Remove Consul data directory
file:
path: /opt/consul
state: absent
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Verify Consul is stopped
shell: |
if pgrep -f consul; then
echo "Consul still running"
exit 1
else
echo "Consul stopped successfully"
fi
register: consul_status
failed_when: consul_status.rc != 0
- name: Display cleanup status
debug:
msg: "Consul cleanup completed on {{ inventory_hostname }}"

View File

@@ -0,0 +1,55 @@
---
- name: Configure Consul Auto-Discovery
hosts: all
become: yes
vars:
consul_servers:
- "warden.tailnet-68f9.ts.net:8301"
- "ch4.tailnet-68f9.ts.net:8301"
- "ash3c.tailnet-68f9.ts.net:8301"
tasks:
- name: Backup current nomad.hcl
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup.{{ ansible_date_time.epoch }}
remote_src: yes
backup: yes
- name: Update Consul configuration for auto-discovery
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CONSUL CONFIG"
block: |
consul {
retry_join = [
"warden.tailnet-68f9.ts.net:8301",
"ch4.tailnet-68f9.ts.net:8301",
"ash3c.tailnet-68f9.ts.net:8301"
]
server_service_name = "nomad"
client_service_name = "nomad-client"
}
insertbefore: '^consul \{'
replace: '^consul \{.*?\}'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Verify Consul connection
shell: |
NOMAD_ADDR=http://localhost:4646 nomad node status | grep -q "ready"
register: nomad_ready
failed_when: nomad_ready.rc != 0
retries: 3
delay: 10

View File

@@ -0,0 +1,75 @@
---
- name: Remove Consul configuration from Nomad servers
hosts: semaphore,ash1d,ash2e,ch2,ch3,onecloud1,de
become: yes
tasks:
- name: Remove entire Consul configuration block
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CONSUL CONFIG"
state: absent
- name: Remove Consul configuration lines
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^consul \{'
state: absent
- name: Remove Consul configuration content
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ address ='
state: absent
- name: Remove Consul service names
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ server_service_name ='
state: absent
- name: Remove Consul client service name
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ client_service_name ='
state: absent
- name: Remove Consul auto-advertise
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ auto_advertise ='
state: absent
- name: Remove Consul server auto-join
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ server_auto_join ='
state: absent
- name: Remove Consul client auto-join
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ client_auto_join ='
state: absent
- name: Remove Consul closing brace
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^}'
state: absent
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Display completion message
debug:
msg: "Removed Consul configuration from {{ inventory_hostname }}"

View File

@@ -0,0 +1,32 @@
---
- name: Enable Nomad Client Mode on Servers
hosts: ch2,ch3,de
become: yes
tasks:
- name: Enable Nomad client mode
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^client \{'
line: 'client {'
state: present
- name: Enable client mode
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ enabled = false'
line: ' enabled = true'
state: present
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30

View File

@@ -0,0 +1,62 @@
---
- name: Fix all master references to ch4
hosts: localhost
gather_facts: no
vars:
files_to_fix:
- "scripts/diagnose-consul-sync.sh"
- "scripts/register-traefik-to-all-consul.sh"
- "deployment/ansible/playbooks/update-nomad-consul-config.yml"
- "deployment/ansible/templates/nomad-server.hcl.j2"
- "deployment/ansible/templates/nomad-client.hcl"
- "deployment/ansible/playbooks/fix-nomad-consul-roles.yml"
- "deployment/ansible/onecloud1_nomad.hcl"
- "ansible/templates/consul-client.hcl.j2"
- "ansible/consul-client-deployment.yml"
- "ansible/consul-client-simple.yml"
tasks:
- name: Replace master.tailnet-68f9.ts.net with ch4.tailnet-68f9.ts.net
replace:
path: "{{ item }}"
regexp: 'master\.tailnet-68f9\.ts\.net'
replace: 'ch4.tailnet-68f9.ts.net'
loop: "{{ files_to_fix }}"
when: item is file
- name: Replace master hostname references
replace:
path: "{{ item }}"
regexp: '\bmaster\b'
replace: 'ch4'
loop: "{{ files_to_fix }}"
when: item is file
- name: Replace master IP references in comments
replace:
path: "{{ item }}"
regexp: '# master'
replace: '# ch4'
loop: "{{ files_to_fix }}"
when: item is file
- name: Fix inventory files
replace:
path: "{{ item }}"
regexp: 'master ansible_host=master'
replace: 'ch4 ansible_host=ch4'
loop:
- "deployment/ansible/inventories/production/inventory.ini"
- "deployment/ansible/inventories/production/csol-consul-nodes.ini"
- "deployment/ansible/inventories/production/nomad-clients.ini"
- "deployment/ansible/inventories/production/master-ash3c.ini"
- "deployment/ansible/inventories/production/consul-nodes.ini"
- "deployment/ansible/inventories/production/vault.ini"
- name: Fix IP address references (100.117.106.136 comments)
replace:
path: "{{ item }}"
regexp: '100\.117\.106\.136.*# master'
replace: '100.117.106.136 # ch4'
loop: "{{ files_to_fix }}"
when: item is file

View File

@@ -72,7 +72,7 @@
"description": "Consul客户端节点用于服务发现和健康检查",
"nodes": [
{
"name": "master",
"name": "ch4",
"host": "100.117.106.136",
"user": "ben",
"password": "3131",

View File

@@ -2,21 +2,21 @@
# 服务器节点 (7个服务器节点)
# ⚠️ 警告:能力越大,责任越大!服务器节点操作需极其谨慎!
# ⚠️ 任何对服务器节点的操作都可能影响整个集群的稳定性!
semaphore ansible_host=semaphore.tailnet-68f9.ts.net ansible_user=root ansible_password=313131 ansible_become_password=313131
semaphore ansible_host=semaphore.tailnet-68f9.ts.net ansible_user=root ansible_password=3131 ansible_become_password=3131
ash1d ansible_host=ash1d.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash2e ansible_host=ash2e.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch2 ansible_host=ch2.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch3 ansible_host=ch3.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
onecloud1 ansible_host=onecloud1.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
de ansible_host=de.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
hcp1 ansible_host=hcp1.tailnet-68f9.ts.net ansible_user=root ansible_password=3131 ansible_become_password=3131
[nomad_clients]
# 客户端节点
master ansible_host=master.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131 ansible_port=60022
# 客户端节点 (5个客户端节点)
ch4 ansible_host=ch4.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash3c ansible_host=ash3c.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
browser ansible_host=browser.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
influxdb1 ansible_host=influxdb1.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
hcp1 ansible_host=hcp1.tailnet-68f9.ts.net ansible_user=root ansible_password=3131 ansible_become_password=3131
warden ansible_host=warden.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
[nomad_nodes:children]

View File

@@ -11,7 +11,7 @@ ash1d ansible_host=ash1d ansible_user=ben ansible_become=yes ansible_become_pass
ash2e ansible_host=ash2e ansible_user=ben ansible_become=yes ansible_become_pass=3131
[oci_a1]
master ansible_host=master ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131
ch4 ansible_host=ch4 ansible_user=ben ansible_become=yes ansible_become_pass=3131
ash3c ansible_host=ash3c ansible_user=ben ansible_become=yes ansible_become_pass=3131

View File

@@ -0,0 +1,62 @@
---
- name: Configure Nomad Dynamic Host Volumes for NFS
hosts: nomad_clients
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration for dynamic host volumes
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} DYNAMIC HOST VOLUMES CONFIGURATION"
block: |
client {
# 启用动态host volumes
host_volume "fnsync" {
path = "{{ mount_point }}"
read_only = false
}
# 添加NFS相关的节点元数据
meta {
nfs_server = "{{ nfs_server }}"
nfs_share = "{{ nfs_share }}"
nfs_mounted = "true"
}
}
insertafter: 'client {'
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check Nomad status
command: nomad node status
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,41 @@
---
- name: 部署Nomad服务器配置模板
hosts: nomad_servers
become: yes
tasks:
- name: 部署Nomad配置文件
template:
src: nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
backup: yes
owner: root
group: root
mode: '0644'
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,39 @@
---
- name: 紧急修复Nomad bootstrap_expect配置
hosts: nomad_servers
become: yes
tasks:
- name: 修复bootstrap_expect为3
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ bootstrap_expect = \d+'
line: ' bootstrap_expect = 3'
backup: yes
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,103 @@
---
- name: Fix ch4 Nomad configuration - convert from server to client
hosts: ch4
become: yes
vars:
ansible_host: 100.117.106.136
tasks:
- name: Backup current Nomad config
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup
remote_src: yes
backup: yes
- name: Update Nomad config to client mode
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CLIENT CONFIG"
block: |
server {
enabled = false
}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = "true"
}
}
insertbefore: '^server \{'
replace: '^server \{.*?\}'
- name: Update client block
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CLIENT BLOCK"
block: |
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = "true"
}
}
insertbefore: '^client \{'
replace: '^client \{.*?\}'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Verify Nomad client status
shell: |
NOMAD_ADDR=http://localhost:4646 nomad node status | grep -q "ready"
register: nomad_ready
failed_when: nomad_ready.rc != 0
retries: 3
delay: 10
- name: Display completion message
debug:
msg: |
✅ Successfully converted ch4 from Nomad server to client
✅ Nomad service restarted
✅ Configuration updated

View File

@@ -0,0 +1,82 @@
---
- name: Fix master node - rename to ch4 and restore SSH port 22
hosts: master
become: yes
vars:
new_hostname: ch4
old_hostname: master
tasks:
- name: Backup current hostname
copy:
content: "{{ old_hostname }}"
dest: /etc/hostname.backup
mode: '0644'
when: ansible_hostname == old_hostname
- name: Update hostname to ch4
hostname:
name: "{{ new_hostname }}"
when: ansible_hostname == old_hostname
- name: Update /etc/hostname file
copy:
content: "{{ new_hostname }}"
dest: /etc/hostname
mode: '0644'
when: ansible_hostname == old_hostname
- name: Update /etc/hosts file
lineinfile:
path: /etc/hosts
regexp: '^127\.0\.1\.1.*{{ old_hostname }}'
line: '127.0.1.1 {{ new_hostname }}'
state: present
when: ansible_hostname == old_hostname
- name: Update Tailscale hostname
shell: |
tailscale set --hostname={{ new_hostname }}
when: ansible_hostname == old_hostname
- name: Backup SSH config
copy:
src: /etc/ssh/sshd_config
dest: /etc/ssh/sshd_config.backup
remote_src: yes
backup: yes
- name: Restore SSH port to 22
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^Port '
line: 'Port 22'
state: present
- name: Restart SSH service
systemd:
name: ssh
state: restarted
enabled: yes
- name: Wait for SSH to be ready on port 22
wait_for:
port: 22
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Test SSH connection on port 22
ping:
delegate_to: "{{ inventory_hostname }}"
vars:
ansible_port: 22
- name: Display completion message
debug:
msg: |
✅ Successfully renamed {{ old_hostname }} to {{ new_hostname }}
✅ SSH port restored to 22
✅ Tailscale hostname updated
🔄 Please update your inventory file to use the new hostname and port

View File

@@ -0,0 +1,71 @@
---
- name: Install and configure Consul clients on all nodes
hosts: all
become: yes
vars:
consul_servers:
- "100.117.106.136" # ch4 (韩国)
- "100.122.197.112" # warden (北京)
- "100.116.80.94" # ash3c (美国)
tasks:
- name: Get Tailscale IP address
shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d/ -f1
register: tailscale_ip_result
changed_when: false
- name: Set Tailscale IP fact
set_fact:
tailscale_ip: "{{ tailscale_ip_result.stdout }}"
- name: Install Consul
apt:
name: consul
state: present
update_cache: yes
- name: Create Consul data directory
file:
path: /opt/consul/data
state: directory
owner: consul
group: consul
mode: '0755'
- name: Create Consul log directory
file:
path: /var/log/consul
state: directory
owner: consul
group: consul
mode: '0755'
- name: Create Consul config directory
file:
path: /etc/consul.d
state: directory
owner: consul
group: consul
mode: '0755'
- name: Generate Consul client configuration
template:
src: consul-client.hcl.j2
dest: /etc/consul.d/consul.hcl
owner: consul
group: consul
mode: '0644'
notify: restart consul
- name: Enable and start Consul service
systemd:
name: consul
enabled: yes
state: started
daemon_reload: yes
handlers:
- name: restart consul
systemd:
name: consul
state: restarted

View File

@@ -0,0 +1,91 @@
---
- name: Install NFS CSI Plugin for Nomad
hosts: nomad_nodes
become: yes
vars:
nomad_user: nomad
nomad_plugins_dir: /opt/nomad/plugins
csi_driver_version: "v4.0.0"
csi_driver_url: "https://github.com/kubernetes-csi/csi-driver-nfs/releases/download/{{ csi_driver_version }}/csi-nfs-driver"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Create plugins directory
file:
path: "{{ nomad_plugins_dir }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Download NFS CSI driver
get_url:
url: "{{ csi_driver_url }}"
dest: "{{ nomad_plugins_dir }}/csi-nfs-driver"
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Install required packages for CSI
package:
name:
- nfs-common
- mount
state: present
- name: Create CSI mount directory
file:
path: /opt/nomad/csi
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Update Nomad configuration for CSI plugin
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} CSI PLUGIN CONFIGURATION"
block: |
plugin_dir = "{{ nomad_plugins_dir }}"
plugin "csi-nfs" {
type = "csi"
config {
driver_name = "nfs.csi.k8s.io"
mount_dir = "/opt/nomad/csi"
health_timeout = "30s"
log_level = "INFO"
}
}
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check Nomad status
command: nomad node status
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,33 @@
---
- name: 启动所有Nomad服务器形成集群
hosts: nomad_servers
become: yes
tasks:
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 启动Nomad服务如果未运行
systemd:
name: nomad
state: started
enabled: yes
when: nomad_status.status.ActiveState != "active"
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示Nomad服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,61 @@
# Consul Client Configuration for {{ inventory_hostname }}
datacenter = "dc1"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "{{ inventory_hostname }}"
bind_addr = "{{ hostvars[inventory_hostname]['tailscale_ip'] }}"
# Client mode (not server)
server = false
# Connect to Consul servers (指向三节点集群)
retry_join = [
{% for server in consul_servers %}
"{{ server }}"{% if not loop.last %},{% endif %}
{% endfor %}
]
# Performance optimization
performance {
raft_multiplier = 5
}
# Ports configuration
ports {
grpc = 8502
http = 8500
dns = 8600
}
# Enable Connect for service mesh
connect {
enabled = true
}
# Cache configuration for performance
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# Node metadata
node_meta = {
region = "unknown"
zone = "nomad-{{ 'server' if 'server' in group_names else 'client' }}"
}
# UI disabled for clients
ui_config {
enabled = false
}
# ACL configuration (if needed)
acl = {
enabled = false
default_policy = "allow"
}
# Logging
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7

View File

@@ -0,0 +1,106 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ ansible_hostname }}"
bind_addr = "0.0.0.0"
addresses {
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 3
server_join {
retry_join = [
"semaphore.tailnet-68f9.ts.net:4648",
"ash1d.tailnet-68f9.ts.net:4648",
"ash2e.tailnet-68f9.ts.net:4648",
"ch2.tailnet-68f9.ts.net:4648",
"ch3.tailnet-68f9.ts.net:4648",
"onecloud1.tailnet-68f9.ts.net:4648",
"de.tailnet-68f9.ts.net:4648",
"hcp1.tailnet-68f9.ts.net:4648"
]
}
}
{% if ansible_hostname == 'hcp1' %}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647",
"hcp1.tailnet-68f9.ts.net:4647"
]
host_volume "traefik-certs" {
path = "/opt/traefik/certs"
read_only = false
}
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
meta {
consul = "true"
consul_version = "1.21.5"
consul_client = "true"
}
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
{% endif %}
consul {
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = false
client_auto_join = true
}
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}

View File

@@ -19,7 +19,7 @@
- ip: "100.120.225.29"
hostnames: ["de"]
- ip: "100.117.106.136"
hostnames: ["master"]
hostnames: ["ch4"]
- ip: "100.116.80.94"
hostnames: ["ash3c", "influxdb1"]
- ip: "100.116.112.45"

View File

@@ -0,0 +1,56 @@
---
- name: 更新Nomad服务器配置添加hcp1作为peer
hosts: nomad_servers
become: yes
vars:
hcp1_ip: "100.97.62.111"
bootstrap_expect: 8
tasks:
- name: 备份原配置文件
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.bak
remote_src: yes
backup: yes
- name: 添加hcp1到retry_join列表
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ retry_join = \['
line: ' retry_join = ["{{ hcp1_ip }}",'
backup: yes
- name: 更新bootstrap_expect为8
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ bootstrap_expect = \d+'
line: ' bootstrap_expect = {{ bootstrap_expect }}'
backup: yes
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,72 @@
---
- name: Remove Consul configuration from all Nomad servers
hosts: semaphore,ash1d,ash2e,ch2,ch3,onecloud1,de
become: yes
tasks:
- name: Create clean Nomad server configuration
copy:
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ inventory_hostname }}"
bind_addr = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
addresses {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
}
advertise {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4646"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4647"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 7
retry_join = ["ash1d.tailnet-68f9.ts.net","ash2e.tailnet-68f9.ts.net","ch2.tailnet-68f9.ts.net","ch3.tailnet-68f9.ts.net","onecloud1.tailnet-68f9.ts.net","de.tailnet-68f9.ts.net"]
}
client {
enabled = false
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
dest: /etc/nomad.d/nomad.hcl
mode: '0644'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Display completion message
debug:
msg: "Removed Consul configuration from {{ inventory_hostname }}"

View File

@@ -0,0 +1,62 @@
# Consul Client Configuration for {{ inventory_hostname }}
datacenter = "dc1"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "{{ inventory_hostname }}"
bind_addr = "{{ ansible_host }}"
# Client mode (not server)
server = false
# Connect to Consul servers (指向三节点集群)
retry_join = [
{% for server in consul_servers %}
"{{ server }}"{% if not loop.last %},{% endif %}
{% endfor %}
]
# Performance optimization
performance {
raft_multiplier = 5
}
# Ports configuration
ports {
grpc = 8502
http = 8500
dns = 8600
}
# Enable Connect for service mesh
connect {
enabled = true
}
# Cache configuration for performance
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# Node metadata
node_meta = {
region = "unknown"
zone = "nomad-{{ 'server' if 'server' in group_names else 'client' }}"
}
# UI disabled for clients
ui_config {
enabled = false
}
# ACL configuration (if needed)
acl = {
enabled = false
default_policy = "allow"
}
# Logging
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7

View File

@@ -49,6 +49,11 @@ client {
read_only = false
}
host_volume "vault-storage" {
path = "/opt/nomad/data/vault-storage"
read_only = false
}
# 禁用Docker驱动只使用Podman
options {
"driver.raw_exec.enable" = "1"

View File

@@ -2,20 +2,20 @@ datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ server_name }}"
name = "{{ ansible_hostname }}"
bind_addr = "{{ server_name }}.tailnet-68f9.ts.net"
bind_addr = "0.0.0.0"
addresses {
http = "{{ server_name }}.tailnet-68f9.ts.net"
rpc = "{{ server_name }}.tailnet-68f9.ts.net"
serf = "{{ server_name }}.tailnet-68f9.ts.net"
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ server_name }}.tailnet-68f9.ts.net:4646"
rpc = "{{ server_name }}.tailnet-68f9.ts.net:4647"
serf = "{{ server_name }}.tailnet-68f9.ts.net:4648"
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
ports {
@@ -26,18 +26,56 @@ ports {
server {
enabled = true
bootstrap_expect = 7
retry_join = [
{%- for server in groups['nomad_servers'] -%}
{%- if server != inventory_hostname -%}
"{{ server }}.tailnet-68f9.ts.net"{% if not loop.last %},{% endif %}
{%- endif -%}
{%- endfor -%}
]
bootstrap_expect = 3
server_join {
retry_join = [
"semaphore.tailnet-68f9.ts.net:4648",
"ash1d.tailnet-68f9.ts.net:4648",
"ash2e.tailnet-68f9.ts.net:4648",
"ch2.tailnet-68f9.ts.net:4648",
"ch3.tailnet-68f9.ts.net:4648",
"onecloud1.tailnet-68f9.ts.net:4648",
"de.tailnet-68f9.ts.net:4648",
"hcp1.tailnet-68f9.ts.net:4648"
]
}
}
{% if ansible_hostname == 'hcp1' %}
client {
enabled = false
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647",
"hcp1.tailnet-68f9.ts.net:4647"
]
host_volume "traefik-certs" {
path = "/opt/traefik/certs"
read_only = false
}
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
meta {
consul = "true"
consul_version = "1.21.5"
consul_client = "true"
}
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
@@ -48,20 +86,21 @@ plugin "nomad-driver-podman" {
}
}
}
{% endif %}
consul {
address = "master.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = true
server_auto_join = false
client_auto_join = true
}
vault {
enabled = true
address = "http://master.tailnet-68f9.ts.net:8200,http://ash3c.tailnet-68f9.ts.net:8200,http://warden.tailnet-68f9.ts.net:8200"
token = "hvs.A5Fu4E1oHyezJapVllKPFsWg"
create_from_role = "nomad-cluster"
tls_skip_verify = true
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}

View File

@@ -64,7 +64,7 @@ plugin "nomad-driver-podman" {
}
consul {
address = "master.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
@@ -74,7 +74,7 @@ consul {
vault {
enabled = true
address = "http://master.tailnet-68f9.ts.net:8200,http://ash3c.tailnet-68f9.ts.net:8200,http://warden.tailnet-68f9.ts.net:8200"
address = "http://ch4.tailnet-68f9.ts.net:8200,http://ash3c.tailnet-68f9.ts.net:8200,http://warden.tailnet-68f9.ts.net:8200"
token = "hvs.A5Fu4E1oHyezJapVllKPFsWg"
create_from_role = "nomad-cluster"
tls_skip_verify = true

View File

@@ -0,0 +1,45 @@
# Vault Configuration for {{ inventory_hostname }}
# Storage backend - Consul
storage "consul" {
address = "127.0.0.1:8500"
path = "vault/"
# Consul datacenter
datacenter = "{{ vault_datacenter }}"
# Service registration
service = "vault"
service_tags = "vault-server"
# Session TTL
session_ttl = "15s"
lock_wait_time = "15s"
}
# Listener configuration
listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = 1
}
# API address - 使用Tailscale网络地址
api_addr = "http://{{ ansible_host }}:8200"
# Cluster address - 使用Tailscale网络地址
cluster_addr = "http://{{ ansible_host }}:8201"
# UI
ui = true
# Cluster name
cluster_name = "{{ vault_cluster_name }}"
# Disable mlock for development (remove in production)
disable_mlock = true
# Log level
log_level = "INFO"
# Plugin directory
plugin_directory = "/opt/vault/plugins"

View File

@@ -0,0 +1,34 @@
[Unit]
Description=Vault
Documentation=https://www.vaultproject.io/docs/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/vault.d/vault.hcl
StartLimitIntervalSec=60
StartLimitBurst=3
[Service]
Type=notify
User=vault
Group=vault
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
PrivateDevices=yes
SecureBits=keep-caps
AmbientCapabilities=CAP_IPC_LOCK
CapabilityBoundingSet=CAP_SYSLOG CAP_IPC_LOCK
NoNewPrivileges=yes
ExecStart=/usr/bin/vault server -config=/etc/vault.d/vault.hcl
ExecReload=/bin/kill --signal HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=5
TimeoutStopSec=30
StartLimitInterval=60
StartLimitBurst=3
LimitNOFILE=65536
LimitMEMLOCK=infinity
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,66 @@
---
- name: Initialize Vault Cluster
hosts: ch4 # 只在一个节点初始化
become: yes
tasks:
- name: Check if Vault is already initialized
uri:
url: "http://{{ ansible_host }}:8200/v1/sys/health"
method: GET
status_code: [200, 429, 472, 473, 501, 503]
register: vault_health
- name: Initialize Vault (only if not initialized)
uri:
url: "http://{{ ansible_host }}:8200/v1/sys/init"
method: POST
body_format: json
body:
secret_shares: 5
secret_threshold: 3
status_code: 200
register: vault_init_result
when: not vault_health.json.initialized
- name: Save initialization results to local file
copy:
content: |
# Vault Cluster Initialization Results
Generated on: {{ ansible_date_time.iso8601 }}
Initialized by: {{ inventory_hostname }}
## Root Token
{{ vault_init_result.json.root_token }}
## Unseal Keys
{% for key in vault_init_result.json.keys %}
Key {{ loop.index }}: {{ key }}
{% endfor %}
## Base64 Unseal Keys
{% for key in vault_init_result.json.keys_base64 %}
Key {{ loop.index }} (base64): {{ key }}
{% endfor %}
## Important Notes
- Store these keys securely and separately
- You need 3 out of 5 keys to unseal Vault
- Root token provides full access to Vault
- Consider revoking root token after initial setup
dest: /tmp/vault-init-results.txt
delegate_to: localhost
when: vault_init_result is defined and vault_init_result.json is defined
- name: Display initialization results
debug:
msg: |
Vault initialized successfully!
Root Token: {{ vault_init_result.json.root_token }}
Unseal Keys: {{ vault_init_result.json.keys }}
when: vault_init_result is defined and vault_init_result.json is defined
- name: Display already initialized message
debug:
msg: "Vault is already initialized on {{ inventory_hostname }}"
when: vault_health.json.initialized

View File

@@ -0,0 +1,85 @@
---
- name: Deploy Vault Cluster with Consul Integration
hosts: ch4,ash3c,warden
become: yes
vars:
vault_version: "1.15.2"
vault_datacenter: "dc1"
vault_cluster_name: "vault-cluster"
tasks:
- name: Update apt cache
apt:
update_cache: yes
cache_valid_time: 3600
- name: Add HashiCorp GPG key (if not exists)
shell: |
if [ ! -f /etc/apt/sources.list.d/hashicorp.list ]; then
curl -fsSL https://apt.releases.hashicorp.com/gpg | gpg --dearmor | sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
fi
args:
creates: /etc/apt/sources.list.d/hashicorp.list
- name: Install Vault
apt:
name: vault
state: present
update_cache: yes
allow_downgrade: yes
- name: Create vault user and directories
block:
- name: Create vault data directory
file:
path: /opt/vault/data
state: directory
owner: vault
group: vault
mode: '0755'
- name: Create vault config directory
file:
path: /etc/vault.d
state: directory
owner: vault
group: vault
mode: '0755'
- name: Generate Vault configuration
template:
src: vault.hcl.j2
dest: /etc/vault.d/vault.hcl
owner: vault
group: vault
mode: '0640'
notify: restart vault
- name: Create Vault systemd service
template:
src: vault.service.j2
dest: /etc/systemd/system/vault.service
owner: root
group: root
mode: '0644'
notify:
- reload systemd
- restart vault
- name: Enable and start Vault service
systemd:
name: vault
enabled: yes
state: started
daemon_reload: yes
handlers:
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart vault
systemd:
name: vault
state: restarted

View File

@@ -0,0 +1,67 @@
---
- name: Verify Vault Cluster Status
hosts: ch4,ash3c,warden
become: yes
tasks:
- name: Check Vault service status
systemd:
name: vault
register: vault_service_status
- name: Display Vault service status
debug:
msg: "Vault service on {{ inventory_hostname }}: {{ vault_service_status.status.ActiveState }}"
- name: Check Vault process
shell: ps aux | grep vault | grep -v grep
register: vault_process
ignore_errors: yes
- name: Display Vault process
debug:
msg: "Vault process on {{ inventory_hostname }}: {{ vault_process.stdout_lines }}"
- name: Check Vault port 8200
wait_for:
port: 8200
host: "{{ ansible_default_ipv4.address }}"
timeout: 10
register: vault_port_check
ignore_errors: yes
- name: Display port check result
debug:
msg: "Vault port 8200 on {{ inventory_hostname }}: {{ 'OPEN' if vault_port_check.failed == false else 'CLOSED' }}"
- name: Get Vault status
uri:
url: "http://{{ ansible_default_ipv4.address }}:8200/v1/sys/health"
method: GET
status_code: [200, 429, 472, 473, 501, 503]
register: vault_health
ignore_errors: yes
- name: Display Vault health status
debug:
msg: "Vault health on {{ inventory_hostname }}: {{ vault_health.json if vault_health.json is defined else 'Connection failed' }}"
- name: Check Consul integration
uri:
url: "http://127.0.0.1:8500/v1/kv/vault/?recurse"
method: GET
register: consul_vault_kv
ignore_errors: yes
- name: Display Consul Vault KV
debug:
msg: "Consul Vault KV on {{ inventory_hostname }}: {{ 'Found vault keys' if consul_vault_kv.status == 200 else 'No vault keys found' }}"
- name: Check Vault logs for errors
shell: journalctl -u vault --no-pager -n 10 | grep -i error || echo "No errors found"
register: vault_logs
ignore_errors: yes
- name: Display Vault error logs
debug:
msg: "Vault errors on {{ inventory_hostname }}: {{ vault_logs.stdout_lines }}"