updated
This commit is contained in:
375
scripts/utilities/NUCLEAR-NOMAD-RESET.yml
Normal file
375
scripts/utilities/NUCLEAR-NOMAD-RESET.yml
Normal file
@@ -0,0 +1,375 @@
|
||||
---
|
||||
# ☢️ NUCLEAR NOMAD RESET ☢️
|
||||
# 这是比终极还要强的修复脚本
|
||||
# 警告:这将完全摧毁并重建 Nomad 集群
|
||||
- name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️"
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
serial: 1 # 一次处理一个节点,避免同时炸掉所有节点
|
||||
vars:
|
||||
nomad_version: "1.10.5"
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
tailscale_ips:
|
||||
semaphore: "100.116.158.95"
|
||||
master: "100.117.106.136"
|
||||
ash3c: "100.116.80.94"
|
||||
|
||||
tasks:
|
||||
- name: "🚨 警告:即将进行核弹级重置"
|
||||
debug:
|
||||
msg: |
|
||||
☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️
|
||||
这将完全摧毁所有 Nomad 相关的数据、配置和进程!
|
||||
如果你不确定,请立即按 Ctrl+C 取消!
|
||||
|
||||
- name: "⏰ 等待 10 秒,给你最后的机会取消..."
|
||||
pause:
|
||||
seconds: 10
|
||||
|
||||
# ========== 第一阶段:核弹级清理 ==========
|
||||
- name: "💀 第一阶段:核弹级进程清理"
|
||||
debug:
|
||||
msg: "开始核弹级进程清理..."
|
||||
|
||||
- name: "🔥 停止 Nomad 服务(如果存在)"
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
enabled: no
|
||||
daemon_reload: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "💣 强制杀死所有 Nomad 相关进程"
|
||||
shell: |
|
||||
# 杀死所有 nomad 进程
|
||||
pkill -9 -f nomad || true
|
||||
# 杀死所有可能的子进程
|
||||
pkill -9 -f "nomad agent" || true
|
||||
pkill -9 -f "nomad server" || true
|
||||
pkill -9 -f "nomad client" || true
|
||||
# 等待进程完全死亡
|
||||
sleep 5
|
||||
# 再次确认杀死
|
||||
ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "🧹 清理所有 Nomad 相关文件和目录"
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /opt/nomad
|
||||
- /etc/nomad.d
|
||||
- /var/log/nomad
|
||||
- /etc/systemd/system/nomad.service
|
||||
- /usr/local/bin/nomad
|
||||
- /usr/bin/nomad
|
||||
- /tmp/nomad*
|
||||
- /var/lib/nomad
|
||||
- /run/nomad
|
||||
- /var/run/nomad.pid
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "🔧 清理 systemd 缓存"
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
# ========== 第二阶段:重新安装 Nomad ==========
|
||||
- name: "🚀 第二阶段:重新安装 Nomad"
|
||||
debug:
|
||||
msg: "开始重新安装 Nomad..."
|
||||
|
||||
- name: "🔑 添加 HashiCorp GPG 密钥"
|
||||
apt_key:
|
||||
url: https://apt.releases.hashicorp.com/gpg
|
||||
state: present
|
||||
|
||||
- name: "📦 添加 HashiCorp APT 仓库"
|
||||
apt_repository:
|
||||
repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main"
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: "🔧 安装 Nomad(自动检测架构)"
|
||||
apt:
|
||||
name: "nomad={{ nomad_version }}-1"
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: "👤 创建 nomad 用户和组"
|
||||
group:
|
||||
name: nomad
|
||||
state: present
|
||||
|
||||
- name: "👤 创建 nomad 用户"
|
||||
user:
|
||||
name: nomad
|
||||
group: nomad
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: /opt/nomad
|
||||
create_home: no
|
||||
|
||||
- name: "📁 创建全新的目录结构"
|
||||
file:
|
||||
path: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: "{{ item.owner | default('nomad') }}"
|
||||
group: "{{ item.group | default('nomad') }}"
|
||||
mode: "{{ item.mode | default('0755') }}"
|
||||
loop:
|
||||
- { path: "/etc/nomad.d", mode: "0755" }
|
||||
- { path: "/opt/nomad", mode: "0755" }
|
||||
- { path: "/opt/nomad/data", mode: "0755" }
|
||||
- { path: "/opt/nomad/alloc_mounts", mode: "0755" }
|
||||
- { path: "/var/log/nomad", mode: "0755" }
|
||||
|
||||
# ========== 第三阶段:网络和防火墙检查 ==========
|
||||
- name: "🌐 第三阶段:网络配置验证"
|
||||
debug:
|
||||
msg: "验证网络配置..."
|
||||
|
||||
- name: "🔍 检查 Tailscale IP 是否正确绑定"
|
||||
shell: |
|
||||
ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND"
|
||||
register: ip_check
|
||||
|
||||
- name: "⚠️ IP 地址检查结果"
|
||||
debug:
|
||||
msg: |
|
||||
节点: {{ inventory_hostname }}
|
||||
期望 IP: {{ tailscale_ips[inventory_hostname] }}
|
||||
检查结果: {{ ip_check.stdout }}
|
||||
{% if 'IP_NOT_FOUND' in ip_check.stdout %}
|
||||
❌ 警告:IP 地址未正确绑定!
|
||||
{% else %}
|
||||
✅ IP 地址检查通过
|
||||
{% endif %}
|
||||
|
||||
- name: "🔥 确保防火墙端口开放"
|
||||
shell: |
|
||||
# 检查并开放 Nomad 端口
|
||||
if command -v ufw >/dev/null 2>&1; then
|
||||
ufw allow 4646/tcp # HTTP API
|
||||
ufw allow 4647/tcp # RPC
|
||||
ufw allow 4648/tcp # Serf
|
||||
elif command -v firewall-cmd >/dev/null 2>&1; then
|
||||
firewall-cmd --permanent --add-port=4646/tcp
|
||||
firewall-cmd --permanent --add-port=4647/tcp
|
||||
firewall-cmd --permanent --add-port=4648/tcp
|
||||
firewall-cmd --reload
|
||||
fi
|
||||
ignore_errors: yes
|
||||
|
||||
# ========== 第四阶段:创建超强配置 ==========
|
||||
- name: "⚙️ 第四阶段:创建超强配置文件"
|
||||
debug:
|
||||
msg: "创建超强配置文件..."
|
||||
|
||||
- name: "📝 创建核弹级 Nomad 配置"
|
||||
copy:
|
||||
content: |
|
||||
# ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }}
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
# 使用正确的 Tailscale IP
|
||||
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
|
||||
# 日志配置
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
log_rotate_duration = "24h"
|
||||
log_rotate_max_files = 5
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
# 更激进的重试配置
|
||||
server_join {
|
||||
retry_join = [
|
||||
"{{ tailscale_ips.semaphore }}:4647",
|
||||
"{{ tailscale_ips.master }}:4647",
|
||||
"{{ tailscale_ips.ash3c }}:4647"
|
||||
]
|
||||
retry_max = 10
|
||||
retry_interval = "15s"
|
||||
}
|
||||
|
||||
# 更宽松的心跳配置
|
||||
heartbeat_grace = "30s"
|
||||
min_heartbeat_ttl = "10s"
|
||||
max_heartbeats_per_second = 50.0
|
||||
|
||||
# Raft 配置优化
|
||||
raft_protocol = 3
|
||||
raft_multiplier = 1
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
|
||||
# 网络接口配置
|
||||
network_interface = "tailscale0"
|
||||
|
||||
# 更宽松的心跳配置
|
||||
max_kill_timeout = "30s"
|
||||
|
||||
# 主机卷配置
|
||||
host_volume "docker-sock" {
|
||||
path = "/var/run/docker.sock"
|
||||
read_only = false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
# 地址和端口配置
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
serf = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
# Docker 插件配置
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
# 更宽松的资源限制
|
||||
gc {
|
||||
image = true
|
||||
image_delay = "10m"
|
||||
container = true
|
||||
dangling_containers {
|
||||
enabled = true
|
||||
dry_run = false
|
||||
period = "5m"
|
||||
creation_grace = "5m"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 遥测配置
|
||||
telemetry {
|
||||
collection_interval = "10s"
|
||||
disable_hostname = false
|
||||
prometheus_metrics = true
|
||||
publish_allocation_metrics = true
|
||||
publish_node_metrics = true
|
||||
}
|
||||
dest: "/etc/nomad.d/nomad.hcl"
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
# ========== 第五阶段:创建超强 systemd 服务 ==========
|
||||
- name: "🔧 创建超强 systemd 服务文件"
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Nomad - Nuclear Edition
|
||||
Documentation=https://www.nomadproject.io/
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User=nomad
|
||||
Group=nomad
|
||||
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=process
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
LimitNOFILE=65536
|
||||
|
||||
# 更强的重启策略
|
||||
StartLimitInterval=0
|
||||
StartLimitBurst=10
|
||||
|
||||
# 环境变量
|
||||
Environment=NOMAD_DISABLE_UPDATE_CHECK=1
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
dest: "/etc/systemd/system/nomad.service"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: "🔄 重新加载 systemd"
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
# ========== 第六阶段:启动和验证 ==========
|
||||
- name: "🚀 第六阶段:启动服务"
|
||||
debug:
|
||||
msg: "启动 Nomad 服务..."
|
||||
|
||||
- name: "🔥 启用并启动 Nomad 服务"
|
||||
systemd:
|
||||
name: nomad
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
|
||||
- name: "⏰ 等待服务启动"
|
||||
pause:
|
||||
seconds: 15
|
||||
|
||||
- name: "🔍 验证服务状态"
|
||||
systemd:
|
||||
name: nomad
|
||||
register: nomad_service_status
|
||||
|
||||
- name: "📊 显示服务状态"
|
||||
debug:
|
||||
msg: |
|
||||
☢️ 核弹级重置完成!
|
||||
节点: {{ inventory_hostname }}
|
||||
服务状态: {{ nomad_service_status.status.ActiveState }}
|
||||
IP 地址: {{ tailscale_ips[inventory_hostname] }}
|
||||
|
||||
{% if nomad_service_status.status.ActiveState == 'active' %}
|
||||
✅ 服务启动成功!
|
||||
{% else %}
|
||||
❌ 服务启动失败,请检查日志!
|
||||
{% endif %}
|
||||
|
||||
- name: "🧹 清理临时文件"
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||||
- "/tmp/nomad"
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "🎉 核弹级重置完成通知"
|
||||
debug:
|
||||
msg: |
|
||||
☢️☢️☢️ 核弹级重置完成!☢️☢️☢️
|
||||
|
||||
节点 {{ inventory_hostname }} 已经被完全摧毁并重建!
|
||||
|
||||
下一步:
|
||||
1. 等待所有节点完成重置
|
||||
2. 检查集群状态:nomad server members
|
||||
3. 检查节点状态:nomad node status
|
||||
4. 如果还有问题,那就真的没救了... 😅
|
||||
37
scripts/utilities/check-nomad-cluster.sh
Executable file
37
scripts/utilities/check-nomad-cluster.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "=== Nomad 集群状态检查 ==="
|
||||
|
||||
# 检查所有节点的服务状态
|
||||
echo "1. 检查服务状态..."
|
||||
ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null
|
||||
|
||||
echo -e "\n2. 检查网络连通性..."
|
||||
# 检查网络连通性
|
||||
for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do
|
||||
echo "检查到 $ip 的连接..."
|
||||
timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)"
|
||||
timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)"
|
||||
timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)"
|
||||
done
|
||||
|
||||
echo -e "\n3. 检查 Nomad 集群成员..."
|
||||
# 尝试查询集群成员
|
||||
if nomad server members 2>/dev/null; then
|
||||
echo "集群成员查询成功"
|
||||
else
|
||||
echo "无法查询集群成员 - 可能没有 leader"
|
||||
fi
|
||||
|
||||
echo -e "\n4. 检查节点状态..."
|
||||
if nomad node status 2>/dev/null; then
|
||||
echo "节点状态查询成功"
|
||||
else
|
||||
echo "无法查询节点状态"
|
||||
fi
|
||||
|
||||
echo -e "\n5. 检查最近的日志..."
|
||||
echo "=== Semaphore 节点日志 ==="
|
||||
journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5
|
||||
|
||||
echo -e "\n=== 检查完成 ==="
|
||||
189
scripts/utilities/complete-nomad-cluster-fix.yml
Normal file
189
scripts/utilities/complete-nomad-cluster-fix.yml
Normal file
@@ -0,0 +1,189 @@
|
||||
---
|
||||
- name: Complete Nomad Cluster Fix with Ansible
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
tailscale_ips:
|
||||
semaphore: "100.116.158.95"
|
||||
master: "100.117.106.136"
|
||||
ash3c: "100.116.80.94"
|
||||
|
||||
tasks:
|
||||
- name: Stop nomad service completely
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
enabled: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Kill any remaining nomad processes
|
||||
shell: pkill -f nomad || true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Reset systemd failure state
|
||||
shell: systemctl reset-failed nomad
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create nomad user if not exists
|
||||
user:
|
||||
name: nomad
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: /opt/nomad
|
||||
create_home: no
|
||||
|
||||
- name: Create all required directories with correct permissions
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /opt/nomad
|
||||
- /opt/nomad/data
|
||||
- /opt/nomad/alloc_mounts
|
||||
- /var/log/nomad
|
||||
- /etc/nomad.d
|
||||
|
||||
- name: Completely clean nomad data directory
|
||||
shell: rm -rf /opt/nomad/data/* /opt/nomad/data/.*
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create correct nomad configuration
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
server_join {
|
||||
retry_join = [
|
||||
"{{ tailscale_ips.semaphore }}:4647",
|
||||
"{{ tailscale_ips.master }}:4647",
|
||||
"{{ tailscale_ips.ash3c }}:4647"
|
||||
]
|
||||
retry_interval = "15s"
|
||||
retry_max = 3
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
alloc_dir = "/opt/nomad/alloc_mounts"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
serf = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
log_rotate_duration = "24h"
|
||||
log_rotate_max_files = 5
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Set correct ownership for all nomad files
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
owner: nomad
|
||||
group: nomad
|
||||
recurse: yes
|
||||
loop:
|
||||
- /opt/nomad
|
||||
- /var/log/nomad
|
||||
- /etc/nomad.d
|
||||
|
||||
- name: Validate nomad configuration
|
||||
shell: nomad config validate /etc/nomad.d/nomad.hcl
|
||||
register: config_validation
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Show config validation result
|
||||
debug:
|
||||
var: config_validation
|
||||
|
||||
- name: Start nomad service on first node (semaphore)
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
when: inventory_hostname == 'semaphore'
|
||||
|
||||
- name: Wait for first node to start
|
||||
pause:
|
||||
seconds: 30
|
||||
when: inventory_hostname == 'semaphore'
|
||||
|
||||
- name: Start nomad service on remaining nodes
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
when: inventory_hostname != 'semaphore'
|
||||
|
||||
- name: Wait for all services to start
|
||||
pause:
|
||||
seconds: 20
|
||||
|
||||
- name: Check nomad service status
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: service_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Show service status
|
||||
debug:
|
||||
var: service_status.stdout_lines
|
||||
|
||||
- name: Check nomad logs for errors
|
||||
shell: journalctl -u nomad -n 10 --no-pager
|
||||
register: nomad_logs
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Show recent nomad logs
|
||||
debug:
|
||||
var: nomad_logs.stdout_lines
|
||||
|
||||
- name: Test nomad connectivity
|
||||
shell: nomad server members
|
||||
register: nomad_members
|
||||
ignore_errors: yes
|
||||
when: inventory_hostname == 'semaphore'
|
||||
|
||||
- name: Show cluster members
|
||||
debug:
|
||||
var: nomad_members.stdout_lines
|
||||
when: inventory_hostname == 'semaphore'
|
||||
151
scripts/utilities/complete-nomad-reset.yml
Normal file
151
scripts/utilities/complete-nomad-reset.yml
Normal file
@@ -0,0 +1,151 @@
|
||||
---
|
||||
- name: Complete Nomad Cluster Reset and Rebuild
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
serial: 1 # 一次处理一个节点
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
tailscale_ips:
|
||||
semaphore: "100.116.158.95"
|
||||
master: "100.117.106.136"
|
||||
ash3c: "100.116.80.94"
|
||||
|
||||
tasks:
|
||||
- name: Stop nomad service completely
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Kill any remaining nomad processes
|
||||
shell: pkill -f nomad || true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Remove all nomad data and state
|
||||
shell: |
|
||||
rm -rf /opt/nomad/data/*
|
||||
rm -rf /opt/nomad/data/.*
|
||||
rm -rf /var/log/nomad/*
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create fresh nomad configuration with correct Tailscale IPs
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
# 使用 Tailscale IP 地址
|
||||
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
server_join {
|
||||
retry_join = [
|
||||
"{{ tailscale_ips.semaphore }}",
|
||||
"{{ tailscale_ips.master }}",
|
||||
"{{ tailscale_ips.ash3c }}"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
network_interface = "tailscale0"
|
||||
}
|
||||
|
||||
ui_config {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
serf = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Ensure log directory exists
|
||||
file:
|
||||
path: /var/log/nomad
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
|
||||
- name: Start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for nomad to start
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ tailscale_ips[inventory_hostname] }}"
|
||||
delay: 5
|
||||
timeout: 30
|
||||
|
||||
- name: Check nomad service status
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: nomad_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display nomad status
|
||||
debug:
|
||||
var: nomad_status.stdout_lines
|
||||
|
||||
- name: Wait for cluster to form
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Wait for cluster formation
|
||||
pause:
|
||||
seconds: 30
|
||||
prompt: "等待集群形成..."
|
||||
|
||||
- name: Verify cluster status
|
||||
hosts: semaphore
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Check cluster members
|
||||
shell: nomad server members
|
||||
register: cluster_members
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display cluster members
|
||||
debug:
|
||||
var: cluster_members.stdout_lines
|
||||
|
||||
- name: Check node status
|
||||
shell: nomad node status
|
||||
register: node_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display node status
|
||||
debug:
|
||||
var: node_status.stdout_lines
|
||||
233
scripts/utilities/consul-cluster-manager.sh
Executable file
233
scripts/utilities/consul-cluster-manager.sh
Executable file
@@ -0,0 +1,233 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Consul 集群管理脚本
|
||||
# 提供集群状态检查、重启、停止等功能
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini"
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 打印带颜色的消息
|
||||
print_status() {
|
||||
echo -e "${GREEN}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
print_header() {
|
||||
echo -e "${BLUE}=== $1 ===${NC}"
|
||||
}
|
||||
|
||||
# 检查必要文件
|
||||
check_prerequisites() {
|
||||
if [[ ! -f "$INVENTORY_FILE" ]]; then
|
||||
print_error "清单文件不存在: $INVENTORY_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v ansible &> /dev/null; then
|
||||
print_error "未找到 ansible 命令"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "Consul 集群管理脚本"
|
||||
echo
|
||||
echo "用法: $0 [命令]"
|
||||
echo
|
||||
echo "命令:"
|
||||
echo " status - 检查集群状态"
|
||||
echo " members - 显示集群成员"
|
||||
echo " leader - 显示集群领导者"
|
||||
echo " restart - 重启 Consul 服务"
|
||||
echo " stop - 停止 Consul 服务"
|
||||
echo " start - 启动 Consul 服务"
|
||||
echo " logs - 查看服务日志"
|
||||
echo " health - 健康检查"
|
||||
echo " cleanup - 清理 Consul 数据(危险操作)"
|
||||
echo " help - 显示此帮助信息"
|
||||
echo
|
||||
}
|
||||
|
||||
# 检查集群状态
|
||||
check_status() {
|
||||
print_header "Consul 服务状态"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o
|
||||
|
||||
echo
|
||||
print_header "Consul 进程状态"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ps aux | grep consul | grep -v grep" -o
|
||||
}
|
||||
|
||||
# 显示集群成员
|
||||
show_members() {
|
||||
print_header "Consul 集群成员"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members" -o
|
||||
}
|
||||
|
||||
# 显示集群领导者
|
||||
show_leader() {
|
||||
print_header "Consul 集群领导者"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul operator raft list-peers" -o
|
||||
|
||||
echo
|
||||
print_header "通过 API 检查领导者"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s http://localhost:8500/v1/status/leader" -o
|
||||
}
|
||||
|
||||
# 重启服务
|
||||
restart_service() {
|
||||
print_header "重启 Consul 服务"
|
||||
print_warning "即将重启所有 Consul 节点..."
|
||||
read -p "确认继续? (y/N): " confirm
|
||||
if [[ $confirm != "y" && $confirm != "Y" ]]; then
|
||||
print_status "操作已取消"
|
||||
return
|
||||
fi
|
||||
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=restarted" -b
|
||||
|
||||
print_status "等待服务启动..."
|
||||
sleep 10
|
||||
check_status
|
||||
}
|
||||
|
||||
# 停止服务
|
||||
stop_service() {
|
||||
print_header "停止 Consul 服务"
|
||||
print_warning "即将停止所有 Consul 节点..."
|
||||
read -p "确认继续? (y/N): " confirm
|
||||
if [[ $confirm != "y" && $confirm != "Y" ]]; then
|
||||
print_status "操作已取消"
|
||||
return
|
||||
fi
|
||||
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b
|
||||
}
|
||||
|
||||
# 启动服务
|
||||
start_service() {
|
||||
print_header "启动 Consul 服务"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b
|
||||
|
||||
print_status "等待服务启动..."
|
||||
sleep 10
|
||||
check_status
|
||||
}
|
||||
|
||||
# 查看日志
|
||||
show_logs() {
|
||||
print_header "Consul 服务日志"
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "journalctl -u consul --no-pager -n 20" -o
|
||||
}
|
||||
|
||||
# 健康检查
|
||||
health_check() {
|
||||
print_header "Consul 健康检查"
|
||||
|
||||
# 检查服务状态
|
||||
print_status "检查服务状态..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o
|
||||
|
||||
echo
|
||||
# 检查端口监听
|
||||
print_status "检查端口监听..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ss -tlnp | grep :8500" -o
|
||||
|
||||
echo
|
||||
# 检查集群成员
|
||||
print_status "检查集群成员..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members | wc -l" -o
|
||||
|
||||
echo
|
||||
# 检查 API 响应
|
||||
print_status "检查 API 响应..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s -o /dev/null -w '%{http_code}' http://localhost:8500/v1/status/leader" -o
|
||||
}
|
||||
|
||||
# 清理数据(危险操作)
|
||||
cleanup_data() {
|
||||
print_header "清理 Consul 数据"
|
||||
print_error "警告: 此操作将删除所有 Consul 数据,包括服务注册、KV 存储等!"
|
||||
print_error "此操作不可逆!"
|
||||
echo
|
||||
read -p "确认要清理所有数据? 请输入 'YES' 确认: " confirm
|
||||
if [[ $confirm != "YES" ]]; then
|
||||
print_status "操作已取消"
|
||||
return
|
||||
fi
|
||||
|
||||
print_status "停止 Consul 服务..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b
|
||||
|
||||
print_status "清理数据目录..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "rm -rf /opt/consul/data/*" -b
|
||||
|
||||
print_status "启动 Consul 服务..."
|
||||
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b
|
||||
|
||||
print_status "数据清理完成"
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
check_prerequisites
|
||||
|
||||
case "${1:-help}" in
|
||||
status)
|
||||
check_status
|
||||
;;
|
||||
members)
|
||||
show_members
|
||||
;;
|
||||
leader)
|
||||
show_leader
|
||||
;;
|
||||
restart)
|
||||
restart_service
|
||||
;;
|
||||
stop)
|
||||
stop_service
|
||||
;;
|
||||
start)
|
||||
start_service
|
||||
;;
|
||||
logs)
|
||||
show_logs
|
||||
;;
|
||||
health)
|
||||
health_check
|
||||
;;
|
||||
cleanup)
|
||||
cleanup_data
|
||||
;;
|
||||
help|--help|-h)
|
||||
show_help
|
||||
;;
|
||||
*)
|
||||
print_error "未知命令: $1"
|
||||
echo
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
115
scripts/utilities/correct-nomad-cluster.yml
Normal file
115
scripts/utilities/correct-nomad-cluster.yml
Normal file
@@ -0,0 +1,115 @@
|
||||
---
|
||||
- name: Correct Nomad Cluster Configuration
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
tailscale_ips:
|
||||
semaphore: "100.116.158.95"
|
||||
master: "100.117.106.136"
|
||||
ash3c: "100.116.80.94"
|
||||
|
||||
tasks:
|
||||
- name: Stop nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clean nomad data
|
||||
file:
|
||||
path: /opt/nomad/data
|
||||
state: absent
|
||||
|
||||
- name: Recreate nomad data directory
|
||||
file:
|
||||
path: /opt/nomad/data
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
|
||||
- name: Create correct nomad configuration
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
server_join {
|
||||
retry_join = [
|
||||
"{{ tailscale_ips.semaphore }}:4647",
|
||||
"{{ tailscale_ips.master }}:4647",
|
||||
"{{ tailscale_ips.ash3c }}:4647"
|
||||
]
|
||||
retry_interval = "15s"
|
||||
retry_max = 3
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
alloc_dir = "/opt/nomad/alloc_mounts"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
serf = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Start nomad services in sequence
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
serial: 1
|
||||
tasks:
|
||||
- name: Start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Wait for nomad to start
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ tailscale_ips[inventory_hostname] }}"
|
||||
delay: 10
|
||||
timeout: 60
|
||||
|
||||
- name: Wait between nodes
|
||||
pause:
|
||||
seconds: 30
|
||||
113
scripts/utilities/deploy-nomad-configs.yml
Normal file
113
scripts/utilities/deploy-nomad-configs.yml
Normal file
@@ -0,0 +1,113 @@
|
||||
---
|
||||
- name: Deploy Nomad Configurations
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
node_ips:
|
||||
semaphore: "100.116.158.95"
|
||||
master: "100.117.106.136"
|
||||
ash3c: "100.116.80.94"
|
||||
|
||||
tasks:
|
||||
- name: Create nomad configuration for each node
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ node_ips[inventory_hostname] }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
server_join {
|
||||
retry_join = [
|
||||
"{{ node_ips.semaphore }}:4647",
|
||||
"{{ node_ips.master }}:4647",
|
||||
"{{ node_ips.ash3c }}:4647"
|
||||
]
|
||||
retry_interval = "15s"
|
||||
retry_max = 3
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
alloc_dir = "/opt/nomad/alloc_mounts"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ node_ips[inventory_hostname] }}"
|
||||
serf = "{{ node_ips[inventory_hostname] }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Validate nomad configuration
|
||||
shell: nomad config validate /etc/nomad.d/nomad.hcl
|
||||
register: config_validation
|
||||
|
||||
- name: Show validation result
|
||||
debug:
|
||||
var: config_validation.stdout_lines
|
||||
|
||||
- name: Start nomad service on bootstrap node first
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
when: inventory_hostname == 'semaphore'
|
||||
|
||||
- name: Wait for bootstrap node
|
||||
pause:
|
||||
seconds: 15
|
||||
when: inventory_hostname == 'semaphore'
|
||||
|
||||
- name: Start nomad service on other nodes
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
when: inventory_hostname != 'semaphore'
|
||||
|
||||
- name: Wait for services to start
|
||||
pause:
|
||||
seconds: 10
|
||||
|
||||
- name: Check service status
|
||||
shell: systemctl status nomad --no-pager
|
||||
register: service_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Show service status
|
||||
debug:
|
||||
var: service_status.stdout_lines
|
||||
190
scripts/utilities/final-nomad-cluster-fix.yml
Normal file
190
scripts/utilities/final-nomad-cluster-fix.yml
Normal file
@@ -0,0 +1,190 @@
|
||||
---
|
||||
- name: Final Complete Nomad Cluster Fix
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
nomad_servers:
|
||||
- "100.116.158.95:4647" # semaphore
|
||||
- "100.117.106.136:4647" # master
|
||||
- "100.116.80.94:4647" # ash3c
|
||||
|
||||
tasks:
|
||||
- name: Stop nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Reset failed nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
daemon_reload: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create nomad user if not exists
|
||||
user:
|
||||
name: nomad
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: /opt/nomad
|
||||
create_home: no
|
||||
|
||||
- name: Create nomad directories with correct permissions
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /etc/nomad.d
|
||||
- /opt/nomad
|
||||
- /opt/nomad/data
|
||||
- /opt/nomad/alloc_mounts
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Clean old nomad data
|
||||
file:
|
||||
path: /opt/nomad/data
|
||||
state: absent
|
||||
|
||||
- name: Recreate nomad data directory
|
||||
file:
|
||||
path: /opt/nomad/data
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
|
||||
- name: Get Tailscale IP address
|
||||
shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1
|
||||
register: tailscale_ip
|
||||
failed_when: false
|
||||
|
||||
- name: Set bind address (fallback to default interface if tailscale not available)
|
||||
set_fact:
|
||||
bind_address: "{{ tailscale_ip.stdout if tailscale_ip.stdout != '' else ansible_default_ipv4.address }}"
|
||||
|
||||
- name: Generate nomad configuration
|
||||
template:
|
||||
src: nomad-server.hcl.j2
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
vars:
|
||||
nomad_datacenter: "dc1"
|
||||
nomad_region: "global"
|
||||
nomad_data_dir: "/opt/nomad/data"
|
||||
nomad_bind_addr: "{{ bind_address }}"
|
||||
nomad_bootstrap_expect: 3
|
||||
nomad_encrypt: "{{ nomad_encrypt_key }}"
|
||||
nomad_retry_join: "{{ nomad_servers }}"
|
||||
nomad_alloc_dir: "/opt/nomad/alloc_mounts"
|
||||
nomad_log_file: "/var/log/nomad/nomad.log"
|
||||
|
||||
- name: Create nomad systemd service
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Nomad
|
||||
Documentation=https://www.nomadproject.io/
|
||||
Requires=network-online.target
|
||||
After=network-online.target
|
||||
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User=nomad
|
||||
Group=nomad
|
||||
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
LimitNOFILE=65536
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
dest: /etc/systemd/system/nomad.service
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for nomad to start
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ bind_address }}"
|
||||
delay: 5
|
||||
timeout: 30
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create nomad configuration template
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Create nomad server template
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "{{ nomad_datacenter }}"
|
||||
region = "{{ nomad_region }}"
|
||||
data_dir = "{{ nomad_data_dir }}"
|
||||
|
||||
bind_addr = "{{ nomad_bind_addr }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = {{ nomad_bootstrap_expect }}
|
||||
encrypt = "{{ nomad_encrypt }}"
|
||||
|
||||
server_join {
|
||||
retry_join = {{ nomad_retry_join | to_json }}
|
||||
retry_interval = "15s"
|
||||
retry_max = 3
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
alloc_dir = "{{ nomad_alloc_dir }}"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ nomad_bind_addr }}"
|
||||
serf = "{{ nomad_bind_addr }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "{{ nomad_log_file }}"
|
||||
dest: /tmp/nomad-server.hcl.j2
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
111
scripts/utilities/final-nomad-fix.yml
Normal file
111
scripts/utilities/final-nomad-fix.yml
Normal file
@@ -0,0 +1,111 @@
|
||||
---
|
||||
- name: Final Nomad Cluster Fix
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
tailscale_ips:
|
||||
semaphore: "100.116.158.95"
|
||||
master: "100.117.106.136"
|
||||
ash3c: "100.116.80.94"
|
||||
|
||||
tasks:
|
||||
- name: Stop nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create required directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /opt/nomad/data
|
||||
- /opt/nomad/alloc_mounts
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Clean nomad data
|
||||
shell: rm -rf /opt/nomad/data/*
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create working nomad configuration
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
server_join {
|
||||
retry_join = [
|
||||
"{{ tailscale_ips.semaphore }}",
|
||||
"{{ tailscale_ips.master }}",
|
||||
"{{ tailscale_ips.ash3c }}"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
serf = "{{ tailscale_ips[inventory_hostname] }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for service to start
|
||||
pause:
|
||||
seconds: 10
|
||||
|
||||
- name: Check service status
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: service_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Show service status
|
||||
debug:
|
||||
var: service_status.stdout_lines
|
||||
137
scripts/utilities/fix-ash3c-ip.sh
Executable file
137
scripts/utilities/fix-ash3c-ip.sh
Executable file
@@ -0,0 +1,137 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 🔧 ash3c IP 地址修复脚本
|
||||
|
||||
set -e
|
||||
|
||||
echo "🔧 ash3c IP 地址问题修复脚本"
|
||||
echo ""
|
||||
|
||||
# 定义正确的 IP 地址
|
||||
CORRECT_IP="100.116.80.94"
|
||||
ASH3C_HOST="100.116.80.94"
|
||||
|
||||
echo "📡 检查 ash3c 节点的网络配置..."
|
||||
|
||||
# 检查 ash3c 的实际 IP 配置
|
||||
echo "🔍 检查 ash3c 节点的 IP 地址绑定..."
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S ip addr show" | grep -E "inet.*100\." || echo "❌ 未找到 Tailscale IP"
|
||||
|
||||
echo ""
|
||||
echo "🔍 检查 Tailscale 状态..."
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S tailscale status" || echo "❌ Tailscale 状态检查失败"
|
||||
|
||||
echo ""
|
||||
echo "🔧 修复 ash3c 的 Nomad 配置..."
|
||||
|
||||
# 创建正确的配置文件
|
||||
cat > /tmp/ash3c-nomad.hcl << EOF
|
||||
# 🔧 ash3c 修复后的 Nomad 配置
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
# 强制使用正确的 Tailscale IP
|
||||
bind_addr = "${CORRECT_IP}"
|
||||
|
||||
# 日志配置
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
|
||||
server_join {
|
||||
retry_join = [
|
||||
"100.116.158.95:4647",
|
||||
"100.117.106.136:4647",
|
||||
"100.116.80.94:4647"
|
||||
]
|
||||
retry_max = 10
|
||||
retry_interval = "15s"
|
||||
}
|
||||
|
||||
# 更宽松的心跳配置
|
||||
heartbeat_grace = "30s"
|
||||
min_heartbeat_ttl = "10s"
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
network_interface = "tailscale0"
|
||||
}
|
||||
|
||||
ui_config {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "${CORRECT_IP}"
|
||||
serf = "${CORRECT_IP}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "📤 上传修复后的配置到 ash3c..."
|
||||
scp -P 22 -i ~/.ssh/id_ed25519 /tmp/ash3c-nomad.hcl ben@${ASH3C_HOST}:/tmp/
|
||||
|
||||
echo "🔧 在 ash3c 上应用修复..."
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} << 'REMOTE_SCRIPT'
|
||||
echo '3131' | sudo -S systemctl stop nomad || true
|
||||
echo '3131' | sudo -S pkill -f nomad || true
|
||||
sleep 5
|
||||
|
||||
# 备份旧配置
|
||||
echo '3131' | sudo -S cp /etc/nomad.d/nomad.hcl /etc/nomad.d/nomad.hcl.backup.$(date +%Y%m%d_%H%M%S) || true
|
||||
|
||||
# 应用新配置
|
||||
echo '3131' | sudo -S cp /tmp/ash3c-nomad.hcl /etc/nomad.d/nomad.hcl
|
||||
echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl
|
||||
echo '3131' | sudo -S chmod 640 /etc/nomad.d/nomad.hcl
|
||||
|
||||
# 清理数据目录
|
||||
echo '3131' | sudo -S rm -rf /opt/nomad/data/*
|
||||
|
||||
# 重启服务
|
||||
echo '3131' | sudo -S systemctl daemon-reload
|
||||
echo '3131' | sudo -S systemctl enable nomad
|
||||
echo '3131' | sudo -S systemctl start nomad
|
||||
|
||||
echo "✅ ash3c 配置修复完成"
|
||||
REMOTE_SCRIPT
|
||||
|
||||
echo ""
|
||||
echo "⏰ 等待 ash3c 服务启动..."
|
||||
sleep 15
|
||||
|
||||
echo ""
|
||||
echo "🔍 检查 ash3c 服务状态..."
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S systemctl status nomad --no-pager" || echo "❌ 服务状态检查失败"
|
||||
|
||||
echo ""
|
||||
echo "🧹 清理临时文件..."
|
||||
rm -f /tmp/ash3c-nomad.hcl
|
||||
|
||||
echo ""
|
||||
echo "✅ ash3c IP 修复完成!"
|
||||
echo ""
|
||||
echo "下一步:"
|
||||
echo "1. 检查集群状态: nomad server members"
|
||||
echo "2. 如果还有问题,运行核弹级重置: ./scripts/utilities/nuclear-reset.sh"
|
||||
151
scripts/utilities/fix-consul-cluster.sh
Executable file
151
scripts/utilities/fix-consul-cluster.sh
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Consul 集群修复脚本
|
||||
# 解决 "No cluster leader" 问题
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== Consul 集群修复脚本 ==="
|
||||
echo "当前时间: $(date)"
|
||||
echo
|
||||
|
||||
# 检查当前 Consul 服务状态
|
||||
echo "1. 检查当前 Consul 服务状态..."
|
||||
docker service ls | grep consul || echo "未找到 consul 服务"
|
||||
echo
|
||||
|
||||
# 显示当前问题
|
||||
echo "2. 检查 Consul 日志中的错误..."
|
||||
echo "Master 节点日志:"
|
||||
docker service logs consul-cluster_consul-master --tail 5 2>/dev/null || echo "无法获取 master 日志"
|
||||
echo
|
||||
echo "Ash3c 节点日志:"
|
||||
docker service logs consul-cluster_consul-ash3c --tail 5 2>/dev/null || echo "无法获取 ash3c 日志"
|
||||
echo
|
||||
|
||||
# 提供修复选项
|
||||
echo "3. 修复选项:"
|
||||
echo " a) 使用修复后的 overlay 网络配置 (推荐)"
|
||||
echo " b) 使用 macvlan 网络配置"
|
||||
echo " c) 仅重启现有服务"
|
||||
echo
|
||||
|
||||
read -p "请选择修复方案 (a/b/c): " choice
|
||||
|
||||
case $choice in
|
||||
a)
|
||||
echo "使用修复后的 overlay 网络配置..."
|
||||
|
||||
# 停止现有服务
|
||||
echo "停止现有 Consul 集群..."
|
||||
docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在"
|
||||
|
||||
# 等待服务完全停止
|
||||
echo "等待服务完全停止..."
|
||||
sleep 10
|
||||
|
||||
# 清理数据卷 (可选)
|
||||
read -p "是否清理现有数据卷? (y/n): " clean_volumes
|
||||
if [[ $clean_volumes == "y" ]]; then
|
||||
docker volume rm consul-cluster_consul_master_data 2>/dev/null || true
|
||||
docker volume rm consul-cluster_consul_ash3c_data 2>/dev/null || true
|
||||
echo "数据卷已清理"
|
||||
fi
|
||||
|
||||
# 部署修复后的配置
|
||||
echo "部署修复后的 Consul 集群..."
|
||||
docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-fixed.yml consul-cluster
|
||||
|
||||
echo "等待服务启动..."
|
||||
sleep 15
|
||||
|
||||
# 检查服务状态
|
||||
echo "检查新服务状态..."
|
||||
docker service ls | grep consul
|
||||
;;
|
||||
|
||||
b)
|
||||
echo "使用 macvlan 网络配置..."
|
||||
echo "注意: 需要根据你的网络环境调整 IP 地址和网络接口"
|
||||
|
||||
# 检查网络接口
|
||||
echo "当前网络接口:"
|
||||
ip link show | grep -E "^[0-9]+:" | awk '{print $2}' | sed 's/://'
|
||||
echo
|
||||
|
||||
read -p "请输入要使用的网络接口 (如 eth0): " interface
|
||||
read -p "请输入子网 (如 192.168.1.0/24): " subnet
|
||||
read -p "请输入网关 (如 192.168.1.1): " gateway
|
||||
|
||||
# 更新 macvlan 配置文件
|
||||
sed -i "s/parent: eth0/parent: $interface/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
|
||||
sed -i "s/192.168.1.0\/24/$subnet/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
|
||||
sed -i "s/192.168.1.1/$gateway/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
|
||||
|
||||
# 停止现有服务
|
||||
echo "停止现有 Consul 集群..."
|
||||
docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在"
|
||||
|
||||
# 等待服务完全停止
|
||||
echo "等待服务完全停止..."
|
||||
sleep 10
|
||||
|
||||
# 部署 macvlan 配置
|
||||
echo "部署 macvlan Consul 集群..."
|
||||
docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml consul-cluster
|
||||
|
||||
echo "等待服务启动..."
|
||||
sleep 15
|
||||
|
||||
# 检查服务状态
|
||||
echo "检查新服务状态..."
|
||||
docker service ls | grep consul
|
||||
;;
|
||||
|
||||
c)
|
||||
echo "重启现有服务..."
|
||||
|
||||
# 重启服务
|
||||
docker service update --force consul-cluster_consul-master
|
||||
docker service update --force consul-cluster_consul-ash3c
|
||||
|
||||
echo "等待服务重启..."
|
||||
sleep 10
|
||||
|
||||
# 检查服务状态
|
||||
echo "检查服务状态..."
|
||||
docker service ls | grep consul
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "无效选择,退出"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo
|
||||
echo "4. 验证修复结果..."
|
||||
sleep 5
|
||||
|
||||
# 检查服务状态
|
||||
echo "服务状态:"
|
||||
docker service ls | grep consul
|
||||
|
||||
echo
|
||||
echo "等待 30 秒后检查集群状态..."
|
||||
sleep 30
|
||||
|
||||
# 尝试检查集群成员
|
||||
echo "尝试检查集群成员状态..."
|
||||
timeout 10 docker service logs consul-cluster_consul-master --tail 10 2>/dev/null || echo "无法获取日志"
|
||||
|
||||
echo
|
||||
echo "=== 修复完成 ==="
|
||||
echo "请等待几分钟让集群完全启动,然后访问:"
|
||||
echo "- Master UI: http://your-master-ip:8500"
|
||||
echo "- Ash3c UI: http://your-ash3c-ip:8501"
|
||||
echo
|
||||
echo "如果问题仍然存在,请检查:"
|
||||
echo "1. 节点间网络连通性"
|
||||
echo "2. 防火墙设置"
|
||||
echo "3. Docker Swarm 网络配置"
|
||||
26
scripts/utilities/fix-master-binary.sh
Executable file
26
scripts/utilities/fix-master-binary.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "🔧 使用 HashiCorp 官方脚本修复 master 节点二进制文件..."
|
||||
|
||||
# 停止 nomad 服务
|
||||
echo '3131' | sudo -S systemctl stop nomad || true
|
||||
echo '3131' | sudo -S pkill -9 -f nomad || true
|
||||
|
||||
# 删除旧的二进制文件
|
||||
echo '3131' | sudo -S rm -f /usr/local/bin/nomad /usr/bin/nomad
|
||||
|
||||
# 使用 HashiCorp 官方安装脚本(自动检测架构)
|
||||
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
|
||||
echo '3131' | sudo -S apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
|
||||
echo '3131' | sudo -S apt-get update
|
||||
echo '3131' | sudo -S apt-get install -y nomad=1.10.5-1
|
||||
|
||||
# 验证安装
|
||||
nomad version
|
||||
|
||||
# 重启服务
|
||||
echo '3131' | sudo -S systemctl daemon-reload
|
||||
echo '3131' | sudo -S systemctl enable nomad
|
||||
echo '3131' | sudo -S systemctl start nomad
|
||||
|
||||
echo "✅ Master 节点二进制文件修复完成!"
|
||||
92
scripts/utilities/fix-nomad-cluster.yml
Normal file
92
scripts/utilities/fix-nomad-cluster.yml
Normal file
@@ -0,0 +1,92 @@
|
||||
---
|
||||
- name: Fix Nomad Cluster Issues
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
|
||||
tasks:
|
||||
- name: Stop nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clean nomad data directory
|
||||
shell: rm -rf /opt/nomad/data/*
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create correct nomad configuration
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
|
||||
server_join {
|
||||
retry_join = ["100.116.158.95", "100.117.106.136", "100.116.80.94"]
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
network_interface = "{{ ansible_default_ipv4.interface | default('eth0') }}"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "0.0.0.0"
|
||||
serf = "0.0.0.0"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for nomad to start
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}"
|
||||
delay: 10
|
||||
timeout: 60
|
||||
|
||||
- name: Check nomad status
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: nomad_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display nomad status
|
||||
debug:
|
||||
var: nomad_status.stdout_lines
|
||||
124
scripts/utilities/nomad-diagnosis.sh
Executable file
124
scripts/utilities/nomad-diagnosis.sh
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 🔍 Nomad 集群快速诊断脚本
|
||||
|
||||
echo "🔍 Nomad 集群快速诊断"
|
||||
echo "===================="
|
||||
echo ""
|
||||
|
||||
# 定义节点信息
|
||||
declare -A NODES=(
|
||||
["semaphore"]="local"
|
||||
["master"]="100.117.106.136:60022"
|
||||
["ash3c"]="100.116.80.94:22"
|
||||
)
|
||||
|
||||
declare -A TAILSCALE_IPS=(
|
||||
["semaphore"]="100.116.158.95"
|
||||
["master"]="100.117.106.136"
|
||||
["ash3c"]="100.116.80.94"
|
||||
)
|
||||
|
||||
echo "📊 1. 本地 Nomad 服务状态"
|
||||
echo "------------------------"
|
||||
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
|
||||
echo ""
|
||||
|
||||
echo "📊 2. 集群成员状态"
|
||||
echo "----------------"
|
||||
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
|
||||
echo ""
|
||||
|
||||
echo "📊 3. 节点状态"
|
||||
echo "------------"
|
||||
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
|
||||
echo ""
|
||||
|
||||
echo "🌐 4. 网络连通性测试"
|
||||
echo "------------------"
|
||||
for node in "${!NODES[@]}"; do
|
||||
ip="${TAILSCALE_IPS[$node]}"
|
||||
echo "测试 $node ($ip):"
|
||||
|
||||
if [[ "$node" == "semaphore" ]]; then
|
||||
echo " ✅ 本地节点"
|
||||
else
|
||||
# Ping 测试
|
||||
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
|
||||
echo " ✅ Ping: 成功"
|
||||
else
|
||||
echo " ❌ Ping: 失败"
|
||||
fi
|
||||
|
||||
# 端口测试
|
||||
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
|
||||
echo " ✅ RPC端口(4647): 开放"
|
||||
else
|
||||
echo " ❌ RPC端口(4647): 关闭"
|
||||
fi
|
||||
|
||||
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
|
||||
echo " ✅ HTTP端口(4646): 开放"
|
||||
else
|
||||
echo " ❌ HTTP端口(4646): 关闭"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "🔧 5. 远程节点服务状态"
|
||||
echo "-------------------"
|
||||
for node in "${!NODES[@]}"; do
|
||||
if [[ "$node" == "semaphore" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
connection="${NODES[$node]}"
|
||||
ip=$(echo "$connection" | cut -d: -f1)
|
||||
port=$(echo "$connection" | cut -d: -f2)
|
||||
|
||||
echo "检查 $node ($ip:$port):"
|
||||
|
||||
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
|
||||
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
|
||||
echo " 服务状态: $status"
|
||||
|
||||
# 检查配置文件中的 bind_addr
|
||||
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
|
||||
echo " 配置绑定地址: $bind_addr"
|
||||
|
||||
# 检查实际监听端口
|
||||
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
|
||||
if [[ -n "$listening" ]]; then
|
||||
echo " 监听端口:"
|
||||
echo "$listening" | sed 's/^/ /'
|
||||
else
|
||||
echo " ❌ 未发现 Nomad 监听端口"
|
||||
fi
|
||||
else
|
||||
echo " ❌ 无法连接或服务未运行"
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "📋 6. 问题总结和建议"
|
||||
echo "=================="
|
||||
|
||||
# 检查是否有 leader
|
||||
if nomad server members 2>/dev/null | grep -q "leader"; then
|
||||
echo "✅ 集群有 leader"
|
||||
else
|
||||
echo "❌ 集群没有 leader - 这是主要问题!"
|
||||
echo ""
|
||||
echo "🔧 建议的修复步骤:"
|
||||
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
|
||||
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
|
||||
echo "3. 检查 master 节点是否需要重启"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🔗 有用的链接:"
|
||||
echo " Web UI: http://100.116.158.95:4646"
|
||||
echo " 日志查看: journalctl -u nomad -f"
|
||||
echo ""
|
||||
echo "🔍 诊断完成!"
|
||||
76
scripts/utilities/nuclear-reset.sh
Executable file
76
scripts/utilities/nuclear-reset.sh
Executable file
@@ -0,0 +1,76 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ☢️ 核弹级 Nomad 重置执行脚本 ☢️
|
||||
|
||||
set -e
|
||||
|
||||
echo "☢️☢️☢️ 核弹级 Nomad 集群重置 ☢️☢️☢️"
|
||||
echo ""
|
||||
echo "这个脚本将:"
|
||||
echo "1. 完全摧毁所有 Nomad 进程和数据"
|
||||
echo "2. 重新下载并安装 Nomad 二进制文件"
|
||||
echo "3. 创建全新的配置文件"
|
||||
echo "4. 重新启动整个集群"
|
||||
echo ""
|
||||
echo "⚠️ 警告:这是不可逆的操作!⚠️"
|
||||
echo ""
|
||||
|
||||
# 检查是否在正确的目录
|
||||
if [[ ! -f "scripts/utilities/NUCLEAR-NOMAD-RESET.yml" ]]; then
|
||||
echo "❌ 错误:请在 /root/mgmt 目录下运行此脚本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 确认操作
|
||||
read -p "你确定要进行核弹级重置吗?输入 'NUCLEAR' 确认: " confirm
|
||||
if [[ "$confirm" != "NUCLEAR" ]]; then
|
||||
echo "❌ 操作已取消"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🚀 开始核弹级重置..."
|
||||
echo ""
|
||||
|
||||
# 设置 Ansible 配置
|
||||
export ANSIBLE_HOST_KEY_CHECKING=False
|
||||
export ANSIBLE_STDOUT_CALLBACK=yaml
|
||||
|
||||
# 执行核弹级重置
|
||||
echo "📡 执行 Ansible playbook..."
|
||||
cd /root/mgmt/configuration
|
||||
|
||||
ansible-playbook \
|
||||
-i inventories/production/nomad-cluster.ini \
|
||||
../scripts/utilities/NUCLEAR-NOMAD-RESET.yml \
|
||||
--extra-vars "ansible_ssh_common_args='-o StrictHostKeyChecking=no'" \
|
||||
-v
|
||||
|
||||
echo ""
|
||||
echo "⏰ 等待集群稳定..."
|
||||
sleep 30
|
||||
|
||||
echo ""
|
||||
echo "🔍 检查集群状态..."
|
||||
|
||||
# 检查集群成员
|
||||
echo "📊 集群成员状态:"
|
||||
nomad server members || echo "❌ 无法获取集群成员状态"
|
||||
|
||||
echo ""
|
||||
echo "📊 节点状态:"
|
||||
nomad node status || echo "❌ 无法获取节点状态"
|
||||
|
||||
echo ""
|
||||
echo "🎯 如果上面显示错误,请等待几分钟后再次检查"
|
||||
echo "集群可能需要一些时间来完全启动和同步"
|
||||
|
||||
echo ""
|
||||
echo "🔧 有用的命令:"
|
||||
echo " 检查集群成员: nomad server members"
|
||||
echo " 检查节点状态: nomad node status"
|
||||
echo " 查看日志: journalctl -u nomad -f"
|
||||
echo " Web UI: http://100.116.158.95:4646"
|
||||
|
||||
echo ""
|
||||
echo "☢️ 核弹级重置完成!☢️"
|
||||
104
scripts/utilities/simple-nomad-fix.sh
Executable file
104
scripts/utilities/simple-nomad-fix.sh
Executable file
@@ -0,0 +1,104 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "=== 简单的 Nomad 集群修复脚本 ==="
|
||||
|
||||
# 定义 Tailscale IP 地址
|
||||
SEMAPHORE_IP="100.116.158.95"
|
||||
MASTER_IP="100.117.106.136"
|
||||
ASH3C_IP="100.116.80.94"
|
||||
ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
|
||||
# 创建配置文件函数
|
||||
create_config() {
|
||||
local node_name=$1
|
||||
local bind_ip=$2
|
||||
|
||||
cat > /tmp/nomad-${node_name}.hcl << EOF
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "${bind_ip}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 3
|
||||
encrypt = "${ENCRYPT_KEY}"
|
||||
|
||||
server_join {
|
||||
retry_join = ["${SEMAPHORE_IP}", "${MASTER_IP}", "${ASH3C_IP}"]
|
||||
}
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
ui_config {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "${bind_ip}"
|
||||
serf = "${bind_ip}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "docker" {
|
||||
config {
|
||||
allow_privileged = true
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
EOF
|
||||
}
|
||||
|
||||
echo "1. 停止所有 Nomad 服务..."
|
||||
systemctl stop nomad
|
||||
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl stop nomad"
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl stop nomad"
|
||||
|
||||
echo "2. 清理数据目录..."
|
||||
rm -rf /opt/nomad/data/*
|
||||
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*"
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*"
|
||||
|
||||
echo "3. 创建新配置文件..."
|
||||
create_config "semaphore" "${SEMAPHORE_IP}"
|
||||
create_config "master" "${MASTER_IP}"
|
||||
create_config "ash3c" "${ASH3C_IP}"
|
||||
|
||||
echo "4. 部署配置文件..."
|
||||
cp /tmp/nomad-semaphore.hcl /etc/nomad.d/nomad.hcl
|
||||
chown nomad:nomad /etc/nomad.d/nomad.hcl
|
||||
|
||||
scp -P 60022 -i ~/.ssh/id_ed25519 /tmp/nomad-master.hcl ben@${MASTER_IP}:/tmp/
|
||||
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S cp /tmp/nomad-master.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl"
|
||||
|
||||
scp -P 22 -i ~/.ssh/id_ed25519 /tmp/nomad-ash3c.hcl ben@${ASH3C_IP}:/tmp/
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S cp /tmp/nomad-ash3c.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl"
|
||||
|
||||
echo "5. 启动服务..."
|
||||
systemctl start nomad
|
||||
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl start nomad"
|
||||
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl start nomad"
|
||||
|
||||
echo "6. 等待集群形成..."
|
||||
sleep 30
|
||||
|
||||
echo "7. 检查集群状态..."
|
||||
nomad server members
|
||||
nomad node status
|
||||
|
||||
echo "=== 修复完成 ==="
|
||||
113
scripts/utilities/ultimate-nomad-fix.yml
Normal file
113
scripts/utilities/ultimate-nomad-fix.yml
Normal file
@@ -0,0 +1,113 @@
|
||||
---
|
||||
- name: Ultimate Nomad Cluster Fix - Complete Reset
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
|
||||
tasks:
|
||||
- name: Stop and disable nomad service completely
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
enabled: no
|
||||
daemon_reload: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Kill any remaining nomad processes
|
||||
shell: pkill -f nomad || true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Remove all nomad data and state
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /opt/nomad/data
|
||||
- /etc/nomad.d/nomad.hcl
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Create clean nomad directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /etc/nomad.d
|
||||
- /opt/nomad
|
||||
- /opt/nomad/data
|
||||
- /opt/nomad/alloc_mounts
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Create minimal nomad configuration
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ ansible_default_ipv4.address }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 1
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
alloc_dir = "/opt/nomad/alloc_mounts"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ ansible_default_ipv4.address }}"
|
||||
serf = "{{ ansible_default_ipv4.address }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Enable and start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Wait for nomad to start
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ ansible_default_ipv4.address }}"
|
||||
delay: 10
|
||||
timeout: 60
|
||||
|
||||
- name: Check nomad status
|
||||
uri:
|
||||
url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader"
|
||||
method: GET
|
||||
register: nomad_leader
|
||||
retries: 5
|
||||
delay: 5
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display nomad status
|
||||
debug:
|
||||
msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}"
|
||||
Reference in New Issue
Block a user