清理中间过程脚本和程序文件,保留与Nomad v1.10.5一致的核心配置文件
This commit is contained in:
@@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "🔧 使用 HashiCorp 官方脚本修复 master 节点二进制文件..."
|
||||
|
||||
# 停止 nomad 服务
|
||||
echo '3131' | sudo -S systemctl stop nomad || true
|
||||
echo '3131' | sudo -S pkill -9 -f nomad || true
|
||||
|
||||
# 删除旧的二进制文件
|
||||
echo '3131' | sudo -S rm -f /usr/local/bin/nomad /usr/bin/nomad
|
||||
|
||||
# 使用 HashiCorp 官方安装脚本(自动检测架构)
|
||||
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
|
||||
echo '3131' | sudo -S apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
|
||||
echo '3131' | sudo -S apt-get update
|
||||
echo '3131' | sudo -S apt-get install -y nomad=1.10.5-1
|
||||
|
||||
# 验证安装
|
||||
nomad version
|
||||
|
||||
# 重启服务
|
||||
echo '3131' | sudo -S systemctl daemon-reload
|
||||
echo '3131' | sudo -S systemctl enable nomad
|
||||
echo '3131' | sudo -S systemctl start nomad
|
||||
|
||||
echo "✅ Master 节点二进制文件修复完成!"
|
||||
@@ -1,124 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 🔍 Nomad 集群快速诊断脚本
|
||||
|
||||
echo "🔍 Nomad 集群快速诊断"
|
||||
echo "===================="
|
||||
echo ""
|
||||
|
||||
# 定义节点信息
|
||||
declare -A NODES=(
|
||||
["semaphore"]="local"
|
||||
["master"]="100.117.106.136:60022"
|
||||
["ash3c"]="100.116.80.94:22"
|
||||
)
|
||||
|
||||
declare -A TAILSCALE_IPS=(
|
||||
["semaphore"]="100.116.158.95"
|
||||
["master"]="100.117.106.136"
|
||||
["ash3c"]="100.116.80.94"
|
||||
)
|
||||
|
||||
echo "📊 1. 本地 Nomad 服务状态"
|
||||
echo "------------------------"
|
||||
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
|
||||
echo ""
|
||||
|
||||
echo "📊 2. 集群成员状态"
|
||||
echo "----------------"
|
||||
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
|
||||
echo ""
|
||||
|
||||
echo "📊 3. 节点状态"
|
||||
echo "------------"
|
||||
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
|
||||
echo ""
|
||||
|
||||
echo "🌐 4. 网络连通性测试"
|
||||
echo "------------------"
|
||||
for node in "${!NODES[@]}"; do
|
||||
ip="${TAILSCALE_IPS[$node]}"
|
||||
echo "测试 $node ($ip):"
|
||||
|
||||
if [[ "$node" == "semaphore" ]]; then
|
||||
echo " ✅ 本地节点"
|
||||
else
|
||||
# Ping 测试
|
||||
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
|
||||
echo " ✅ Ping: 成功"
|
||||
else
|
||||
echo " ❌ Ping: 失败"
|
||||
fi
|
||||
|
||||
# 端口测试
|
||||
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
|
||||
echo " ✅ RPC端口(4647): 开放"
|
||||
else
|
||||
echo " ❌ RPC端口(4647): 关闭"
|
||||
fi
|
||||
|
||||
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
|
||||
echo " ✅ HTTP端口(4646): 开放"
|
||||
else
|
||||
echo " ❌ HTTP端口(4646): 关闭"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "🔧 5. 远程节点服务状态"
|
||||
echo "-------------------"
|
||||
for node in "${!NODES[@]}"; do
|
||||
if [[ "$node" == "semaphore" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
connection="${NODES[$node]}"
|
||||
ip=$(echo "$connection" | cut -d: -f1)
|
||||
port=$(echo "$connection" | cut -d: -f2)
|
||||
|
||||
echo "检查 $node ($ip:$port):"
|
||||
|
||||
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
|
||||
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
|
||||
echo " 服务状态: $status"
|
||||
|
||||
# 检查配置文件中的 bind_addr
|
||||
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
|
||||
echo " 配置绑定地址: $bind_addr"
|
||||
|
||||
# 检查实际监听端口
|
||||
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
|
||||
if [[ -n "$listening" ]]; then
|
||||
echo " 监听端口:"
|
||||
echo "$listening" | sed 's/^/ /'
|
||||
else
|
||||
echo " ❌ 未发现 Nomad 监听端口"
|
||||
fi
|
||||
else
|
||||
echo " ❌ 无法连接或服务未运行"
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "📋 6. 问题总结和建议"
|
||||
echo "=================="
|
||||
|
||||
# 检查是否有 leader
|
||||
if nomad server members 2>/dev/null | grep -q "leader"; then
|
||||
echo "✅ 集群有 leader"
|
||||
else
|
||||
echo "❌ 集群没有 leader - 这是主要问题!"
|
||||
echo ""
|
||||
echo "🔧 建议的修复步骤:"
|
||||
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
|
||||
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
|
||||
echo "3. 检查 master 节点是否需要重启"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🔗 有用的链接:"
|
||||
echo " Web UI: http://100.116.158.95:4646"
|
||||
echo " 日志查看: journalctl -u nomad -f"
|
||||
echo ""
|
||||
echo "🔍 诊断完成!"
|
||||
@@ -1,76 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ☢️ 核弹级 Nomad 重置执行脚本 ☢️
|
||||
|
||||
set -e
|
||||
|
||||
echo "☢️☢️☢️ 核弹级 Nomad 集群重置 ☢️☢️☢️"
|
||||
echo ""
|
||||
echo "这个脚本将:"
|
||||
echo "1. 完全摧毁所有 Nomad 进程和数据"
|
||||
echo "2. 重新下载并安装 Nomad 二进制文件"
|
||||
echo "3. 创建全新的配置文件"
|
||||
echo "4. 重新启动整个集群"
|
||||
echo ""
|
||||
echo "⚠️ 警告:这是不可逆的操作!⚠️"
|
||||
echo ""
|
||||
|
||||
# 检查是否在正确的目录
|
||||
if [[ ! -f "scripts/utilities/NUCLEAR-NOMAD-RESET.yml" ]]; then
|
||||
echo "❌ 错误:请在 /root/mgmt 目录下运行此脚本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 确认操作
|
||||
read -p "你确定要进行核弹级重置吗?输入 'NUCLEAR' 确认: " confirm
|
||||
if [[ "$confirm" != "NUCLEAR" ]]; then
|
||||
echo "❌ 操作已取消"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🚀 开始核弹级重置..."
|
||||
echo ""
|
||||
|
||||
# 设置 Ansible 配置
|
||||
export ANSIBLE_HOST_KEY_CHECKING=False
|
||||
export ANSIBLE_STDOUT_CALLBACK=yaml
|
||||
|
||||
# 执行核弹级重置
|
||||
echo "📡 执行 Ansible playbook..."
|
||||
cd /root/mgmt/configuration
|
||||
|
||||
ansible-playbook \
|
||||
-i inventories/production/nomad-cluster.ini \
|
||||
../scripts/utilities/NUCLEAR-NOMAD-RESET.yml \
|
||||
--extra-vars "ansible_ssh_common_args='-o StrictHostKeyChecking=no'" \
|
||||
-v
|
||||
|
||||
echo ""
|
||||
echo "⏰ 等待集群稳定..."
|
||||
sleep 30
|
||||
|
||||
echo ""
|
||||
echo "🔍 检查集群状态..."
|
||||
|
||||
# 检查集群成员
|
||||
echo "📊 集群成员状态:"
|
||||
nomad server members || echo "❌ 无法获取集群成员状态"
|
||||
|
||||
echo ""
|
||||
echo "📊 节点状态:"
|
||||
nomad node status || echo "❌ 无法获取节点状态"
|
||||
|
||||
echo ""
|
||||
echo "🎯 如果上面显示错误,请等待几分钟后再次检查"
|
||||
echo "集群可能需要一些时间来完全启动和同步"
|
||||
|
||||
echo ""
|
||||
echo "🔧 有用的命令:"
|
||||
echo " 检查集群成员: nomad server members"
|
||||
echo " 检查节点状态: nomad node status"
|
||||
echo " 查看日志: journalctl -u nomad -f"
|
||||
echo " Web UI: http://100.116.158.95:4646"
|
||||
|
||||
echo ""
|
||||
echo "☢️ 核弹级重置完成!☢️"
|
||||
@@ -1,113 +0,0 @@
|
||||
---
|
||||
- name: Ultimate Nomad Cluster Fix - Complete Reset
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
|
||||
tasks:
|
||||
- name: Stop and disable nomad service completely
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
enabled: no
|
||||
daemon_reload: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Kill any remaining nomad processes
|
||||
shell: pkill -f nomad || true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Remove all nomad data and state
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /opt/nomad/data
|
||||
- /etc/nomad.d/nomad.hcl
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Create clean nomad directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /etc/nomad.d
|
||||
- /opt/nomad
|
||||
- /opt/nomad/data
|
||||
- /opt/nomad/alloc_mounts
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Create minimal nomad configuration
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "{{ ansible_default_ipv4.address }}"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 1
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
alloc_dir = "/opt/nomad/alloc_mounts"
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ ansible_default_ipv4.address }}"
|
||||
serf = "{{ ansible_default_ipv4.address }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
|
||||
- name: Enable and start nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Wait for nomad to start
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ ansible_default_ipv4.address }}"
|
||||
delay: 10
|
||||
timeout: 60
|
||||
|
||||
- name: Check nomad status
|
||||
uri:
|
||||
url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader"
|
||||
method: GET
|
||||
register: nomad_leader
|
||||
retries: 5
|
||||
delay: 5
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display nomad status
|
||||
debug:
|
||||
msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}"
|
||||
@@ -1,31 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "=== Nomad Podman Migration Verification ==="
|
||||
echo
|
||||
|
||||
# Check Nomad service status
|
||||
echo "1. Checking Nomad service status..."
|
||||
ssh ben@100.84.197.26 "sudo systemctl status nomad --no-pager -l"
|
||||
echo
|
||||
|
||||
# Check Nomad configuration
|
||||
echo "2. Checking Nomad configuration..."
|
||||
ssh ben@100.84.197.26 "sudo cat /etc/nomad.d/nomad.hcl | grep -A 10 -B 2 podman"
|
||||
echo
|
||||
|
||||
# Check Podman socket
|
||||
echo "3. Checking Podman socket..."
|
||||
ssh ben@100.84.197.26 "ls -la /run/user/*/podman/podman.sock 2>/dev/null || echo 'Podman socket not found'"
|
||||
echo
|
||||
|
||||
# Check Nomad node status
|
||||
echo "4. Checking Nomad node status..."
|
||||
ssh ben@100.84.197.26 "sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 'Driver Status'" 2>/dev/null || echo "Could not get node status"
|
||||
echo
|
||||
|
||||
# Test Podman functionality
|
||||
echo "5. Testing Podman as nomad user..."
|
||||
ssh ben@100.84.197.26 "sudo -u nomad podman version --format '{{.Version}}'" 2>/dev/null || echo "Podman test failed"
|
||||
echo
|
||||
|
||||
echo "=== Verification Complete ==="
|
||||
Reference in New Issue
Block a user