清理中间过程脚本和程序文件,保留与Nomad v1.10.5一致的核心配置文件

This commit is contained in:
2025-09-28 05:49:32 +00:00
parent 44b098bd20
commit bc529a25fa
70 changed files with 20 additions and 4382 deletions

View File

@@ -1,26 +0,0 @@
#!/bin/bash
echo "🔧 使用 HashiCorp 官方脚本修复 master 节点二进制文件..."
# 停止 nomad 服务
echo '3131' | sudo -S systemctl stop nomad || true
echo '3131' | sudo -S pkill -9 -f nomad || true
# 删除旧的二进制文件
echo '3131' | sudo -S rm -f /usr/local/bin/nomad /usr/bin/nomad
# 使用 HashiCorp 官方安装脚本(自动检测架构)
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
echo '3131' | sudo -S apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
echo '3131' | sudo -S apt-get update
echo '3131' | sudo -S apt-get install -y nomad=1.10.5-1
# 验证安装
nomad version
# 重启服务
echo '3131' | sudo -S systemctl daemon-reload
echo '3131' | sudo -S systemctl enable nomad
echo '3131' | sudo -S systemctl start nomad
echo "✅ Master 节点二进制文件修复完成!"

View File

@@ -1,124 +0,0 @@
#!/bin/bash
# 🔍 Nomad 集群快速诊断脚本
echo "🔍 Nomad 集群快速诊断"
echo "===================="
echo ""
# 定义节点信息
declare -A NODES=(
["semaphore"]="local"
["master"]="100.117.106.136:60022"
["ash3c"]="100.116.80.94:22"
)
declare -A TAILSCALE_IPS=(
["semaphore"]="100.116.158.95"
["master"]="100.117.106.136"
["ash3c"]="100.116.80.94"
)
echo "📊 1. 本地 Nomad 服务状态"
echo "------------------------"
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
echo ""
echo "📊 2. 集群成员状态"
echo "----------------"
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 3. 节点状态"
echo "------------"
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
echo ""
echo "🌐 4. 网络连通性测试"
echo "------------------"
for node in "${!NODES[@]}"; do
ip="${TAILSCALE_IPS[$node]}"
echo "测试 $node ($ip):"
if [[ "$node" == "semaphore" ]]; then
echo " ✅ 本地节点"
else
# Ping 测试
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
echo " ✅ Ping: 成功"
else
echo " ❌ Ping: 失败"
fi
# 端口测试
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
echo " ✅ RPC端口(4647): 开放"
else
echo " ❌ RPC端口(4647): 关闭"
fi
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
echo " ✅ HTTP端口(4646): 开放"
else
echo " ❌ HTTP端口(4646): 关闭"
fi
fi
echo ""
done
echo "🔧 5. 远程节点服务状态"
echo "-------------------"
for node in "${!NODES[@]}"; do
if [[ "$node" == "semaphore" ]]; then
continue
fi
connection="${NODES[$node]}"
ip=$(echo "$connection" | cut -d: -f1)
port=$(echo "$connection" | cut -d: -f2)
echo "检查 $node ($ip:$port):"
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
echo " 服务状态: $status"
# 检查配置文件中的 bind_addr
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
echo " 配置绑定地址: $bind_addr"
# 检查实际监听端口
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
if [[ -n "$listening" ]]; then
echo " 监听端口:"
echo "$listening" | sed 's/^/ /'
else
echo " ❌ 未发现 Nomad 监听端口"
fi
else
echo " ❌ 无法连接或服务未运行"
fi
echo ""
done
echo "📋 6. 问题总结和建议"
echo "=================="
# 检查是否有 leader
if nomad server members 2>/dev/null | grep -q "leader"; then
echo "✅ 集群有 leader"
else
echo "❌ 集群没有 leader - 这是主要问题!"
echo ""
echo "🔧 建议的修复步骤:"
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
echo "3. 检查 master 节点是否需要重启"
fi
echo ""
echo "🔗 有用的链接:"
echo " Web UI: http://100.116.158.95:4646"
echo " 日志查看: journalctl -u nomad -f"
echo ""
echo "🔍 诊断完成!"

View File

@@ -1,76 +0,0 @@
#!/bin/bash
# ☢️ 核弹级 Nomad 重置执行脚本 ☢️
set -e
echo "☢️☢️☢️ 核弹级 Nomad 集群重置 ☢️☢️☢️"
echo ""
echo "这个脚本将:"
echo "1. 完全摧毁所有 Nomad 进程和数据"
echo "2. 重新下载并安装 Nomad 二进制文件"
echo "3. 创建全新的配置文件"
echo "4. 重新启动整个集群"
echo ""
echo "⚠️ 警告:这是不可逆的操作!⚠️"
echo ""
# 检查是否在正确的目录
if [[ ! -f "scripts/utilities/NUCLEAR-NOMAD-RESET.yml" ]]; then
echo "❌ 错误:请在 /root/mgmt 目录下运行此脚本"
exit 1
fi
# 确认操作
read -p "你确定要进行核弹级重置吗?输入 'NUCLEAR' 确认: " confirm
if [[ "$confirm" != "NUCLEAR" ]]; then
echo "❌ 操作已取消"
exit 1
fi
echo ""
echo "🚀 开始核弹级重置..."
echo ""
# 设置 Ansible 配置
export ANSIBLE_HOST_KEY_CHECKING=False
export ANSIBLE_STDOUT_CALLBACK=yaml
# 执行核弹级重置
echo "📡 执行 Ansible playbook..."
cd /root/mgmt/configuration
ansible-playbook \
-i inventories/production/nomad-cluster.ini \
../scripts/utilities/NUCLEAR-NOMAD-RESET.yml \
--extra-vars "ansible_ssh_common_args='-o StrictHostKeyChecking=no'" \
-v
echo ""
echo "⏰ 等待集群稳定..."
sleep 30
echo ""
echo "🔍 检查集群状态..."
# 检查集群成员
echo "📊 集群成员状态:"
nomad server members || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 节点状态:"
nomad node status || echo "❌ 无法获取节点状态"
echo ""
echo "🎯 如果上面显示错误,请等待几分钟后再次检查"
echo "集群可能需要一些时间来完全启动和同步"
echo ""
echo "🔧 有用的命令:"
echo " 检查集群成员: nomad server members"
echo " 检查节点状态: nomad node status"
echo " 查看日志: journalctl -u nomad -f"
echo " Web UI: http://100.116.158.95:4646"
echo ""
echo "☢️ 核弹级重置完成!☢️"

View File

@@ -1,113 +0,0 @@
---
- name: Ultimate Nomad Cluster Fix - Complete Reset
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tasks:
- name: Stop and disable nomad service completely
systemd:
name: nomad
state: stopped
enabled: no
daemon_reload: yes
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Remove all nomad data and state
file:
path: "{{ item }}"
state: absent
loop:
- /opt/nomad/data
- /etc/nomad.d/nomad.hcl
- /var/log/nomad
- name: Create clean nomad directories
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /etc/nomad.d
- /opt/nomad
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- name: Create minimal nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_default_ipv4.address }}"
server {
enabled = true
bootstrap_expect = 1
encrypt = "{{ nomad_encrypt_key }}"
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ ansible_default_ipv4.address }}"
serf = "{{ ansible_default_ipv4.address }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Enable and start nomad service
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 10
timeout: 60
- name: Check nomad status
uri:
url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader"
method: GET
register: nomad_leader
retries: 5
delay: 5
ignore_errors: yes
- name: Display nomad status
debug:
msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}"

View File

@@ -1,31 +0,0 @@
#!/bin/bash
echo "=== Nomad Podman Migration Verification ==="
echo
# Check Nomad service status
echo "1. Checking Nomad service status..."
ssh ben@100.84.197.26 "sudo systemctl status nomad --no-pager -l"
echo
# Check Nomad configuration
echo "2. Checking Nomad configuration..."
ssh ben@100.84.197.26 "sudo cat /etc/nomad.d/nomad.hcl | grep -A 10 -B 2 podman"
echo
# Check Podman socket
echo "3. Checking Podman socket..."
ssh ben@100.84.197.26 "ls -la /run/user/*/podman/podman.sock 2>/dev/null || echo 'Podman socket not found'"
echo
# Check Nomad node status
echo "4. Checking Nomad node status..."
ssh ben@100.84.197.26 "sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 'Driver Status'" 2>/dev/null || echo "Could not get node status"
echo
# Test Podman functionality
echo "5. Testing Podman as nomad user..."
ssh ben@100.84.197.26 "sudo -u nomad podman version --format '{{.Version}}'" 2>/dev/null || echo "Podman test failed"
echo
echo "=== Verification Complete ==="