将infrastructure文件夹中的核心文件整合到tofu文件夹中,并删除infrastructure文件夹

This commit is contained in:
Houzhong Xu 2025-09-28 05:51:27 +00:00
parent bc529a25fa
commit 8d45dec614
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
12 changed files with 0 additions and 1105 deletions

View File

@ -1,52 +0,0 @@
#
#
terraform {
required_version = ">= 1.6"
required_providers {
# Oracle Cloud Infrastructure
oci = {
source = "oracle/oci"
version = "~> 5.0"
}
}
#
backend "local" {
path = "terraform.tfstate"
}
}
# Oracle Cloud
provider "oci" {
tenancy_ocid = var.oci_config.tenancy_ocid
user_ocid = var.oci_config.user_ocid
fingerprint = var.oci_config.fingerprint
private_key_path = var.oci_config.private_key_path
region = var.oci_config.region
}
# Oracle Cloud
module "oracle_cloud" {
source = "../../providers/oracle-cloud"
#
environment = var.environment
project_name = var.project_name
owner = var.owner
vpc_cidr = var.vpc_cidr
availability_zones = var.availability_zones
common_tags = var.common_tags
oci_config = var.oci_config
#
instance_count = 1
instance_size = "VM.Standard.E2.1.Micro" #
}
#
output "oracle_cloud_outputs" {
description = "Oracle Cloud 基础设施输出"
value = module.oracle_cloud
}

View File

@ -1,69 +0,0 @@
#!/bin/bash
# Nomad集群NFS配置部署脚本
# 根据容器类型和地理位置进行分情况处理
set -e
echo "🚀 开始部署Nomad集群NFS配置..."
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 函数:打印带颜色的消息
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# 检查当前目录
if [ ! -f "configuration/inventories/production/inventory.ini" ]; then
log_error "请在mgmt项目根目录运行此脚本"
exit 1
fi
# 1. 为所有节点配置NFS挂载
log_info "步骤1: 为所有节点配置NFS挂载 (根据容器类型和地理位置)"
ansible-playbook -i configuration/inventories/production/inventory.ini \
playbooks/setup-nfs-by-container-type.yml
# 2. 为Nomad客户端配置NFS卷支持
log_info "步骤2: 配置Nomad客户端支持NFS卷"
ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \
playbooks/setup-nomad-nfs-client.yml
# 3. 验证NFS挂载状态
log_info "步骤3: 验证所有节点的NFS挂载状态"
ansible all -i configuration/inventories/production/inventory.ini \
-m shell -a "df -h /mnt/fnsync 2>/dev/null || echo 'NFS未挂载'" \
--limit '!snail'
# 4. 验证Nomad客户端配置
log_info "步骤4: 验证Nomad客户端配置"
ansible nomad_clients -i configuration/inventories/production/nomad-cluster.ini \
-m shell -a "nomad node status -self 2>/dev/null || echo 'Nomad未运行'"
# 5. 部署示例NFS任务可选
read -p "是否部署示例NFS任务(y/n): " deploy_example
if [ "$deploy_example" = "y" ] || [ "$deploy_example" = "Y" ]; then
log_info "部署示例NFS任务..."
nomad run jobs/nomad-nfs-multi-type.nomad
echo "等待任务启动..."
sleep 10
nomad job status nfs-multi-type-example
fi
log_info "✅ NFS配置部署完成!"
echo ""
echo "📋 使用说明:"
echo "1. NFS挂载点: /mnt/fnsync"
echo "2. 本地LXC容器: 直接使用挂载目录"
echo "3. 海外PVE容器: 使用优化参数挂载"
echo "4. Nomad作业: 使用host volume 'nfs-shared'"
echo ""
echo "🔧 手动验证命令:"
echo " - 检查NFS挂载: df -h /mnt/fnsync"
echo " - 检查Nomad状态: nomad node status"
echo " - 运行NFS任务: nomad run jobs/nomad-nfs-multi-type.nomad"

View File

@ -1,29 +0,0 @@
#!/bin/bash
# 分发SSH公钥到所有Nomad节点
echo "分发SSH公钥到Nomad节点..."
# 节点列表
NODES=(
"100.81.26.3" # ash1d.global
"100.103.147.94" # ash2e.global
"100.90.159.68" # ch2.global
"100.86.141.112" # ch3.global
"100.117.106.136" # master
"100.116.80.94" # ash3c
)
PUB_KEY=$(cat /home/ben/.ssh/id_ed25519.pub)
for NODE in "${NODES[@]}"; do
echo "正在配置节点: $NODE"
# 尝试使用现有密钥连接并添加新密钥
ssh-keyscan -H $NODE >> ~/.ssh/known_hosts 2>/dev/null
# 使用现有认证方式添加密钥
ssh root@$NODE "echo '$PUB_KEY' >> /root/.ssh/authorized_keys" 2>/dev/null && \
echo "$NODE 配置成功" || echo "$NODE 配置失败"
done
echo "密钥分发完成"

View File

@ -1,22 +0,0 @@
---
- name: 设置Nomad节点SSH密钥认证
hosts: nomad_nodes
become: yes
vars:
ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIddJVPEvFRtzhWwYjr21lKTar+d7R5Kn/6bhd2s231 ben@ch2"
tasks:
- name: 确保.ssh目录存在
file:
path: /root/.ssh
state: directory
mode: '0700'
- name: 添加SSH公钥到authorized_keys
authorized_key:
user: root
state: present
key: "{{ ssh_public_key }}"
- name: 测试SSH连接
ping:

View File

@ -1,230 +0,0 @@
#!/bin/bash
# Nomad 笔记本设置脚本 - Mac/Linux 版本
# 用于将 Mac 或 Linux 笔记本加入 Nomad 集群作为 server
set -e
# 配置变量
NOMAD_VERSION="1.10.5"
NOMAD_DATACENTER="dc1"
NOMAD_ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
# 检测操作系统
OS=$(uname -s | tr '[:upper:]' '[:lower:]')
ARCH=$(uname -m)
case $ARCH in
x86_64) ARCH="amd64" ;;
arm64|aarch64) ARCH="arm64" ;;
*) echo "不支持的架构: $ARCH"; exit 1 ;;
esac
echo "🚀 开始设置 Nomad ($OS-$ARCH)..."
# 1. 检查 Tailscale
echo "📡 检查 Tailscale 连接..."
if ! command -v tailscale &> /dev/null; then
echo "❌ 请先安装 Tailscale"
exit 1
fi
TAILSCALE_IP=$(tailscale ip | head -1)
if [ -z "$TAILSCALE_IP" ]; then
echo "❌ Tailscale 未连接,请先运行: tailscale up"
exit 1
fi
echo "✅ Tailscale IP: $TAILSCALE_IP"
# 2. 安装 Nomad如果需要
if ! command -v nomad &> /dev/null; then
echo "📦 安装 Nomad $NOMAD_VERSION..."
if [[ "$OS" == "darwin" ]]; then
# macOS
if command -v brew &> /dev/null; then
brew install nomad
else
echo "❌ 请先安装 Homebrew 或手动安装 Nomad"
exit 1
fi
else
# Linux
NOMAD_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_${OS}_${ARCH}.zip"
curl -L "$NOMAD_URL" -o nomad.zip
unzip nomad.zip
sudo mv nomad /usr/local/bin/
rm nomad.zip
fi
fi
echo "✅ Nomad 版本: $(nomad version)"
# 3. 创建配置目录
echo "📁 创建配置目录..."
sudo mkdir -p /etc/nomad.d /opt/nomad/data
sudo chown -R $(whoami):$(id -gn) /etc/nomad.d /opt/nomad/data
# 4. 生成 Nomad 配置
echo "⚙️ 生成 Nomad 配置..."
cat > /etc/nomad.d/nomad.hcl << EOF
datacenter = "$NOMAD_DATACENTER"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "$TAILSCALE_IP"
addresses {
http = "0.0.0.0"
rpc = "$TAILSCALE_IP"
serf = "$TAILSCALE_IP"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 6
retry_join = [
"100.116.158.95", # semaphore
"100.117.106.136", # master (现在是 client)
"100.116.80.94" # ash3c (现在是 client)
]
encrypt = "$NOMAD_ENCRYPT_KEY"
}
client {
enabled = false
}
# 如果是 macOS可能需要 Docker 插件
plugin "podman" {
config {
volumes {
enabled = true
}
}
}
consul {
address = "$TAILSCALE_IP:8500"
}
EOF
echo "✅ 配置文件已生成: /etc/nomad.d/nomad.hcl"
# 5. 创建启动脚本macOS 不使用 systemd
if [[ "$OS" == "darwin" ]]; then
# macOS - 创建 LaunchDaemon
echo "🍎 创建 macOS LaunchDaemon..."
sudo tee /Library/LaunchDaemons/io.nomadproject.nomad.plist > /dev/null << EOF
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>io.nomadproject.nomad</string>
<key>ProgramArguments</key>
<array>
<string>/usr/local/bin/nomad</string>
<string>agent</string>
<string>-config=/etc/nomad.d/nomad.hcl</string>
</array>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<true/>
<key>StandardOutPath</key>
<string>/var/log/nomad.log</string>
<key>StandardErrorPath</key>
<string>/var/log/nomad.log</string>
</dict>
</plist>
EOF
# 加载并启动服务
sudo launchctl load /Library/LaunchDaemons/io.nomadproject.nomad.plist
sudo launchctl start io.nomadproject.nomad
else
# Linux - 创建 systemd 服务
echo "🐧 创建 systemd 服务..."
sudo tee /etc/systemd/system/nomad.service > /dev/null << EOF
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
[Service]
Type=notify
User=$(whoami)
Group=$(id -gn)
ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable nomad
sudo systemctl start nomad
fi
# 6. 验证安装
echo "🔍 验证 Nomad 服务..."
sleep 5
if [[ "$OS" == "darwin" ]]; then
if sudo launchctl list | grep -q nomad; then
echo "✅ Nomad 服务已启动"
else
echo "❌ Nomad 服务启动失败"
exit 1
fi
else
if systemctl is-active --quiet nomad; then
echo "✅ Nomad 服务已启动"
else
echo "❌ Nomad 服务启动失败"
sudo systemctl status nomad
exit 1
fi
fi
# 7. 检查集群状态
echo "🌐 检查集群连接..."
sleep 10
if nomad server members 2>/dev/null | grep -q alive; then
echo "✅ 成功加入 Nomad 集群!"
nomad server members
else
echo "⚠️ 正在连接集群,请稍等..."
echo "可以运行以下命令检查状态:"
echo " nomad server members"
echo " nomad node status"
fi
echo ""
echo "🎉 设置完成!"
echo "📊 Web UI: http://$TAILSCALE_IP:4646"
echo "🔧 配置文件: /etc/nomad.d/nomad.hcl"
echo "📝 日志查看:"
if [[ "$OS" == "darwin" ]]; then
echo " tail -f /var/log/nomad.log"
else
echo " sudo journalctl -u nomad -f"
fi

View File

@ -1,212 +0,0 @@
# Nomad Windows 设置脚本
# 用于将 Windows 笔记本加入 Nomad 集群作为 server
param(
[string]$NomadVersion = "1.10.5",
[string]$DataCenter = "dc1",
[string]$EncryptKey = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
)
# 需要管理员权限
if (-NOT ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) {
Write-Host "❌ 此脚本需要管理员权限运行" -ForegroundColor Red
Write-Host "请以管理员身份运行 PowerShell" -ForegroundColor Yellow
exit 1
}
Write-Host "🚀 开始设置 Windows Nomad Server..." -ForegroundColor Green
# 1. 检查 Tailscale
Write-Host "📡 检查 Tailscale 连接..." -ForegroundColor Cyan
try {
$tailscaleIP = (tailscale ip) | Select-Object -First 1
if ([string]::IsNullOrEmpty($tailscaleIP)) {
throw "Tailscale IP 为空"
}
Write-Host "✅ Tailscale IP: $tailscaleIP" -ForegroundColor Green
} catch {
Write-Host "❌ Tailscale 未安装或未连接" -ForegroundColor Red
Write-Host "请先安装 Tailscale 并运行: tailscale up" -ForegroundColor Yellow
exit 1
}
# 2. 创建目录
Write-Host "📁 创建 Nomad 目录..." -ForegroundColor Cyan
$nomadDir = "C:\nomad"
$configDir = "$nomadDir\config"
$dataDir = "$nomadDir\data"
$binDir = "$nomadDir\bin"
New-Item -ItemType Directory -Force -Path $configDir | Out-Null
New-Item -ItemType Directory -Force -Path $dataDir | Out-Null
New-Item -ItemType Directory -Force -Path $binDir | Out-Null
# 3. 下载 Nomad如果需要
$nomadExe = "$binDir\nomad.exe"
if (-not (Test-Path $nomadExe)) {
Write-Host "📦 下载 Nomad $NomadVersion..." -ForegroundColor Cyan
$nomadUrl = "https://releases.hashicorp.com/nomad/$NomadVersion/nomad_${NomadVersion}_windows_amd64.zip"
$zipPath = "$env:TEMP\nomad.zip"
try {
Invoke-WebRequest -Uri $nomadUrl -OutFile $zipPath
Expand-Archive -Path $zipPath -DestinationPath $binDir -Force
Remove-Item $zipPath
Write-Host "✅ Nomad 下载完成" -ForegroundColor Green
} catch {
Write-Host "❌ 下载 Nomad 失败: $_" -ForegroundColor Red
exit 1
}
}
# 4. 添加到 PATH如果需要
$currentPath = [Environment]::GetEnvironmentVariable("PATH", "Machine")
if ($currentPath -notlike "*$binDir*") {
Write-Host "🔧 添加 Nomad 到系统 PATH..." -ForegroundColor Cyan
[Environment]::SetEnvironmentVariable("PATH", "$currentPath;$binDir", "Machine")
$env:PATH += ";$binDir"
}
# 5. 生成配置文件
Write-Host "⚙️ 生成 Nomad 配置..." -ForegroundColor Cyan
$configContent = @"
datacenter = "$DataCenter"
data_dir = "$($dataDir -replace '\\', '/')"
log_level = "INFO"
bind_addr = "$tailscaleIP"
addresses {
http = "0.0.0.0"
rpc = "$tailscaleIP"
serf = "$tailscaleIP"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 6
retry_join = [
"100.116.158.95", # semaphore
"100.117.106.136", # master
"100.116.80.94" # ash3c
]
encrypt = "$EncryptKey"
}
client {
enabled = false
}
plugin "podman" {
config {
volumes {
enabled = true
}
}
}
consul {
address = "$tailscaleIP:8500"
}
"@
$configFile = "$configDir\nomad.hcl"
$configContent | Out-File -FilePath $configFile -Encoding UTF8
Write-Host "✅ 配置文件已生成: $configFile" -ForegroundColor Green
# 6. 创建 Windows 服务
Write-Host "🔧 创建 Windows 服务..." -ForegroundColor Cyan
# 先停止并删除现有服务(如果存在)
try {
Stop-Service -Name "Nomad" -ErrorAction SilentlyContinue
& sc.exe delete "Nomad" 2>$null
} catch {}
# 创建新服务
$serviceName = "Nomad"
$serviceDisplayName = "HashiCorp Nomad"
$serviceDescription = "HashiCorp Nomad Agent"
$serviceCommand = "`"$nomadExe`" agent -config=`"$configFile`""
try {
& sc.exe create $serviceName binPath= $serviceCommand DisplayName= $serviceDisplayName start= auto
& sc.exe description $serviceName $serviceDescription
# 配置服务恢复选项
& sc.exe failure $serviceName reset= 30 actions= restart/5000/restart/5000/restart/5000
Write-Host "✅ Windows 服务已创建" -ForegroundColor Green
} catch {
Write-Host "❌ 创建服务失败: $_" -ForegroundColor Red
exit 1
}
# 7. 启动服务
Write-Host "🚀 启动 Nomad 服务..." -ForegroundColor Cyan
try {
Start-Service -Name $serviceName
Write-Host "✅ Nomad 服务已启动" -ForegroundColor Green
} catch {
Write-Host "❌ 启动服务失败: $_" -ForegroundColor Red
Write-Host "检查服务状态: Get-Service Nomad" -ForegroundColor Yellow
exit 1
}
# 8. 验证安装
Write-Host "🔍 验证 Nomad 服务..." -ForegroundColor Cyan
Start-Sleep -Seconds 10
try {
$serviceStatus = Get-Service -Name $serviceName
if ($serviceStatus.Status -eq "Running") {
Write-Host "✅ Nomad 服务运行正常" -ForegroundColor Green
} else {
Write-Host "❌ Nomad 服务状态异常: $($serviceStatus.Status)" -ForegroundColor Red
}
} catch {
Write-Host "❌ 检查服务状态失败: $_" -ForegroundColor Red
}
# 9. 检查集群连接
Write-Host "🌐 检查集群连接..." -ForegroundColor Cyan
Start-Sleep -Seconds 15
try {
& $nomadExe server members
Write-Host "✅ 成功加入 Nomad 集群!" -ForegroundColor Green
} catch {
Write-Host "⚠️ 正在连接集群,请稍等..." -ForegroundColor Yellow
Write-Host "可以运行以下命令检查状态:" -ForegroundColor Cyan
Write-Host " nomad server members" -ForegroundColor White
Write-Host " nomad node status" -ForegroundColor White
}
# 10. 防火墙规则
Write-Host "🔥 配置防火墙规则..." -ForegroundColor Cyan
try {
New-NetFirewallRule -DisplayName "Nomad HTTP" -Direction Inbound -Protocol TCP -LocalPort 4646 -Action Allow -ErrorAction SilentlyContinue
New-NetFirewallRule -DisplayName "Nomad RPC" -Direction Inbound -Protocol TCP -LocalPort 4647 -Action Allow -ErrorAction SilentlyContinue
New-NetFirewallRule -DisplayName "Nomad Serf" -Direction Inbound -Protocol TCP -LocalPort 4648 -Action Allow -ErrorAction SilentlyContinue
Write-Host "✅ 防火墙规则已配置" -ForegroundColor Green
} catch {
Write-Host "⚠️ 防火墙规则配置可能失败,请手动检查" -ForegroundColor Yellow
}
Write-Host ""
Write-Host "🎉 Windows Nomad Server 设置完成!" -ForegroundColor Green
Write-Host "📊 Web UI: http://$tailscaleIP:4646" -ForegroundColor Cyan
Write-Host "🔧 配置文件: $configFile" -ForegroundColor Cyan
Write-Host "📝 服务管理:" -ForegroundColor Cyan
Write-Host " 启动: Start-Service Nomad" -ForegroundColor White
Write-Host " 停止: Stop-Service Nomad" -ForegroundColor White
Write-Host " 状态: Get-Service Nomad" -ForegroundColor White
Write-Host " 日志: Get-EventLog -LogName Application -Source Nomad" -ForegroundColor White

View File

@ -1,37 +0,0 @@
#!/bin/bash
echo "=== Nomad 集群状态检查 ==="
# 检查所有节点的服务状态
echo "1. 检查服务状态..."
ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null
echo -e "\n2. 检查网络连通性..."
# 检查网络连通性
for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do
echo "检查到 $ip 的连接..."
timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)"
timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)"
timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)"
done
echo -e "\n3. 检查 Nomad 集群成员..."
# 尝试查询集群成员
if nomad server members 2>/dev/null; then
echo "集群成员查询成功"
else
echo "无法查询集群成员 - 可能没有 leader"
fi
echo -e "\n4. 检查节点状态..."
if nomad node status 2>/dev/null; then
echo "节点状态查询成功"
else
echo "无法查询节点状态"
fi
echo -e "\n5. 检查最近的日志..."
echo "=== Semaphore 节点日志 ==="
journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5
echo -e "\n=== 检查完成 ==="

View File

@ -1,69 +0,0 @@
#!/bin/bash
# 清理退役节点脚本
# 创建日期: 2025-09-27
# 执行日期: 2025-10-27 (一个月后)
set -e
NOMAD_ADDR=${NOMAD_ADDR:-"http://100.116.158.95:4646"}
echo "=== 清理退役节点脚本 ==="
echo "执行时间: $(date)"
echo "Nomad 地址: $NOMAD_ADDR"
echo ""
# 退役节点列表
RETIRED_NODES=(
"583f1b77:semaphore:已转为纯server"
"06bb8a3a:hcs:华为云节点退役"
)
echo "准备清理以下退役节点:"
for node_info in "${RETIRED_NODES[@]}"; do
IFS=':' read -r node_id node_name reason <<< "$node_info"
echo " - $node_name ($node_id): $reason"
done
echo ""
read -p "确认要清理这些节点吗? (y/N): " confirm
if [[ $confirm != [yY] ]]; then
echo "操作已取消"
exit 0
fi
echo "开始清理退役节点..."
for node_info in "${RETIRED_NODES[@]}"; do
IFS=':' read -r node_id node_name reason <<< "$node_info"
echo "处理节点: $node_name ($node_id)"
# 检查节点状态
if nomad node status "$node_id" >/dev/null 2>&1; then
echo " - 节点存在,开始清理..."
# 确保节点已 drain
echo " - 确保节点已 drain..."
nomad node drain -enable -yes "$node_id" || true
# 禁用调度
echo " - 禁用调度资格..."
nomad node eligibility -disable "$node_id" || true
# 等待一段时间确保所有任务已迁移
echo " - 等待任务迁移完成..."
sleep 10
echo " - 节点 $node_name 已成功清理"
else
echo " - 节点不存在或已被清理"
fi
echo ""
done
echo "=== 清理完成 ==="
echo "请手动验证集群状态:"
echo " nomad node status"
echo " nomad server members"
echo ""
echo "如需彻底删除节点记录,请联系管理员"

View File

@ -1,33 +0,0 @@
#!/bin/bash
# 磁盘监控脚本
# 使用方法: ./disk-monitor.sh [threshold]
THRESHOLD=${1:-85} # 默认阈值 85%
INVENTORY_FILE="configuration/inventories/production/nomad-cluster.ini"
echo "🔍 开始磁盘空间监控 (阈值: ${THRESHOLD}%)"
echo "=================================="
# 运行磁盘分析
echo "📊 运行磁盘分析..."
ansible-playbook -i "$INVENTORY_FILE" configuration/playbooks/disk-analysis-ncdu.yml
echo ""
echo "⚠️ 检查高磁盘使用率节点..."
# 检查所有节点的磁盘使用情况
ansible all -i "$INVENTORY_FILE" -m shell -a "df -h | awk 'NR>1 {gsub(/%/, \"\", \$5); if(\$5 > $THRESHOLD) print \$0}'" | while read line; do
if [[ $line == *"=>"* ]]; then
echo "🚨 节点: $line"
elif [[ $line =~ ^/dev ]]; then
echo " 高使用率磁盘: $line"
fi
done
echo ""
echo "💡 如需清理,运行:"
echo " ansible-playbook -i $INVENTORY_FILE configuration/playbooks/disk-cleanup.yml"
echo ""
echo "📁 详细报告位置: /tmp/disk-analysis/"
echo " 使用 ncdu -f /tmp/disk-analysis/ncdu-root-<hostname>.json 查看详细信息"

View File

@ -1,227 +0,0 @@
#!/bin/bash
# 🚀 Nomad 集群管理脚本
# Nomad Cluster Management Script
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_header() {
echo -e "${PURPLE}=== $1 ===${NC}"
}
# 显示集群状态
show_cluster_status() {
log_header "Nomad 集群状态概览"
# 检查 Leader
echo -e "${CYAN}Leader 状态:${NC}"
LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接")
if [[ "$LEADER" =~ ^\".*\"$ ]]; then
echo " ✅ Leader: $(echo $LEADER | tr -d '\"')"
else
echo " ❌ 无 Leader 或连接失败"
return 1
fi
echo ""
# 节点状态
echo -e "${CYAN}节点状态:${NC}"
curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || {
log_warning "无法获取节点状态详情"
nomad node status 2>/dev/null || echo " ❌ 命令执行失败"
}
echo ""
# 驱动状态
echo -e "${CYAN}驱动程序状态:${NC}"
curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '
.[] |
" 节点: \(.Name)" as $node |
.Drivers |
to_entries[] |
" \(.value.Healthy and "" or "") \(.key): \(.value.HealthDescription // "未知")"
' 2>/dev/null || {
log_warning "无法获取驱动状态详情"
}
}
# 显示作业状态
show_jobs_status() {
log_header "作业状态"
JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null)
if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then
echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null
else
echo " 📝 当前没有运行的作业"
fi
}
# 显示访问信息
show_access_info() {
log_header "访问信息"
echo -e "${CYAN}Web UI:${NC}"
echo " 🌐 http://100.116.158.95:4646"
echo ""
echo -e "${CYAN}API 端点:${NC}"
echo " 🔗 http://100.116.158.95:4646/v1/"
echo ""
echo -e "${CYAN}常用命令:${NC}"
echo " 📊 nomad status # 查看集群概览"
echo " 🖥️ nomad node status # 查看节点状态"
echo " 🔧 nomad server members # 查看服务器成员"
echo " 📋 nomad job status <job-name> # 查看作业状态"
echo " 🚀 nomad job run <job-file> # 运行作业"
echo " 📜 journalctl -u nomad -f # 查看日志"
}
# 运行诊断
run_diagnosis() {
log_header "运行完整诊断"
if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then
bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh"
else
log_error "诊断脚本未找到"
return 1
fi
}
# 配置 Podman 驱动
configure_podman() {
log_header "配置所有节点使用 Podman 驱动"
local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml"
local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
if [[ ! -f "$playbook" ]]; then
log_error "Playbook 文件不存在: $playbook"
return 1
fi
if [[ ! -f "$inventory" ]]; then
log_error "Inventory 文件不存在: $inventory"
return 1
fi
cd "$PROJECT_ROOT/configuration"
python3 -m ansible playbook -i "$inventory" "$playbook" -v
}
# 重启集群
restart_cluster() {
log_header "重启 Nomad 集群"
log_warning "这将重启整个 Nomad 集群"
read -p "确认继续? (y/N): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
cd "$PROJECT_ROOT/configuration"
python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become
log_info "等待集群启动..."
sleep 15
show_cluster_status
else
log_info "操作已取消"
fi
}
# 主菜单
show_menu() {
echo ""
log_header "Nomad 集群管理菜单"
echo ""
echo "1) 📊 显示集群状态"
echo "2) 📋 显示作业状态"
echo "3) 🔍 运行完整诊断"
echo "4) 🐳 配置 Podman 驱动"
echo "5) 🔄 重启集群"
echo "6) 显示访问信息"
echo "0) ❌ 退出"
echo ""
}
# 主函数
main() {
echo ""
echo "🚀 Nomad 集群管理工具"
echo "==================="
while true; do
show_menu
read -p "请选择操作 (0-6): " choice
case $choice in
1)
show_cluster_status
;;
2)
show_jobs_status
;;
3)
run_diagnosis
;;
4)
configure_podman
;;
5)
restart_cluster
;;
6)
show_access_info
;;
0)
log_info "再见!"
exit 0
;;
*)
log_error "无效选择,请重试"
;;
esac
echo ""
read -p "按回车键继续..." -r
done
}
# 如果直接运行脚本
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -1,39 +0,0 @@
#!/bin/bash
set -euo pipefail
ADDR="http://100.81.26.3:4646"
# 检查 NOMAD_TOKEN 是否设置,如果设置了,则准备好 Header
HDR=""
if [ -n "${NOMAD_TOKEN:-}" ]; then
HDR="-H "X-Nomad-Token: $NOMAD_TOKEN""
fi
echo "--- 节点列表 (Before) ---"
nomad node status -address="$ADDR"
echo
echo "--- 开始查找需要清理的旧节点 ---"
# 使用 jq 从 nomad node status 的 json 输出中精确查找
# 条件: 状态为 "down" 且 名称匹配列表
IDS_TO_PURGE=$(nomad node status -address="$ADDR" -json | jq -r '.[] | select(.Status == "down" and (.Name | test("^(ch3|ch2|ash1d|ash2e|semaphore)$"))) | .ID')
if [[ -z "$IDS_TO_PURGE" ]]; then
echo "✅ 未找到符合条件的 'down' 状态节点,无需清理。"
else
echo "以下是待清理的节点 ID:"
echo "$IDS_TO_PURGE"
echo
# 循环遍历 ID使用 curl 调用 HTTP API 进行 purge
for NODE_ID in $IDS_TO_PURGE; do
echo "===> 正在清理节点: $NODE_ID"
# 构造 curl 命令,并使用 eval 来正确处理可能为空的 $HDR
cmd="curl -sS -XPOST $HDR -w ' -> HTTP %{http_code}\n' '$ADDR/v1/node/$NODE_ID/purge'"
eval $cmd
done
fi
echo
echo "--- 节点列表 (After) ---"
nomad node status -address="$ADDR"

View File

@ -1,86 +0,0 @@
#!/bin/bash
# NFS配置验证脚本
set -e
echo "🔍 验证NFS配置状态..."
# 颜色定义
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# 1. 检查本地NFS挂载
log_info "1. 检查本地NFS挂载状态"
if df -h | grep -q "/mnt/fnsync"; then
log_info "✅ 本地NFS挂载正常"
df -h | grep "/mnt/fnsync"
else
log_error "❌ 本地NFS未挂载"
fi
# 2. 检查配置文件存在
log_info "2. 检查配置文件"
config_files=(
"playbooks/setup-nfs-by-container-type.yml"
"playbooks/setup-nomad-nfs-client.yml"
"jobs/nomad-nfs-multi-type.nomad"
"scripts/deploy-nfs-for-nomad.sh"
"docs/nomad-nfs-setup.md"
)
for file in "${config_files[@]}"; do
if [ -f "$file" ]; then
log_info "$file 存在"
else
log_error "$file 不存在"
fi
done
# 3. 检查Ansible inventory
log_info "3. 检查Ansible配置"
if [ -f "configuration/inventories/production/inventory.ini" ]; then
log_info "✅ inventory.ini 存在"
echo "节点分类:"
grep -E "\[.*\]" configuration/inventories/production/inventory.ini | head -10
else
log_error "❌ inventory.ini 不存在"
fi
# 4. 检查Nomad服务状态
log_info "4. 检查Nomad服务"
if command -v nomad &> /dev/null; then
if nomad node status &> /dev/null; then
log_info "✅ Nomad服务运行正常"
nomad node status -self | grep -E "(Name|Status|Datacenter)"
else
log_warn "⚠️ Nomad服务未运行或无法连接"
fi
else
log_warn "⚠️ Nomad命令未安装"
fi
# 5. 检查NFS服务器连通性
log_info "5. 检查NFS服务器连通性"
if ping -c 1 -W 3 snail &> /dev/null; then
log_info "✅ NFS服务器 snail 可达"
if command -v showmount &> /dev/null; then
showmount -e snail 2>/dev/null || log_warn "⚠️ 无法获取NFS导出列表"
fi
else
log_error "❌ NFS服务器 snail 不可达"
fi
echo ""
echo "📊 验证完成!"
echo ""
echo "🚀 下一步操作:"
echo "1. 运行部署脚本: ./scripts/deploy-nfs-for-nomad.sh"
echo "2. 查看详细文档: cat docs/nomad-nfs-setup.md"
echo "3. 测试NFS功能: nomad run jobs/nomad-nfs-multi-type.nomad"