diff --git a/infrastructure/environments/dev/main.tf b/infrastructure/environments/dev/main.tf deleted file mode 100644 index c4d12bc..0000000 --- a/infrastructure/environments/dev/main.tf +++ /dev/null @@ -1,52 +0,0 @@ -# 开发环境主配置文件 - -# 版本和提供商配置 -terraform { - required_version = ">= 1.6" - - required_providers { - # Oracle Cloud Infrastructure - oci = { - source = "oracle/oci" - version = "~> 5.0" - } - } - - # 后端配置 - backend "local" { - path = "terraform.tfstate" - } -} - -# Oracle Cloud 提供商配置 -provider "oci" { - tenancy_ocid = var.oci_config.tenancy_ocid - user_ocid = var.oci_config.user_ocid - fingerprint = var.oci_config.fingerprint - private_key_path = var.oci_config.private_key_path - region = var.oci_config.region -} - -# Oracle Cloud 基础设施 -module "oracle_cloud" { - source = "../../providers/oracle-cloud" - - # 传递变量 - environment = var.environment - project_name = var.project_name - owner = var.owner - vpc_cidr = var.vpc_cidr - availability_zones = var.availability_zones - common_tags = var.common_tags - oci_config = var.oci_config - - # 开发环境特定配置 - instance_count = 1 - instance_size = "VM.Standard.E2.1.Micro" # 免费层 -} - -# 输出 -output "oracle_cloud_outputs" { - description = "Oracle Cloud 基础设施输出" - value = module.oracle_cloud -} \ No newline at end of file diff --git a/scripts/deploy-nfs-for-nomad.sh b/scripts/deploy-nfs-for-nomad.sh deleted file mode 100755 index be308c8..0000000 --- a/scripts/deploy-nfs-for-nomad.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -# Nomad集群NFS配置部署脚本 -# 根据容器类型和地理位置进行分情况处理 - -set -e - -echo "🚀 开始部署Nomad集群NFS配置..." - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# 函数:打印带颜色的消息 -log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -# 检查当前目录 -if [ ! -f "configuration/inventories/production/inventory.ini" ]; then - log_error "请在mgmt项目根目录运行此脚本" - exit 1 -fi - -# 1. 为所有节点配置NFS挂载 -log_info "步骤1: 为所有节点配置NFS挂载 (根据容器类型和地理位置)" -ansible-playbook -i configuration/inventories/production/inventory.ini \ - playbooks/setup-nfs-by-container-type.yml - -# 2. 为Nomad客户端配置NFS卷支持 -log_info "步骤2: 配置Nomad客户端支持NFS卷" -ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ - playbooks/setup-nomad-nfs-client.yml - -# 3. 验证NFS挂载状态 -log_info "步骤3: 验证所有节点的NFS挂载状态" -ansible all -i configuration/inventories/production/inventory.ini \ - -m shell -a "df -h /mnt/fnsync 2>/dev/null || echo 'NFS未挂载'" \ - --limit '!snail' - -# 4. 验证Nomad客户端配置 -log_info "步骤4: 验证Nomad客户端配置" -ansible nomad_clients -i configuration/inventories/production/nomad-cluster.ini \ - -m shell -a "nomad node status -self 2>/dev/null || echo 'Nomad未运行'" - -# 5. 部署示例NFS任务(可选) -read -p "是否部署示例NFS任务?(y/n): " deploy_example -if [ "$deploy_example" = "y" ] || [ "$deploy_example" = "Y" ]; then - log_info "部署示例NFS任务..." - nomad run jobs/nomad-nfs-multi-type.nomad - echo "等待任务启动..." - sleep 10 - nomad job status nfs-multi-type-example -fi - -log_info "✅ NFS配置部署完成!" -echo "" -echo "📋 使用说明:" -echo "1. NFS挂载点: /mnt/fnsync" -echo "2. 本地LXC容器: 直接使用挂载目录" -echo "3. 海外PVE容器: 使用优化参数挂载" -echo "4. Nomad作业: 使用host volume 'nfs-shared'" -echo "" -echo "🔧 手动验证命令:" -echo " - 检查NFS挂载: df -h /mnt/fnsync" -echo " - 检查Nomad状态: nomad node status" -echo " - 运行NFS任务: nomad run jobs/nomad-nfs-multi-type.nomad" \ No newline at end of file diff --git a/scripts/distribute-keys.sh b/scripts/distribute-keys.sh deleted file mode 100644 index f626114..0000000 --- a/scripts/distribute-keys.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# 分发SSH公钥到所有Nomad节点 -echo "分发SSH公钥到Nomad节点..." - -# 节点列表 -NODES=( - "100.81.26.3" # ash1d.global - "100.103.147.94" # ash2e.global - "100.90.159.68" # ch2.global - "100.86.141.112" # ch3.global - "100.117.106.136" # master - "100.116.80.94" # ash3c -) - -PUB_KEY=$(cat /home/ben/.ssh/id_ed25519.pub) - -for NODE in "${NODES[@]}"; do - echo "正在配置节点: $NODE" - - # 尝试使用现有密钥连接并添加新密钥 - ssh-keyscan -H $NODE >> ~/.ssh/known_hosts 2>/dev/null - - # 使用现有认证方式添加密钥 - ssh root@$NODE "echo '$PUB_KEY' >> /root/.ssh/authorized_keys" 2>/dev/null && \ - echo "✓ $NODE 配置成功" || echo "✗ $NODE 配置失败" -done - -echo "密钥分发完成" \ No newline at end of file diff --git a/scripts/setup-ssh-keys.yml b/scripts/setup-ssh-keys.yml deleted file mode 100644 index eeeb595..0000000 --- a/scripts/setup-ssh-keys.yml +++ /dev/null @@ -1,22 +0,0 @@ ---- -- name: 设置Nomad节点SSH密钥认证 - hosts: nomad_nodes - become: yes - vars: - ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIddJVPEvFRtzhWwYjr21lKTar+d7R5Kn/6bhd2s231 ben@ch2" - - tasks: - - name: 确保.ssh目录存在 - file: - path: /root/.ssh - state: directory - mode: '0700' - - - name: 添加SSH公钥到authorized_keys - authorized_key: - user: root - state: present - key: "{{ ssh_public_key }}" - - - name: 测试SSH连接 - ping: \ No newline at end of file diff --git a/scripts/setup/setup-nomad-laptop.sh b/scripts/setup/setup-nomad-laptop.sh deleted file mode 100755 index 7337e51..0000000 --- a/scripts/setup/setup-nomad-laptop.sh +++ /dev/null @@ -1,230 +0,0 @@ -#!/bin/bash - -# Nomad 笔记本设置脚本 - Mac/Linux 版本 -# 用于将 Mac 或 Linux 笔记本加入 Nomad 集群作为 server - -set -e - -# 配置变量 -NOMAD_VERSION="1.10.5" -NOMAD_DATACENTER="dc1" -NOMAD_ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - -# 检测操作系统 -OS=$(uname -s | tr '[:upper:]' '[:lower:]') -ARCH=$(uname -m) - -case $ARCH in - x86_64) ARCH="amd64" ;; - arm64|aarch64) ARCH="arm64" ;; - *) echo "不支持的架构: $ARCH"; exit 1 ;; -esac - -echo "🚀 开始设置 Nomad ($OS-$ARCH)..." - -# 1. 检查 Tailscale -echo "📡 检查 Tailscale 连接..." -if ! command -v tailscale &> /dev/null; then - echo "❌ 请先安装 Tailscale" - exit 1 -fi - -TAILSCALE_IP=$(tailscale ip | head -1) -if [ -z "$TAILSCALE_IP" ]; then - echo "❌ Tailscale 未连接,请先运行: tailscale up" - exit 1 -fi - -echo "✅ Tailscale IP: $TAILSCALE_IP" - -# 2. 安装 Nomad(如果需要) -if ! command -v nomad &> /dev/null; then - echo "📦 安装 Nomad $NOMAD_VERSION..." - - if [[ "$OS" == "darwin" ]]; then - # macOS - if command -v brew &> /dev/null; then - brew install nomad - else - echo "❌ 请先安装 Homebrew 或手动安装 Nomad" - exit 1 - fi - else - # Linux - NOMAD_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_${OS}_${ARCH}.zip" - curl -L "$NOMAD_URL" -o nomad.zip - unzip nomad.zip - sudo mv nomad /usr/local/bin/ - rm nomad.zip - fi -fi - -echo "✅ Nomad 版本: $(nomad version)" - -# 3. 创建配置目录 -echo "📁 创建配置目录..." -sudo mkdir -p /etc/nomad.d /opt/nomad/data -sudo chown -R $(whoami):$(id -gn) /etc/nomad.d /opt/nomad/data - -# 4. 生成 Nomad 配置 -echo "⚙️ 生成 Nomad 配置..." -cat > /etc/nomad.d/nomad.hcl << EOF -datacenter = "$NOMAD_DATACENTER" -data_dir = "/opt/nomad/data" -log_level = "INFO" - -bind_addr = "$TAILSCALE_IP" - -addresses { - http = "0.0.0.0" - rpc = "$TAILSCALE_IP" - serf = "$TAILSCALE_IP" -} - -ports { - http = 4646 - rpc = 4647 - serf = 4648 -} - -server { - enabled = true - bootstrap_expect = 6 - - retry_join = [ - "100.116.158.95", # semaphore - "100.117.106.136", # master (现在是 client) - "100.116.80.94" # ash3c (现在是 client) - ] - - encrypt = "$NOMAD_ENCRYPT_KEY" -} - -client { - enabled = false -} - -# 如果是 macOS,可能需要 Docker 插件 -plugin "podman" { - config { - volumes { - enabled = true - } - } -} - -consul { - address = "$TAILSCALE_IP:8500" -} -EOF - -echo "✅ 配置文件已生成: /etc/nomad.d/nomad.hcl" - -# 5. 创建启动脚本(macOS 不使用 systemd) -if [[ "$OS" == "darwin" ]]; then - # macOS - 创建 LaunchDaemon - echo "🍎 创建 macOS LaunchDaemon..." - sudo tee /Library/LaunchDaemons/io.nomadproject.nomad.plist > /dev/null << EOF - - - - - Label - io.nomadproject.nomad - ProgramArguments - - /usr/local/bin/nomad - agent - -config=/etc/nomad.d/nomad.hcl - - RunAtLoad - - KeepAlive - - StandardOutPath - /var/log/nomad.log - StandardErrorPath - /var/log/nomad.log - - -EOF - - # 加载并启动服务 - sudo launchctl load /Library/LaunchDaemons/io.nomadproject.nomad.plist - sudo launchctl start io.nomadproject.nomad - -else - # Linux - 创建 systemd 服务 - echo "🐧 创建 systemd 服务..." - sudo tee /etc/systemd/system/nomad.service > /dev/null << EOF -[Unit] -Description=Nomad -Documentation=https://www.nomadproject.io/ -Requires=network-online.target -After=network-online.target - -[Service] -Type=notify -User=$(whoami) -Group=$(id -gn) -ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl -ExecReload=/bin/kill -HUP \$MAINPID -KillMode=process -Restart=on-failure -LimitNOFILE=65536 - -[Install] -WantedBy=multi-user.target -EOF - - # 启动服务 - sudo systemctl daemon-reload - sudo systemctl enable nomad - sudo systemctl start nomad -fi - -# 6. 验证安装 -echo "🔍 验证 Nomad 服务..." -sleep 5 - -if [[ "$OS" == "darwin" ]]; then - if sudo launchctl list | grep -q nomad; then - echo "✅ Nomad 服务已启动" - else - echo "❌ Nomad 服务启动失败" - exit 1 - fi -else - if systemctl is-active --quiet nomad; then - echo "✅ Nomad 服务已启动" - else - echo "❌ Nomad 服务启动失败" - sudo systemctl status nomad - exit 1 - fi -fi - -# 7. 检查集群状态 -echo "🌐 检查集群连接..." -sleep 10 - -if nomad server members 2>/dev/null | grep -q alive; then - echo "✅ 成功加入 Nomad 集群!" - nomad server members -else - echo "⚠️ 正在连接集群,请稍等..." - echo "可以运行以下命令检查状态:" - echo " nomad server members" - echo " nomad node status" -fi - -echo "" -echo "🎉 设置完成!" -echo "📊 Web UI: http://$TAILSCALE_IP:4646" -echo "🔧 配置文件: /etc/nomad.d/nomad.hcl" -echo "📝 日志查看:" -if [[ "$OS" == "darwin" ]]; then - echo " tail -f /var/log/nomad.log" -else - echo " sudo journalctl -u nomad -f" -fi \ No newline at end of file diff --git a/scripts/setup/setup-nomad-windows.ps1 b/scripts/setup/setup-nomad-windows.ps1 deleted file mode 100644 index 241e9cd..0000000 --- a/scripts/setup/setup-nomad-windows.ps1 +++ /dev/null @@ -1,212 +0,0 @@ -# Nomad Windows 设置脚本 -# 用于将 Windows 笔记本加入 Nomad 集群作为 server - -param( - [string]$NomadVersion = "1.10.5", - [string]$DataCenter = "dc1", - [string]$EncryptKey = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" -) - -# 需要管理员权限 -if (-NOT ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) { - Write-Host "❌ 此脚本需要管理员权限运行" -ForegroundColor Red - Write-Host "请以管理员身份运行 PowerShell" -ForegroundColor Yellow - exit 1 -} - -Write-Host "🚀 开始设置 Windows Nomad Server..." -ForegroundColor Green - -# 1. 检查 Tailscale -Write-Host "📡 检查 Tailscale 连接..." -ForegroundColor Cyan -try { - $tailscaleIP = (tailscale ip) | Select-Object -First 1 - if ([string]::IsNullOrEmpty($tailscaleIP)) { - throw "Tailscale IP 为空" - } - Write-Host "✅ Tailscale IP: $tailscaleIP" -ForegroundColor Green -} catch { - Write-Host "❌ Tailscale 未安装或未连接" -ForegroundColor Red - Write-Host "请先安装 Tailscale 并运行: tailscale up" -ForegroundColor Yellow - exit 1 -} - -# 2. 创建目录 -Write-Host "📁 创建 Nomad 目录..." -ForegroundColor Cyan -$nomadDir = "C:\nomad" -$configDir = "$nomadDir\config" -$dataDir = "$nomadDir\data" -$binDir = "$nomadDir\bin" - -New-Item -ItemType Directory -Force -Path $configDir | Out-Null -New-Item -ItemType Directory -Force -Path $dataDir | Out-Null -New-Item -ItemType Directory -Force -Path $binDir | Out-Null - -# 3. 下载 Nomad(如果需要) -$nomadExe = "$binDir\nomad.exe" -if (-not (Test-Path $nomadExe)) { - Write-Host "📦 下载 Nomad $NomadVersion..." -ForegroundColor Cyan - $nomadUrl = "https://releases.hashicorp.com/nomad/$NomadVersion/nomad_${NomadVersion}_windows_amd64.zip" - $zipPath = "$env:TEMP\nomad.zip" - - try { - Invoke-WebRequest -Uri $nomadUrl -OutFile $zipPath - Expand-Archive -Path $zipPath -DestinationPath $binDir -Force - Remove-Item $zipPath - Write-Host "✅ Nomad 下载完成" -ForegroundColor Green - } catch { - Write-Host "❌ 下载 Nomad 失败: $_" -ForegroundColor Red - exit 1 - } -} - -# 4. 添加到 PATH(如果需要) -$currentPath = [Environment]::GetEnvironmentVariable("PATH", "Machine") -if ($currentPath -notlike "*$binDir*") { - Write-Host "🔧 添加 Nomad 到系统 PATH..." -ForegroundColor Cyan - [Environment]::SetEnvironmentVariable("PATH", "$currentPath;$binDir", "Machine") - $env:PATH += ";$binDir" -} - -# 5. 生成配置文件 -Write-Host "⚙️ 生成 Nomad 配置..." -ForegroundColor Cyan -$configContent = @" -datacenter = "$DataCenter" -data_dir = "$($dataDir -replace '\\', '/')" -log_level = "INFO" - -bind_addr = "$tailscaleIP" - -addresses { - http = "0.0.0.0" - rpc = "$tailscaleIP" - serf = "$tailscaleIP" -} - -ports { - http = 4646 - rpc = 4647 - serf = 4648 -} - -server { - enabled = true - bootstrap_expect = 6 - - retry_join = [ - "100.116.158.95", # semaphore - "100.117.106.136", # master - "100.116.80.94" # ash3c - ] - - encrypt = "$EncryptKey" -} - -client { - enabled = false -} - -plugin "podman" { - config { - volumes { - enabled = true - } - } -} - -consul { - address = "$tailscaleIP:8500" -} -"@ - -$configFile = "$configDir\nomad.hcl" -$configContent | Out-File -FilePath $configFile -Encoding UTF8 -Write-Host "✅ 配置文件已生成: $configFile" -ForegroundColor Green - -# 6. 创建 Windows 服务 -Write-Host "🔧 创建 Windows 服务..." -ForegroundColor Cyan - -# 先停止并删除现有服务(如果存在) -try { - Stop-Service -Name "Nomad" -ErrorAction SilentlyContinue - & sc.exe delete "Nomad" 2>$null -} catch {} - -# 创建新服务 -$serviceName = "Nomad" -$serviceDisplayName = "HashiCorp Nomad" -$serviceDescription = "HashiCorp Nomad Agent" -$serviceCommand = "`"$nomadExe`" agent -config=`"$configFile`"" - -try { - & sc.exe create $serviceName binPath= $serviceCommand DisplayName= $serviceDisplayName start= auto - & sc.exe description $serviceName $serviceDescription - - # 配置服务恢复选项 - & sc.exe failure $serviceName reset= 30 actions= restart/5000/restart/5000/restart/5000 - - Write-Host "✅ Windows 服务已创建" -ForegroundColor Green -} catch { - Write-Host "❌ 创建服务失败: $_" -ForegroundColor Red - exit 1 -} - -# 7. 启动服务 -Write-Host "🚀 启动 Nomad 服务..." -ForegroundColor Cyan -try { - Start-Service -Name $serviceName - Write-Host "✅ Nomad 服务已启动" -ForegroundColor Green -} catch { - Write-Host "❌ 启动服务失败: $_" -ForegroundColor Red - Write-Host "检查服务状态: Get-Service Nomad" -ForegroundColor Yellow - exit 1 -} - -# 8. 验证安装 -Write-Host "🔍 验证 Nomad 服务..." -ForegroundColor Cyan -Start-Sleep -Seconds 10 - -try { - $serviceStatus = Get-Service -Name $serviceName - if ($serviceStatus.Status -eq "Running") { - Write-Host "✅ Nomad 服务运行正常" -ForegroundColor Green - } else { - Write-Host "❌ Nomad 服务状态异常: $($serviceStatus.Status)" -ForegroundColor Red - } -} catch { - Write-Host "❌ 检查服务状态失败: $_" -ForegroundColor Red -} - -# 9. 检查集群连接 -Write-Host "🌐 检查集群连接..." -ForegroundColor Cyan -Start-Sleep -Seconds 15 - -try { - & $nomadExe server members - Write-Host "✅ 成功加入 Nomad 集群!" -ForegroundColor Green -} catch { - Write-Host "⚠️ 正在连接集群,请稍等..." -ForegroundColor Yellow - Write-Host "可以运行以下命令检查状态:" -ForegroundColor Cyan - Write-Host " nomad server members" -ForegroundColor White - Write-Host " nomad node status" -ForegroundColor White -} - -# 10. 防火墙规则 -Write-Host "🔥 配置防火墙规则..." -ForegroundColor Cyan -try { - New-NetFirewallRule -DisplayName "Nomad HTTP" -Direction Inbound -Protocol TCP -LocalPort 4646 -Action Allow -ErrorAction SilentlyContinue - New-NetFirewallRule -DisplayName "Nomad RPC" -Direction Inbound -Protocol TCP -LocalPort 4647 -Action Allow -ErrorAction SilentlyContinue - New-NetFirewallRule -DisplayName "Nomad Serf" -Direction Inbound -Protocol TCP -LocalPort 4648 -Action Allow -ErrorAction SilentlyContinue - Write-Host "✅ 防火墙规则已配置" -ForegroundColor Green -} catch { - Write-Host "⚠️ 防火墙规则配置可能失败,请手动检查" -ForegroundColor Yellow -} - -Write-Host "" -Write-Host "🎉 Windows Nomad Server 设置完成!" -ForegroundColor Green -Write-Host "📊 Web UI: http://$tailscaleIP:4646" -ForegroundColor Cyan -Write-Host "🔧 配置文件: $configFile" -ForegroundColor Cyan -Write-Host "📝 服务管理:" -ForegroundColor Cyan -Write-Host " 启动: Start-Service Nomad" -ForegroundColor White -Write-Host " 停止: Stop-Service Nomad" -ForegroundColor White -Write-Host " 状态: Get-Service Nomad" -ForegroundColor White -Write-Host " 日志: Get-EventLog -LogName Application -Source Nomad" -ForegroundColor White \ No newline at end of file diff --git a/scripts/utilities/check-nomad-cluster.sh b/scripts/utilities/check-nomad-cluster.sh deleted file mode 100755 index 7286a83..0000000 --- a/scripts/utilities/check-nomad-cluster.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -echo "=== Nomad 集群状态检查 ===" - -# 检查所有节点的服务状态 -echo "1. 检查服务状态..." -ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null - -echo -e "\n2. 检查网络连通性..." -# 检查网络连通性 -for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do - echo "检查到 $ip 的连接..." - timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)" - timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)" - timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)" -done - -echo -e "\n3. 检查 Nomad 集群成员..." -# 尝试查询集群成员 -if nomad server members 2>/dev/null; then - echo "集群成员查询成功" -else - echo "无法查询集群成员 - 可能没有 leader" -fi - -echo -e "\n4. 检查节点状态..." -if nomad node status 2>/dev/null; then - echo "节点状态查询成功" -else - echo "无法查询节点状态" -fi - -echo -e "\n5. 检查最近的日志..." -echo "=== Semaphore 节点日志 ===" -journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5 - -echo -e "\n=== 检查完成 ===" \ No newline at end of file diff --git a/scripts/utilities/cleanup-retired-nodes.sh b/scripts/utilities/cleanup-retired-nodes.sh deleted file mode 100644 index d2f3cde..0000000 --- a/scripts/utilities/cleanup-retired-nodes.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# 清理退役节点脚本 -# 创建日期: 2025-09-27 -# 执行日期: 2025-10-27 (一个月后) - -set -e - -NOMAD_ADDR=${NOMAD_ADDR:-"http://100.116.158.95:4646"} - -echo "=== 清理退役节点脚本 ===" -echo "执行时间: $(date)" -echo "Nomad 地址: $NOMAD_ADDR" -echo "" - -# 退役节点列表 -RETIRED_NODES=( - "583f1b77:semaphore:已转为纯server" - "06bb8a3a:hcs:华为云节点退役" -) - -echo "准备清理以下退役节点:" -for node_info in "${RETIRED_NODES[@]}"; do - IFS=':' read -r node_id node_name reason <<< "$node_info" - echo " - $node_name ($node_id): $reason" -done -echo "" - -read -p "确认要清理这些节点吗? (y/N): " confirm -if [[ $confirm != [yY] ]]; then - echo "操作已取消" - exit 0 -fi - -echo "开始清理退役节点..." - -for node_info in "${RETIRED_NODES[@]}"; do - IFS=':' read -r node_id node_name reason <<< "$node_info" - - echo "处理节点: $node_name ($node_id)" - - # 检查节点状态 - if nomad node status "$node_id" >/dev/null 2>&1; then - echo " - 节点存在,开始清理..." - - # 确保节点已 drain - echo " - 确保节点已 drain..." - nomad node drain -enable -yes "$node_id" || true - - # 禁用调度 - echo " - 禁用调度资格..." - nomad node eligibility -disable "$node_id" || true - - # 等待一段时间确保所有任务已迁移 - echo " - 等待任务迁移完成..." - sleep 10 - - echo " - 节点 $node_name 已成功清理" - else - echo " - 节点不存在或已被清理" - fi - echo "" -done - -echo "=== 清理完成 ===" -echo "请手动验证集群状态:" -echo " nomad node status" -echo " nomad server members" -echo "" -echo "如需彻底删除节点记录,请联系管理员" \ No newline at end of file diff --git a/scripts/utilities/disk-monitor.sh b/scripts/utilities/disk-monitor.sh deleted file mode 100755 index 799838c..0000000 --- a/scripts/utilities/disk-monitor.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# 磁盘监控脚本 -# 使用方法: ./disk-monitor.sh [threshold] - -THRESHOLD=${1:-85} # 默认阈值 85% -INVENTORY_FILE="configuration/inventories/production/nomad-cluster.ini" - -echo "🔍 开始磁盘空间监控 (阈值: ${THRESHOLD}%)" -echo "==================================" - -# 运行磁盘分析 -echo "📊 运行磁盘分析..." -ansible-playbook -i "$INVENTORY_FILE" configuration/playbooks/disk-analysis-ncdu.yml - -echo "" -echo "⚠️ 检查高磁盘使用率节点..." - -# 检查所有节点的磁盘使用情况 -ansible all -i "$INVENTORY_FILE" -m shell -a "df -h | awk 'NR>1 {gsub(/%/, \"\", \$5); if(\$5 > $THRESHOLD) print \$0}'" | while read line; do - if [[ $line == *"=>"* ]]; then - echo "🚨 节点: $line" - elif [[ $line =~ ^/dev ]]; then - echo " 高使用率磁盘: $line" - fi -done - -echo "" -echo "💡 如需清理,运行:" -echo " ansible-playbook -i $INVENTORY_FILE configuration/playbooks/disk-cleanup.yml" -echo "" -echo "📁 详细报告位置: /tmp/disk-analysis/" -echo " 使用 ncdu -f /tmp/disk-analysis/ncdu-root-.json 查看详细信息" \ No newline at end of file diff --git a/scripts/utilities/nomad-cluster-manager.sh b/scripts/utilities/nomad-cluster-manager.sh deleted file mode 100755 index 9a71e99..0000000 --- a/scripts/utilities/nomad-cluster-manager.sh +++ /dev/null @@ -1,227 +0,0 @@ -#!/bin/bash - -# 🚀 Nomad 集群管理脚本 -# Nomad Cluster Management Script - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -PURPLE='\033[0;35m' -CYAN='\033[0;36m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -log_header() { - echo -e "${PURPLE}=== $1 ===${NC}" -} - -# 显示集群状态 -show_cluster_status() { - log_header "Nomad 集群状态概览" - - # 检查 Leader - echo -e "${CYAN}Leader 状态:${NC}" - LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接") - if [[ "$LEADER" =~ ^\".*\"$ ]]; then - echo " ✅ Leader: $(echo $LEADER | tr -d '\"')" - else - echo " ❌ 无 Leader 或连接失败" - return 1 - fi - - echo "" - - # 节点状态 - echo -e "${CYAN}节点状态:${NC}" - curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || { - log_warning "无法获取节点状态详情" - nomad node status 2>/dev/null || echo " ❌ 命令执行失败" - } - - echo "" - - # 驱动状态 - echo -e "${CYAN}驱动程序状态:${NC}" - curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r ' - .[] | - " 节点: \(.Name)" as $node | - .Drivers | - to_entries[] | - " \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")" - ' 2>/dev/null || { - log_warning "无法获取驱动状态详情" - } -} - -# 显示作业状态 -show_jobs_status() { - log_header "作业状态" - - JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null) - if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then - echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null - else - echo " 📝 当前没有运行的作业" - fi -} - -# 显示访问信息 -show_access_info() { - log_header "访问信息" - - echo -e "${CYAN}Web UI:${NC}" - echo " 🌐 http://100.116.158.95:4646" - echo "" - - echo -e "${CYAN}API 端点:${NC}" - echo " 🔗 http://100.116.158.95:4646/v1/" - echo "" - - echo -e "${CYAN}常用命令:${NC}" - echo " 📊 nomad status # 查看集群概览" - echo " 🖥️ nomad node status # 查看节点状态" - echo " 🔧 nomad server members # 查看服务器成员" - echo " 📋 nomad job status # 查看作业状态" - echo " 🚀 nomad job run # 运行作业" - echo " 📜 journalctl -u nomad -f # 查看日志" -} - -# 运行诊断 -run_diagnosis() { - log_header "运行完整诊断" - - if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then - bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" - else - log_error "诊断脚本未找到" - return 1 - fi -} - -# 配置 Podman 驱动 -configure_podman() { - log_header "配置所有节点使用 Podman 驱动" - - local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml" - local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - if [[ ! -f "$playbook" ]]; then - log_error "Playbook 文件不存在: $playbook" - return 1 - fi - - if [[ ! -f "$inventory" ]]; then - log_error "Inventory 文件不存在: $inventory" - return 1 - fi - - cd "$PROJECT_ROOT/configuration" - python3 -m ansible playbook -i "$inventory" "$playbook" -v -} - -# 重启集群 -restart_cluster() { - log_header "重启 Nomad 集群" - - log_warning "这将重启整个 Nomad 集群" - read -p "确认继续? (y/N): " -n 1 -r - echo "" - - if [[ $REPLY =~ ^[Yy]$ ]]; then - local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - cd "$PROJECT_ROOT/configuration" - python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become - - log_info "等待集群启动..." - sleep 15 - show_cluster_status - else - log_info "操作已取消" - fi -} - -# 主菜单 -show_menu() { - echo "" - log_header "Nomad 集群管理菜单" - echo "" - echo "1) 📊 显示集群状态" - echo "2) 📋 显示作业状态" - echo "3) 🔍 运行完整诊断" - echo "4) 🐳 配置 Podman 驱动" - echo "5) 🔄 重启集群" - echo "6) ℹ️ 显示访问信息" - echo "0) ❌ 退出" - echo "" -} - -# 主函数 -main() { - echo "" - echo "🚀 Nomad 集群管理工具" - echo "===================" - - while true; do - show_menu - read -p "请选择操作 (0-6): " choice - - case $choice in - 1) - show_cluster_status - ;; - 2) - show_jobs_status - ;; - 3) - run_diagnosis - ;; - 4) - configure_podman - ;; - 5) - restart_cluster - ;; - 6) - show_access_info - ;; - 0) - log_info "再见!" - exit 0 - ;; - *) - log_error "无效选择,请重试" - ;; - esac - - echo "" - read -p "按回车键继续..." -r - done -} - -# 如果直接运行脚本 -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - main "$@" -fi \ No newline at end of file diff --git a/scripts/utilities/purge_stale_nodes.sh b/scripts/utilities/purge_stale_nodes.sh deleted file mode 100755 index 4ff0f6b..0000000 --- a/scripts/utilities/purge_stale_nodes.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -set -euo pipefail - -ADDR="http://100.81.26.3:4646" -# 检查 NOMAD_TOKEN 是否设置,如果设置了,则准备好 Header -HDR="" -if [ -n "${NOMAD_TOKEN:-}" ]; then - HDR="-H "X-Nomad-Token: $NOMAD_TOKEN"" -fi - -echo "--- 节点列表 (Before) ---" -nomad node status -address="$ADDR" - -echo -echo "--- 开始查找需要清理的旧节点 ---" - -# 使用 jq 从 nomad node status 的 json 输出中精确查找 -# 条件: 状态为 "down" 且 名称匹配列表 -IDS_TO_PURGE=$(nomad node status -address="$ADDR" -json | jq -r '.[] | select(.Status == "down" and (.Name | test("^(ch3|ch2|ash1d|ash2e|semaphore)$"))) | .ID') - -if [[ -z "$IDS_TO_PURGE" ]]; then - echo "✅ 未找到符合条件的 'down' 状态节点,无需清理。" -else - echo "以下是待清理的节点 ID:" - echo "$IDS_TO_PURGE" - echo - - # 循环遍历 ID,使用 curl 调用 HTTP API 进行 purge - for NODE_ID in $IDS_TO_PURGE; do - echo "===> 正在清理节点: $NODE_ID" - # 构造 curl 命令,并使用 eval 来正确处理可能为空的 $HDR - cmd="curl -sS -XPOST $HDR -w ' -> HTTP %{http_code}\n' '$ADDR/v1/node/$NODE_ID/purge'" - eval $cmd - done -fi - -echo -echo "--- 节点列表 (After) ---" -nomad node status -address="$ADDR" \ No newline at end of file diff --git a/scripts/verify-nfs-config.sh b/scripts/verify-nfs-config.sh deleted file mode 100755 index bcb41bb..0000000 --- a/scripts/verify-nfs-config.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# NFS配置验证脚本 - -set -e - -echo "🔍 验证NFS配置状态..." - -# 颜色定义 -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -RED='\033[0;31m' -NC='\033[0m' - -log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -# 1. 检查本地NFS挂载 -log_info "1. 检查本地NFS挂载状态" -if df -h | grep -q "/mnt/fnsync"; then - log_info "✅ 本地NFS挂载正常" - df -h | grep "/mnt/fnsync" -else - log_error "❌ 本地NFS未挂载" -fi - -# 2. 检查配置文件存在 -log_info "2. 检查配置文件" -config_files=( - "playbooks/setup-nfs-by-container-type.yml" - "playbooks/setup-nomad-nfs-client.yml" - "jobs/nomad-nfs-multi-type.nomad" - "scripts/deploy-nfs-for-nomad.sh" - "docs/nomad-nfs-setup.md" -) - -for file in "${config_files[@]}"; do - if [ -f "$file" ]; then - log_info "✅ $file 存在" - else - log_error "❌ $file 不存在" - fi -done - -# 3. 检查Ansible inventory -log_info "3. 检查Ansible配置" -if [ -f "configuration/inventories/production/inventory.ini" ]; then - log_info "✅ inventory.ini 存在" - echo "节点分类:" - grep -E "\[.*\]" configuration/inventories/production/inventory.ini | head -10 -else - log_error "❌ inventory.ini 不存在" -fi - -# 4. 检查Nomad服务状态 -log_info "4. 检查Nomad服务" -if command -v nomad &> /dev/null; then - if nomad node status &> /dev/null; then - log_info "✅ Nomad服务运行正常" - nomad node status -self | grep -E "(Name|Status|Datacenter)" - else - log_warn "⚠️ Nomad服务未运行或无法连接" - fi -else - log_warn "⚠️ Nomad命令未安装" -fi - -# 5. 检查NFS服务器连通性 -log_info "5. 检查NFS服务器连通性" -if ping -c 1 -W 3 snail &> /dev/null; then - log_info "✅ NFS服务器 snail 可达" - if command -v showmount &> /dev/null; then - showmount -e snail 2>/dev/null || log_warn "⚠️ 无法获取NFS导出列表" - fi -else - log_error "❌ NFS服务器 snail 不可达" -fi - -echo "" -echo "📊 验证完成!" -echo "" -echo "🚀 下一步操作:" -echo "1. 运行部署脚本: ./scripts/deploy-nfs-for-nomad.sh" -echo "2. 查看详细文档: cat docs/nomad-nfs-setup.md" -echo "3. 测试NFS功能: nomad run jobs/nomad-nfs-multi-type.nomad" \ No newline at end of file