diff --git a/infrastructure/environments/dev/main.tf b/infrastructure/environments/dev/main.tf
deleted file mode 100644
index c4d12bc..0000000
--- a/infrastructure/environments/dev/main.tf
+++ /dev/null
@@ -1,52 +0,0 @@
-# 开发环境主配置文件
-
-# 版本和提供商配置
-terraform {
- required_version = ">= 1.6"
-
- required_providers {
- # Oracle Cloud Infrastructure
- oci = {
- source = "oracle/oci"
- version = "~> 5.0"
- }
- }
-
- # 后端配置
- backend "local" {
- path = "terraform.tfstate"
- }
-}
-
-# Oracle Cloud 提供商配置
-provider "oci" {
- tenancy_ocid = var.oci_config.tenancy_ocid
- user_ocid = var.oci_config.user_ocid
- fingerprint = var.oci_config.fingerprint
- private_key_path = var.oci_config.private_key_path
- region = var.oci_config.region
-}
-
-# Oracle Cloud 基础设施
-module "oracle_cloud" {
- source = "../../providers/oracle-cloud"
-
- # 传递变量
- environment = var.environment
- project_name = var.project_name
- owner = var.owner
- vpc_cidr = var.vpc_cidr
- availability_zones = var.availability_zones
- common_tags = var.common_tags
- oci_config = var.oci_config
-
- # 开发环境特定配置
- instance_count = 1
- instance_size = "VM.Standard.E2.1.Micro" # 免费层
-}
-
-# 输出
-output "oracle_cloud_outputs" {
- description = "Oracle Cloud 基础设施输出"
- value = module.oracle_cloud
-}
\ No newline at end of file
diff --git a/scripts/deploy-nfs-for-nomad.sh b/scripts/deploy-nfs-for-nomad.sh
deleted file mode 100755
index be308c8..0000000
--- a/scripts/deploy-nfs-for-nomad.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# Nomad集群NFS配置部署脚本
-# 根据容器类型和地理位置进行分情况处理
-
-set -e
-
-echo "🚀 开始部署Nomad集群NFS配置..."
-
-# 颜色定义
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-# 函数:打印带颜色的消息
-log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
-log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
-log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
-
-# 检查当前目录
-if [ ! -f "configuration/inventories/production/inventory.ini" ]; then
- log_error "请在mgmt项目根目录运行此脚本"
- exit 1
-fi
-
-# 1. 为所有节点配置NFS挂载
-log_info "步骤1: 为所有节点配置NFS挂载 (根据容器类型和地理位置)"
-ansible-playbook -i configuration/inventories/production/inventory.ini \
- playbooks/setup-nfs-by-container-type.yml
-
-# 2. 为Nomad客户端配置NFS卷支持
-log_info "步骤2: 配置Nomad客户端支持NFS卷"
-ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \
- playbooks/setup-nomad-nfs-client.yml
-
-# 3. 验证NFS挂载状态
-log_info "步骤3: 验证所有节点的NFS挂载状态"
-ansible all -i configuration/inventories/production/inventory.ini \
- -m shell -a "df -h /mnt/fnsync 2>/dev/null || echo 'NFS未挂载'" \
- --limit '!snail'
-
-# 4. 验证Nomad客户端配置
-log_info "步骤4: 验证Nomad客户端配置"
-ansible nomad_clients -i configuration/inventories/production/nomad-cluster.ini \
- -m shell -a "nomad node status -self 2>/dev/null || echo 'Nomad未运行'"
-
-# 5. 部署示例NFS任务(可选)
-read -p "是否部署示例NFS任务?(y/n): " deploy_example
-if [ "$deploy_example" = "y" ] || [ "$deploy_example" = "Y" ]; then
- log_info "部署示例NFS任务..."
- nomad run jobs/nomad-nfs-multi-type.nomad
- echo "等待任务启动..."
- sleep 10
- nomad job status nfs-multi-type-example
-fi
-
-log_info "✅ NFS配置部署完成!"
-echo ""
-echo "📋 使用说明:"
-echo "1. NFS挂载点: /mnt/fnsync"
-echo "2. 本地LXC容器: 直接使用挂载目录"
-echo "3. 海外PVE容器: 使用优化参数挂载"
-echo "4. Nomad作业: 使用host volume 'nfs-shared'"
-echo ""
-echo "🔧 手动验证命令:"
-echo " - 检查NFS挂载: df -h /mnt/fnsync"
-echo " - 检查Nomad状态: nomad node status"
-echo " - 运行NFS任务: nomad run jobs/nomad-nfs-multi-type.nomad"
\ No newline at end of file
diff --git a/scripts/distribute-keys.sh b/scripts/distribute-keys.sh
deleted file mode 100644
index f626114..0000000
--- a/scripts/distribute-keys.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-# 分发SSH公钥到所有Nomad节点
-echo "分发SSH公钥到Nomad节点..."
-
-# 节点列表
-NODES=(
- "100.81.26.3" # ash1d.global
- "100.103.147.94" # ash2e.global
- "100.90.159.68" # ch2.global
- "100.86.141.112" # ch3.global
- "100.117.106.136" # master
- "100.116.80.94" # ash3c
-)
-
-PUB_KEY=$(cat /home/ben/.ssh/id_ed25519.pub)
-
-for NODE in "${NODES[@]}"; do
- echo "正在配置节点: $NODE"
-
- # 尝试使用现有密钥连接并添加新密钥
- ssh-keyscan -H $NODE >> ~/.ssh/known_hosts 2>/dev/null
-
- # 使用现有认证方式添加密钥
- ssh root@$NODE "echo '$PUB_KEY' >> /root/.ssh/authorized_keys" 2>/dev/null && \
- echo "✓ $NODE 配置成功" || echo "✗ $NODE 配置失败"
-done
-
-echo "密钥分发完成"
\ No newline at end of file
diff --git a/scripts/setup-ssh-keys.yml b/scripts/setup-ssh-keys.yml
deleted file mode 100644
index eeeb595..0000000
--- a/scripts/setup-ssh-keys.yml
+++ /dev/null
@@ -1,22 +0,0 @@
----
-- name: 设置Nomad节点SSH密钥认证
- hosts: nomad_nodes
- become: yes
- vars:
- ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIddJVPEvFRtzhWwYjr21lKTar+d7R5Kn/6bhd2s231 ben@ch2"
-
- tasks:
- - name: 确保.ssh目录存在
- file:
- path: /root/.ssh
- state: directory
- mode: '0700'
-
- - name: 添加SSH公钥到authorized_keys
- authorized_key:
- user: root
- state: present
- key: "{{ ssh_public_key }}"
-
- - name: 测试SSH连接
- ping:
\ No newline at end of file
diff --git a/scripts/setup/setup-nomad-laptop.sh b/scripts/setup/setup-nomad-laptop.sh
deleted file mode 100755
index 7337e51..0000000
--- a/scripts/setup/setup-nomad-laptop.sh
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/bin/bash
-
-# Nomad 笔记本设置脚本 - Mac/Linux 版本
-# 用于将 Mac 或 Linux 笔记本加入 Nomad 集群作为 server
-
-set -e
-
-# 配置变量
-NOMAD_VERSION="1.10.5"
-NOMAD_DATACENTER="dc1"
-NOMAD_ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
-
-# 检测操作系统
-OS=$(uname -s | tr '[:upper:]' '[:lower:]')
-ARCH=$(uname -m)
-
-case $ARCH in
- x86_64) ARCH="amd64" ;;
- arm64|aarch64) ARCH="arm64" ;;
- *) echo "不支持的架构: $ARCH"; exit 1 ;;
-esac
-
-echo "🚀 开始设置 Nomad ($OS-$ARCH)..."
-
-# 1. 检查 Tailscale
-echo "📡 检查 Tailscale 连接..."
-if ! command -v tailscale &> /dev/null; then
- echo "❌ 请先安装 Tailscale"
- exit 1
-fi
-
-TAILSCALE_IP=$(tailscale ip | head -1)
-if [ -z "$TAILSCALE_IP" ]; then
- echo "❌ Tailscale 未连接,请先运行: tailscale up"
- exit 1
-fi
-
-echo "✅ Tailscale IP: $TAILSCALE_IP"
-
-# 2. 安装 Nomad(如果需要)
-if ! command -v nomad &> /dev/null; then
- echo "📦 安装 Nomad $NOMAD_VERSION..."
-
- if [[ "$OS" == "darwin" ]]; then
- # macOS
- if command -v brew &> /dev/null; then
- brew install nomad
- else
- echo "❌ 请先安装 Homebrew 或手动安装 Nomad"
- exit 1
- fi
- else
- # Linux
- NOMAD_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_${OS}_${ARCH}.zip"
- curl -L "$NOMAD_URL" -o nomad.zip
- unzip nomad.zip
- sudo mv nomad /usr/local/bin/
- rm nomad.zip
- fi
-fi
-
-echo "✅ Nomad 版本: $(nomad version)"
-
-# 3. 创建配置目录
-echo "📁 创建配置目录..."
-sudo mkdir -p /etc/nomad.d /opt/nomad/data
-sudo chown -R $(whoami):$(id -gn) /etc/nomad.d /opt/nomad/data
-
-# 4. 生成 Nomad 配置
-echo "⚙️ 生成 Nomad 配置..."
-cat > /etc/nomad.d/nomad.hcl << EOF
-datacenter = "$NOMAD_DATACENTER"
-data_dir = "/opt/nomad/data"
-log_level = "INFO"
-
-bind_addr = "$TAILSCALE_IP"
-
-addresses {
- http = "0.0.0.0"
- rpc = "$TAILSCALE_IP"
- serf = "$TAILSCALE_IP"
-}
-
-ports {
- http = 4646
- rpc = 4647
- serf = 4648
-}
-
-server {
- enabled = true
- bootstrap_expect = 6
-
- retry_join = [
- "100.116.158.95", # semaphore
- "100.117.106.136", # master (现在是 client)
- "100.116.80.94" # ash3c (现在是 client)
- ]
-
- encrypt = "$NOMAD_ENCRYPT_KEY"
-}
-
-client {
- enabled = false
-}
-
-# 如果是 macOS,可能需要 Docker 插件
-plugin "podman" {
- config {
- volumes {
- enabled = true
- }
- }
-}
-
-consul {
- address = "$TAILSCALE_IP:8500"
-}
-EOF
-
-echo "✅ 配置文件已生成: /etc/nomad.d/nomad.hcl"
-
-# 5. 创建启动脚本(macOS 不使用 systemd)
-if [[ "$OS" == "darwin" ]]; then
- # macOS - 创建 LaunchDaemon
- echo "🍎 创建 macOS LaunchDaemon..."
- sudo tee /Library/LaunchDaemons/io.nomadproject.nomad.plist > /dev/null << EOF
-
-
-
-
- Label
- io.nomadproject.nomad
- ProgramArguments
-
- /usr/local/bin/nomad
- agent
- -config=/etc/nomad.d/nomad.hcl
-
- RunAtLoad
-
- KeepAlive
-
- StandardOutPath
- /var/log/nomad.log
- StandardErrorPath
- /var/log/nomad.log
-
-
-EOF
-
- # 加载并启动服务
- sudo launchctl load /Library/LaunchDaemons/io.nomadproject.nomad.plist
- sudo launchctl start io.nomadproject.nomad
-
-else
- # Linux - 创建 systemd 服务
- echo "🐧 创建 systemd 服务..."
- sudo tee /etc/systemd/system/nomad.service > /dev/null << EOF
-[Unit]
-Description=Nomad
-Documentation=https://www.nomadproject.io/
-Requires=network-online.target
-After=network-online.target
-
-[Service]
-Type=notify
-User=$(whoami)
-Group=$(id -gn)
-ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
-ExecReload=/bin/kill -HUP \$MAINPID
-KillMode=process
-Restart=on-failure
-LimitNOFILE=65536
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
- # 启动服务
- sudo systemctl daemon-reload
- sudo systemctl enable nomad
- sudo systemctl start nomad
-fi
-
-# 6. 验证安装
-echo "🔍 验证 Nomad 服务..."
-sleep 5
-
-if [[ "$OS" == "darwin" ]]; then
- if sudo launchctl list | grep -q nomad; then
- echo "✅ Nomad 服务已启动"
- else
- echo "❌ Nomad 服务启动失败"
- exit 1
- fi
-else
- if systemctl is-active --quiet nomad; then
- echo "✅ Nomad 服务已启动"
- else
- echo "❌ Nomad 服务启动失败"
- sudo systemctl status nomad
- exit 1
- fi
-fi
-
-# 7. 检查集群状态
-echo "🌐 检查集群连接..."
-sleep 10
-
-if nomad server members 2>/dev/null | grep -q alive; then
- echo "✅ 成功加入 Nomad 集群!"
- nomad server members
-else
- echo "⚠️ 正在连接集群,请稍等..."
- echo "可以运行以下命令检查状态:"
- echo " nomad server members"
- echo " nomad node status"
-fi
-
-echo ""
-echo "🎉 设置完成!"
-echo "📊 Web UI: http://$TAILSCALE_IP:4646"
-echo "🔧 配置文件: /etc/nomad.d/nomad.hcl"
-echo "📝 日志查看:"
-if [[ "$OS" == "darwin" ]]; then
- echo " tail -f /var/log/nomad.log"
-else
- echo " sudo journalctl -u nomad -f"
-fi
\ No newline at end of file
diff --git a/scripts/setup/setup-nomad-windows.ps1 b/scripts/setup/setup-nomad-windows.ps1
deleted file mode 100644
index 241e9cd..0000000
--- a/scripts/setup/setup-nomad-windows.ps1
+++ /dev/null
@@ -1,212 +0,0 @@
-# Nomad Windows 设置脚本
-# 用于将 Windows 笔记本加入 Nomad 集群作为 server
-
-param(
- [string]$NomadVersion = "1.10.5",
- [string]$DataCenter = "dc1",
- [string]$EncryptKey = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
-)
-
-# 需要管理员权限
-if (-NOT ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) {
- Write-Host "❌ 此脚本需要管理员权限运行" -ForegroundColor Red
- Write-Host "请以管理员身份运行 PowerShell" -ForegroundColor Yellow
- exit 1
-}
-
-Write-Host "🚀 开始设置 Windows Nomad Server..." -ForegroundColor Green
-
-# 1. 检查 Tailscale
-Write-Host "📡 检查 Tailscale 连接..." -ForegroundColor Cyan
-try {
- $tailscaleIP = (tailscale ip) | Select-Object -First 1
- if ([string]::IsNullOrEmpty($tailscaleIP)) {
- throw "Tailscale IP 为空"
- }
- Write-Host "✅ Tailscale IP: $tailscaleIP" -ForegroundColor Green
-} catch {
- Write-Host "❌ Tailscale 未安装或未连接" -ForegroundColor Red
- Write-Host "请先安装 Tailscale 并运行: tailscale up" -ForegroundColor Yellow
- exit 1
-}
-
-# 2. 创建目录
-Write-Host "📁 创建 Nomad 目录..." -ForegroundColor Cyan
-$nomadDir = "C:\nomad"
-$configDir = "$nomadDir\config"
-$dataDir = "$nomadDir\data"
-$binDir = "$nomadDir\bin"
-
-New-Item -ItemType Directory -Force -Path $configDir | Out-Null
-New-Item -ItemType Directory -Force -Path $dataDir | Out-Null
-New-Item -ItemType Directory -Force -Path $binDir | Out-Null
-
-# 3. 下载 Nomad(如果需要)
-$nomadExe = "$binDir\nomad.exe"
-if (-not (Test-Path $nomadExe)) {
- Write-Host "📦 下载 Nomad $NomadVersion..." -ForegroundColor Cyan
- $nomadUrl = "https://releases.hashicorp.com/nomad/$NomadVersion/nomad_${NomadVersion}_windows_amd64.zip"
- $zipPath = "$env:TEMP\nomad.zip"
-
- try {
- Invoke-WebRequest -Uri $nomadUrl -OutFile $zipPath
- Expand-Archive -Path $zipPath -DestinationPath $binDir -Force
- Remove-Item $zipPath
- Write-Host "✅ Nomad 下载完成" -ForegroundColor Green
- } catch {
- Write-Host "❌ 下载 Nomad 失败: $_" -ForegroundColor Red
- exit 1
- }
-}
-
-# 4. 添加到 PATH(如果需要)
-$currentPath = [Environment]::GetEnvironmentVariable("PATH", "Machine")
-if ($currentPath -notlike "*$binDir*") {
- Write-Host "🔧 添加 Nomad 到系统 PATH..." -ForegroundColor Cyan
- [Environment]::SetEnvironmentVariable("PATH", "$currentPath;$binDir", "Machine")
- $env:PATH += ";$binDir"
-}
-
-# 5. 生成配置文件
-Write-Host "⚙️ 生成 Nomad 配置..." -ForegroundColor Cyan
-$configContent = @"
-datacenter = "$DataCenter"
-data_dir = "$($dataDir -replace '\\', '/')"
-log_level = "INFO"
-
-bind_addr = "$tailscaleIP"
-
-addresses {
- http = "0.0.0.0"
- rpc = "$tailscaleIP"
- serf = "$tailscaleIP"
-}
-
-ports {
- http = 4646
- rpc = 4647
- serf = 4648
-}
-
-server {
- enabled = true
- bootstrap_expect = 6
-
- retry_join = [
- "100.116.158.95", # semaphore
- "100.117.106.136", # master
- "100.116.80.94" # ash3c
- ]
-
- encrypt = "$EncryptKey"
-}
-
-client {
- enabled = false
-}
-
-plugin "podman" {
- config {
- volumes {
- enabled = true
- }
- }
-}
-
-consul {
- address = "$tailscaleIP:8500"
-}
-"@
-
-$configFile = "$configDir\nomad.hcl"
-$configContent | Out-File -FilePath $configFile -Encoding UTF8
-Write-Host "✅ 配置文件已生成: $configFile" -ForegroundColor Green
-
-# 6. 创建 Windows 服务
-Write-Host "🔧 创建 Windows 服务..." -ForegroundColor Cyan
-
-# 先停止并删除现有服务(如果存在)
-try {
- Stop-Service -Name "Nomad" -ErrorAction SilentlyContinue
- & sc.exe delete "Nomad" 2>$null
-} catch {}
-
-# 创建新服务
-$serviceName = "Nomad"
-$serviceDisplayName = "HashiCorp Nomad"
-$serviceDescription = "HashiCorp Nomad Agent"
-$serviceCommand = "`"$nomadExe`" agent -config=`"$configFile`""
-
-try {
- & sc.exe create $serviceName binPath= $serviceCommand DisplayName= $serviceDisplayName start= auto
- & sc.exe description $serviceName $serviceDescription
-
- # 配置服务恢复选项
- & sc.exe failure $serviceName reset= 30 actions= restart/5000/restart/5000/restart/5000
-
- Write-Host "✅ Windows 服务已创建" -ForegroundColor Green
-} catch {
- Write-Host "❌ 创建服务失败: $_" -ForegroundColor Red
- exit 1
-}
-
-# 7. 启动服务
-Write-Host "🚀 启动 Nomad 服务..." -ForegroundColor Cyan
-try {
- Start-Service -Name $serviceName
- Write-Host "✅ Nomad 服务已启动" -ForegroundColor Green
-} catch {
- Write-Host "❌ 启动服务失败: $_" -ForegroundColor Red
- Write-Host "检查服务状态: Get-Service Nomad" -ForegroundColor Yellow
- exit 1
-}
-
-# 8. 验证安装
-Write-Host "🔍 验证 Nomad 服务..." -ForegroundColor Cyan
-Start-Sleep -Seconds 10
-
-try {
- $serviceStatus = Get-Service -Name $serviceName
- if ($serviceStatus.Status -eq "Running") {
- Write-Host "✅ Nomad 服务运行正常" -ForegroundColor Green
- } else {
- Write-Host "❌ Nomad 服务状态异常: $($serviceStatus.Status)" -ForegroundColor Red
- }
-} catch {
- Write-Host "❌ 检查服务状态失败: $_" -ForegroundColor Red
-}
-
-# 9. 检查集群连接
-Write-Host "🌐 检查集群连接..." -ForegroundColor Cyan
-Start-Sleep -Seconds 15
-
-try {
- & $nomadExe server members
- Write-Host "✅ 成功加入 Nomad 集群!" -ForegroundColor Green
-} catch {
- Write-Host "⚠️ 正在连接集群,请稍等..." -ForegroundColor Yellow
- Write-Host "可以运行以下命令检查状态:" -ForegroundColor Cyan
- Write-Host " nomad server members" -ForegroundColor White
- Write-Host " nomad node status" -ForegroundColor White
-}
-
-# 10. 防火墙规则
-Write-Host "🔥 配置防火墙规则..." -ForegroundColor Cyan
-try {
- New-NetFirewallRule -DisplayName "Nomad HTTP" -Direction Inbound -Protocol TCP -LocalPort 4646 -Action Allow -ErrorAction SilentlyContinue
- New-NetFirewallRule -DisplayName "Nomad RPC" -Direction Inbound -Protocol TCP -LocalPort 4647 -Action Allow -ErrorAction SilentlyContinue
- New-NetFirewallRule -DisplayName "Nomad Serf" -Direction Inbound -Protocol TCP -LocalPort 4648 -Action Allow -ErrorAction SilentlyContinue
- Write-Host "✅ 防火墙规则已配置" -ForegroundColor Green
-} catch {
- Write-Host "⚠️ 防火墙规则配置可能失败,请手动检查" -ForegroundColor Yellow
-}
-
-Write-Host ""
-Write-Host "🎉 Windows Nomad Server 设置完成!" -ForegroundColor Green
-Write-Host "📊 Web UI: http://$tailscaleIP:4646" -ForegroundColor Cyan
-Write-Host "🔧 配置文件: $configFile" -ForegroundColor Cyan
-Write-Host "📝 服务管理:" -ForegroundColor Cyan
-Write-Host " 启动: Start-Service Nomad" -ForegroundColor White
-Write-Host " 停止: Stop-Service Nomad" -ForegroundColor White
-Write-Host " 状态: Get-Service Nomad" -ForegroundColor White
-Write-Host " 日志: Get-EventLog -LogName Application -Source Nomad" -ForegroundColor White
\ No newline at end of file
diff --git a/scripts/utilities/check-nomad-cluster.sh b/scripts/utilities/check-nomad-cluster.sh
deleted file mode 100755
index 7286a83..0000000
--- a/scripts/utilities/check-nomad-cluster.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-echo "=== Nomad 集群状态检查 ==="
-
-# 检查所有节点的服务状态
-echo "1. 检查服务状态..."
-ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null
-
-echo -e "\n2. 检查网络连通性..."
-# 检查网络连通性
-for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do
- echo "检查到 $ip 的连接..."
- timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)"
- timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)"
- timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)"
-done
-
-echo -e "\n3. 检查 Nomad 集群成员..."
-# 尝试查询集群成员
-if nomad server members 2>/dev/null; then
- echo "集群成员查询成功"
-else
- echo "无法查询集群成员 - 可能没有 leader"
-fi
-
-echo -e "\n4. 检查节点状态..."
-if nomad node status 2>/dev/null; then
- echo "节点状态查询成功"
-else
- echo "无法查询节点状态"
-fi
-
-echo -e "\n5. 检查最近的日志..."
-echo "=== Semaphore 节点日志 ==="
-journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5
-
-echo -e "\n=== 检查完成 ==="
\ No newline at end of file
diff --git a/scripts/utilities/cleanup-retired-nodes.sh b/scripts/utilities/cleanup-retired-nodes.sh
deleted file mode 100644
index d2f3cde..0000000
--- a/scripts/utilities/cleanup-retired-nodes.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-# 清理退役节点脚本
-# 创建日期: 2025-09-27
-# 执行日期: 2025-10-27 (一个月后)
-
-set -e
-
-NOMAD_ADDR=${NOMAD_ADDR:-"http://100.116.158.95:4646"}
-
-echo "=== 清理退役节点脚本 ==="
-echo "执行时间: $(date)"
-echo "Nomad 地址: $NOMAD_ADDR"
-echo ""
-
-# 退役节点列表
-RETIRED_NODES=(
- "583f1b77:semaphore:已转为纯server"
- "06bb8a3a:hcs:华为云节点退役"
-)
-
-echo "准备清理以下退役节点:"
-for node_info in "${RETIRED_NODES[@]}"; do
- IFS=':' read -r node_id node_name reason <<< "$node_info"
- echo " - $node_name ($node_id): $reason"
-done
-echo ""
-
-read -p "确认要清理这些节点吗? (y/N): " confirm
-if [[ $confirm != [yY] ]]; then
- echo "操作已取消"
- exit 0
-fi
-
-echo "开始清理退役节点..."
-
-for node_info in "${RETIRED_NODES[@]}"; do
- IFS=':' read -r node_id node_name reason <<< "$node_info"
-
- echo "处理节点: $node_name ($node_id)"
-
- # 检查节点状态
- if nomad node status "$node_id" >/dev/null 2>&1; then
- echo " - 节点存在,开始清理..."
-
- # 确保节点已 drain
- echo " - 确保节点已 drain..."
- nomad node drain -enable -yes "$node_id" || true
-
- # 禁用调度
- echo " - 禁用调度资格..."
- nomad node eligibility -disable "$node_id" || true
-
- # 等待一段时间确保所有任务已迁移
- echo " - 等待任务迁移完成..."
- sleep 10
-
- echo " - 节点 $node_name 已成功清理"
- else
- echo " - 节点不存在或已被清理"
- fi
- echo ""
-done
-
-echo "=== 清理完成 ==="
-echo "请手动验证集群状态:"
-echo " nomad node status"
-echo " nomad server members"
-echo ""
-echo "如需彻底删除节点记录,请联系管理员"
\ No newline at end of file
diff --git a/scripts/utilities/disk-monitor.sh b/scripts/utilities/disk-monitor.sh
deleted file mode 100755
index 799838c..0000000
--- a/scripts/utilities/disk-monitor.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-# 磁盘监控脚本
-# 使用方法: ./disk-monitor.sh [threshold]
-
-THRESHOLD=${1:-85} # 默认阈值 85%
-INVENTORY_FILE="configuration/inventories/production/nomad-cluster.ini"
-
-echo "🔍 开始磁盘空间监控 (阈值: ${THRESHOLD}%)"
-echo "=================================="
-
-# 运行磁盘分析
-echo "📊 运行磁盘分析..."
-ansible-playbook -i "$INVENTORY_FILE" configuration/playbooks/disk-analysis-ncdu.yml
-
-echo ""
-echo "⚠️ 检查高磁盘使用率节点..."
-
-# 检查所有节点的磁盘使用情况
-ansible all -i "$INVENTORY_FILE" -m shell -a "df -h | awk 'NR>1 {gsub(/%/, \"\", \$5); if(\$5 > $THRESHOLD) print \$0}'" | while read line; do
- if [[ $line == *"=>"* ]]; then
- echo "🚨 节点: $line"
- elif [[ $line =~ ^/dev ]]; then
- echo " 高使用率磁盘: $line"
- fi
-done
-
-echo ""
-echo "💡 如需清理,运行:"
-echo " ansible-playbook -i $INVENTORY_FILE configuration/playbooks/disk-cleanup.yml"
-echo ""
-echo "📁 详细报告位置: /tmp/disk-analysis/"
-echo " 使用 ncdu -f /tmp/disk-analysis/ncdu-root-.json 查看详细信息"
\ No newline at end of file
diff --git a/scripts/utilities/nomad-cluster-manager.sh b/scripts/utilities/nomad-cluster-manager.sh
deleted file mode 100755
index 9a71e99..0000000
--- a/scripts/utilities/nomad-cluster-manager.sh
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/bin/bash
-
-# 🚀 Nomad 集群管理脚本
-# Nomad Cluster Management Script
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-
-# 颜色定义
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-PURPLE='\033[0;35m'
-CYAN='\033[0;36m'
-NC='\033[0m' # No Color
-
-# 日志函数
-log_info() {
- echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
- echo -e "${GREEN}[SUCCESS]${NC} $1"
-}
-
-log_warning() {
- echo -e "${YELLOW}[WARNING]${NC} $1"
-}
-
-log_error() {
- echo -e "${RED}[ERROR]${NC} $1"
-}
-
-log_header() {
- echo -e "${PURPLE}=== $1 ===${NC}"
-}
-
-# 显示集群状态
-show_cluster_status() {
- log_header "Nomad 集群状态概览"
-
- # 检查 Leader
- echo -e "${CYAN}Leader 状态:${NC}"
- LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接")
- if [[ "$LEADER" =~ ^\".*\"$ ]]; then
- echo " ✅ Leader: $(echo $LEADER | tr -d '\"')"
- else
- echo " ❌ 无 Leader 或连接失败"
- return 1
- fi
-
- echo ""
-
- # 节点状态
- echo -e "${CYAN}节点状态:${NC}"
- curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || {
- log_warning "无法获取节点状态详情"
- nomad node status 2>/dev/null || echo " ❌ 命令执行失败"
- }
-
- echo ""
-
- # 驱动状态
- echo -e "${CYAN}驱动程序状态:${NC}"
- curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '
- .[] |
- " 节点: \(.Name)" as $node |
- .Drivers |
- to_entries[] |
- " \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")"
- ' 2>/dev/null || {
- log_warning "无法获取驱动状态详情"
- }
-}
-
-# 显示作业状态
-show_jobs_status() {
- log_header "作业状态"
-
- JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null)
- if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then
- echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null
- else
- echo " 📝 当前没有运行的作业"
- fi
-}
-
-# 显示访问信息
-show_access_info() {
- log_header "访问信息"
-
- echo -e "${CYAN}Web UI:${NC}"
- echo " 🌐 http://100.116.158.95:4646"
- echo ""
-
- echo -e "${CYAN}API 端点:${NC}"
- echo " 🔗 http://100.116.158.95:4646/v1/"
- echo ""
-
- echo -e "${CYAN}常用命令:${NC}"
- echo " 📊 nomad status # 查看集群概览"
- echo " 🖥️ nomad node status # 查看节点状态"
- echo " 🔧 nomad server members # 查看服务器成员"
- echo " 📋 nomad job status # 查看作业状态"
- echo " 🚀 nomad job run # 运行作业"
- echo " 📜 journalctl -u nomad -f # 查看日志"
-}
-
-# 运行诊断
-run_diagnosis() {
- log_header "运行完整诊断"
-
- if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then
- bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh"
- else
- log_error "诊断脚本未找到"
- return 1
- fi
-}
-
-# 配置 Podman 驱动
-configure_podman() {
- log_header "配置所有节点使用 Podman 驱动"
-
- local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml"
- local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
-
- if [[ ! -f "$playbook" ]]; then
- log_error "Playbook 文件不存在: $playbook"
- return 1
- fi
-
- if [[ ! -f "$inventory" ]]; then
- log_error "Inventory 文件不存在: $inventory"
- return 1
- fi
-
- cd "$PROJECT_ROOT/configuration"
- python3 -m ansible playbook -i "$inventory" "$playbook" -v
-}
-
-# 重启集群
-restart_cluster() {
- log_header "重启 Nomad 集群"
-
- log_warning "这将重启整个 Nomad 集群"
- read -p "确认继续? (y/N): " -n 1 -r
- echo ""
-
- if [[ $REPLY =~ ^[Yy]$ ]]; then
- local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
- cd "$PROJECT_ROOT/configuration"
- python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become
-
- log_info "等待集群启动..."
- sleep 15
- show_cluster_status
- else
- log_info "操作已取消"
- fi
-}
-
-# 主菜单
-show_menu() {
- echo ""
- log_header "Nomad 集群管理菜单"
- echo ""
- echo "1) 📊 显示集群状态"
- echo "2) 📋 显示作业状态"
- echo "3) 🔍 运行完整诊断"
- echo "4) 🐳 配置 Podman 驱动"
- echo "5) 🔄 重启集群"
- echo "6) ℹ️ 显示访问信息"
- echo "0) ❌ 退出"
- echo ""
-}
-
-# 主函数
-main() {
- echo ""
- echo "🚀 Nomad 集群管理工具"
- echo "==================="
-
- while true; do
- show_menu
- read -p "请选择操作 (0-6): " choice
-
- case $choice in
- 1)
- show_cluster_status
- ;;
- 2)
- show_jobs_status
- ;;
- 3)
- run_diagnosis
- ;;
- 4)
- configure_podman
- ;;
- 5)
- restart_cluster
- ;;
- 6)
- show_access_info
- ;;
- 0)
- log_info "再见!"
- exit 0
- ;;
- *)
- log_error "无效选择,请重试"
- ;;
- esac
-
- echo ""
- read -p "按回车键继续..." -r
- done
-}
-
-# 如果直接运行脚本
-if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
- main "$@"
-fi
\ No newline at end of file
diff --git a/scripts/utilities/purge_stale_nodes.sh b/scripts/utilities/purge_stale_nodes.sh
deleted file mode 100755
index 4ff0f6b..0000000
--- a/scripts/utilities/purge_stale_nodes.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-ADDR="http://100.81.26.3:4646"
-# 检查 NOMAD_TOKEN 是否设置,如果设置了,则准备好 Header
-HDR=""
-if [ -n "${NOMAD_TOKEN:-}" ]; then
- HDR="-H "X-Nomad-Token: $NOMAD_TOKEN""
-fi
-
-echo "--- 节点列表 (Before) ---"
-nomad node status -address="$ADDR"
-
-echo
-echo "--- 开始查找需要清理的旧节点 ---"
-
-# 使用 jq 从 nomad node status 的 json 输出中精确查找
-# 条件: 状态为 "down" 且 名称匹配列表
-IDS_TO_PURGE=$(nomad node status -address="$ADDR" -json | jq -r '.[] | select(.Status == "down" and (.Name | test("^(ch3|ch2|ash1d|ash2e|semaphore)$"))) | .ID')
-
-if [[ -z "$IDS_TO_PURGE" ]]; then
- echo "✅ 未找到符合条件的 'down' 状态节点,无需清理。"
-else
- echo "以下是待清理的节点 ID:"
- echo "$IDS_TO_PURGE"
- echo
-
- # 循环遍历 ID,使用 curl 调用 HTTP API 进行 purge
- for NODE_ID in $IDS_TO_PURGE; do
- echo "===> 正在清理节点: $NODE_ID"
- # 构造 curl 命令,并使用 eval 来正确处理可能为空的 $HDR
- cmd="curl -sS -XPOST $HDR -w ' -> HTTP %{http_code}\n' '$ADDR/v1/node/$NODE_ID/purge'"
- eval $cmd
- done
-fi
-
-echo
-echo "--- 节点列表 (After) ---"
-nomad node status -address="$ADDR"
\ No newline at end of file
diff --git a/scripts/verify-nfs-config.sh b/scripts/verify-nfs-config.sh
deleted file mode 100755
index bcb41bb..0000000
--- a/scripts/verify-nfs-config.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-# NFS配置验证脚本
-
-set -e
-
-echo "🔍 验证NFS配置状态..."
-
-# 颜色定义
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-RED='\033[0;31m'
-NC='\033[0m'
-
-log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
-log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
-log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
-
-# 1. 检查本地NFS挂载
-log_info "1. 检查本地NFS挂载状态"
-if df -h | grep -q "/mnt/fnsync"; then
- log_info "✅ 本地NFS挂载正常"
- df -h | grep "/mnt/fnsync"
-else
- log_error "❌ 本地NFS未挂载"
-fi
-
-# 2. 检查配置文件存在
-log_info "2. 检查配置文件"
-config_files=(
- "playbooks/setup-nfs-by-container-type.yml"
- "playbooks/setup-nomad-nfs-client.yml"
- "jobs/nomad-nfs-multi-type.nomad"
- "scripts/deploy-nfs-for-nomad.sh"
- "docs/nomad-nfs-setup.md"
-)
-
-for file in "${config_files[@]}"; do
- if [ -f "$file" ]; then
- log_info "✅ $file 存在"
- else
- log_error "❌ $file 不存在"
- fi
-done
-
-# 3. 检查Ansible inventory
-log_info "3. 检查Ansible配置"
-if [ -f "configuration/inventories/production/inventory.ini" ]; then
- log_info "✅ inventory.ini 存在"
- echo "节点分类:"
- grep -E "\[.*\]" configuration/inventories/production/inventory.ini | head -10
-else
- log_error "❌ inventory.ini 不存在"
-fi
-
-# 4. 检查Nomad服务状态
-log_info "4. 检查Nomad服务"
-if command -v nomad &> /dev/null; then
- if nomad node status &> /dev/null; then
- log_info "✅ Nomad服务运行正常"
- nomad node status -self | grep -E "(Name|Status|Datacenter)"
- else
- log_warn "⚠️ Nomad服务未运行或无法连接"
- fi
-else
- log_warn "⚠️ Nomad命令未安装"
-fi
-
-# 5. 检查NFS服务器连通性
-log_info "5. 检查NFS服务器连通性"
-if ping -c 1 -W 3 snail &> /dev/null; then
- log_info "✅ NFS服务器 snail 可达"
- if command -v showmount &> /dev/null; then
- showmount -e snail 2>/dev/null || log_warn "⚠️ 无法获取NFS导出列表"
- fi
-else
- log_error "❌ NFS服务器 snail 不可达"
-fi
-
-echo ""
-echo "📊 验证完成!"
-echo ""
-echo "🚀 下一步操作:"
-echo "1. 运行部署脚本: ./scripts/deploy-nfs-for-nomad.sh"
-echo "2. 查看详细文档: cat docs/nomad-nfs-setup.md"
-echo "3. 测试NFS功能: nomad run jobs/nomad-nfs-multi-type.nomad"
\ No newline at end of file