#!/bin/bash # 🚀 Nomad 集群管理脚本 # Nomad Cluster Management Script set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' PURPLE='\033[0;35m' CYAN='\033[0;36m' NC='\033[0m' # No Color # 日志函数 log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } log_header() { echo -e "${PURPLE}=== $1 ===${NC}" } # 显示集群状态 show_cluster_status() { log_header "Nomad 集群状态概览" # 检查 Leader echo -e "${CYAN}Leader 状态:${NC}" LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接") if [[ "$LEADER" =~ ^\".*\"$ ]]; then echo " ✅ Leader: $(echo $LEADER | tr -d '\"')" else echo " ❌ 无 Leader 或连接失败" return 1 fi echo "" # 节点状态 echo -e "${CYAN}节点状态:${NC}" curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || { log_warning "无法获取节点状态详情" nomad node status 2>/dev/null || echo " ❌ 命令执行失败" } echo "" # 驱动状态 echo -e "${CYAN}驱动程序状态:${NC}" curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r ' .[] | " 节点: \(.Name)" as $node | .Drivers | to_entries[] | " \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")" ' 2>/dev/null || { log_warning "无法获取驱动状态详情" } } # 显示作业状态 show_jobs_status() { log_header "作业状态" JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null) if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null else echo " 📝 当前没有运行的作业" fi } # 显示访问信息 show_access_info() { log_header "访问信息" echo -e "${CYAN}Web UI:${NC}" echo " 🌐 http://100.116.158.95:4646" echo "" echo -e "${CYAN}API 端点:${NC}" echo " 🔗 http://100.116.158.95:4646/v1/" echo "" echo -e "${CYAN}常用命令:${NC}" echo " 📊 nomad status # 查看集群概览" echo " 🖥️ nomad node status # 查看节点状态" echo " 🔧 nomad server members # 查看服务器成员" echo " 📋 nomad job status # 查看作业状态" echo " 🚀 nomad job run # 运行作业" echo " 📜 journalctl -u nomad -f # 查看日志" } # 运行诊断 run_diagnosis() { log_header "运行完整诊断" if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" else log_error "诊断脚本未找到" return 1 fi } # 配置 Podman 驱动 configure_podman() { log_header "配置所有节点使用 Podman 驱动" local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml" local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" if [[ ! -f "$playbook" ]]; then log_error "Playbook 文件不存在: $playbook" return 1 fi if [[ ! -f "$inventory" ]]; then log_error "Inventory 文件不存在: $inventory" return 1 fi cd "$PROJECT_ROOT/configuration" python3 -m ansible playbook -i "$inventory" "$playbook" -v } # 重启集群 restart_cluster() { log_header "重启 Nomad 集群" log_warning "这将重启整个 Nomad 集群" read -p "确认继续? (y/N): " -n 1 -r echo "" if [[ $REPLY =~ ^[Yy]$ ]]; then local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" cd "$PROJECT_ROOT/configuration" python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become log_info "等待集群启动..." sleep 15 show_cluster_status else log_info "操作已取消" fi } # 主菜单 show_menu() { echo "" log_header "Nomad 集群管理菜单" echo "" echo "1) 📊 显示集群状态" echo "2) 📋 显示作业状态" echo "3) 🔍 运行完整诊断" echo "4) 🐳 配置 Podman 驱动" echo "5) 🔄 重启集群" echo "6) ℹ️ 显示访问信息" echo "0) ❌ 退出" echo "" } # 主函数 main() { echo "" echo "🚀 Nomad 集群管理工具" echo "===================" while true; do show_menu read -p "请选择操作 (0-6): " choice case $choice in 1) show_cluster_status ;; 2) show_jobs_status ;; 3) run_diagnosis ;; 4) configure_podman ;; 5) restart_cluster ;; 6) show_access_info ;; 0) log_info "再见!" exit 0 ;; *) log_error "无效选择,请重试" ;; esac echo "" read -p "按回车键继续..." -r done } # 如果直接运行脚本 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@" fi