mgmt/scripts/utilities/nomad-cluster-manager.sh

227 lines
5.8 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 🚀 Nomad 集群管理脚本
# Nomad Cluster Management Script
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_header() {
echo -e "${PURPLE}=== $1 ===${NC}"
}
# 显示集群状态
show_cluster_status() {
log_header "Nomad 集群状态概览"
# 检查 Leader
echo -e "${CYAN}Leader 状态:${NC}"
LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接")
if [[ "$LEADER" =~ ^\".*\"$ ]]; then
echo " ✅ Leader: $(echo $LEADER | tr -d '\"')"
else
echo " ❌ 无 Leader 或连接失败"
return 1
fi
echo ""
# 节点状态
echo -e "${CYAN}节点状态:${NC}"
curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || {
log_warning "无法获取节点状态详情"
nomad node status 2>/dev/null || echo " ❌ 命令执行失败"
}
echo ""
# 驱动状态
echo -e "${CYAN}驱动程序状态:${NC}"
curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '
.[] |
" 节点: \(.Name)" as $node |
.Drivers |
to_entries[] |
" \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")"
' 2>/dev/null || {
log_warning "无法获取驱动状态详情"
}
}
# 显示作业状态
show_jobs_status() {
log_header "作业状态"
JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null)
if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then
echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null
else
echo " 📝 当前没有运行的作业"
fi
}
# 显示访问信息
show_access_info() {
log_header "访问信息"
echo -e "${CYAN}Web UI:${NC}"
echo " 🌐 http://100.116.158.95:4646"
echo ""
echo -e "${CYAN}API 端点:${NC}"
echo " 🔗 http://100.116.158.95:4646/v1/"
echo ""
echo -e "${CYAN}常用命令:${NC}"
echo " 📊 nomad status # 查看集群概览"
echo " 🖥️ nomad node status # 查看节点状态"
echo " 🔧 nomad server members # 查看服务器成员"
echo " 📋 nomad job status <job-name> # 查看作业状态"
echo " 🚀 nomad job run <job-file> # 运行作业"
echo " 📜 journalctl -u nomad -f # 查看日志"
}
# 运行诊断
run_diagnosis() {
log_header "运行完整诊断"
if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then
bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh"
else
log_error "诊断脚本未找到"
return 1
fi
}
# 配置 Podman 驱动
configure_podman() {
log_header "配置所有节点使用 Podman 驱动"
local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml"
local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
if [[ ! -f "$playbook" ]]; then
log_error "Playbook 文件不存在: $playbook"
return 1
fi
if [[ ! -f "$inventory" ]]; then
log_error "Inventory 文件不存在: $inventory"
return 1
fi
cd "$PROJECT_ROOT/configuration"
python3 -m ansible playbook -i "$inventory" "$playbook" -v
}
# 重启集群
restart_cluster() {
log_header "重启 Nomad 集群"
log_warning "这将重启整个 Nomad 集群"
read -p "确认继续? (y/N): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
cd "$PROJECT_ROOT/configuration"
python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become
log_info "等待集群启动..."
sleep 15
show_cluster_status
else
log_info "操作已取消"
fi
}
# 主菜单
show_menu() {
echo ""
log_header "Nomad 集群管理菜单"
echo ""
echo "1) 📊 显示集群状态"
echo "2) 📋 显示作业状态"
echo "3) 🔍 运行完整诊断"
echo "4) 🐳 配置 Podman 驱动"
echo "5) 🔄 重启集群"
echo "6) 显示访问信息"
echo "0) ❌ 退出"
echo ""
}
# 主函数
main() {
echo ""
echo "🚀 Nomad 集群管理工具"
echo "==================="
while true; do
show_menu
read -p "请选择操作 (0-6): " choice
case $choice in
1)
show_cluster_status
;;
2)
show_jobs_status
;;
3)
run_diagnosis
;;
4)
configure_podman
;;
5)
restart_cluster
;;
6)
show_access_info
;;
0)
log_info "再见!"
exit 0
;;
*)
log_error "无效选择,请重试"
;;
esac
echo ""
read -p "按回车键继续..." -r
done
}
# 如果直接运行脚本
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi