227 lines
5.8 KiB
Bash
Executable File
227 lines
5.8 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# 🚀 Nomad 集群管理脚本
|
||
# Nomad Cluster Management Script
|
||
|
||
set -e
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
|
||
# 颜色定义
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
PURPLE='\033[0;35m'
|
||
CYAN='\033[0;36m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# 日志函数
|
||
log_info() {
|
||
echo -e "${BLUE}[INFO]${NC} $1"
|
||
}
|
||
|
||
log_success() {
|
||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||
}
|
||
|
||
log_warning() {
|
||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1"
|
||
}
|
||
|
||
log_header() {
|
||
echo -e "${PURPLE}=== $1 ===${NC}"
|
||
}
|
||
|
||
# 显示集群状态
|
||
show_cluster_status() {
|
||
log_header "Nomad 集群状态概览"
|
||
|
||
# 检查 Leader
|
||
echo -e "${CYAN}Leader 状态:${NC}"
|
||
LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接")
|
||
if [[ "$LEADER" =~ ^\".*\"$ ]]; then
|
||
echo " ✅ Leader: $(echo $LEADER | tr -d '\"')"
|
||
else
|
||
echo " ❌ 无 Leader 或连接失败"
|
||
return 1
|
||
fi
|
||
|
||
echo ""
|
||
|
||
# 节点状态
|
||
echo -e "${CYAN}节点状态:${NC}"
|
||
curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || {
|
||
log_warning "无法获取节点状态详情"
|
||
nomad node status 2>/dev/null || echo " ❌ 命令执行失败"
|
||
}
|
||
|
||
echo ""
|
||
|
||
# 驱动状态
|
||
echo -e "${CYAN}驱动程序状态:${NC}"
|
||
curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '
|
||
.[] |
|
||
" 节点: \(.Name)" as $node |
|
||
.Drivers |
|
||
to_entries[] |
|
||
" \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")"
|
||
' 2>/dev/null || {
|
||
log_warning "无法获取驱动状态详情"
|
||
}
|
||
}
|
||
|
||
# 显示作业状态
|
||
show_jobs_status() {
|
||
log_header "作业状态"
|
||
|
||
JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null)
|
||
if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then
|
||
echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null
|
||
else
|
||
echo " 📝 当前没有运行的作业"
|
||
fi
|
||
}
|
||
|
||
# 显示访问信息
|
||
show_access_info() {
|
||
log_header "访问信息"
|
||
|
||
echo -e "${CYAN}Web UI:${NC}"
|
||
echo " 🌐 http://100.116.158.95:4646"
|
||
echo ""
|
||
|
||
echo -e "${CYAN}API 端点:${NC}"
|
||
echo " 🔗 http://100.116.158.95:4646/v1/"
|
||
echo ""
|
||
|
||
echo -e "${CYAN}常用命令:${NC}"
|
||
echo " 📊 nomad status # 查看集群概览"
|
||
echo " 🖥️ nomad node status # 查看节点状态"
|
||
echo " 🔧 nomad server members # 查看服务器成员"
|
||
echo " 📋 nomad job status <job-name> # 查看作业状态"
|
||
echo " 🚀 nomad job run <job-file> # 运行作业"
|
||
echo " 📜 journalctl -u nomad -f # 查看日志"
|
||
}
|
||
|
||
# 运行诊断
|
||
run_diagnosis() {
|
||
log_header "运行完整诊断"
|
||
|
||
if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then
|
||
bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh"
|
||
else
|
||
log_error "诊断脚本未找到"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 配置 Podman 驱动
|
||
configure_podman() {
|
||
log_header "配置所有节点使用 Podman 驱动"
|
||
|
||
local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml"
|
||
local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
|
||
|
||
if [[ ! -f "$playbook" ]]; then
|
||
log_error "Playbook 文件不存在: $playbook"
|
||
return 1
|
||
fi
|
||
|
||
if [[ ! -f "$inventory" ]]; then
|
||
log_error "Inventory 文件不存在: $inventory"
|
||
return 1
|
||
fi
|
||
|
||
cd "$PROJECT_ROOT/configuration"
|
||
python3 -m ansible playbook -i "$inventory" "$playbook" -v
|
||
}
|
||
|
||
# 重启集群
|
||
restart_cluster() {
|
||
log_header "重启 Nomad 集群"
|
||
|
||
log_warning "这将重启整个 Nomad 集群"
|
||
read -p "确认继续? (y/N): " -n 1 -r
|
||
echo ""
|
||
|
||
if [[ $REPLY =~ ^[Yy]$ ]]; then
|
||
local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
|
||
cd "$PROJECT_ROOT/configuration"
|
||
python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become
|
||
|
||
log_info "等待集群启动..."
|
||
sleep 15
|
||
show_cluster_status
|
||
else
|
||
log_info "操作已取消"
|
||
fi
|
||
}
|
||
|
||
# 主菜单
|
||
show_menu() {
|
||
echo ""
|
||
log_header "Nomad 集群管理菜单"
|
||
echo ""
|
||
echo "1) 📊 显示集群状态"
|
||
echo "2) 📋 显示作业状态"
|
||
echo "3) 🔍 运行完整诊断"
|
||
echo "4) 🐳 配置 Podman 驱动"
|
||
echo "5) 🔄 重启集群"
|
||
echo "6) ℹ️ 显示访问信息"
|
||
echo "0) ❌ 退出"
|
||
echo ""
|
||
}
|
||
|
||
# 主函数
|
||
main() {
|
||
echo ""
|
||
echo "🚀 Nomad 集群管理工具"
|
||
echo "==================="
|
||
|
||
while true; do
|
||
show_menu
|
||
read -p "请选择操作 (0-6): " choice
|
||
|
||
case $choice in
|
||
1)
|
||
show_cluster_status
|
||
;;
|
||
2)
|
||
show_jobs_status
|
||
;;
|
||
3)
|
||
run_diagnosis
|
||
;;
|
||
4)
|
||
configure_podman
|
||
;;
|
||
5)
|
||
restart_cluster
|
||
;;
|
||
6)
|
||
show_access_info
|
||
;;
|
||
0)
|
||
log_info "再见!"
|
||
exit 0
|
||
;;
|
||
*)
|
||
log_error "无效选择,请重试"
|
||
;;
|
||
esac
|
||
|
||
echo ""
|
||
read -p "按回车键继续..." -r
|
||
done
|
||
}
|
||
|
||
# 如果直接运行脚本
|
||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||
main "$@"
|
||
fi |