mgmt/configuration/deploy-monitoring.sh

#!/bin/bash
# Nomad 集群硬盘监控部署脚本
# 使用现有的 InfluxDB + Grafana 监控栈

echo "🚀 开始部署 Nomad 集群硬盘监控..."

# 检查配置文件
if [[ ! -f "inventories/production/group_vars/all.yml" ]]; then
    echo "❌ 配置文件不存在，请先配置 InfluxDB 连接信息"
    exit 1
fi

# 显示配置信息
echo "📋 当前监控配置："
grep -E "influxdb_|disk_usage_|collection_interval" inventories/production/group_vars/all.yml

echo ""
read -p "🤔 确认配置正确吗？(y/N): " confirm
if [[ $confirm != [yY] ]]; then
    echo "❌ 部署取消，请修改配置后重试"
    exit 1
fi

# 部署到所有节点
echo "📦 开始安装 Telegraf 到所有节点..."
ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml

# 检查部署结果
if [[ $? -eq 0 ]]; then
    echo "✅ 硬盘监控部署完成！"
    echo ""
    echo "📊 监控信息："
    echo "- 数据将发送到你现有的 InfluxDB"
    echo "- 可以在 Grafana 中创建仪表板查看数据"
    echo "- 已禁用本地日志文件以节省硬盘空间"
    echo "- 监控数据每30秒收集一次"
    echo ""
    echo "🔧 下一步："
    echo "1. 在 Grafana 中创建 Nomad 集群监控仪表板"
    echo "2. 设置硬盘使用率告警规则"
    echo "3. 可以运行以下命令检查监控状态："
    echo "   ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf'"
else
    echo "❌ 部署失败，请检查错误信息"
    exit 1
fi