--- - name: 部署 Telegraf 硬盘监控到 Nomad 集群 hosts: all become: yes vars: # 连接现有的 InfluxDB 2.x + Grafana 监控栈 influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}" influxdb_token: "{{ influxdb_token }}" influxdb_org: "{{ influxdb_org | default('nomad') }}" influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}" # 远程 Telegraf 配置模式(优先) use_remote_config: "{{ use_remote_config | default(true) }}" telegraf_config_url: "{{ telegraf_config_url | default('') }}" # 硬盘监控阈值 disk_usage_warning: 80 # 80% 使用率警告 disk_usage_critical: 90 # 90% 使用率严重告警 # 监控间隔(秒) collection_interval: 30 tasks: - name: 显示正在处理的节点 debug: msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控" - name: 添加 InfluxData 仓库密钥 apt_key: url: https://repos.influxdata.com/influxdata-archive_compat.key state: present retries: 3 delay: 5 - name: 添加 InfluxData 仓库 apt_repository: repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable" state: present update_cache: yes retries: 3 delay: 5 - name: 安装 Telegraf apt: name: telegraf state: present update_cache: yes retries: 3 delay: 10 - name: 创建 Telegraf 配置目录 file: path: /etc/telegraf/telegraf.d state: directory owner: telegraf group: telegraf mode: '0755' - name: 清理旧的 Telegraf 日志文件(节省硬盘空间) file: path: "{{ item }}" state: absent loop: - /var/log/telegraf - /var/log/telegraf.log ignore_errors: yes - name: 禁用 Telegraf 日志目录创建 file: path: /var/log/telegraf state: absent ignore_errors: yes - name: 创建 Telegraf 环境变量文件 template: src: telegraf-env.j2 dest: /etc/default/telegraf owner: root group: root mode: '0600' backup: yes notify: restart telegraf - name: 创建 Telegraf systemd 服务文件(支持远程配置) template: src: telegraf.service.j2 dest: /etc/systemd/system/telegraf.service owner: root group: root mode: '0644' backup: yes notify: - reload systemd - restart telegraf when: telegraf_config_url is defined and telegraf_config_url != '' - name: 生成 Telegraf 主配置文件(本地配置模式) template: src: telegraf.conf.j2 dest: /etc/telegraf/telegraf.conf owner: telegraf group: telegraf mode: '0644' backup: yes notify: restart telegraf when: telegraf_config_url is not defined or telegraf_config_url == '' - name: 生成硬盘监控配置 template: src: disk-monitoring.conf.j2 dest: /etc/telegraf/telegraf.d/disk-monitoring.conf owner: telegraf group: telegraf mode: '0644' backup: yes notify: restart telegraf - name: 生成系统监控配置 template: src: system-monitoring.conf.j2 dest: /etc/telegraf/telegraf.d/system-monitoring.conf owner: telegraf group: telegraf mode: '0644' backup: yes notify: restart telegraf - name: 启用并启动 Telegraf 服务 systemd: name: telegraf state: started enabled: yes daemon_reload: yes - name: 验证 Telegraf 状态 systemd: name: telegraf register: telegraf_status - name: 检查 InfluxDB 连接 uri: url: "{{ influxdb_url }}/ping" method: GET timeout: 5 register: influxdb_ping ignore_errors: yes delegate_to: localhost run_once: true - name: 显示 InfluxDB 连接状态 debug: msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}" run_once: true - name: 显示 Telegraf 状态 debug: msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}" - name: 检查硬盘使用情况 shell: | df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output; do usage=$(echo $output | awk '{print $1}' | sed 's/%//g') partition=$(echo $output | awk '{print $2}') mount=$(echo $output | awk '{print $3}') if [ $usage -ge {{ disk_usage_warning }} ]; then echo "⚠️ 警告: $mount ($partition) 使用率 $usage%" else echo "✅ $mount ($partition) 使用率 $usage%" fi done register: disk_check changed_when: false - name: 显示硬盘检查结果 debug: msg: "{{ disk_check.stdout_lines }}" handlers: - name: reload systemd systemd: daemon_reload: yes - name: restart telegraf systemd: name: telegraf state: restarted