187 lines
5.2 KiB
YAML
187 lines
5.2 KiB
YAML
---
|
|
- name: 部署 Telegraf 硬盘监控到 Nomad 集群
|
|
hosts: all
|
|
become: yes
|
|
vars:
|
|
# 连接现有的 InfluxDB 2.x + Grafana 监控栈
|
|
influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}"
|
|
influxdb_token: "{{ influxdb_token }}"
|
|
influxdb_org: "{{ influxdb_org | default('nomad') }}"
|
|
influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}"
|
|
|
|
# 远程 Telegraf 配置模式(优先)
|
|
use_remote_config: "{{ use_remote_config | default(true) }}"
|
|
telegraf_config_url: "{{ telegraf_config_url | default('') }}"
|
|
|
|
# 硬盘监控阈值
|
|
disk_usage_warning: 80 # 80% 使用率警告
|
|
disk_usage_critical: 90 # 90% 使用率严重告警
|
|
|
|
# 监控间隔(秒)
|
|
collection_interval: 30
|
|
|
|
tasks:
|
|
- name: 显示正在处理的节点
|
|
debug:
|
|
msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控"
|
|
|
|
- name: 添加 InfluxData 仓库密钥
|
|
apt_key:
|
|
url: https://repos.influxdata.com/influxdata-archive_compat.key
|
|
state: present
|
|
retries: 3
|
|
delay: 5
|
|
|
|
- name: 添加 InfluxData 仓库
|
|
apt_repository:
|
|
repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable"
|
|
state: present
|
|
update_cache: yes
|
|
retries: 3
|
|
delay: 5
|
|
|
|
- name: 安装 Telegraf
|
|
apt:
|
|
name: telegraf
|
|
state: present
|
|
update_cache: yes
|
|
retries: 3
|
|
delay: 10
|
|
|
|
- name: 创建 Telegraf 配置目录
|
|
file:
|
|
path: /etc/telegraf/telegraf.d
|
|
state: directory
|
|
owner: telegraf
|
|
group: telegraf
|
|
mode: '0755'
|
|
|
|
- name: 清理旧的 Telegraf 日志文件(节省硬盘空间)
|
|
file:
|
|
path: "{{ item }}"
|
|
state: absent
|
|
loop:
|
|
- /var/log/telegraf
|
|
- /var/log/telegraf.log
|
|
ignore_errors: yes
|
|
|
|
- name: 禁用 Telegraf 日志目录创建
|
|
file:
|
|
path: /var/log/telegraf
|
|
state: absent
|
|
ignore_errors: yes
|
|
|
|
- name: 创建 Telegraf 环境变量文件
|
|
template:
|
|
src: telegraf-env.j2
|
|
dest: /etc/default/telegraf
|
|
owner: root
|
|
group: root
|
|
mode: '0600'
|
|
backup: yes
|
|
notify: restart telegraf
|
|
|
|
- name: 创建 Telegraf systemd 服务文件(支持远程配置)
|
|
template:
|
|
src: telegraf.service.j2
|
|
dest: /etc/systemd/system/telegraf.service
|
|
owner: root
|
|
group: root
|
|
mode: '0644'
|
|
backup: yes
|
|
notify:
|
|
- reload systemd
|
|
- restart telegraf
|
|
when: telegraf_config_url is defined and telegraf_config_url != ''
|
|
|
|
- name: 生成 Telegraf 主配置文件(本地配置模式)
|
|
template:
|
|
src: telegraf.conf.j2
|
|
dest: /etc/telegraf/telegraf.conf
|
|
owner: telegraf
|
|
group: telegraf
|
|
mode: '0644'
|
|
backup: yes
|
|
notify: restart telegraf
|
|
when: telegraf_config_url is not defined or telegraf_config_url == ''
|
|
|
|
- name: 生成硬盘监控配置
|
|
template:
|
|
src: disk-monitoring.conf.j2
|
|
dest: /etc/telegraf/telegraf.d/disk-monitoring.conf
|
|
owner: telegraf
|
|
group: telegraf
|
|
mode: '0644'
|
|
backup: yes
|
|
notify: restart telegraf
|
|
|
|
- name: 生成系统监控配置
|
|
template:
|
|
src: system-monitoring.conf.j2
|
|
dest: /etc/telegraf/telegraf.d/system-monitoring.conf
|
|
owner: telegraf
|
|
group: telegraf
|
|
mode: '0644'
|
|
backup: yes
|
|
notify: restart telegraf
|
|
|
|
- name: 启用并启动 Telegraf 服务
|
|
systemd:
|
|
name: telegraf
|
|
state: started
|
|
enabled: yes
|
|
daemon_reload: yes
|
|
|
|
- name: 验证 Telegraf 状态
|
|
systemd:
|
|
name: telegraf
|
|
register: telegraf_status
|
|
|
|
- name: 检查 InfluxDB 连接
|
|
uri:
|
|
url: "{{ influxdb_url }}/ping"
|
|
method: GET
|
|
timeout: 5
|
|
register: influxdb_ping
|
|
ignore_errors: yes
|
|
delegate_to: localhost
|
|
run_once: true
|
|
|
|
- name: 显示 InfluxDB 连接状态
|
|
debug:
|
|
msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}"
|
|
run_once: true
|
|
|
|
- name: 显示 Telegraf 状态
|
|
debug:
|
|
msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}"
|
|
|
|
- name: 检查硬盘使用情况
|
|
shell: |
|
|
df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output;
|
|
do
|
|
usage=$(echo $output | awk '{print $1}' | sed 's/%//g')
|
|
partition=$(echo $output | awk '{print $2}')
|
|
mount=$(echo $output | awk '{print $3}')
|
|
if [ $usage -ge {{ disk_usage_warning }} ]; then
|
|
echo "⚠️ 警告: $mount ($partition) 使用率 $usage%"
|
|
else
|
|
echo "✅ $mount ($partition) 使用率 $usage%"
|
|
fi
|
|
done
|
|
register: disk_check
|
|
changed_when: false
|
|
|
|
- name: 显示硬盘检查结果
|
|
debug:
|
|
msg: "{{ disk_check.stdout_lines }}"
|
|
|
|
handlers:
|
|
- name: reload systemd
|
|
systemd:
|
|
daemon_reload: yes
|
|
|
|
- name: restart telegraf
|
|
systemd:
|
|
name: telegraf
|
|
state: restarted |