187 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			YAML
		
	
	
	
			
		
		
	
	
			187 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			YAML
		
	
	
	
---
 | 
						|
- name: 部署 Telegraf 硬盘监控到 Nomad 集群
 | 
						|
  hosts: all
 | 
						|
  become: yes
 | 
						|
  vars:
 | 
						|
    # 连接现有的 InfluxDB 2.x + Grafana 监控栈
 | 
						|
    influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}"
 | 
						|
    influxdb_token: "{{ influxdb_token }}"
 | 
						|
    influxdb_org: "{{ influxdb_org | default('nomad') }}"
 | 
						|
    influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}"
 | 
						|
    
 | 
						|
    # 远程 Telegraf 配置模式(优先)
 | 
						|
    use_remote_config: "{{ use_remote_config | default(true) }}"
 | 
						|
    telegraf_config_url: "{{ telegraf_config_url | default('') }}"
 | 
						|
    
 | 
						|
    # 硬盘监控阈值
 | 
						|
    disk_usage_warning: 80  # 80% 使用率警告
 | 
						|
    disk_usage_critical: 90 # 90% 使用率严重告警
 | 
						|
    
 | 
						|
    # 监控间隔(秒)
 | 
						|
    collection_interval: 30
 | 
						|
    
 | 
						|
  tasks:
 | 
						|
    - name: 显示正在处理的节点
 | 
						|
      debug:
 | 
						|
        msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控"
 | 
						|
 | 
						|
    - name: 添加 InfluxData 仓库密钥
 | 
						|
      apt_key:
 | 
						|
        url: https://repos.influxdata.com/influxdata-archive_compat.key
 | 
						|
        state: present
 | 
						|
      retries: 3
 | 
						|
      delay: 5
 | 
						|
 | 
						|
    - name: 添加 InfluxData 仓库
 | 
						|
      apt_repository:
 | 
						|
        repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable"
 | 
						|
        state: present
 | 
						|
        update_cache: yes
 | 
						|
      retries: 3
 | 
						|
      delay: 5
 | 
						|
 | 
						|
    - name: 安装 Telegraf
 | 
						|
      apt:
 | 
						|
        name: telegraf
 | 
						|
        state: present
 | 
						|
        update_cache: yes
 | 
						|
      retries: 3
 | 
						|
      delay: 10
 | 
						|
 | 
						|
    - name: 创建 Telegraf 配置目录
 | 
						|
      file:
 | 
						|
        path: /etc/telegraf/telegraf.d
 | 
						|
        state: directory
 | 
						|
        owner: telegraf
 | 
						|
        group: telegraf
 | 
						|
        mode: '0755'
 | 
						|
 | 
						|
    - name: 清理旧的 Telegraf 日志文件(节省硬盘空间)
 | 
						|
      file:
 | 
						|
        path: "{{ item }}"
 | 
						|
        state: absent
 | 
						|
      loop:
 | 
						|
        - /var/log/telegraf
 | 
						|
        - /var/log/telegraf.log
 | 
						|
      ignore_errors: yes
 | 
						|
 | 
						|
    - name: 禁用 Telegraf 日志目录创建
 | 
						|
      file:
 | 
						|
        path: /var/log/telegraf
 | 
						|
        state: absent
 | 
						|
      ignore_errors: yes
 | 
						|
 | 
						|
    - name: 创建 Telegraf 环境变量文件
 | 
						|
      template:
 | 
						|
        src: telegraf-env.j2
 | 
						|
        dest: /etc/default/telegraf
 | 
						|
        owner: root
 | 
						|
        group: root
 | 
						|
        mode: '0600'
 | 
						|
        backup: yes
 | 
						|
      notify: restart telegraf
 | 
						|
 | 
						|
    - name: 创建 Telegraf systemd 服务文件(支持远程配置)
 | 
						|
      template:
 | 
						|
        src: telegraf.service.j2
 | 
						|
        dest: /etc/systemd/system/telegraf.service
 | 
						|
        owner: root
 | 
						|
        group: root
 | 
						|
        mode: '0644'
 | 
						|
        backup: yes
 | 
						|
      notify:
 | 
						|
        - reload systemd
 | 
						|
        - restart telegraf
 | 
						|
      when: telegraf_config_url is defined and telegraf_config_url != ''
 | 
						|
 | 
						|
    - name: 生成 Telegraf 主配置文件(本地配置模式)
 | 
						|
      template:
 | 
						|
        src: telegraf.conf.j2
 | 
						|
        dest: /etc/telegraf/telegraf.conf
 | 
						|
        owner: telegraf
 | 
						|
        group: telegraf
 | 
						|
        mode: '0644'
 | 
						|
        backup: yes
 | 
						|
      notify: restart telegraf
 | 
						|
      when: telegraf_config_url is not defined or telegraf_config_url == ''
 | 
						|
 | 
						|
    - name: 生成硬盘监控配置
 | 
						|
      template:
 | 
						|
        src: disk-monitoring.conf.j2
 | 
						|
        dest: /etc/telegraf/telegraf.d/disk-monitoring.conf
 | 
						|
        owner: telegraf
 | 
						|
        group: telegraf
 | 
						|
        mode: '0644'
 | 
						|
        backup: yes
 | 
						|
      notify: restart telegraf
 | 
						|
 | 
						|
    - name: 生成系统监控配置
 | 
						|
      template:
 | 
						|
        src: system-monitoring.conf.j2
 | 
						|
        dest: /etc/telegraf/telegraf.d/system-monitoring.conf
 | 
						|
        owner: telegraf
 | 
						|
        group: telegraf
 | 
						|
        mode: '0644'
 | 
						|
        backup: yes
 | 
						|
      notify: restart telegraf
 | 
						|
 | 
						|
    - name: 启用并启动 Telegraf 服务
 | 
						|
      systemd:
 | 
						|
        name: telegraf
 | 
						|
        state: started
 | 
						|
        enabled: yes
 | 
						|
        daemon_reload: yes
 | 
						|
 | 
						|
    - name: 验证 Telegraf 状态
 | 
						|
      systemd:
 | 
						|
        name: telegraf
 | 
						|
      register: telegraf_status
 | 
						|
 | 
						|
    - name: 检查 InfluxDB 连接
 | 
						|
      uri:
 | 
						|
        url: "{{ influxdb_url }}/ping"
 | 
						|
        method: GET
 | 
						|
        timeout: 5
 | 
						|
      register: influxdb_ping
 | 
						|
      ignore_errors: yes
 | 
						|
      delegate_to: localhost
 | 
						|
      run_once: true
 | 
						|
 | 
						|
    - name: 显示 InfluxDB 连接状态
 | 
						|
      debug:
 | 
						|
        msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}"
 | 
						|
      run_once: true
 | 
						|
 | 
						|
    - name: 显示 Telegraf 状态
 | 
						|
      debug:
 | 
						|
        msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}"
 | 
						|
 | 
						|
    - name: 检查硬盘使用情况
 | 
						|
      shell: |
 | 
						|
        df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output;
 | 
						|
        do
 | 
						|
          usage=$(echo $output | awk '{print $1}' | sed 's/%//g')
 | 
						|
          partition=$(echo $output | awk '{print $2}')
 | 
						|
          mount=$(echo $output | awk '{print $3}')
 | 
						|
          if [ $usage -ge {{ disk_usage_warning }} ]; then
 | 
						|
            echo "⚠️  警告: $mount ($partition) 使用率 $usage%"
 | 
						|
          else
 | 
						|
            echo "✅ $mount ($partition) 使用率 $usage%"
 | 
						|
          fi
 | 
						|
        done        
 | 
						|
      register: disk_check
 | 
						|
      changed_when: false
 | 
						|
 | 
						|
    - name: 显示硬盘检查结果
 | 
						|
      debug:
 | 
						|
        msg: "{{ disk_check.stdout_lines }}"
 | 
						|
 | 
						|
  handlers:
 | 
						|
    - name: reload systemd
 | 
						|
      systemd:
 | 
						|
        daemon_reload: yes
 | 
						|
 | 
						|
    - name: restart telegraf
 | 
						|
      systemd:
 | 
						|
        name: telegraf
 | 
						|
        state: restarted |