mgmt/ansible/service-health-check.yml

135 lines
4.0 KiB
YAML

---
- name: Service Health Check and Monitoring
hosts: all
become: yes
gather_facts: yes
vars:
critical_services:
- ssh
- systemd-resolved
- cron
web_services:
- nginx
- apache2
database_services:
- mysql
- mariadb
- postgresql
container_services:
- docker
- containerd
network_services:
- tailscale
- cloudflared
tasks:
# 检查关键系统服务
- name: Check critical system services
systemd:
name: "{{ item }}"
register: critical_service_status
loop: "{{ critical_services }}"
failed_when: false
- name: Report critical service issues
debug:
msg: "⚠️ Critical service {{ item.item }} is {{ item.status.ActiveState | default('not found') }}"
loop: "{{ critical_service_status.results }}"
when: item.status is defined and item.status.ActiveState != "active"
# 检查 Web 服务
- name: Check web services
systemd:
name: "{{ item }}"
register: web_service_status
loop: "{{ web_services }}"
failed_when: false
- name: Report web service status
debug:
msg: "🌐 Web service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
loop: "{{ web_service_status.results }}"
when: item.status is defined
# 检查数据库服务
- name: Check database services
systemd:
name: "{{ item }}"
register: db_service_status
loop: "{{ database_services }}"
failed_when: false
- name: Report database service status
debug:
msg: "🗄️ Database service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
loop: "{{ db_service_status.results }}"
when: item.status is defined
# 检查容器服务
- name: Check container services
systemd:
name: "{{ item }}"
register: container_service_status
loop: "{{ container_services }}"
failed_when: false
- name: Report container service status
debug:
msg: "📦 Container service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
loop: "{{ container_service_status.results }}"
when: item.status is defined
# 检查网络服务
- name: Check network services
systemd:
name: "{{ item }}"
register: network_service_status
loop: "{{ network_services }}"
failed_when: false
- name: Report network service status
debug:
msg: "🌐 Network service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
loop: "{{ network_service_status.results }}"
when: item.status is defined
# 检查系统负载
- name: Check system load
shell: uptime
register: system_load
- name: Display system load
debug:
msg: "📊 System Load: {{ system_load.stdout }}"
# 检查磁盘空间警告
- name: Check disk space usage
shell: df -h | awk '$5 > 80 {print $0}'
register: disk_warning
changed_when: false
- name: Warn about high disk usage
debug:
msg: "⚠️ High disk usage detected: {{ disk_warning.stdout_lines }}"
when: disk_warning.stdout_lines | length > 0
# 检查内存使用率
- name: Check memory usage percentage
shell: free | awk 'NR==2{printf "%.2f%%", $3*100/$2}'
register: memory_percent
- name: Display memory usage
debug:
msg: "🧠 Memory Usage: {{ memory_percent.stdout }}"
# 检查最近的系统错误
- name: Check recent system errors
shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10
register: recent_errors
changed_when: false
- name: Display recent errors
debug:
msg: "🚨 Recent system errors: {{ recent_errors.stdout_lines }}"
when: recent_errors.stdout_lines | length > 0