--- - name: Service Health Check and Monitoring hosts: all become: yes gather_facts: yes vars: critical_services: - ssh - systemd-resolved - cron web_services: - nginx - apache2 database_services: - mysql - mariadb - postgresql container_services: - docker - containerd network_services: - tailscale - cloudflared tasks: # 检查关键系统服务 - name: Check critical system services systemd: name: "{{ item }}" register: critical_service_status loop: "{{ critical_services }}" failed_when: false - name: Report critical service issues debug: msg: "⚠️ Critical service {{ item.item }} is {{ item.status.ActiveState | default('not found') }}" loop: "{{ critical_service_status.results }}" when: item.status is defined and item.status.ActiveState != "active" # 检查 Web 服务 - name: Check web services systemd: name: "{{ item }}" register: web_service_status loop: "{{ web_services }}" failed_when: false - name: Report web service status debug: msg: "🌐 Web service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" loop: "{{ web_service_status.results }}" when: item.status is defined # 检查数据库服务 - name: Check database services systemd: name: "{{ item }}" register: db_service_status loop: "{{ database_services }}" failed_when: false - name: Report database service status debug: msg: "🗄️ Database service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" loop: "{{ db_service_status.results }}" when: item.status is defined # 检查容器服务 - name: Check container services systemd: name: "{{ item }}" register: container_service_status loop: "{{ container_services }}" failed_when: false - name: Report container service status debug: msg: "📦 Container service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" loop: "{{ container_service_status.results }}" when: item.status is defined # 检查网络服务 - name: Check network services systemd: name: "{{ item }}" register: network_service_status loop: "{{ network_services }}" failed_when: false - name: Report network service status debug: msg: "🌐 Network service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" loop: "{{ network_service_status.results }}" when: item.status is defined # 检查系统负载 - name: Check system load shell: uptime register: system_load - name: Display system load debug: msg: "📊 System Load: {{ system_load.stdout }}" # 检查磁盘空间警告 - name: Check disk space usage shell: df -h | awk '$5 > 80 {print $0}' register: disk_warning changed_when: false - name: Warn about high disk usage debug: msg: "⚠️ High disk usage detected: {{ disk_warning.stdout_lines }}" when: disk_warning.stdout_lines | length > 0 # 检查内存使用率 - name: Check memory usage percentage shell: free | awk 'NR==2{printf "%.2f%%", $3*100/$2}' register: memory_percent - name: Display memory usage debug: msg: "🧠 Memory Usage: {{ memory_percent.stdout }}" # 检查最近的系统错误 - name: Check recent system errors shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10 register: recent_errors changed_when: false - name: Display recent errors debug: msg: "🚨 Recent system errors: {{ recent_errors.stdout_lines }}" when: recent_errors.stdout_lines | length > 0