mgmt/configuration/playbooks/maintenance/ops-toolkit.yml

131 lines
5.0 KiB
YAML

---
- name: Operations Toolkit - Unified Management Dashboard
hosts: all
gather_facts: yes
vars:
# 可用的运维脚本
available_scripts:
- { name: "system-update", desc: "System package updates", file: "system-update.yml" }
- { name: "system-cleanup", desc: "System cleanup and maintenance", file: "system-cleanup.yml" }
- { name: "service-health", desc: "Service health monitoring", file: "service-health-check.yml" }
- { name: "security-hardening", desc: "Security hardening and backup", file: "security-hardening.yml" }
- { name: "docker-management", desc: "Docker container management", file: "docker-management.yml" }
- { name: "network-connectivity", desc: "Network connectivity check", file: "network-connectivity.yml" }
- { name: "certificate-management", desc: "SSL certificate monitoring", file: "certificate-management.yml" }
tasks:
# 显示系统概览
- name: Display system overview
debug:
msg: |
🖥️ System Overview for {{ inventory_hostname }}:
📊 OS: {{ ansible_distribution }} {{ ansible_distribution_version }}
💾 Memory: {{ (ansible_memtotal_mb/1024)|round(1) }}GB total, {{ (ansible_memfree_mb/1024)|round(1) }}GB free
💿 CPU: {{ ansible_processor_vcpus }} cores
🏠 Architecture: {{ ansible_architecture }}
🌐 IP: {{ ansible_default_ipv4.address }}
⏰ Uptime: {{ ansible_uptime_seconds//86400 }}d {{ (ansible_uptime_seconds%86400)//3600 }}h {{ ((ansible_uptime_seconds%3600)//60) }}m
# 快速系统状态检查
- name: Quick system status check
shell: |
echo "=== DISK USAGE ==="
df -h | grep -E "(Filesystem|/dev/)"
echo ""
echo "=== MEMORY USAGE ==="
free -h
echo ""
echo "=== LOAD AVERAGE ==="
uptime
echo ""
echo "=== TOP PROCESSES ==="
ps aux --sort=-%cpu | head -6
register: quick_status
- name: Display quick status
debug:
msg: "{{ quick_status.stdout_lines }}"
# 检查关键服务状态
- name: Check critical services
systemd:
name: "{{ item }}"
register: service_status
loop:
- ssh
- systemd-resolved
- cron
failed_when: false
- name: Display service status
debug:
msg: "🔧 {{ item.item }}: {{ item.status.ActiveState if item.status is defined else 'NOT FOUND' }}"
loop: "{{ service_status.results }}"
# 检查最近的系统日志错误
- name: Check recent system errors
shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10
register: recent_errors
failed_when: false
- name: Display recent errors
debug:
msg: "🚨 Recent Errors: {{ recent_errors.stdout_lines if recent_errors.stdout_lines else ['No recent errors found'] }}"
# 检查网络连接
- name: Quick network check
shell: |
echo "=== NETWORK INTERFACES ==="
ip -br addr show
echo ""
echo "=== DEFAULT ROUTE ==="
ip route | grep default
echo ""
echo "=== DNS TEST ==="
nslookup google.com | grep -A1 "Name:" || echo "DNS resolution failed"
register: network_check
failed_when: false
- name: Display network status
debug:
msg: "🌐 Network Status: {{ network_check.stdout_lines }}"
# 显示可用的运维脚本
- name: Display available operations scripts
debug:
msg: |
🛠️ Available Operations Scripts:
{% for script in available_scripts %}
{{ loop.index }}. {{ script.name }}: {{ script.desc }}
{% endfor %}
💡 Usage Examples:
ansible-playbook -i inventory.ini system-cleanup.yml --limit {{ inventory_hostname }}
ansible-playbook -i inventory.ini docker-management.yml --limit lxc
ansible-playbook -i inventory.ini network-connectivity.yml --limit proxmox
# 生成运维建议
- name: Generate maintenance recommendations
debug:
msg: |
💡 Maintenance Recommendations for {{ inventory_hostname }}:
🔄 Regular Tasks (Weekly):
- Run system-cleanup.yml to free up disk space
- Check service-health-check.yml for service status
- Review certificate-management.yml for expiring certificates
🔒 Security Tasks (Monthly):
- Execute security-hardening.yml for security updates
- Review network-connectivity.yml for network security
🐳 Container Tasks (As needed):
- Use docker-management.yml for Docker maintenance
📊 Monitoring Tasks (Daily):
- Quick check with ops-toolkit.yml (this script)
⚡ Emergency Tasks:
- Use system-update.yml for critical security patches
- Run network-connectivity.yml for connectivity issues