feat(监控): 添加Telegraf监控配置和磁盘监控脚本
refactor(容器): 从Docker迁移到Podman并更新Nomad配置 fix(配置): 修复代理和别名配置问题 docs(文档): 更新配置文件和脚本注释 chore(清理): 移除不再使用的Consul和Docker相关文件
This commit is contained in:
46
configuration/deploy-monitoring.sh
Executable file
46
configuration/deploy-monitoring.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
# Nomad 集群硬盘监控部署脚本
|
||||
# 使用现有的 InfluxDB + Grafana 监控栈
|
||||
|
||||
echo "🚀 开始部署 Nomad 集群硬盘监控..."
|
||||
|
||||
# 检查配置文件
|
||||
if [[ ! -f "inventories/production/group_vars/all.yml" ]]; then
|
||||
echo "❌ 配置文件不存在,请先配置 InfluxDB 连接信息"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 显示配置信息
|
||||
echo "📋 当前监控配置:"
|
||||
grep -E "influxdb_|disk_usage_|collection_interval" inventories/production/group_vars/all.yml
|
||||
|
||||
echo ""
|
||||
read -p "🤔 确认配置正确吗?(y/N): " confirm
|
||||
if [[ $confirm != [yY] ]]; then
|
||||
echo "❌ 部署取消,请修改配置后重试"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 部署到所有节点
|
||||
echo "📦 开始安装 Telegraf 到所有节点..."
|
||||
ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml
|
||||
|
||||
# 检查部署结果
|
||||
if [[ $? -eq 0 ]]; then
|
||||
echo "✅ 硬盘监控部署完成!"
|
||||
echo ""
|
||||
echo "📊 监控信息:"
|
||||
echo "- 数据将发送到你现有的 InfluxDB"
|
||||
echo "- 可以在 Grafana 中创建仪表板查看数据"
|
||||
echo "- 已禁用本地日志文件以节省硬盘空间"
|
||||
echo "- 监控数据每30秒收集一次"
|
||||
echo ""
|
||||
echo "🔧 下一步:"
|
||||
echo "1. 在 Grafana 中创建 Nomad 集群监控仪表板"
|
||||
echo "2. 设置硬盘使用率告警规则"
|
||||
echo "3. 可以运行以下命令检查监控状态:"
|
||||
echo " ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf'"
|
||||
else
|
||||
echo "❌ 部署失败,请检查错误信息"
|
||||
exit 1
|
||||
fi
|
||||
40
configuration/deploy-telegraf-remote.sh
Executable file
40
configuration/deploy-telegraf-remote.sh
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
# 使用远程 InfluxDB 2.x 配置快速部署 Telegraf 监控
|
||||
|
||||
echo "🚀 使用 InfluxDB 2.x 远程配置部署 Telegraf 监控..."
|
||||
|
||||
# 设置变量
|
||||
INFLUX_TOKEN="VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w=="
|
||||
TELEGRAF_CONFIG_URL="http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000"
|
||||
|
||||
# 检查网络连接
|
||||
echo "🔍 检查 InfluxDB 连接..."
|
||||
if curl -s --max-time 5 "http://influxdb1.tailnet-68f9.ts.net:8086/health" > /dev/null; then
|
||||
echo "✅ InfluxDB 连接正常"
|
||||
else
|
||||
echo "❌ 无法连接到 InfluxDB,请检查网络"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 使用远程配置部署
|
||||
echo "📦 开始部署到所有节点..."
|
||||
ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml \
|
||||
-e "use_remote_config=true" \
|
||||
-e "telegraf_config_url=$TELEGRAF_CONFIG_URL" \
|
||||
-e "influxdb_token=$INFLUX_TOKEN"
|
||||
|
||||
# 检查部署结果
|
||||
if [[ $? -eq 0 ]]; then
|
||||
echo "✅ Telegraf 监控部署完成!"
|
||||
echo ""
|
||||
echo "📊 配置信息:"
|
||||
echo "- 使用远程配置: $TELEGRAF_CONFIG_URL"
|
||||
echo "- InfluxDB 服务器: influxdb1.tailnet-68f9.ts.net:8086"
|
||||
echo "- 已禁用本地日志文件"
|
||||
echo ""
|
||||
echo "🔧 验证部署:"
|
||||
echo "ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf --no-pager'"
|
||||
else
|
||||
echo "❌ 部署失败,请检查错误信息"
|
||||
exit 1
|
||||
fi
|
||||
@@ -1,14 +0,0 @@
|
||||
{
|
||||
"proxies": {
|
||||
"http-proxy": "http://istoreos.tailnet-68f9.ts.net:7891",
|
||||
"https-proxy": "http://istoreos.tailnet-68f9.ts.net:7891",
|
||||
"no-proxy": "localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net"
|
||||
},
|
||||
"registry-mirrors": [],
|
||||
"insecure-registries": [],
|
||||
"debug": false,
|
||||
"experimental": false,
|
||||
"features": {
|
||||
"buildkit": true
|
||||
}
|
||||
}
|
||||
20
configuration/inventories/production/group_vars/all.yml
Normal file
20
configuration/inventories/production/group_vars/all.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Nomad 集群全局配置
|
||||
# InfluxDB 2.x + Grafana 监控配置
|
||||
|
||||
# InfluxDB 2.x 连接配置
|
||||
influxdb_url: "http://influxdb1.tailnet-68f9.ts.net:8086"
|
||||
influxdb_token: "VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w=="
|
||||
influxdb_org: "nomad" # 组织名称
|
||||
influxdb_bucket: "nomad_monitoring" # Bucket 名称
|
||||
|
||||
# 远程 Telegraf 配置 URL
|
||||
telegraf_config_url: "http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000"
|
||||
|
||||
# 监控配置
|
||||
disk_usage_warning: 80 # 硬盘使用率警告阈值
|
||||
disk_usage_critical: 90 # 硬盘使用率严重告警阈值
|
||||
collection_interval: 30 # 数据收集间隔(秒)
|
||||
|
||||
# Telegraf 优化配置
|
||||
telegraf_log_level: "ERROR" # 只记录错误日志
|
||||
telegraf_disable_local_logs: true # 禁用本地日志文件
|
||||
@@ -1,10 +1,20 @@
|
||||
[nomad_servers]
|
||||
master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
|
||||
semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
|
||||
ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
|
||||
semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=6
|
||||
ash2e ansible_host=ash2e ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
|
||||
ash1d ansible_host=ash1d ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
|
||||
ch2 ansible_host=ch2 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
|
||||
ch3 ansible_host=ch3 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
|
||||
# 新增的 Mac 和 Windows 节点(请替换为实际的 Tailscale IP)
|
||||
mac-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_mac_user nomad_role=server nomad_bootstrap_expect=6
|
||||
win-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_win_user nomad_role=server nomad_bootstrap_expect=6
|
||||
|
||||
[nomad_clients]
|
||||
# 如果需要客户端节点,可以在这里添加
|
||||
master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
|
||||
hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
|
||||
hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
|
||||
[nomad_cluster:children]
|
||||
nomad_servers
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
[nomad_servers]
|
||||
master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
|
||||
semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
|
||||
ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
|
||||
|
||||
[nomad_clients]
|
||||
hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
|
||||
hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
|
||||
hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
|
||||
[nomad_cluster:children]
|
||||
nomad_servers
|
||||
nomad_clients
|
||||
|
||||
[nomad_cluster:vars]
|
||||
ansible_ssh_private_key_file=~/.ssh/id_ed25519
|
||||
ansible_user=ben
|
||||
ansible_become=yes
|
||||
nomad_version=1.10.5
|
||||
nomad_datacenter=dc1
|
||||
nomad_region=global
|
||||
nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=
|
||||
@@ -0,0 +1,23 @@
|
||||
[nomad_servers]
|
||||
master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
|
||||
semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
|
||||
ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
|
||||
|
||||
[nomad_clients]
|
||||
hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
|
||||
hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
|
||||
hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
|
||||
|
||||
[nomad_cluster:children]
|
||||
nomad_servers
|
||||
nomad_clients
|
||||
|
||||
[nomad_cluster:vars]
|
||||
ansible_ssh_private_key_file=~/.ssh/id_ed25519
|
||||
ansible_user=ben
|
||||
ansible_become=yes
|
||||
nomad_version=1.10.5
|
||||
nomad_datacenter=dc1
|
||||
nomad_region=global
|
||||
nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=
|
||||
@@ -1,183 +0,0 @@
|
||||
---
|
||||
- name: Setup Automated Maintenance Cron Jobs
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
|
||||
vars:
|
||||
# 定时任务配置
|
||||
cron_jobs:
|
||||
# 每日快速检查
|
||||
- name: "Daily system health check"
|
||||
job: "cd /root/mgmt && ./scripts/ops-manager.sh toolkit all --check > /var/log/daily-health-check.log 2>&1"
|
||||
minute: "0"
|
||||
hour: "8"
|
||||
day: "*"
|
||||
month: "*"
|
||||
weekday: "*"
|
||||
|
||||
# 每周系统清理
|
||||
- name: "Weekly system cleanup"
|
||||
job: "cd /root/mgmt && ./scripts/ops-manager.sh cleanup all > /var/log/weekly-cleanup.log 2>&1"
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
day: "*"
|
||||
month: "*"
|
||||
weekday: "0" # Sunday
|
||||
|
||||
# 每月安全检查
|
||||
- name: "Monthly security hardening check"
|
||||
job: "cd /root/mgmt && ./scripts/ops-manager.sh security all --check > /var/log/monthly-security-check.log 2>&1"
|
||||
minute: "0"
|
||||
hour: "3"
|
||||
day: "1"
|
||||
month: "*"
|
||||
weekday: "*"
|
||||
|
||||
# 每周证书检查
|
||||
- name: "Weekly certificate check"
|
||||
job: "cd /root/mgmt && ./scripts/ops-manager.sh cert all > /var/log/weekly-cert-check.log 2>&1"
|
||||
minute: "30"
|
||||
hour: "4"
|
||||
day: "*"
|
||||
month: "*"
|
||||
weekday: "1" # Monday
|
||||
|
||||
# 每日 Docker 清理 (仅 LXC 组)
|
||||
- name: "Daily Docker cleanup for LXC"
|
||||
job: "cd /root/mgmt && ansible lxc -i ansible/inventory.ini -m shell -a 'docker system prune -f' --become -e 'ansible_ssh_pass=313131' > /var/log/daily-docker-cleanup.log 2>&1"
|
||||
minute: "0"
|
||||
hour: "1"
|
||||
day: "*"
|
||||
month: "*"
|
||||
weekday: "*"
|
||||
|
||||
# 每周网络连通性检查
|
||||
- name: "Weekly network connectivity check"
|
||||
job: "cd /root/mgmt && ./scripts/ops-manager.sh network all > /var/log/weekly-network-check.log 2>&1"
|
||||
minute: "0"
|
||||
hour: "6"
|
||||
day: "*"
|
||||
month: "*"
|
||||
weekday: "2" # Tuesday
|
||||
|
||||
tasks:
|
||||
# 创建日志目录
|
||||
- name: Create log directory
|
||||
file:
|
||||
path: /var/log/ansible-automation
|
||||
state: directory
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
# 设置脚本执行权限
|
||||
- name: Make ops-manager.sh executable
|
||||
file:
|
||||
path: /root/mgmt/scripts/ops-manager.sh
|
||||
mode: '0755'
|
||||
|
||||
# 创建定时任务
|
||||
- name: Setup cron jobs for automated maintenance
|
||||
cron:
|
||||
name: "{{ item.name }}"
|
||||
job: "{{ item.job }}"
|
||||
minute: "{{ item.minute }}"
|
||||
hour: "{{ item.hour }}"
|
||||
day: "{{ item.day }}"
|
||||
month: "{{ item.month }}"
|
||||
weekday: "{{ item.weekday }}"
|
||||
user: root
|
||||
loop: "{{ cron_jobs }}"
|
||||
become: yes
|
||||
|
||||
# 创建日志轮转配置
|
||||
- name: Setup log rotation for automation logs
|
||||
copy:
|
||||
content: |
|
||||
/var/log/*-health-check.log
|
||||
/var/log/*-cleanup.log
|
||||
/var/log/*-security-check.log
|
||||
/var/log/*-cert-check.log
|
||||
/var/log/*-docker-cleanup.log
|
||||
/var/log/*-network-check.log {
|
||||
daily
|
||||
missingok
|
||||
rotate 30
|
||||
compress
|
||||
delaycompress
|
||||
notifempty
|
||||
copytruncate
|
||||
}
|
||||
dest: /etc/logrotate.d/ansible-automation
|
||||
mode: '0644'
|
||||
become: yes
|
||||
|
||||
# 创建监控脚本
|
||||
- name: Create monitoring dashboard script
|
||||
copy:
|
||||
content: |
|
||||
#!/bin/bash
|
||||
# Automation Monitoring Dashboard
|
||||
|
||||
echo "🤖 Ansible Automation Status Dashboard"
|
||||
echo "======================================"
|
||||
echo ""
|
||||
|
||||
echo "📅 Last Execution Times:"
|
||||
echo "------------------------"
|
||||
for log in /var/log/*-check.log /var/log/*-cleanup.log; do
|
||||
if [ -f "$log" ]; then
|
||||
echo "$(basename "$log" .log): $(stat -c %y "$log" | cut -d. -f1)"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "📊 Recent Log Summary:"
|
||||
echo "---------------------"
|
||||
for log in /var/log/daily-health-check.log /var/log/weekly-cleanup.log; do
|
||||
if [ -f "$log" ]; then
|
||||
echo "=== $(basename "$log") ==="
|
||||
tail -5 "$log" | grep -E "(TASK|PLAY RECAP|ERROR|WARNING)" || echo "No recent activity"
|
||||
echo ""
|
||||
fi
|
||||
done
|
||||
|
||||
echo "⏰ Next Scheduled Jobs:"
|
||||
echo "----------------------"
|
||||
crontab -l | grep -E "(health|cleanup|security|cert|docker|network)" | while read line; do
|
||||
echo "$line"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "💾 Log File Sizes:"
|
||||
echo "-----------------"
|
||||
ls -lh /var/log/*-*.log 2>/dev/null | awk '{print $5, $9}' || echo "No log files found"
|
||||
dest: /usr/local/bin/automation-status
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
# 显示设置完成信息
|
||||
- name: Display setup completion info
|
||||
debug:
|
||||
msg: |
|
||||
🎉 自动化定时任务设置完成!
|
||||
|
||||
📋 已配置的定时任务:
|
||||
• 每日 08:00 - 系统健康检查
|
||||
• 每日 01:00 - Docker 清理 (LXC 组)
|
||||
• 每周日 02:00 - 系统清理
|
||||
• 每周一 04:30 - 证书检查
|
||||
• 每周二 06:00 - 网络连通性检查
|
||||
• 每月1日 03:00 - 安全检查
|
||||
|
||||
📊 监控命令:
|
||||
• 查看状态: automation-status
|
||||
• 查看定时任务: crontab -l
|
||||
• 查看日志: tail -f /var/log/daily-health-check.log
|
||||
|
||||
📁 日志位置: /var/log/
|
||||
🔄 日志轮转: 30天自动清理
|
||||
|
||||
💡 手动执行示例:
|
||||
• ./scripts/ops-manager.sh toolkit all
|
||||
• ./scripts/ops-manager.sh cleanup lxc
|
||||
• ./scripts/ops-manager.sh health proxmox
|
||||
@@ -1,175 +0,0 @@
|
||||
---
|
||||
- name: Bootstrap Infrastructure
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
# 基础软件包
|
||||
base_packages:
|
||||
- curl
|
||||
- wget
|
||||
- git
|
||||
- vim
|
||||
- htop
|
||||
- tree
|
||||
- unzip
|
||||
- jq
|
||||
- python3
|
||||
- python3-pip
|
||||
- apt-transport-https
|
||||
- ca-certificates
|
||||
- gnupg
|
||||
- lsb-release
|
||||
|
||||
# Docker 配置
|
||||
docker_users:
|
||||
- "{{ ansible_user }}"
|
||||
|
||||
# 系统配置
|
||||
timezone: "Asia/Shanghai"
|
||||
|
||||
tasks:
|
||||
- name: Update package cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Install base packages
|
||||
package:
|
||||
name: "{{ base_packages }}"
|
||||
state: present
|
||||
|
||||
- name: Set timezone
|
||||
timezone:
|
||||
name: "{{ timezone }}"
|
||||
|
||||
- name: Create system users
|
||||
user:
|
||||
name: "{{ ansible_user }}"
|
||||
groups: sudo
|
||||
shell: /bin/bash
|
||||
create_home: yes
|
||||
when: ansible_user != "root"
|
||||
|
||||
- name: Configure SSH
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: "{{ item.regexp }}"
|
||||
line: "{{ item.line }}"
|
||||
backup: yes
|
||||
loop:
|
||||
- { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' }
|
||||
- { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' }
|
||||
- { regexp: '^#?PubkeyAuthentication', line: 'PubkeyAuthentication yes' }
|
||||
notify: restart ssh
|
||||
when: ansible_user != "root"
|
||||
|
||||
- name: Install Docker
|
||||
block:
|
||||
- name: Add Docker GPG key
|
||||
apt_key:
|
||||
url: https://download.docker.com/linux/ubuntu/gpg
|
||||
state: present
|
||||
|
||||
- name: Add Docker repository
|
||||
apt_repository:
|
||||
repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
|
||||
state: present
|
||||
|
||||
- name: Install Docker
|
||||
package:
|
||||
name:
|
||||
- docker-ce
|
||||
- docker-ce-cli
|
||||
- containerd.io
|
||||
- docker-compose-plugin
|
||||
state: present
|
||||
|
||||
- name: Add users to docker group
|
||||
user:
|
||||
name: "{{ item }}"
|
||||
groups: docker
|
||||
append: yes
|
||||
loop: "{{ docker_users }}"
|
||||
|
||||
- name: Start and enable Docker
|
||||
systemd:
|
||||
name: docker
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Install Docker Compose (standalone)
|
||||
get_url:
|
||||
url: "https://github.com/docker/compose/releases/latest/download/docker-compose-linux-x86_64"
|
||||
dest: /usr/local/bin/docker-compose
|
||||
mode: '0755'
|
||||
|
||||
- name: Configure firewall
|
||||
ufw:
|
||||
rule: "{{ item.rule }}"
|
||||
port: "{{ item.port }}"
|
||||
proto: "{{ item.proto | default('tcp') }}"
|
||||
loop:
|
||||
- { rule: 'allow', port: '22' }
|
||||
- { rule: 'allow', port: '80' }
|
||||
- { rule: 'allow', port: '443' }
|
||||
notify: enable ufw
|
||||
|
||||
- name: Create application directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /opt/apps
|
||||
- /opt/data
|
||||
- /opt/logs
|
||||
- /opt/backups
|
||||
- /opt/scripts
|
||||
|
||||
- name: Install monitoring tools
|
||||
package:
|
||||
name:
|
||||
- htop
|
||||
- iotop
|
||||
- nethogs
|
||||
- ncdu
|
||||
- tmux
|
||||
state: present
|
||||
|
||||
- name: Configure system limits
|
||||
pam_limits:
|
||||
domain: '*'
|
||||
limit_type: "{{ item.type }}"
|
||||
limit_item: "{{ item.item }}"
|
||||
value: "{{ item.value }}"
|
||||
loop:
|
||||
- { type: 'soft', item: 'nofile', value: '65536' }
|
||||
- { type: 'hard', item: 'nofile', value: '65536' }
|
||||
- { type: 'soft', item: 'nproc', value: '32768' }
|
||||
- { type: 'hard', item: 'nproc', value: '32768' }
|
||||
|
||||
- name: Configure sysctl
|
||||
sysctl:
|
||||
name: "{{ item.name }}"
|
||||
value: "{{ item.value }}"
|
||||
state: present
|
||||
reload: yes
|
||||
loop:
|
||||
- { name: 'vm.max_map_count', value: '262144' }
|
||||
- { name: 'fs.file-max', value: '2097152' }
|
||||
- { name: 'net.core.somaxconn', value: '32768' }
|
||||
|
||||
handlers:
|
||||
- name: restart ssh
|
||||
systemd:
|
||||
name: ssh
|
||||
state: restarted
|
||||
|
||||
- name: enable ufw
|
||||
ufw:
|
||||
state: enabled
|
||||
@@ -1,83 +0,0 @@
|
||||
---
|
||||
- name: System Cleanup and Maintenance
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
tasks:
|
||||
# 清理包缓存和孤立包
|
||||
- name: Clean package cache (Debian/Ubuntu)
|
||||
apt:
|
||||
autoclean: yes
|
||||
autoremove: yes
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Remove orphaned packages (Debian/Ubuntu)
|
||||
shell: apt-get autoremove --purge -y
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
# 清理日志文件
|
||||
- name: Clean old journal logs (keep 7 days)
|
||||
shell: journalctl --vacuum-time=7d
|
||||
|
||||
- name: Clean old log files
|
||||
find:
|
||||
paths: /var/log
|
||||
patterns: "*.log.*,*.gz"
|
||||
age: "7d"
|
||||
recurse: yes
|
||||
register: old_logs
|
||||
|
||||
- name: Remove old log files
|
||||
file:
|
||||
path: "{{ item.path }}"
|
||||
state: absent
|
||||
loop: "{{ old_logs.files }}"
|
||||
when: old_logs.files is defined
|
||||
|
||||
# 清理临时文件
|
||||
- name: Clean /tmp directory (files older than 7 days)
|
||||
find:
|
||||
paths: /tmp
|
||||
age: "7d"
|
||||
recurse: yes
|
||||
register: tmp_files
|
||||
|
||||
- name: Remove old temp files
|
||||
file:
|
||||
path: "{{ item.path }}"
|
||||
state: absent
|
||||
loop: "{{ tmp_files.files }}"
|
||||
when: tmp_files.files is defined
|
||||
|
||||
# Docker 清理 (如果存在)
|
||||
- name: Check if Docker is installed
|
||||
command: which docker
|
||||
register: docker_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Clean Docker system
|
||||
shell: |
|
||||
docker system prune -f
|
||||
docker image prune -f
|
||||
docker volume prune -f
|
||||
when: docker_check.rc == 0
|
||||
|
||||
# 磁盘空间检查
|
||||
- name: Check disk usage
|
||||
shell: df -h
|
||||
register: disk_usage
|
||||
|
||||
- name: Display disk usage
|
||||
debug:
|
||||
msg: "{{ disk_usage.stdout_lines }}"
|
||||
|
||||
# 内存使用检查
|
||||
- name: Check memory usage
|
||||
shell: free -h
|
||||
register: memory_usage
|
||||
|
||||
- name: Display memory usage
|
||||
debug:
|
||||
msg: "{{ memory_usage.stdout_lines }}"
|
||||
@@ -1,43 +0,0 @@
|
||||
---
|
||||
- name: System Update Playbook
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
tasks:
|
||||
- name: Wait for automatic system updates to complete
|
||||
shell: while fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do sleep 5; done
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Update apt cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
when: ansible_os_family == "Debian"
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: Upgrade all packages
|
||||
apt:
|
||||
upgrade: yes
|
||||
autoremove: yes
|
||||
autoclean: yes
|
||||
when: ansible_os_family == "Debian"
|
||||
register: upgrade_result
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: Display upgrade results
|
||||
debug:
|
||||
msg: "System upgrade completed. {{ upgrade_result.changed }} packages were updated."
|
||||
|
||||
- name: Check if reboot is required
|
||||
stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Notify if reboot is required
|
||||
debug:
|
||||
msg: "System reboot is required to complete the update."
|
||||
when: reboot_required.stat.exists is defined and reboot_required.stat.exists
|
||||
81
configuration/playbooks/clear-aliases.yml
Normal file
81
configuration/playbooks/clear-aliases.yml
Normal file
@@ -0,0 +1,81 @@
|
||||
---
|
||||
- name: Clear all aliases on hcp1 and hcp2
|
||||
hosts: hcp1,hcp2
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Check current aliases
|
||||
shell: alias || echo "No aliases found"
|
||||
register: current_aliases
|
||||
|
||||
- name: Display current aliases
|
||||
debug:
|
||||
msg: "Current aliases: {{ current_aliases.stdout_lines }}"
|
||||
|
||||
- name: Clear aliases from /root/.bashrc
|
||||
shell: |
|
||||
sed -i '/^alias /d' /root/.bashrc
|
||||
sed -i '/^alias\t/d' /root/.bashrc
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clear aliases from /root/.profile
|
||||
shell: |
|
||||
sed -i '/^alias /d' /root/.profile
|
||||
sed -i '/^alias\t/d' /root/.profile
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clear aliases from /root/.zshrc
|
||||
shell: |
|
||||
sed -i '/^alias /d' /root/.zshrc
|
||||
sed -i '/^alias\t/d' /root/.zshrc
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clear aliases from /etc/bash.bashrc
|
||||
shell: |
|
||||
sed -i '/^alias /d' /etc/bash.bashrc
|
||||
sed -i '/^alias\t/d' /etc/bash.bashrc
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clear aliases from /etc/profile
|
||||
shell: |
|
||||
sed -i '/^alias /d' /etc/profile
|
||||
sed -i '/^alias\t/d' /etc/profile
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Find and clear custom alias files
|
||||
find:
|
||||
paths: ["/root", "/etc", "/home"]
|
||||
patterns: ["*.aliases", ".aliases", "aliases"]
|
||||
recurse: yes
|
||||
register: alias_files
|
||||
|
||||
- name: Remove found alias files
|
||||
file:
|
||||
path: "{{ item.path }}"
|
||||
state: absent
|
||||
loop: "{{ alias_files.files }}"
|
||||
when: alias_files.files is defined
|
||||
|
||||
- name: Clear shell history to remove alias commands
|
||||
shell: |
|
||||
> /root/.bash_history
|
||||
> /root/.zsh_history
|
||||
history -c
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Unalias all current aliases
|
||||
shell: unalias -a
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Restart shell services
|
||||
shell: |
|
||||
pkill -f bash || true
|
||||
pkill -f zsh || true
|
||||
|
||||
- name: Test network connectivity after clearing aliases
|
||||
shell: ping -c 2 8.8.8.8 || echo "Ping failed"
|
||||
register: ping_test
|
||||
|
||||
- name: Display ping test result
|
||||
debug:
|
||||
msg: "Ping test: {{ ping_test.stdout_lines }}"
|
||||
76
configuration/playbooks/clear-proxy.yml
Normal file
76
configuration/playbooks/clear-proxy.yml
Normal file
@@ -0,0 +1,76 @@
|
||||
---
|
||||
- name: Clear proxy settings on hcp1 and hcp2
|
||||
hosts: hcp1,hcp2
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Check current proxy environment variables
|
||||
shell: env | grep -i proxy || echo "No proxy vars found"
|
||||
register: proxy_env_before
|
||||
|
||||
- name: Display current proxy settings
|
||||
debug:
|
||||
msg: "Current proxy env: {{ proxy_env_before.stdout_lines }}"
|
||||
|
||||
- name: Clear proxy from /etc/environment
|
||||
lineinfile:
|
||||
path: /etc/environment
|
||||
regexp: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "^http_proxy="
|
||||
- "^https_proxy="
|
||||
- "^HTTP_PROXY="
|
||||
- "^HTTPS_PROXY="
|
||||
- "^ftp_proxy="
|
||||
- "^FTP_PROXY="
|
||||
- "^no_proxy="
|
||||
- "^NO_PROXY="
|
||||
|
||||
- name: Clear proxy from /etc/apt/apt.conf.d/
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /etc/apt/apt.conf.d/95proxies
|
||||
- /etc/apt/apt.conf.d/proxy.conf
|
||||
- /etc/apt/apt.conf.d/00proxy
|
||||
|
||||
- name: Clear proxy from user profiles
|
||||
lineinfile:
|
||||
path: "{{ item }}"
|
||||
regexp: ".*proxy.*"
|
||||
state: absent
|
||||
loop:
|
||||
- /root/.bashrc
|
||||
- /root/.profile
|
||||
- /home/root/.bashrc
|
||||
- /home/root/.profile
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Unset proxy variables in current session
|
||||
shell: |
|
||||
unset http_proxy
|
||||
unset https_proxy
|
||||
unset HTTP_PROXY
|
||||
unset HTTPS_PROXY
|
||||
unset ftp_proxy
|
||||
unset FTP_PROXY
|
||||
unset no_proxy
|
||||
unset NO_PROXY
|
||||
|
||||
- name: Check APT proxy configuration
|
||||
shell: apt-config dump | grep -i proxy || echo "No APT proxy found"
|
||||
register: apt_proxy_check
|
||||
|
||||
- name: Display APT proxy status
|
||||
debug:
|
||||
msg: "APT proxy config: {{ apt_proxy_check.stdout_lines }}"
|
||||
|
||||
- name: Test direct connection to HashiCorp
|
||||
shell: curl -I --connect-timeout 10 https://releases.hashicorp.com/ || echo "Connection failed"
|
||||
register: connection_test
|
||||
|
||||
- name: Display connection test result
|
||||
debug:
|
||||
msg: "Connection test: {{ connection_test.stdout_lines }}"
|
||||
57
configuration/playbooks/configure-nomad-podman-cluster.yml
Normal file
57
configuration/playbooks/configure-nomad-podman-cluster.yml
Normal file
@@ -0,0 +1,57 @@
|
||||
---
|
||||
- name: Configure Podman driver for all Nomad client nodes
|
||||
hosts: nomad_clients,nomad_servers
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Install Podman if not present
|
||||
package:
|
||||
name: podman
|
||||
state: present
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Enable Podman socket
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Update Nomad configuration to use Podman
|
||||
lineinfile:
|
||||
path: /etc/nomad.d/nomad.hcl
|
||||
regexp: '^plugin "docker"'
|
||||
line: 'plugin "podman" {'
|
||||
state: present
|
||||
|
||||
- name: Add Podman plugin configuration
|
||||
blockinfile:
|
||||
path: /etc/nomad.d/nomad.hcl
|
||||
marker: "# {mark} PODMAN PLUGIN CONFIG"
|
||||
block: |
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
insertafter: 'client {'
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: localhost
|
||||
delay: 5
|
||||
timeout: 30
|
||||
217
configuration/playbooks/configure-nomad-tailscale.yml
Normal file
217
configuration/playbooks/configure-nomad-tailscale.yml
Normal file
@@ -0,0 +1,217 @@
|
||||
---
|
||||
- name: 配置 Nomad 集群使用 Tailscale 网络通讯
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
gather_facts: no
|
||||
vars:
|
||||
nomad_config_dir: "/etc/nomad.d"
|
||||
nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
|
||||
|
||||
tasks:
|
||||
- name: 获取当前节点的 Tailscale IP
|
||||
shell: tailscale ip | head -1
|
||||
register: current_tailscale_ip
|
||||
failed_when: current_tailscale_ip.rc != 0
|
||||
|
||||
- name: 确保 Nomad 配置目录存在
|
||||
file:
|
||||
path: "{{ nomad_config_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: 生成 Nomad 服务器配置(使用 Tailscale)
|
||||
copy:
|
||||
dest: "{{ nomad_config_file }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
content: |
|
||||
datacenter = "{{ nomad_datacenter | default('dc1') }}"
|
||||
data_dir = "/opt/nomad/data"
|
||||
log_level = "INFO"
|
||||
|
||||
bind_addr = "{{ current_tailscale_ip.stdout }}"
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ current_tailscale_ip.stdout }}"
|
||||
serf = "{{ current_tailscale_ip.stdout }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = {{ nomad_bootstrap_expect | default(4) }}
|
||||
|
||||
retry_join = [
|
||||
"100.116.158.95", # semaphore
|
||||
"100.103.147.94", # ash2e
|
||||
"100.81.26.3", # ash1d
|
||||
"100.90.159.68" # ch2
|
||||
]
|
||||
|
||||
encrypt = "{{ nomad_encrypt_key }}"
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = false
|
||||
}
|
||||
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
consul {
|
||||
address = "{{ current_tailscale_ip.stdout }}:8500"
|
||||
}
|
||||
when: nomad_role == "server"
|
||||
|
||||
- name: 生成 Nomad 客户端配置(使用 Tailscale)
|
||||
copy:
|
||||
dest: "{{ nomad_config_file }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
content: |
|
||||
datacenter = "{{ nomad_datacenter | default('dc1') }}"
|
||||
data_dir = "/opt/nomad/data"
|
||||
log_level = "INFO"
|
||||
|
||||
bind_addr = "{{ current_tailscale_ip.stdout }}"
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "{{ current_tailscale_ip.stdout }}"
|
||||
serf = "{{ current_tailscale_ip.stdout }}"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
server {
|
||||
enabled = false
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
|
||||
servers = [
|
||||
"100.116.158.95:4647", # semaphore
|
||||
"100.103.147.94:4647", # ash2e
|
||||
"100.81.26.3:4647", # ash1d
|
||||
"100.90.159.68:4647" # ch2
|
||||
]
|
||||
}
|
||||
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
consul {
|
||||
address = "{{ current_tailscale_ip.stdout }}:8500"
|
||||
}
|
||||
when: nomad_role == "client"
|
||||
|
||||
- name: 检查 Nomad 二进制文件位置
|
||||
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
|
||||
register: nomad_binary_path
|
||||
failed_when: nomad_binary_path.stdout == ""
|
||||
|
||||
- name: 创建/更新 Nomad systemd 服务文件
|
||||
copy:
|
||||
dest: "/etc/systemd/system/nomad.service"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Nomad
|
||||
Documentation=https://www.nomadproject.io/
|
||||
Requires=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
LimitNOFILE=65536
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
notify: restart nomad
|
||||
|
||||
- name: 确保 Nomad 数据目录存在
|
||||
file:
|
||||
path: "/opt/nomad/data"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: 重新加载 systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: 启用并启动 Nomad 服务
|
||||
systemd:
|
||||
name: nomad
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: 等待 Nomad 服务启动
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: "{{ current_tailscale_ip.stdout }}"
|
||||
delay: 5
|
||||
timeout: 30
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 检查 Nomad 服务状态
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: nomad_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 显示配置结果
|
||||
debug:
|
||||
msg: |
|
||||
✅ 节点 {{ inventory_hostname }} 配置完成
|
||||
🌐 Tailscale IP: {{ current_tailscale_ip.stdout }}
|
||||
🎯 角色: {{ nomad_role }}
|
||||
🔧 Nomad 二进制: {{ nomad_binary_path.stdout }}
|
||||
📊 服务状态: {{ 'active' if nomad_status.rc == 0 else 'failed' }}
|
||||
{% if nomad_status.rc != 0 %}
|
||||
❌ 错误信息:
|
||||
{{ nomad_status.stdout }}
|
||||
{{ nomad_status.stderr }}
|
||||
{% endif %}
|
||||
|
||||
handlers:
|
||||
- name: restart nomad
|
||||
systemd:
|
||||
name: nomad
|
||||
state: restarted
|
||||
daemon_reload: yes
|
||||
60
configuration/playbooks/debug-nomad-podman.yml
Normal file
60
configuration/playbooks/debug-nomad-podman.yml
Normal file
@@ -0,0 +1,60 @@
|
||||
---
|
||||
- name: Debug Nomad Podman Driver Issues
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
nomad_user: nomad
|
||||
|
||||
tasks:
|
||||
- name: Check Nomad configuration
|
||||
shell: cat /etc/nomad.d/nomad.hcl
|
||||
register: nomad_config
|
||||
|
||||
- name: Display Nomad configuration
|
||||
debug:
|
||||
var: nomad_config.stdout_lines
|
||||
|
||||
- name: Check plugin directory contents
|
||||
shell: ls -la /opt/nomad/data/plugins/
|
||||
register: plugin_dir
|
||||
|
||||
- name: Display plugin directory
|
||||
debug:
|
||||
var: plugin_dir.stdout_lines
|
||||
|
||||
- name: Check Nomad logs for plugin loading
|
||||
shell: journalctl -u nomad -n 50 --no-pager | grep -E "(plugin|driver|podman)"
|
||||
register: nomad_logs
|
||||
failed_when: false
|
||||
|
||||
- name: Display relevant Nomad logs
|
||||
debug:
|
||||
var: nomad_logs.stdout_lines
|
||||
|
||||
- name: Check if plugin is executable
|
||||
stat:
|
||||
path: /opt/nomad/data/plugins/nomad-driver-podman
|
||||
register: plugin_stat
|
||||
|
||||
- name: Display plugin file info
|
||||
debug:
|
||||
var: plugin_stat
|
||||
|
||||
- name: Test plugin directly
|
||||
shell: /opt/nomad/data/plugins/nomad-driver-podman --version
|
||||
register: plugin_version
|
||||
failed_when: false
|
||||
become_user: "{{ nomad_user }}"
|
||||
|
||||
- name: Display plugin version
|
||||
debug:
|
||||
msg: "Plugin version test: {{ 'SUCCESS' if plugin_version.rc == 0 else 'FAILED' }} - {{ plugin_version.stdout if plugin_version.rc == 0 else plugin_version.stderr }}"
|
||||
|
||||
- name: Check Podman socket accessibility
|
||||
shell: sudo -u {{ nomad_user }} curl --unix-socket /run/user/1001/podman/podman.sock http://localhost/v1.0.0/libpod/info 2>/dev/null | head -3
|
||||
register: podman_socket_test
|
||||
failed_when: false
|
||||
|
||||
- name: Display Podman socket test
|
||||
debug:
|
||||
msg: "Podman socket test: {{ 'SUCCESS' if podman_socket_test.rc == 0 else 'FAILED' }}"
|
||||
168
configuration/playbooks/disk-analysis-ncdu.yml
Normal file
168
configuration/playbooks/disk-analysis-ncdu.yml
Normal file
@@ -0,0 +1,168 @@
|
||||
---
|
||||
- name: 磁盘空间分析 - 使用 ncdu 工具
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
ncdu_scan_paths:
|
||||
- "/"
|
||||
- "/var"
|
||||
- "/opt"
|
||||
- "/home"
|
||||
output_dir: "/tmp/disk-analysis"
|
||||
|
||||
tasks:
|
||||
- name: 安装 ncdu 工具
|
||||
package:
|
||||
name: ncdu
|
||||
state: present
|
||||
register: ncdu_install
|
||||
|
||||
- name: 创建输出目录
|
||||
file:
|
||||
path: "{{ output_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: 检查磁盘空间使用情况
|
||||
shell: df -h
|
||||
register: disk_usage
|
||||
|
||||
- name: 显示当前磁盘使用情况
|
||||
debug:
|
||||
msg: |
|
||||
=== {{ inventory_hostname }} 磁盘使用情况 ===
|
||||
{{ disk_usage.stdout }}
|
||||
|
||||
- name: 使用 ncdu 扫描根目录并生成报告
|
||||
shell: |
|
||||
ncdu -x -o {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json /
|
||||
async: 300
|
||||
poll: 0
|
||||
register: ncdu_root_scan
|
||||
|
||||
- name: 使用 ncdu 扫描 /var 目录
|
||||
shell: |
|
||||
ncdu -x -o {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json /var
|
||||
async: 180
|
||||
poll: 0
|
||||
register: ncdu_var_scan
|
||||
when: ansible_mounts | selectattr('mount', 'equalto', '/var') | list | length > 0 or '/var' in ansible_mounts | map(attribute='mount') | list
|
||||
|
||||
- name: 使用 ncdu 扫描 /opt 目录
|
||||
shell: |
|
||||
ncdu -x -o {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json /opt
|
||||
async: 120
|
||||
poll: 0
|
||||
register: ncdu_opt_scan
|
||||
when: ansible_mounts | selectattr('mount', 'equalto', '/opt') | list | length > 0 or '/opt' in ansible_mounts | map(attribute='mount') | list
|
||||
|
||||
- name: 等待根目录扫描完成
|
||||
async_status:
|
||||
jid: "{{ ncdu_root_scan.ansible_job_id }}"
|
||||
register: ncdu_root_result
|
||||
until: ncdu_root_result.finished
|
||||
retries: 60
|
||||
delay: 5
|
||||
|
||||
- name: 等待 /var 目录扫描完成
|
||||
async_status:
|
||||
jid: "{{ ncdu_var_scan.ansible_job_id }}"
|
||||
register: ncdu_var_result
|
||||
until: ncdu_var_result.finished
|
||||
retries: 36
|
||||
delay: 5
|
||||
when: ncdu_var_scan is defined and ncdu_var_scan.ansible_job_id is defined
|
||||
|
||||
- name: 等待 /opt 目录扫描完成
|
||||
async_status:
|
||||
jid: "{{ ncdu_opt_scan.ansible_job_id }}"
|
||||
register: ncdu_opt_result
|
||||
until: ncdu_opt_result.finished
|
||||
retries: 24
|
||||
delay: 5
|
||||
when: ncdu_opt_scan is defined and ncdu_opt_scan.ansible_job_id is defined
|
||||
|
||||
- name: 生成磁盘使用分析报告
|
||||
shell: |
|
||||
echo "=== {{ inventory_hostname }} 磁盘分析报告 ===" > {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "生成时间: $(date)" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "=== 磁盘使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
df -h >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "=== 最大的目录 (前10个) ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
du -h --max-depth=2 / 2>/dev/null | sort -hr | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "=== /var 目录最大文件 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
find /var -type f -size +100M -exec ls -lh {} \; 2>/dev/null | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "=== /tmp 目录使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
du -sh /tmp/* 2>/dev/null | sort -hr | head -5 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
echo "=== 日志文件大小 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
find /var/log -name "*.log" -type f -size +50M -exec ls -lh {} \; 2>/dev/null >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
|
||||
- name: 显示分析报告
|
||||
shell: cat {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
register: disk_report
|
||||
|
||||
- name: 输出磁盘分析结果
|
||||
debug:
|
||||
msg: "{{ disk_report.stdout }}"
|
||||
|
||||
- name: 检查是否有磁盘使用率超过 80%
|
||||
shell: df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 80) print $0}'
|
||||
register: high_usage_disks
|
||||
|
||||
- name: 警告高磁盘使用率
|
||||
debug:
|
||||
msg: |
|
||||
⚠️ 警告: {{ inventory_hostname }} 发现高磁盘使用率!
|
||||
{{ high_usage_disks.stdout }}
|
||||
when: high_usage_disks.stdout != ""
|
||||
|
||||
- name: 创建清理建议
|
||||
shell: |
|
||||
echo "=== {{ inventory_hostname }} 清理建议 ===" > {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "1. 检查日志文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
find /var/log -name "*.log" -type f -size +100M -exec echo " 大日志文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "2. 检查临时文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
find /tmp -type f -size +50M -exec echo " 大临时文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "3. 检查包缓存:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
if [ -d /var/cache/apt ]; then
|
||||
echo " APT 缓存大小: $(du -sh /var/cache/apt 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
fi
|
||||
if [ -d /var/cache/yum ]; then
|
||||
echo " YUM 缓存大小: $(du -sh /var/cache/yum 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
fi
|
||||
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo "4. 检查容器相关:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
if command -v podman >/dev/null 2>&1; then
|
||||
echo " Podman 镜像: $(podman images --format 'table {{.Repository}} {{.Tag}} {{.Size}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
echo " Podman 容器: $(podman ps -a --format 'table {{.Names}} {{.Status}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
fi
|
||||
|
||||
- name: 显示清理建议
|
||||
shell: cat {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
register: cleanup_suggestions
|
||||
|
||||
- name: 输出清理建议
|
||||
debug:
|
||||
msg: "{{ cleanup_suggestions.stdout }}"
|
||||
|
||||
- name: 保存 ncdu 文件位置信息
|
||||
debug:
|
||||
msg: |
|
||||
📁 ncdu 扫描文件已保存到:
|
||||
- 根目录: {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json
|
||||
- /var 目录: {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json (如果存在)
|
||||
- /opt 目录: {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json (如果存在)
|
||||
|
||||
💡 使用方法:
|
||||
ncdu -f {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json
|
||||
|
||||
📊 完整报告: {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
|
||||
🧹 清理建议: {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
|
||||
96
configuration/playbooks/disk-cleanup.yml
Normal file
96
configuration/playbooks/disk-cleanup.yml
Normal file
@@ -0,0 +1,96 @@
|
||||
---
|
||||
- name: 磁盘清理工具
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
cleanup_logs: true
|
||||
cleanup_cache: true
|
||||
cleanup_temp: true
|
||||
cleanup_containers: false # 谨慎操作
|
||||
|
||||
tasks:
|
||||
- name: 检查磁盘使用情况 (清理前)
|
||||
shell: df -h
|
||||
register: disk_before
|
||||
|
||||
- name: 显示清理前磁盘使用情况
|
||||
debug:
|
||||
msg: |
|
||||
=== {{ inventory_hostname }} 清理前磁盘使用情况 ===
|
||||
{{ disk_before.stdout }}
|
||||
|
||||
- name: 清理系统日志 (保留最近7天)
|
||||
shell: |
|
||||
journalctl --vacuum-time=7d
|
||||
find /var/log -name "*.log" -type f -mtime +7 -exec truncate -s 0 {} \;
|
||||
find /var/log -name "*.log.*" -type f -mtime +7 -delete
|
||||
when: cleanup_logs | bool
|
||||
register: log_cleanup
|
||||
|
||||
- name: 清理包管理器缓存
|
||||
block:
|
||||
- name: 清理 APT 缓存 (Debian/Ubuntu)
|
||||
shell: |
|
||||
apt-get clean
|
||||
apt-get autoclean
|
||||
apt-get autoremove -y
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: 清理 YUM/DNF 缓存 (RedHat/CentOS)
|
||||
shell: |
|
||||
if command -v dnf >/dev/null 2>&1; then
|
||||
dnf clean all
|
||||
elif command -v yum >/dev/null 2>&1; then
|
||||
yum clean all
|
||||
fi
|
||||
when: ansible_os_family == "RedHat"
|
||||
when: cleanup_cache | bool
|
||||
|
||||
- name: 清理临时文件
|
||||
shell: |
|
||||
find /tmp -type f -atime +7 -delete 2>/dev/null || true
|
||||
find /var/tmp -type f -atime +7 -delete 2>/dev/null || true
|
||||
rm -rf /tmp/.* 2>/dev/null || true
|
||||
when: cleanup_temp | bool
|
||||
|
||||
- name: 清理 Podman 资源 (谨慎操作)
|
||||
block:
|
||||
- name: 停止所有容器
|
||||
shell: podman stop --all
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 删除未使用的容器
|
||||
shell: podman container prune -f
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 删除未使用的镜像
|
||||
shell: podman image prune -f
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 删除未使用的卷
|
||||
shell: podman volume prune -f
|
||||
ignore_errors: yes
|
||||
when: cleanup_containers | bool
|
||||
|
||||
- name: 清理核心转储文件
|
||||
shell: |
|
||||
find /var/crash -name "core.*" -type f -delete 2>/dev/null || true
|
||||
find / -name "core" -type f -size +10M -delete 2>/dev/null || true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 检查磁盘使用情况 (清理后)
|
||||
shell: df -h
|
||||
register: disk_after
|
||||
|
||||
- name: 显示清理结果
|
||||
debug:
|
||||
msg: |
|
||||
=== {{ inventory_hostname }} 清理完成 ===
|
||||
|
||||
清理前:
|
||||
{{ disk_before.stdout }}
|
||||
|
||||
清理后:
|
||||
{{ disk_after.stdout }}
|
||||
|
||||
🧹 清理操作完成!
|
||||
105
configuration/playbooks/final-podman-fix.yml
Normal file
105
configuration/playbooks/final-podman-fix.yml
Normal file
@@ -0,0 +1,105 @@
|
||||
---
|
||||
- name: Final Podman Permission Fix for Nomad
|
||||
hosts: all
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Install podman for nomad user (system-wide)
|
||||
package:
|
||||
name: podman
|
||||
state: present
|
||||
|
||||
- name: Enable podman socket for nomad user
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
scope: system
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Create nomad user podman configuration directory
|
||||
file:
|
||||
path: /home/nomad/.config/containers
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
recurse: yes
|
||||
|
||||
- name: Configure podman for nomad user to use system socket
|
||||
copy:
|
||||
content: |
|
||||
[containers]
|
||||
|
||||
[engine]
|
||||
remote = true
|
||||
|
||||
[service_destinations]
|
||||
[service_destinations.system]
|
||||
uri = "unix:///run/podman/podman.sock"
|
||||
dest: /home/nomad/.config/containers/containers.conf
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0644'
|
||||
|
||||
- name: Update Nomad configuration to use system podman socket
|
||||
replace:
|
||||
path: /etc/nomad.d/nomad.hcl
|
||||
regexp: 'socket_path = "unix:///run/user/1001/podman/podman.sock"'
|
||||
replace: 'socket_path = "unix:///run/podman/podman.sock"'
|
||||
|
||||
- name: Add nomad user to necessary groups
|
||||
user:
|
||||
name: nomad
|
||||
groups:
|
||||
- podman
|
||||
append: yes
|
||||
|
||||
- name: Create podman group if it doesn't exist
|
||||
group:
|
||||
name: podman
|
||||
state: present
|
||||
|
||||
- name: Set proper permissions on system podman socket directory
|
||||
file:
|
||||
path: /run/podman
|
||||
state: directory
|
||||
mode: '0755'
|
||||
group: podman
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
timeout: 60
|
||||
|
||||
- name: Wait for plugins to load
|
||||
pause:
|
||||
seconds: 20
|
||||
|
||||
- name: Final verification - Check driver status
|
||||
shell: sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 "Driver Status"
|
||||
register: final_driver_status
|
||||
failed_when: false
|
||||
|
||||
- name: Display final driver status
|
||||
debug:
|
||||
var: final_driver_status.stdout_lines
|
||||
|
||||
- name: Test podman access for nomad user
|
||||
shell: sudo -u nomad podman version
|
||||
register: podman_test
|
||||
failed_when: false
|
||||
|
||||
- name: Display podman test result
|
||||
debug:
|
||||
var: podman_test.stdout_lines
|
||||
83
configuration/playbooks/fix-hcp-podman.yml
Normal file
83
configuration/playbooks/fix-hcp-podman.yml
Normal file
@@ -0,0 +1,83 @@
|
||||
---
|
||||
- name: Fix HCP1 and HCP2 Podman Configuration
|
||||
hosts: hcp1,hcp2
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Ensure nomad user exists
|
||||
user:
|
||||
name: nomad
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: /home/nomad
|
||||
create_home: yes
|
||||
|
||||
- name: Ensure Podman socket is running
|
||||
systemd:
|
||||
name: podman.socket
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Set proper permissions on Podman socket
|
||||
file:
|
||||
path: /run/podman/podman.sock
|
||||
mode: '0666'
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create nomad data directory
|
||||
file:
|
||||
path: /opt/nomad/data
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
|
||||
- name: Create nomad log directory
|
||||
file:
|
||||
path: /var/log/nomad
|
||||
state: directory
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0755'
|
||||
|
||||
- name: Test Podman access for nomad user
|
||||
shell: sudo -u nomad podman version
|
||||
register: podman_test
|
||||
failed_when: false
|
||||
|
||||
- name: Display Podman test result
|
||||
debug:
|
||||
var: podman_test.stdout_lines
|
||||
|
||||
- name: Validate Nomad configuration
|
||||
shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl
|
||||
register: config_validation
|
||||
failed_when: false
|
||||
|
||||
- name: Display configuration validation
|
||||
debug:
|
||||
var: config_validation
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
timeout: 60
|
||||
|
||||
- name: Check Nomad node status
|
||||
shell: /usr/local/bin/nomad node status -self
|
||||
register: node_status
|
||||
failed_when: false
|
||||
|
||||
- name: Display node status
|
||||
debug:
|
||||
var: node_status.stdout_lines
|
||||
56
configuration/playbooks/fix-hcs-dpkg-issue.yml
Normal file
56
configuration/playbooks/fix-hcs-dpkg-issue.yml
Normal file
@@ -0,0 +1,56 @@
|
||||
---
|
||||
- name: Fix dpkg and initramfs issues on hcs
|
||||
hosts: hcs
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Check current dpkg status
|
||||
shell: dpkg --audit
|
||||
register: dpkg_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display dpkg status
|
||||
debug:
|
||||
var: dpkg_status.stdout_lines
|
||||
|
||||
- name: Fix broken btrfs hook
|
||||
shell: |
|
||||
# Remove problematic btrfs hook temporarily
|
||||
mv /usr/share/initramfs-tools/hooks/btrfs /usr/share/initramfs-tools/hooks/btrfs.bak || true
|
||||
|
||||
# Try to reconfigure the failed package
|
||||
dpkg --configure -a
|
||||
|
||||
# If that works, restore the hook
|
||||
if [ $? -eq 0 ]; then
|
||||
mv /usr/share/initramfs-tools/hooks/btrfs.bak /usr/share/initramfs-tools/hooks/btrfs || true
|
||||
fi
|
||||
register: fix_result
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display fix result
|
||||
debug:
|
||||
var: fix_result
|
||||
|
||||
- name: Alternative fix - reinstall initramfs-tools
|
||||
apt:
|
||||
name: initramfs-tools
|
||||
state: latest
|
||||
force: yes
|
||||
when: fix_result.rc != 0
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Clean up and update
|
||||
shell: |
|
||||
apt autoremove -y
|
||||
apt update
|
||||
apt upgrade -y
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Check final dpkg status
|
||||
shell: dpkg --audit
|
||||
register: final_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display final status
|
||||
debug:
|
||||
var: final_status.stdout_lines
|
||||
99
configuration/playbooks/fix-nomad-local.yml
Normal file
99
configuration/playbooks/fix-nomad-local.yml
Normal file
@@ -0,0 +1,99 @@
|
||||
---
|
||||
- name: Update Nomad configuration for Podman and fix issues
|
||||
hosts: localhost
|
||||
become: yes
|
||||
connection: local
|
||||
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Update Nomad configuration to use Podman and disable Consul
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "100.116.158.95"
|
||||
|
||||
server {
|
||||
enabled = true
|
||||
bootstrap_expect = 1
|
||||
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
|
||||
}
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
ui {
|
||||
enabled = true
|
||||
}
|
||||
|
||||
addresses {
|
||||
http = "0.0.0.0"
|
||||
rpc = "100.116.158.95"
|
||||
serf = "100.116.158.95"
|
||||
}
|
||||
|
||||
ports {
|
||||
http = 4646
|
||||
rpc = 4647
|
||||
serf = 4648
|
||||
}
|
||||
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Disable Consul integration for now
|
||||
consul {
|
||||
address = ""
|
||||
}
|
||||
|
||||
log_level = "INFO"
|
||||
log_file = "/var/log/nomad/nomad.log"
|
||||
dest: /etc/nomad.d/nomad.hcl
|
||||
owner: nomad
|
||||
group: nomad
|
||||
mode: '0640'
|
||||
backup: yes
|
||||
|
||||
- name: Enable Podman socket for systemd
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: localhost
|
||||
delay: 5
|
||||
timeout: 30
|
||||
|
||||
- name: Check Nomad status
|
||||
uri:
|
||||
url: http://localhost:4646/v1/status/leader
|
||||
method: GET
|
||||
register: nomad_status
|
||||
retries: 3
|
||||
delay: 5
|
||||
|
||||
- name: Display Nomad status
|
||||
debug:
|
||||
msg: "Nomad leader: {{ nomad_status.json if nomad_status.json is defined else 'No leader elected' }}"
|
||||
72
configuration/playbooks/fix-nomad-podman-config.yml
Normal file
72
configuration/playbooks/fix-nomad-podman-config.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
---
|
||||
- name: Fix Nomad Podman Driver Configuration
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
nomad_user: nomad
|
||||
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Update Nomad configuration to properly reference Podman plugin
|
||||
replace:
|
||||
path: /etc/nomad.d/nomad.hcl
|
||||
regexp: 'plugin "podman" \{\n config \{\n socket_path = "unix:///run/user/1001/podman/podman.sock"\n volumes \{\n enabled = true\n \}\n \}\n\}'
|
||||
replace: |
|
||||
plugin "nomad-driver-podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/user/1001/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: localhost
|
||||
delay: 10
|
||||
timeout: 60
|
||||
|
||||
- name: Wait for plugins to load
|
||||
pause:
|
||||
seconds: 15
|
||||
|
||||
- name: Check if Podman driver is now loaded
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
|
||||
register: driver_status
|
||||
|
||||
- name: Display driver status
|
||||
debug:
|
||||
var: driver_status.stdout_lines
|
||||
|
||||
- name: Check Nomad logs for successful plugin loading
|
||||
shell: journalctl -u nomad -n 20 --no-pager | grep -E "(podman|plugin)"
|
||||
register: recent_logs
|
||||
failed_when: false
|
||||
|
||||
- name: Display recent plugin logs
|
||||
debug:
|
||||
var: recent_logs.stdout_lines
|
||||
|
||||
- name: Final verification - Test Podman functionality
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | grep -i podman
|
||||
register: podman_driver_check
|
||||
failed_when: false
|
||||
|
||||
- name: Display final result
|
||||
debug:
|
||||
msg: |
|
||||
Podman driver status: {{ 'SUCCESS - Driver loaded!' if 'podman' in (podman_driver_check.stdout | default('')) else 'Still checking...' }}
|
||||
Available drivers: {{ podman_driver_check.stdout_lines | default(['none']) | join(', ') }}
|
||||
88
configuration/playbooks/fix-nomad-systemd.yml
Normal file
88
configuration/playbooks/fix-nomad-systemd.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
---
|
||||
- name: Fix Nomad systemd service binary path
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Check Nomad binary location
|
||||
shell: which nomad
|
||||
register: nomad_binary_path
|
||||
|
||||
- name: Display binary path
|
||||
debug:
|
||||
msg: "Nomad binary 位于: {{ nomad_binary_path.stdout }}"
|
||||
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Update Nomad systemd service with correct binary path
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Nomad
|
||||
Documentation=https://www.nomadproject.io/
|
||||
Requires=network-online.target
|
||||
After=network-online.target
|
||||
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User=nomad
|
||||
Group=nomad
|
||||
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
LimitNOFILE=65536
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
dest: /etc/systemd/system/nomad.service
|
||||
mode: '0644'
|
||||
notify: reload systemd
|
||||
|
||||
- name: Reload systemd and start Nomad servers first
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
when: inventory_hostname in groups['nomad_servers']
|
||||
|
||||
- name: Wait for servers to be ready
|
||||
pause:
|
||||
seconds: 15
|
||||
when: inventory_hostname in groups['nomad_servers']
|
||||
|
||||
- name: Start Nomad clients
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
when: inventory_hostname in groups['nomad_clients']
|
||||
|
||||
- name: Wait for clients to connect
|
||||
pause:
|
||||
seconds: 10
|
||||
when: inventory_hostname in groups['nomad_clients']
|
||||
|
||||
- name: Check final service status
|
||||
shell: systemctl status nomad --no-pager -l
|
||||
register: service_status
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display service status
|
||||
debug:
|
||||
msg: |
|
||||
✅ 节点 {{ inventory_hostname }} 服务状态:
|
||||
📊 状态: {{ 'SUCCESS' if service_status.rc == 0 else 'FAILED' }}
|
||||
💾 二进制路径: {{ nomad_binary_path.stdout }}
|
||||
|
||||
handlers:
|
||||
- name: reload systemd
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
79
configuration/playbooks/fix-podman-installation.yml
Normal file
79
configuration/playbooks/fix-podman-installation.yml
Normal file
@@ -0,0 +1,79 @@
|
||||
---
|
||||
- name: Fix Podman installation on remaining nodes
|
||||
hosts: semaphore,master,ash3c,hcs
|
||||
become: yes
|
||||
serial: 1 # 逐个处理,避免同时影响多个节点
|
||||
|
||||
tasks:
|
||||
- name: Current node status
|
||||
debug:
|
||||
msg: "🔧 修复节点: {{ inventory_hostname }}"
|
||||
|
||||
- name: Check if Podman is already installed
|
||||
shell: podman --version 2>/dev/null || echo "NOT_INSTALLED"
|
||||
register: podman_check
|
||||
|
||||
- name: Install Podman if not present (semaphore special handling)
|
||||
apt:
|
||||
name:
|
||||
- podman
|
||||
- buildah
|
||||
- skopeo
|
||||
state: present
|
||||
update_cache: yes
|
||||
force_apt_get: yes
|
||||
when: inventory_hostname == 'semaphore' and 'NOT_INSTALLED' in podman_check.stdout
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Install Podman on other nodes
|
||||
apt:
|
||||
name:
|
||||
- podman
|
||||
- buildah
|
||||
- skopeo
|
||||
state: present
|
||||
when: inventory_hostname != 'semaphore'
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Install Python dependencies for podman-compose
|
||||
apt:
|
||||
name:
|
||||
- python3-pip
|
||||
- python3-setuptools
|
||||
- python3-yaml
|
||||
- python3-dotenv
|
||||
state: present
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Install podman-compose via pip
|
||||
pip:
|
||||
name:
|
||||
- podman-compose
|
||||
state: present
|
||||
executable: pip3
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Alternative podman-compose installation via apt
|
||||
apt:
|
||||
name: podman-compose
|
||||
state: present
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Verify installations
|
||||
shell: |
|
||||
echo "Podman: $(podman --version 2>/dev/null || echo 'FAILED')"
|
||||
echo "Podman Compose: $(podman-compose --version 2>/dev/null || echo 'FAILED')"
|
||||
register: verify_result
|
||||
|
||||
- name: Display verification results
|
||||
debug:
|
||||
msg: |
|
||||
✅ 节点 {{ inventory_hostname }} 验证结果:
|
||||
{{ verify_result.stdout }}
|
||||
|
||||
- name: Enable Podman socket
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
ignore_errors: yes
|
||||
133
configuration/playbooks/install-nomad-direct-download.yml
Normal file
133
configuration/playbooks/install-nomad-direct-download.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
---
|
||||
- name: Install Nomad by direct download from HashiCorp
|
||||
hosts: hcs
|
||||
become: yes
|
||||
vars:
|
||||
nomad_version: "1.10.5"
|
||||
nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||||
nomad_user: "nomad"
|
||||
nomad_group: "nomad"
|
||||
nomad_home: "/opt/nomad"
|
||||
nomad_data_dir: "/opt/nomad/data"
|
||||
nomad_config_dir: "/etc/nomad.d"
|
||||
nomad_datacenter: "dc1"
|
||||
nomad_region: "global"
|
||||
nomad_server_addresses:
|
||||
- "100.116.158.95:4647" # semaphore server address
|
||||
|
||||
tasks:
|
||||
- name: Create nomad user
|
||||
user:
|
||||
name: "{{ nomad_user }}"
|
||||
group: "{{ nomad_group }}"
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: "{{ nomad_home }}"
|
||||
create_home: yes
|
||||
|
||||
- name: Create nomad directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ nomad_user }}"
|
||||
group: "{{ nomad_group }}"
|
||||
mode: '0755'
|
||||
loop:
|
||||
- "{{ nomad_home }}"
|
||||
- "{{ nomad_data_dir }}"
|
||||
- "{{ nomad_config_dir }}"
|
||||
- /var/log/nomad
|
||||
|
||||
- name: Install unzip package
|
||||
apt:
|
||||
name: unzip
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Download Nomad binary
|
||||
get_url:
|
||||
url: "{{ nomad_url }}"
|
||||
dest: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||||
mode: '0644'
|
||||
timeout: 300
|
||||
|
||||
- name: Extract Nomad binary
|
||||
unarchive:
|
||||
src: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||||
dest: /tmp
|
||||
remote_src: yes
|
||||
|
||||
- name: Copy Nomad binary to /usr/local/bin
|
||||
copy:
|
||||
src: /tmp/nomad
|
||||
dest: /usr/local/bin/nomad
|
||||
mode: '0755'
|
||||
owner: root
|
||||
group: root
|
||||
remote_src: yes
|
||||
|
||||
- name: Create Nomad client configuration
|
||||
template:
|
||||
src: templates/nomad-client.hcl.j2
|
||||
dest: "{{ nomad_config_dir }}/nomad.hcl"
|
||||
owner: "{{ nomad_user }}"
|
||||
group: "{{ nomad_group }}"
|
||||
mode: '0640'
|
||||
|
||||
- name: Create Nomad systemd service
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Nomad
|
||||
Documentation=https://www.nomadproject.io/
|
||||
Requires=network-online.target
|
||||
After=network-online.target
|
||||
ConditionFileNotEmpty={{ nomad_config_dir }}/nomad.hcl
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User={{ nomad_user }}
|
||||
Group={{ nomad_group }}
|
||||
ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
LimitNOFILE=65536
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
dest: /etc/systemd/system/nomad.service
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: localhost
|
||||
delay: 5
|
||||
timeout: 60
|
||||
|
||||
- name: Verify Nomad installation
|
||||
command: /usr/local/bin/nomad version
|
||||
register: nomad_version_output
|
||||
|
||||
- name: Display Nomad version
|
||||
debug:
|
||||
msg: "{{ nomad_version_output.stdout }}"
|
||||
|
||||
- name: Clean up downloaded files
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
|
||||
- /tmp/nomad
|
||||
131
configuration/playbooks/install-nomad-podman-driver.yml
Normal file
131
configuration/playbooks/install-nomad-podman-driver.yml
Normal file
@@ -0,0 +1,131 @@
|
||||
---
|
||||
- name: Install Nomad Podman Driver Plugin
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
nomad_user: nomad
|
||||
nomad_data_dir: /opt/nomad/data
|
||||
nomad_plugins_dir: "{{ nomad_data_dir }}/plugins"
|
||||
podman_driver_version: "0.6.1"
|
||||
podman_driver_url: "https://releases.hashicorp.com/nomad-driver-podman/{{ podman_driver_version }}/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
|
||||
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Create plugins directory
|
||||
file:
|
||||
path: "{{ nomad_plugins_dir }}"
|
||||
state: directory
|
||||
owner: "{{ nomad_user }}"
|
||||
group: "{{ nomad_user }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Download Nomad Podman driver
|
||||
get_url:
|
||||
url: "{{ podman_driver_url }}"
|
||||
dest: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
|
||||
mode: '0644'
|
||||
|
||||
- name: Extract Nomad Podman driver
|
||||
unarchive:
|
||||
src: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
|
||||
dest: "/tmp"
|
||||
remote_src: yes
|
||||
|
||||
- name: Install Nomad Podman driver
|
||||
copy:
|
||||
src: "/tmp/nomad-driver-podman"
|
||||
dest: "{{ nomad_plugins_dir }}/nomad-driver-podman"
|
||||
owner: "{{ nomad_user }}"
|
||||
group: "{{ nomad_user }}"
|
||||
mode: '0755'
|
||||
remote_src: yes
|
||||
|
||||
- name: Update Nomad configuration for plugin directory
|
||||
blockinfile:
|
||||
path: /etc/nomad.d/nomad.hcl
|
||||
marker: "# {mark} PLUGIN DIRECTORY CONFIGURATION"
|
||||
block: |
|
||||
plugin_dir = "{{ nomad_plugins_dir }}"
|
||||
insertafter: 'data_dir = "/opt/nomad/data"'
|
||||
|
||||
- name: Fix Podman socket permissions
|
||||
file:
|
||||
path: /run/user/1001/podman/podman.sock
|
||||
mode: '0666'
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Ensure nomad user can access Podman socket
|
||||
user:
|
||||
name: "{{ nomad_user }}"
|
||||
groups: ben
|
||||
append: yes
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for Nomad to be ready
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: localhost
|
||||
delay: 10
|
||||
timeout: 60
|
||||
|
||||
- name: Verify Nomad is running
|
||||
systemd:
|
||||
name: nomad
|
||||
register: nomad_service_status
|
||||
|
||||
- name: Display Nomad service status
|
||||
debug:
|
||||
msg: "Nomad service is {{ nomad_service_status.status.ActiveState }}"
|
||||
|
||||
- name: Wait for plugins to load
|
||||
pause:
|
||||
seconds: 15
|
||||
|
||||
- name: Check available drivers
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
|
||||
register: driver_status
|
||||
failed_when: false
|
||||
|
||||
- name: Display driver status
|
||||
debug:
|
||||
var: driver_status.stdout_lines
|
||||
|
||||
- name: Test Podman driver functionality
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]'
|
||||
register: available_drivers
|
||||
failed_when: false
|
||||
|
||||
- name: Display available drivers
|
||||
debug:
|
||||
msg: "Available drivers: {{ available_drivers.stdout_lines | join(', ') }}"
|
||||
|
||||
- name: Clean up downloaded files
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
|
||||
- "/tmp/nomad-driver-podman"
|
||||
|
||||
- name: Final verification - Check if Podman driver is loaded
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers.podman.Detected'
|
||||
register: podman_driver_detected
|
||||
failed_when: false
|
||||
|
||||
- name: Display final result
|
||||
debug:
|
||||
msg: |
|
||||
Podman driver installation: {{ 'SUCCESS' if podman_driver_detected.stdout == 'true' else 'NEEDS VERIFICATION' }}
|
||||
Driver detected: {{ podman_driver_detected.stdout | default('unknown') }}
|
||||
61
configuration/playbooks/install-podman-compose.yml
Normal file
61
configuration/playbooks/install-podman-compose.yml
Normal file
@@ -0,0 +1,61 @@
|
||||
---
|
||||
- name: Install Podman Compose on all Nomad cluster nodes
|
||||
hosts: nomad_cluster
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Display target node
|
||||
debug:
|
||||
msg: "正在安装 Podman Compose 到节点: {{ inventory_hostname }}"
|
||||
|
||||
- name: Update package cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Install Podman and related tools
|
||||
apt:
|
||||
name:
|
||||
- podman
|
||||
- podman-compose
|
||||
- buildah
|
||||
- skopeo
|
||||
state: present
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Install additional dependencies
|
||||
apt:
|
||||
name:
|
||||
- python3-pip
|
||||
- python3-setuptools
|
||||
state: present
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Install podman-compose via pip if package manager failed
|
||||
pip:
|
||||
name: podman-compose
|
||||
state: present
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Verify Podman installation
|
||||
shell: podman --version
|
||||
register: podman_version
|
||||
|
||||
- name: Verify Podman Compose installation
|
||||
shell: podman-compose --version
|
||||
register: podman_compose_version
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display installation results
|
||||
debug:
|
||||
msg: |
|
||||
✅ 节点 {{ inventory_hostname }} 安装结果:
|
||||
📦 Podman: {{ podman_version.stdout }}
|
||||
🐳 Podman Compose: {{ podman_compose_version.stdout if podman_compose_version.rc == 0 else '安装失败或不可用' }}
|
||||
|
||||
- name: Ensure Podman socket is enabled
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
ignore_errors: yes
|
||||
@@ -1,131 +0,0 @@
|
||||
---
|
||||
- name: Operations Toolkit - Unified Management Dashboard
|
||||
hosts: all
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
# 可用的运维脚本
|
||||
available_scripts:
|
||||
- { name: "system-update", desc: "System package updates", file: "system-update.yml" }
|
||||
- { name: "system-cleanup", desc: "System cleanup and maintenance", file: "system-cleanup.yml" }
|
||||
- { name: "service-health", desc: "Service health monitoring", file: "service-health-check.yml" }
|
||||
- { name: "security-hardening", desc: "Security hardening and backup", file: "security-hardening.yml" }
|
||||
- { name: "docker-management", desc: "Docker container management", file: "docker-management.yml" }
|
||||
- { name: "network-connectivity", desc: "Network connectivity check", file: "network-connectivity.yml" }
|
||||
- { name: "certificate-management", desc: "SSL certificate monitoring", file: "certificate-management.yml" }
|
||||
|
||||
tasks:
|
||||
# 显示系统概览
|
||||
- name: Display system overview
|
||||
debug:
|
||||
msg: |
|
||||
🖥️ System Overview for {{ inventory_hostname }}:
|
||||
📊 OS: {{ ansible_distribution }} {{ ansible_distribution_version }}
|
||||
💾 Memory: {{ (ansible_memtotal_mb/1024)|round(1) }}GB total, {{ (ansible_memfree_mb/1024)|round(1) }}GB free
|
||||
💿 CPU: {{ ansible_processor_vcpus }} cores
|
||||
🏠 Architecture: {{ ansible_architecture }}
|
||||
🌐 IP: {{ ansible_default_ipv4.address }}
|
||||
⏰ Uptime: {{ ansible_uptime_seconds//86400 }}d {{ (ansible_uptime_seconds%86400)//3600 }}h {{ ((ansible_uptime_seconds%3600)//60) }}m
|
||||
|
||||
# 快速系统状态检查
|
||||
- name: Quick system status check
|
||||
shell: |
|
||||
echo "=== DISK USAGE ==="
|
||||
df -h | grep -E "(Filesystem|/dev/)"
|
||||
echo ""
|
||||
echo "=== MEMORY USAGE ==="
|
||||
free -h
|
||||
echo ""
|
||||
echo "=== LOAD AVERAGE ==="
|
||||
uptime
|
||||
echo ""
|
||||
echo "=== TOP PROCESSES ==="
|
||||
ps aux --sort=-%cpu | head -6
|
||||
register: quick_status
|
||||
|
||||
- name: Display quick status
|
||||
debug:
|
||||
msg: "{{ quick_status.stdout_lines }}"
|
||||
|
||||
# 检查关键服务状态
|
||||
- name: Check critical services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: service_status
|
||||
loop:
|
||||
- ssh
|
||||
- systemd-resolved
|
||||
- cron
|
||||
failed_when: false
|
||||
|
||||
- name: Display service status
|
||||
debug:
|
||||
msg: "🔧 {{ item.item }}: {{ item.status.ActiveState if item.status is defined else 'NOT FOUND' }}"
|
||||
loop: "{{ service_status.results }}"
|
||||
|
||||
# 检查最近的系统日志错误
|
||||
- name: Check recent system errors
|
||||
shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10
|
||||
register: recent_errors
|
||||
failed_when: false
|
||||
|
||||
- name: Display recent errors
|
||||
debug:
|
||||
msg: "🚨 Recent Errors: {{ recent_errors.stdout_lines if recent_errors.stdout_lines else ['No recent errors found'] }}"
|
||||
|
||||
# 检查网络连接
|
||||
- name: Quick network check
|
||||
shell: |
|
||||
echo "=== NETWORK INTERFACES ==="
|
||||
ip -br addr show
|
||||
echo ""
|
||||
echo "=== DEFAULT ROUTE ==="
|
||||
ip route | grep default
|
||||
echo ""
|
||||
echo "=== DNS TEST ==="
|
||||
nslookup google.com | grep -A1 "Name:" || echo "DNS resolution failed"
|
||||
register: network_check
|
||||
failed_when: false
|
||||
|
||||
- name: Display network status
|
||||
debug:
|
||||
msg: "🌐 Network Status: {{ network_check.stdout_lines }}"
|
||||
|
||||
# 显示可用的运维脚本
|
||||
- name: Display available operations scripts
|
||||
debug:
|
||||
msg: |
|
||||
🛠️ Available Operations Scripts:
|
||||
{% for script in available_scripts %}
|
||||
{{ loop.index }}. {{ script.name }}: {{ script.desc }}
|
||||
{% endfor %}
|
||||
|
||||
💡 Usage Examples:
|
||||
ansible-playbook -i inventory.ini system-cleanup.yml --limit {{ inventory_hostname }}
|
||||
ansible-playbook -i inventory.ini docker-management.yml --limit lxc
|
||||
ansible-playbook -i inventory.ini network-connectivity.yml --limit proxmox
|
||||
|
||||
# 生成运维建议
|
||||
- name: Generate maintenance recommendations
|
||||
debug:
|
||||
msg: |
|
||||
💡 Maintenance Recommendations for {{ inventory_hostname }}:
|
||||
|
||||
🔄 Regular Tasks (Weekly):
|
||||
- Run system-cleanup.yml to free up disk space
|
||||
- Check service-health-check.yml for service status
|
||||
- Review certificate-management.yml for expiring certificates
|
||||
|
||||
🔒 Security Tasks (Monthly):
|
||||
- Execute security-hardening.yml for security updates
|
||||
- Review network-connectivity.yml for network security
|
||||
|
||||
🐳 Container Tasks (As needed):
|
||||
- Use docker-management.yml for Docker maintenance
|
||||
|
||||
📊 Monitoring Tasks (Daily):
|
||||
- Quick check with ops-toolkit.yml (this script)
|
||||
|
||||
⚡ Emergency Tasks:
|
||||
- Use system-update.yml for critical security patches
|
||||
- Run network-connectivity.yml for connectivity issues
|
||||
167
configuration/playbooks/migrate-to-podman-simple.yml
Normal file
167
configuration/playbooks/migrate-to-podman-simple.yml
Normal file
@@ -0,0 +1,167 @@
|
||||
---
|
||||
- name: Migrate Nomad from Docker to Podman (Simple Version)
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
nomad_user: nomad
|
||||
nomad_config_dir: /etc/nomad.d
|
||||
nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
|
||||
|
||||
tasks:
|
||||
- name: Stop Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: stopped
|
||||
|
||||
- name: Backup current Nomad configuration
|
||||
copy:
|
||||
src: "{{ nomad_config_file }}"
|
||||
dest: "{{ nomad_config_file }}.backup-{{ ansible_date_time.epoch }}"
|
||||
remote_src: yes
|
||||
|
||||
- name: Get nomad user info
|
||||
getent:
|
||||
database: passwd
|
||||
key: "{{ nomad_user }}"
|
||||
register: nomad_user_info
|
||||
|
||||
- name: Set nomad user UID variable
|
||||
set_fact:
|
||||
nomad_uid: "{{ nomad_user_info.ansible_facts.getent_passwd[nomad_user][1] }}"
|
||||
|
||||
- name: Enable lingering for nomad user
|
||||
command: loginctl enable-linger {{ nomad_user }}
|
||||
failed_when: false
|
||||
|
||||
- name: Create runtime directory for nomad user
|
||||
file:
|
||||
path: "/run/user/{{ nomad_uid }}"
|
||||
state: directory
|
||||
owner: "{{ nomad_user }}"
|
||||
group: "{{ nomad_user }}"
|
||||
mode: '0700'
|
||||
|
||||
- name: Start Podman socket as nomad user
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} systemctl --user enable --now podman.socket
|
||||
args:
|
||||
creates: "/run/user/{{ nomad_uid }}/podman/podman.sock"
|
||||
|
||||
- name: Create new Nomad configuration with Podman
|
||||
copy:
|
||||
content: |
|
||||
datacenter = "dc1"
|
||||
region = "global"
|
||||
data_dir = "/opt/nomad/data"
|
||||
|
||||
bind_addr = "0.0.0.0"
|
||||
|
||||
client {
|
||||
enabled = true
|
||||
servers = [
|
||||
"100.116.158.95:4647",
|
||||
]
|
||||
}
|
||||
|
||||
# Docker plugin (disabled)
|
||||
# plugin "docker" {
|
||||
# config {
|
||||
# allow_privileged = true
|
||||
# volumes {
|
||||
# enabled = true
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
plugin "podman" {
|
||||
config {
|
||||
socket_path = "unix:///run/user/{{ nomad_uid }}/podman/podman.sock"
|
||||
volumes {
|
||||
enabled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
consul {
|
||||
address = "127.0.0.1:8500"
|
||||
}
|
||||
dest: "{{ nomad_config_file }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Update Nomad systemd service to run as nomad user
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Nomad
|
||||
Documentation=https://www.nomadproject.io/
|
||||
Requires=network-online.target
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User={{ nomad_user }}
|
||||
Group={{ nomad_user }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }}
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
LimitNOFILE=65536
|
||||
Environment=XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
dest: /etc/systemd/system/nomad.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Start Nomad service
|
||||
systemd:
|
||||
name: nomad
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Wait for Nomad to be ready (local check)
|
||||
wait_for:
|
||||
port: 4646
|
||||
host: localhost
|
||||
delay: 5
|
||||
timeout: 60
|
||||
|
||||
- name: Verify Nomad is running
|
||||
shell: systemctl is-active nomad
|
||||
register: nomad_status
|
||||
|
||||
- name: Display Nomad status
|
||||
debug:
|
||||
msg: "Nomad service status: {{ nomad_status.stdout }}"
|
||||
|
||||
- name: Check Podman socket
|
||||
stat:
|
||||
path: "/run/user/{{ nomad_uid }}/podman/podman.sock"
|
||||
register: podman_socket
|
||||
|
||||
- name: Display Podman socket status
|
||||
debug:
|
||||
msg: "Podman socket exists: {{ podman_socket.stat.exists }}"
|
||||
|
||||
- name: Test Podman as nomad user
|
||||
shell: |
|
||||
sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} podman version --format json
|
||||
register: podman_test
|
||||
failed_when: false
|
||||
|
||||
- name: Display Podman test result
|
||||
debug:
|
||||
msg: |
|
||||
Podman test: {{ 'SUCCESS' if podman_test.rc == 0 else 'FAILED' }}
|
||||
{% if podman_test.rc != 0 %}
|
||||
Error: {{ podman_test.stderr }}
|
||||
{% endif %}
|
||||
@@ -1,143 +0,0 @@
|
||||
---
|
||||
- name: Network Connectivity and Performance Check
|
||||
hosts: all
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
test_domains:
|
||||
- google.com
|
||||
- github.com
|
||||
- docker.io
|
||||
- tailscale.com
|
||||
test_ports:
|
||||
- { host: "8.8.8.8", port: 53, name: "Google DNS" }
|
||||
- { host: "1.1.1.1", port: 53, name: "Cloudflare DNS" }
|
||||
- { host: "github.com", port: 443, name: "GitHub HTTPS" }
|
||||
- { host: "docker.io", port: 443, name: "Docker Hub" }
|
||||
|
||||
tasks:
|
||||
# 基本网络信息
|
||||
- name: Get network interfaces
|
||||
shell: ip addr show | grep -E "^[0-9]+:|inet "
|
||||
register: network_interfaces
|
||||
|
||||
- name: Display network interfaces
|
||||
debug:
|
||||
msg: "🌐 Network Interfaces: {{ network_interfaces.stdout_lines }}"
|
||||
|
||||
# 检查默认路由
|
||||
- name: Check default route
|
||||
shell: ip route | grep default
|
||||
register: default_route
|
||||
|
||||
- name: Display default route
|
||||
debug:
|
||||
msg: "🛣️ Default Route: {{ default_route.stdout }}"
|
||||
|
||||
# DNS 解析测试
|
||||
- name: Test DNS resolution
|
||||
shell: nslookup {{ item }} | grep -A2 "Name:"
|
||||
register: dns_test
|
||||
loop: "{{ test_domains }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Display DNS test results
|
||||
debug:
|
||||
msg: "🔍 DNS Test for {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}"
|
||||
loop: "{{ dns_test.results }}"
|
||||
|
||||
# 网络连通性测试
|
||||
- name: Test network connectivity (ping)
|
||||
shell: ping -c 3 {{ item }}
|
||||
register: ping_test
|
||||
loop: "{{ test_domains }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Display ping test results
|
||||
debug:
|
||||
msg: "🏓 Ping to {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}"
|
||||
loop: "{{ ping_test.results }}"
|
||||
|
||||
# 端口连通性测试
|
||||
- name: Test port connectivity
|
||||
wait_for:
|
||||
host: "{{ item.host }}"
|
||||
port: "{{ item.port }}"
|
||||
timeout: 5
|
||||
register: port_test
|
||||
loop: "{{ test_ports }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Display port test results
|
||||
debug:
|
||||
msg: "🔌 {{ item.item.name }} ({{ item.item.host }}:{{ item.item.port }}): {{ 'SUCCESS' if not item.failed else 'FAILED' }}"
|
||||
loop: "{{ port_test.results }}"
|
||||
|
||||
# 检查 Tailscale 状态
|
||||
- name: Check Tailscale status
|
||||
shell: tailscale status
|
||||
register: tailscale_status
|
||||
failed_when: false
|
||||
|
||||
- name: Display Tailscale status
|
||||
debug:
|
||||
msg: "🔗 Tailscale Status: {{ 'CONNECTED' if tailscale_status.rc == 0 else 'NOT CONNECTED' }}"
|
||||
|
||||
- name: Show Tailscale details
|
||||
debug:
|
||||
msg: "{{ tailscale_status.stdout_lines }}"
|
||||
when: tailscale_status.rc == 0
|
||||
|
||||
# 检查防火墙状态
|
||||
- name: Check UFW status (Ubuntu/Debian)
|
||||
shell: ufw status
|
||||
register: ufw_status
|
||||
failed_when: false
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Display UFW status
|
||||
debug:
|
||||
msg: "🛡️ UFW Firewall: {{ ufw_status.stdout_lines }}"
|
||||
when: ansible_os_family == "Debian" and ufw_status.rc == 0
|
||||
|
||||
# 检查 iptables 规则
|
||||
- name: Check iptables rules
|
||||
shell: iptables -L -n | head -20
|
||||
register: iptables_rules
|
||||
failed_when: false
|
||||
become: yes
|
||||
|
||||
- name: Display iptables summary
|
||||
debug:
|
||||
msg: "🔥 Iptables Rules: {{ iptables_rules.stdout_lines[:10] }}"
|
||||
when: iptables_rules.rc == 0
|
||||
|
||||
# 网络性能测试
|
||||
- name: Test download speed (small file)
|
||||
shell: curl -o /dev/null -s -w "%{time_total}" http://speedtest.wdc01.softlayer.com/downloads/test10.zip
|
||||
register: download_speed
|
||||
failed_when: false
|
||||
|
||||
- name: Display download speed test
|
||||
debug:
|
||||
msg: "⚡ Download Speed Test: {{ download_speed.stdout }}s for 10MB file"
|
||||
when: download_speed.rc == 0
|
||||
|
||||
# 检查网络统计
|
||||
- name: Get network statistics
|
||||
shell: cat /proc/net/dev | grep -v "lo:" | grep ":"
|
||||
register: network_stats
|
||||
|
||||
- name: Display network statistics
|
||||
debug:
|
||||
msg: "📊 Network Stats: {{ network_stats.stdout_lines }}"
|
||||
|
||||
# 生成网络健康报告
|
||||
- name: Generate network health summary
|
||||
debug:
|
||||
msg: |
|
||||
🌐 Network Health Summary for {{ inventory_hostname }}:
|
||||
✅ DNS Resolution: {{ (dns_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} domains
|
||||
✅ Ping Connectivity: {{ (ping_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} hosts
|
||||
✅ Port Connectivity: {{ (port_test.results | rejectattr('failed', 'defined') | list | length) }}/{{ test_ports | length }} ports
|
||||
✅ Tailscale: {{ 'Connected' if tailscale_status.rc == 0 else 'Disconnected' }}
|
||||
@@ -1,135 +0,0 @@
|
||||
---
|
||||
- name: Service Health Check and Monitoring
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
critical_services:
|
||||
- ssh
|
||||
- systemd-resolved
|
||||
- cron
|
||||
web_services:
|
||||
- nginx
|
||||
- apache2
|
||||
database_services:
|
||||
- mysql
|
||||
- mariadb
|
||||
- postgresql
|
||||
container_services:
|
||||
- docker
|
||||
- containerd
|
||||
network_services:
|
||||
- tailscale
|
||||
- cloudflared
|
||||
|
||||
tasks:
|
||||
# 检查关键系统服务
|
||||
- name: Check critical system services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: critical_service_status
|
||||
loop: "{{ critical_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Report critical service issues
|
||||
debug:
|
||||
msg: "⚠️ Critical service {{ item.item }} is {{ item.status.ActiveState | default('not found') }}"
|
||||
loop: "{{ critical_service_status.results }}"
|
||||
when: item.status is defined and item.status.ActiveState != "active"
|
||||
|
||||
# 检查 Web 服务
|
||||
- name: Check web services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: web_service_status
|
||||
loop: "{{ web_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Report web service status
|
||||
debug:
|
||||
msg: "🌐 Web service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
|
||||
loop: "{{ web_service_status.results }}"
|
||||
when: item.status is defined
|
||||
|
||||
# 检查数据库服务
|
||||
- name: Check database services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: db_service_status
|
||||
loop: "{{ database_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Report database service status
|
||||
debug:
|
||||
msg: "🗄️ Database service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
|
||||
loop: "{{ db_service_status.results }}"
|
||||
when: item.status is defined
|
||||
|
||||
# 检查容器服务
|
||||
- name: Check container services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: container_service_status
|
||||
loop: "{{ container_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Report container service status
|
||||
debug:
|
||||
msg: "📦 Container service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
|
||||
loop: "{{ container_service_status.results }}"
|
||||
when: item.status is defined
|
||||
|
||||
# 检查网络服务
|
||||
- name: Check network services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: network_service_status
|
||||
loop: "{{ network_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Report network service status
|
||||
debug:
|
||||
msg: "🌐 Network service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
|
||||
loop: "{{ network_service_status.results }}"
|
||||
when: item.status is defined
|
||||
|
||||
# 检查系统负载
|
||||
- name: Check system load
|
||||
shell: uptime
|
||||
register: system_load
|
||||
|
||||
- name: Display system load
|
||||
debug:
|
||||
msg: "📊 System Load: {{ system_load.stdout }}"
|
||||
|
||||
# 检查磁盘空间警告
|
||||
- name: Check disk space usage
|
||||
shell: df -h | awk '$5 > 80 {print $0}'
|
||||
register: disk_warning
|
||||
changed_when: false
|
||||
|
||||
- name: Warn about high disk usage
|
||||
debug:
|
||||
msg: "⚠️ High disk usage detected: {{ disk_warning.stdout_lines }}"
|
||||
when: disk_warning.stdout_lines | length > 0
|
||||
|
||||
# 检查内存使用率
|
||||
- name: Check memory usage percentage
|
||||
shell: free | awk 'NR==2{printf "%.2f%%", $3*100/$2}'
|
||||
register: memory_percent
|
||||
|
||||
- name: Display memory usage
|
||||
debug:
|
||||
msg: "🧠 Memory Usage: {{ memory_percent.stdout }}"
|
||||
|
||||
# 检查最近的系统错误
|
||||
- name: Check recent system errors
|
||||
shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10
|
||||
register: recent_errors
|
||||
changed_when: false
|
||||
|
||||
- name: Display recent errors
|
||||
debug:
|
||||
msg: "🚨 Recent system errors: {{ recent_errors.stdout_lines }}"
|
||||
when: recent_errors.stdout_lines | length > 0
|
||||
120
configuration/playbooks/remove-docker-install-podman.yml
Normal file
120
configuration/playbooks/remove-docker-install-podman.yml
Normal file
@@ -0,0 +1,120 @@
|
||||
---
|
||||
- name: 移除 Docker 并安装 Podman - 新 Server 节点
|
||||
hosts: ash2e,ash1d,ch2
|
||||
become: yes
|
||||
gather_facts: no
|
||||
serial: 1 # 逐个节点处理,避免并发冲突
|
||||
|
||||
tasks:
|
||||
- name: 显示当前处理的节点
|
||||
debug:
|
||||
msg: "🔧 正在处理节点: {{ inventory_hostname }}"
|
||||
|
||||
- name: 检查 Docker 服务状态
|
||||
shell: systemctl is-active docker 2>/dev/null || echo "inactive"
|
||||
register: docker_status
|
||||
changed_when: false
|
||||
|
||||
- name: 停止 Docker 服务
|
||||
systemd:
|
||||
name: docker
|
||||
state: stopped
|
||||
enabled: no
|
||||
ignore_errors: yes
|
||||
when: docker_status.stdout == "active"
|
||||
|
||||
- name: 停止 Docker socket
|
||||
systemd:
|
||||
name: docker.socket
|
||||
state: stopped
|
||||
enabled: no
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 移除 Docker 相关包
|
||||
apt:
|
||||
name:
|
||||
- docker-ce
|
||||
- docker-ce-cli
|
||||
- containerd.io
|
||||
- docker-buildx-plugin
|
||||
- docker-compose-plugin
|
||||
- docker.io
|
||||
- docker-doc
|
||||
- docker-compose
|
||||
- docker-registry
|
||||
- containerd
|
||||
- runc
|
||||
state: absent
|
||||
purge: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 清理 Docker 数据目录
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /var/lib/docker
|
||||
- /var/lib/containerd
|
||||
- /etc/docker
|
||||
- /etc/containerd
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 清理 Docker 用户组
|
||||
group:
|
||||
name: docker
|
||||
state: absent
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 更新包缓存
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: 安装 Podman 及相关工具
|
||||
apt:
|
||||
name:
|
||||
- podman
|
||||
- buildah
|
||||
- skopeo
|
||||
- podman-compose
|
||||
state: present
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: 启用 Podman socket 服务
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 创建 Podman 用户服务目录
|
||||
file:
|
||||
path: /etc/systemd/user
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: 验证 Podman 安装
|
||||
shell: podman --version
|
||||
register: podman_version
|
||||
|
||||
- name: 验证 Podman Compose 安装
|
||||
shell: podman-compose --version 2>/dev/null || echo "未安装"
|
||||
register: podman_compose_version
|
||||
|
||||
- name: 检查 Docker 清理状态
|
||||
shell: systemctl is-active docker 2>/dev/null || echo "已移除"
|
||||
register: final_docker_status
|
||||
|
||||
- name: 显示节点处理结果
|
||||
debug:
|
||||
msg: |
|
||||
✅ 节点 {{ inventory_hostname }} 处理完成
|
||||
🐳 Docker 状态: {{ final_docker_status.stdout }}
|
||||
📦 Podman 版本: {{ podman_version.stdout }}
|
||||
🔧 Compose 状态: {{ podman_compose_version.stdout }}
|
||||
|
||||
- name: 清理 apt 缓存
|
||||
apt:
|
||||
autoclean: yes
|
||||
autoremove: yes
|
||||
39
configuration/playbooks/restart-tailscale.yml
Normal file
39
configuration/playbooks/restart-tailscale.yml
Normal file
@@ -0,0 +1,39 @@
|
||||
---
|
||||
- name: Restart Tailscale to fix DNS issues
|
||||
hosts: hcp1,hcp2
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Check current DNS configuration
|
||||
shell: cat /etc/resolv.conf
|
||||
register: dns_before
|
||||
|
||||
- name: Display current DNS config
|
||||
debug:
|
||||
msg: "Current DNS config: {{ dns_before.stdout_lines }}"
|
||||
|
||||
- name: Restart tailscaled service
|
||||
systemd:
|
||||
name: tailscaled
|
||||
state: restarted
|
||||
|
||||
- name: Wait for tailscale to stabilize
|
||||
wait_for:
|
||||
timeout: 10
|
||||
|
||||
- name: Check DNS configuration after restart
|
||||
shell: cat /etc/resolv.conf
|
||||
register: dns_after
|
||||
|
||||
- name: Display new DNS config
|
||||
debug:
|
||||
msg: "New DNS config: {{ dns_after.stdout_lines }}"
|
||||
|
||||
- name: Test DNS resolution
|
||||
shell: nslookup apt.releases.hashicorp.com
|
||||
register: dns_test
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display DNS test result
|
||||
debug:
|
||||
msg: "DNS test result: {{ dns_test.stdout_lines }}"
|
||||
@@ -1,152 +0,0 @@
|
||||
---
|
||||
- name: SSL Certificate Management and Monitoring
|
||||
hosts: all
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
# 常见证书路径
|
||||
cert_paths:
|
||||
- /etc/ssl/certs
|
||||
- /etc/letsencrypt/live
|
||||
- /etc/nginx/ssl
|
||||
- /etc/apache2/ssl
|
||||
- /usr/local/share/ca-certificates
|
||||
|
||||
# 需要检查的服务端口
|
||||
ssl_services:
|
||||
- { name: "HTTPS", port: 443 }
|
||||
- { name: "SMTPS", port: 465 }
|
||||
- { name: "IMAPS", port: 993 }
|
||||
- { name: "LDAPS", port: 636 }
|
||||
|
||||
tasks:
|
||||
# 检查证书目录
|
||||
- name: Check certificate directories
|
||||
stat:
|
||||
path: "{{ item }}"
|
||||
register: cert_dirs
|
||||
loop: "{{ cert_paths }}"
|
||||
|
||||
- name: List existing certificate directories
|
||||
debug:
|
||||
msg: "📁 Certificate directory {{ item.item }}: {{ 'EXISTS' if item.stat.exists else 'NOT FOUND' }}"
|
||||
loop: "{{ cert_dirs.results }}"
|
||||
|
||||
# 查找证书文件
|
||||
- name: Find certificate files
|
||||
find:
|
||||
paths: "{{ cert_paths }}"
|
||||
patterns: "*.crt,*.pem,*.cert"
|
||||
recurse: yes
|
||||
register: cert_files
|
||||
|
||||
- name: Display found certificates
|
||||
debug:
|
||||
msg: "🔐 Found {{ cert_files.files | length }} certificate files"
|
||||
|
||||
# 检查证书过期时间
|
||||
- name: Check certificate expiration
|
||||
shell: |
|
||||
if [ -f "{{ item.path }}" ]; then
|
||||
openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2
|
||||
fi
|
||||
register: cert_expiry
|
||||
loop: "{{ cert_files.files[:10] }}" # 限制检查前10个证书
|
||||
failed_when: false
|
||||
|
||||
- name: Display certificate expiration dates
|
||||
debug:
|
||||
msg: "📅 {{ item.item.path | basename }}: expires {{ item.stdout if item.stdout else 'INVALID/UNREADABLE' }}"
|
||||
loop: "{{ cert_expiry.results }}"
|
||||
when: item.stdout != ""
|
||||
|
||||
# 检查即将过期的证书 (30天内)
|
||||
- name: Check certificates expiring soon
|
||||
shell: |
|
||||
if [ -f "{{ item.path }}" ]; then
|
||||
exp_date=$(openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2)
|
||||
if [ ! -z "$exp_date" ]; then
|
||||
exp_epoch=$(date -d "$exp_date" +%s 2>/dev/null)
|
||||
now_epoch=$(date +%s)
|
||||
days_left=$(( (exp_epoch - now_epoch) / 86400 ))
|
||||
if [ $days_left -lt 30 ]; then
|
||||
echo "WARNING: $days_left days left"
|
||||
else
|
||||
echo "OK: $days_left days left"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
register: cert_warnings
|
||||
loop: "{{ cert_files.files[:10] }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Display certificate warnings
|
||||
debug:
|
||||
msg: "⚠️ {{ item.item.path | basename }}: {{ item.stdout }}"
|
||||
loop: "{{ cert_warnings.results }}"
|
||||
when: item.stdout != "" and "WARNING" in item.stdout
|
||||
|
||||
# 检查 Let's Encrypt 证书
|
||||
- name: Check Let's Encrypt certificates
|
||||
shell: certbot certificates 2>/dev/null || echo "Certbot not installed"
|
||||
register: letsencrypt_certs
|
||||
failed_when: false
|
||||
|
||||
- name: Display Let's Encrypt status
|
||||
debug:
|
||||
msg: "🔒 Let's Encrypt: {{ letsencrypt_certs.stdout_lines }}"
|
||||
when: "'not installed' not in letsencrypt_certs.stdout"
|
||||
|
||||
# 检查 SSL 服务端口
|
||||
- name: Check SSL service ports
|
||||
wait_for:
|
||||
port: "{{ item.port }}"
|
||||
timeout: 3
|
||||
register: ssl_ports
|
||||
loop: "{{ ssl_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Display SSL service status
|
||||
debug:
|
||||
msg: "🔌 {{ item.item.name }} (port {{ item.item.port }}): {{ 'LISTENING' if not item.failed else 'NOT AVAILABLE' }}"
|
||||
loop: "{{ ssl_ports.results }}"
|
||||
|
||||
# 测试 HTTPS 连接
|
||||
- name: Test HTTPS connection to localhost
|
||||
uri:
|
||||
url: "https://{{ ansible_default_ipv4.address }}"
|
||||
method: GET
|
||||
validate_certs: no
|
||||
timeout: 5
|
||||
register: https_test
|
||||
failed_when: false
|
||||
when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed
|
||||
|
||||
- name: Display HTTPS test result
|
||||
debug:
|
||||
msg: "🌐 HTTPS Test: {{ 'SUCCESS' if https_test.status is defined else 'FAILED' }}"
|
||||
when: https_test is defined
|
||||
|
||||
# 检查证书链
|
||||
- name: Check certificate chain for HTTPS
|
||||
shell: |
|
||||
echo | openssl s_client -connect {{ ansible_default_ipv4.address }}:443 -servername {{ ansible_hostname }} 2>/dev/null | openssl x509 -noout -subject -issuer
|
||||
register: cert_chain
|
||||
failed_when: false
|
||||
when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed
|
||||
|
||||
- name: Display certificate chain info
|
||||
debug:
|
||||
msg: "🔗 Certificate Chain: {{ cert_chain.stdout_lines }}"
|
||||
when: cert_chain is defined and cert_chain.rc == 0
|
||||
|
||||
# 生成证书健康报告
|
||||
- name: Generate certificate health summary
|
||||
debug:
|
||||
msg: |
|
||||
🔐 Certificate Health Summary for {{ inventory_hostname }}:
|
||||
📁 Certificate directories found: {{ (cert_dirs.results | selectattr('stat.exists') | list | length) }}
|
||||
📄 Certificate files found: {{ cert_files.files | length }}
|
||||
⚠️ Certificates expiring soon: {{ (cert_warnings.results | selectattr('stdout', 'search', 'WARNING') | list | length) }}
|
||||
🔒 Let's Encrypt: {{ 'Configured' if 'not installed' not in letsencrypt_certs.stdout else 'Not installed' }}
|
||||
🌐 SSL Services: {{ (ssl_ports.results | rejectattr('failed') | list | length) }}/{{ ssl_services | length }} available
|
||||
@@ -1,119 +0,0 @@
|
||||
---
|
||||
- name: Security Hardening and Backup
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
tasks:
|
||||
# SSH 安全配置检查
|
||||
- name: Check SSH configuration security
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: "{{ item.regexp }}"
|
||||
line: "{{ item.line }}"
|
||||
backup: yes
|
||||
loop:
|
||||
- { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' }
|
||||
- { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' }
|
||||
- { regexp: '^#?X11Forwarding', line: 'X11Forwarding no' }
|
||||
- { regexp: '^#?MaxAuthTries', line: 'MaxAuthTries 3' }
|
||||
notify: restart ssh
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
# 防火墙状态检查
|
||||
- name: Check UFW firewall status
|
||||
shell: ufw status
|
||||
register: ufw_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Display firewall status
|
||||
debug:
|
||||
msg: "🔥 Firewall Status: {{ ufw_status.stdout_lines }}"
|
||||
when: ansible_os_family == "Debian" and ufw_status.stdout_lines is defined
|
||||
|
||||
# 检查可疑登录
|
||||
- name: Check for failed login attempts
|
||||
shell: grep "Failed password" /var/log/auth.log | tail -10
|
||||
register: failed_logins
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Report suspicious login attempts
|
||||
debug:
|
||||
msg: "🚨 Recent failed logins: {{ failed_logins.stdout_lines }}"
|
||||
when: failed_logins.stdout_lines | length > 0
|
||||
|
||||
# 检查 root 用户活动
|
||||
- name: Check recent root activity
|
||||
shell: grep "sudo.*root" /var/log/auth.log | tail -5
|
||||
register: root_activity
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display root activity
|
||||
debug:
|
||||
msg: "👑 Recent root activity: {{ root_activity.stdout_lines }}"
|
||||
when: root_activity.stdout_lines | length > 0
|
||||
|
||||
# 备份重要配置文件
|
||||
- name: Create backup directory
|
||||
file:
|
||||
path: /backup/configs
|
||||
state: directory
|
||||
mode: '0700'
|
||||
|
||||
- name: Backup important configuration files
|
||||
copy:
|
||||
src: "{{ item }}"
|
||||
dest: "/backup/configs/{{ item | basename }}.{{ ansible_date_time.epoch }}"
|
||||
remote_src: yes
|
||||
backup: yes
|
||||
loop:
|
||||
- /etc/ssh/sshd_config
|
||||
- /etc/hosts
|
||||
- /etc/fstab
|
||||
- /etc/crontab
|
||||
failed_when: false
|
||||
|
||||
# 检查系统完整性
|
||||
- name: Check for world-writable files
|
||||
shell: find /etc /usr /bin /sbin -type f -perm -002 2>/dev/null | head -10
|
||||
register: world_writable
|
||||
changed_when: false
|
||||
|
||||
- name: Report world-writable files
|
||||
debug:
|
||||
msg: "⚠️ World-writable files found: {{ world_writable.stdout_lines }}"
|
||||
when: world_writable.stdout_lines | length > 0
|
||||
|
||||
# 检查 SUID 文件
|
||||
- name: Check for SUID files
|
||||
shell: find /usr /bin /sbin -type f -perm -4000 2>/dev/null
|
||||
register: suid_files
|
||||
changed_when: false
|
||||
|
||||
- name: Display SUID files count
|
||||
debug:
|
||||
msg: "🔐 Found {{ suid_files.stdout_lines | length }} SUID files"
|
||||
|
||||
# 更新系统时间
|
||||
- name: Sync system time
|
||||
shell: timedatectl set-ntp true
|
||||
failed_when: false
|
||||
|
||||
- name: Check time synchronization
|
||||
shell: timedatectl status
|
||||
register: time_status
|
||||
|
||||
- name: Display time sync status
|
||||
debug:
|
||||
msg: "🕐 Time sync: {{ time_status.stdout_lines | select('match', '.*synchronized.*') | list }}"
|
||||
|
||||
handlers:
|
||||
- name: restart ssh
|
||||
systemd:
|
||||
name: ssh
|
||||
state: restarted
|
||||
when: ansible_os_family == "Debian"
|
||||
187
configuration/playbooks/setup-disk-monitoring.yml
Normal file
187
configuration/playbooks/setup-disk-monitoring.yml
Normal file
@@ -0,0 +1,187 @@
|
||||
---
|
||||
- name: 部署 Telegraf 硬盘监控到 Nomad 集群
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
# 连接现有的 InfluxDB 2.x + Grafana 监控栈
|
||||
influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}"
|
||||
influxdb_token: "{{ influxdb_token }}"
|
||||
influxdb_org: "{{ influxdb_org | default('nomad') }}"
|
||||
influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}"
|
||||
|
||||
# 远程 Telegraf 配置模式(优先)
|
||||
use_remote_config: "{{ use_remote_config | default(true) }}"
|
||||
telegraf_config_url: "{{ telegraf_config_url | default('') }}"
|
||||
|
||||
# 硬盘监控阈值
|
||||
disk_usage_warning: 80 # 80% 使用率警告
|
||||
disk_usage_critical: 90 # 90% 使用率严重告警
|
||||
|
||||
# 监控间隔(秒)
|
||||
collection_interval: 30
|
||||
|
||||
tasks:
|
||||
- name: 显示正在处理的节点
|
||||
debug:
|
||||
msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控"
|
||||
|
||||
- name: 添加 InfluxData 仓库密钥
|
||||
apt_key:
|
||||
url: https://repos.influxdata.com/influxdata-archive_compat.key
|
||||
state: present
|
||||
retries: 3
|
||||
delay: 5
|
||||
|
||||
- name: 添加 InfluxData 仓库
|
||||
apt_repository:
|
||||
repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable"
|
||||
state: present
|
||||
update_cache: yes
|
||||
retries: 3
|
||||
delay: 5
|
||||
|
||||
- name: 安装 Telegraf
|
||||
apt:
|
||||
name: telegraf
|
||||
state: present
|
||||
update_cache: yes
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: 创建 Telegraf 配置目录
|
||||
file:
|
||||
path: /etc/telegraf/telegraf.d
|
||||
state: directory
|
||||
owner: telegraf
|
||||
group: telegraf
|
||||
mode: '0755'
|
||||
|
||||
- name: 清理旧的 Telegraf 日志文件(节省硬盘空间)
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /var/log/telegraf
|
||||
- /var/log/telegraf.log
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 禁用 Telegraf 日志目录创建
|
||||
file:
|
||||
path: /var/log/telegraf
|
||||
state: absent
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 创建 Telegraf 环境变量文件
|
||||
template:
|
||||
src: telegraf-env.j2
|
||||
dest: /etc/default/telegraf
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0600'
|
||||
backup: yes
|
||||
notify: restart telegraf
|
||||
|
||||
- name: 创建 Telegraf systemd 服务文件(支持远程配置)
|
||||
template:
|
||||
src: telegraf.service.j2
|
||||
dest: /etc/systemd/system/telegraf.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
backup: yes
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart telegraf
|
||||
when: telegraf_config_url is defined and telegraf_config_url != ''
|
||||
|
||||
- name: 生成 Telegraf 主配置文件(本地配置模式)
|
||||
template:
|
||||
src: telegraf.conf.j2
|
||||
dest: /etc/telegraf/telegraf.conf
|
||||
owner: telegraf
|
||||
group: telegraf
|
||||
mode: '0644'
|
||||
backup: yes
|
||||
notify: restart telegraf
|
||||
when: telegraf_config_url is not defined or telegraf_config_url == ''
|
||||
|
||||
- name: 生成硬盘监控配置
|
||||
template:
|
||||
src: disk-monitoring.conf.j2
|
||||
dest: /etc/telegraf/telegraf.d/disk-monitoring.conf
|
||||
owner: telegraf
|
||||
group: telegraf
|
||||
mode: '0644'
|
||||
backup: yes
|
||||
notify: restart telegraf
|
||||
|
||||
- name: 生成系统监控配置
|
||||
template:
|
||||
src: system-monitoring.conf.j2
|
||||
dest: /etc/telegraf/telegraf.d/system-monitoring.conf
|
||||
owner: telegraf
|
||||
group: telegraf
|
||||
mode: '0644'
|
||||
backup: yes
|
||||
notify: restart telegraf
|
||||
|
||||
- name: 启用并启动 Telegraf 服务
|
||||
systemd:
|
||||
name: telegraf
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
|
||||
- name: 验证 Telegraf 状态
|
||||
systemd:
|
||||
name: telegraf
|
||||
register: telegraf_status
|
||||
|
||||
- name: 检查 InfluxDB 连接
|
||||
uri:
|
||||
url: "{{ influxdb_url }}/ping"
|
||||
method: GET
|
||||
timeout: 5
|
||||
register: influxdb_ping
|
||||
ignore_errors: yes
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
|
||||
- name: 显示 InfluxDB 连接状态
|
||||
debug:
|
||||
msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}"
|
||||
run_once: true
|
||||
|
||||
- name: 显示 Telegraf 状态
|
||||
debug:
|
||||
msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}"
|
||||
|
||||
- name: 检查硬盘使用情况
|
||||
shell: |
|
||||
df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output;
|
||||
do
|
||||
usage=$(echo $output | awk '{print $1}' | sed 's/%//g')
|
||||
partition=$(echo $output | awk '{print $2}')
|
||||
mount=$(echo $output | awk '{print $3}')
|
||||
if [ $usage -ge {{ disk_usage_warning }} ]; then
|
||||
echo "⚠️ 警告: $mount ($partition) 使用率 $usage%"
|
||||
else
|
||||
echo "✅ $mount ($partition) 使用率 $usage%"
|
||||
fi
|
||||
done
|
||||
register: disk_check
|
||||
changed_when: false
|
||||
|
||||
- name: 显示硬盘检查结果
|
||||
debug:
|
||||
msg: "{{ disk_check.stdout_lines }}"
|
||||
|
||||
handlers:
|
||||
- name: reload systemd
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: restart telegraf
|
||||
systemd:
|
||||
name: telegraf
|
||||
state: restarted
|
||||
76
configuration/playbooks/setup-new-nomad-nodes.yml
Normal file
76
configuration/playbooks/setup-new-nomad-nodes.yml
Normal file
@@ -0,0 +1,76 @@
|
||||
---
|
||||
- name: 安装并配置新的 Nomad Server 节点
|
||||
hosts: ash2e,ash1d,ch2
|
||||
become: yes
|
||||
gather_facts: no
|
||||
|
||||
tasks:
|
||||
- name: 更新包缓存
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: 安装依赖包
|
||||
apt:
|
||||
name:
|
||||
- wget
|
||||
- curl
|
||||
- unzip
|
||||
- podman
|
||||
- buildah
|
||||
- skopeo
|
||||
state: present
|
||||
retries: 3
|
||||
delay: 10
|
||||
|
||||
- name: 检查 Nomad 是否已安装
|
||||
shell: which nomad || echo "not_found"
|
||||
register: nomad_check
|
||||
changed_when: false
|
||||
|
||||
- name: 下载并安装 Nomad
|
||||
block:
|
||||
- name: 下载 Nomad 1.10.5
|
||||
get_url:
|
||||
url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip"
|
||||
dest: "/tmp/nomad.zip"
|
||||
mode: '0644'
|
||||
|
||||
- name: 解压 Nomad
|
||||
unarchive:
|
||||
src: "/tmp/nomad.zip"
|
||||
dest: "/usr/bin/"
|
||||
remote_src: yes
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: 清理临时文件
|
||||
file:
|
||||
path: "/tmp/nomad.zip"
|
||||
state: absent
|
||||
when: nomad_check.stdout == "not_found"
|
||||
|
||||
- name: 验证 Nomad 安装
|
||||
shell: nomad version
|
||||
register: nomad_version_output
|
||||
|
||||
- name: 显示安装结果
|
||||
debug:
|
||||
msg: |
|
||||
✅ 节点 {{ inventory_hostname }} 软件安装完成
|
||||
📦 Podman: {{ ansible_facts.packages.podman[0].version if ansible_facts.packages.podman is defined else 'checking...' }}
|
||||
🎯 Nomad: {{ nomad_version_output.stdout.split('\n')[0] }}
|
||||
|
||||
- name: 启用 Podman socket
|
||||
systemd:
|
||||
name: podman.socket
|
||||
enabled: yes
|
||||
state: started
|
||||
ignore_errors: yes
|
||||
|
||||
- name: 继续完整配置
|
||||
debug:
|
||||
msg: "软件安装完成,现在将运行完整的 Nomad 配置..."
|
||||
68
configuration/templates/disk-monitoring.conf.j2
Normal file
68
configuration/templates/disk-monitoring.conf.j2
Normal file
@@ -0,0 +1,68 @@
|
||||
# 硬盘监控配置
|
||||
# 监控所有挂载点的硬盘使用情况
|
||||
|
||||
# 硬盘使用率监控
|
||||
[[inputs.disk]]
|
||||
## 忽略的文件系统类型
|
||||
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
|
||||
|
||||
## 监控所有挂载点
|
||||
mount_points = ["/", "/var", "/tmp", "/opt", "/home"]
|
||||
|
||||
## 标签配置
|
||||
[inputs.disk.tags]
|
||||
service = "disk-monitoring"
|
||||
|
||||
# 硬盘 I/O 监控
|
||||
[[inputs.diskio]]
|
||||
## 监控所有设备
|
||||
devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"]
|
||||
|
||||
## 跳过序列号收集以提高性能
|
||||
skip_serial_number = true
|
||||
|
||||
[inputs.diskio.tags]
|
||||
service = "disk-io-monitoring"
|
||||
|
||||
# 文件系统 inode 监控
|
||||
[[inputs.disk]]
|
||||
## 监控 inode 使用情况
|
||||
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
|
||||
|
||||
## 收集 inode 信息
|
||||
[inputs.disk.tags]
|
||||
service = "inode-monitoring"
|
||||
|
||||
# 进程监控(可选,用于监控可能占用大量硬盘的进程)
|
||||
[[inputs.procstat]]
|
||||
## 监控 Docker 进程(如果存在)
|
||||
pattern = "docker"
|
||||
|
||||
[inputs.procstat.tags]
|
||||
service = "docker-process"
|
||||
|
||||
[[inputs.procstat]]
|
||||
## 监控 Podman 进程
|
||||
pattern = "podman"
|
||||
|
||||
[inputs.procstat.tags]
|
||||
service = "podman-process"
|
||||
|
||||
[[inputs.procstat]]
|
||||
## 监控 Nomad 进程
|
||||
pattern = "nomad"
|
||||
|
||||
[inputs.procstat.tags]
|
||||
service = "nomad-process"
|
||||
|
||||
# 日志文件大小监控
|
||||
[[inputs.filestat]]
|
||||
files = [
|
||||
"/var/log/nomad/*.log",
|
||||
"/var/log/syslog",
|
||||
"/var/log/kern.log",
|
||||
"/var/log/auth.log"
|
||||
]
|
||||
|
||||
[inputs.filestat.tags]
|
||||
service = "log-monitoring"
|
||||
68
configuration/templates/system-monitoring.conf.j2
Normal file
68
configuration/templates/system-monitoring.conf.j2
Normal file
@@ -0,0 +1,68 @@
|
||||
# 系统监控配置
|
||||
# CPU、内存、网络等系统资源监控
|
||||
|
||||
# CPU 监控
|
||||
[[inputs.cpu]]
|
||||
## 是否收集每个 CPU 核心的信息
|
||||
percpu = true
|
||||
## 是否收集总 CPU 信息
|
||||
totalcpu = true
|
||||
## 收集字段
|
||||
collect_cpu_time = false
|
||||
## 报告活跃的 CPU
|
||||
report_active = false
|
||||
|
||||
[inputs.cpu.tags]
|
||||
service = "cpu-monitoring"
|
||||
|
||||
# 内存监控
|
||||
[[inputs.mem]]
|
||||
[inputs.mem.tags]
|
||||
service = "memory-monitoring"
|
||||
|
||||
# 网络接口监控
|
||||
[[inputs.net]]
|
||||
## 接口配置
|
||||
interfaces = ["eth*", "en*", "tailscale*"]
|
||||
|
||||
[inputs.net.tags]
|
||||
service = "network-monitoring"
|
||||
|
||||
# 系统负载监控
|
||||
[[inputs.system]]
|
||||
[inputs.system.tags]
|
||||
service = "system-load"
|
||||
|
||||
# 内核统计
|
||||
[[inputs.kernel]]
|
||||
[inputs.kernel.tags]
|
||||
service = "kernel-stats"
|
||||
|
||||
# 网络统计
|
||||
[[inputs.netstat]]
|
||||
[inputs.netstat.tags]
|
||||
service = "network-stats"
|
||||
|
||||
# 交换分区监控
|
||||
[[inputs.swap]]
|
||||
[inputs.swap.tags]
|
||||
service = "swap-monitoring"
|
||||
|
||||
# 服务状态监控
|
||||
[[inputs.systemd_units]]
|
||||
## 监控的服务
|
||||
units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"]
|
||||
|
||||
[inputs.systemd_units.tags]
|
||||
service = "service-monitoring"
|
||||
|
||||
# 硬盘健康状态监控(如果支持 SMART)
|
||||
[[inputs.smart]]
|
||||
## SMART 监控路径
|
||||
path_smartctl = "/usr/sbin/smartctl"
|
||||
|
||||
## 超时设置
|
||||
timeout = "30s"
|
||||
|
||||
[inputs.smart.tags]
|
||||
service = "smart-monitoring"
|
||||
7
configuration/templates/telegraf-env.j2
Normal file
7
configuration/templates/telegraf-env.j2
Normal file
@@ -0,0 +1,7 @@
|
||||
# Telegraf 环境变量配置
|
||||
# InfluxDB 2.x 认证信息
|
||||
|
||||
INFLUX_TOKEN={{ influxdb_token }}
|
||||
INFLUX_ORG={{ influxdb_org }}
|
||||
INFLUX_BUCKET={{ influxdb_bucket }}
|
||||
INFLUX_URL={{ influxdb_url }}
|
||||
53
configuration/templates/telegraf.conf.j2
Normal file
53
configuration/templates/telegraf.conf.j2
Normal file
@@ -0,0 +1,53 @@
|
||||
# Telegraf 主配置文件
|
||||
# Nomad 集群硬盘监控配置
|
||||
|
||||
# 全局设置
|
||||
[global_tags]
|
||||
nomad_cluster = "production"
|
||||
node_role = "{{ nomad_role | default('unknown') }}"
|
||||
hostname = "{{ inventory_hostname }}"
|
||||
|
||||
# Agent 配置
|
||||
[agent]
|
||||
interval = "{{ collection_interval | default(30) }}s"
|
||||
round_interval = true
|
||||
metric_batch_size = 1000
|
||||
metric_buffer_limit = 10000
|
||||
collection_jitter = "2s"
|
||||
flush_interval = "10s"
|
||||
flush_jitter = "0s"
|
||||
precision = ""
|
||||
hostname = "{{ inventory_hostname }}"
|
||||
omit_hostname = false
|
||||
|
||||
# 输出配置 - InfluxDB 2.x
|
||||
[[outputs.influxdb_v2]]
|
||||
urls = ["{{ influxdb_url }}"]
|
||||
token = "{{ influxdb_token }}"
|
||||
organization = "{{ influxdb_org | default('nomad') }}"
|
||||
bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}"
|
||||
|
||||
## 连接配置
|
||||
timeout = "10s"
|
||||
max_retries = 3
|
||||
retry_timeout = "5s"
|
||||
|
||||
## 数据精度
|
||||
precision = "s"
|
||||
|
||||
## TLS 配置(如果需要)
|
||||
# tls_ca = "/etc/telegraf/ca.pem"
|
||||
# tls_cert = "/etc/telegraf/cert.pem"
|
||||
# tls_key = "/etc/telegraf/key.pem"
|
||||
# insecure_skip_verify = false
|
||||
|
||||
# 日志配置 - 禁用本地日志以节省硬盘空间
|
||||
[log]
|
||||
## 只输出错误日志到 syslog,不生成本地文件
|
||||
level = "ERROR"
|
||||
## 禁用本地日志文件
|
||||
# file = "/var/log/telegraf/telegraf.log"
|
||||
## 使用 syslog 替代本地文件
|
||||
logtarget = "syslog"
|
||||
## 禁用日志轮转
|
||||
logrotate = false
|
||||
29
configuration/templates/telegraf.service.j2
Normal file
29
configuration/templates/telegraf.service.j2
Normal file
@@ -0,0 +1,29 @@
|
||||
[Unit]
|
||||
Description=Telegraf - 节点监控服务
|
||||
Documentation=https://github.com/influxdata/telegraf
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User=telegraf
|
||||
Group=telegraf
|
||||
ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=control-group
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
TimeoutStopSec=20
|
||||
EnvironmentFile=/etc/default/telegraf
|
||||
|
||||
# 安全配置
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=/var/lib/telegraf
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
ProtectControlGroups=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user