feat(监控): 添加Telegraf监控配置和磁盘监控脚本

refactor(容器): 从Docker迁移到Podman并更新Nomad配置 fix(配置): 修复代理和别名配置问题 docs(文档): 更新配置文件和脚本注释 chore(清理): 移除不再使用的Consul和Docker相关文件
2025-09-24 03:46:30 +00:00
parent 3f45ad8361
commit d0e7f64c1d
92 changed files with 3552 additions and 7737 deletions
--- a/configuration/deploy-monitoring.sh
+++ b/configuration/deploy-monitoring.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Nomad 集群硬盘监控部署脚本
+# 使用现有的 InfluxDB + Grafana 监控栈
+
+echo "🚀 开始部署 Nomad 集群硬盘监控..."
+
+# 检查配置文件
+if [[ ! -f "inventories/production/group_vars/all.yml" ]]; then
+    echo "❌ 配置文件不存在，请先配置 InfluxDB 连接信息"
+    exit 1
+fi
+
+# 显示配置信息
+echo "📋 当前监控配置："
+grep -E "influxdb_|disk_usage_|collection_interval" inventories/production/group_vars/all.yml
+
+echo ""
+read -p "🤔 确认配置正确吗？(y/N): " confirm
+if [[ $confirm != [yY] ]]; then
+    echo "❌ 部署取消，请修改配置后重试"
+    exit 1
+fi
+
+# 部署到所有节点
+echo "📦 开始安装 Telegraf 到所有节点..."
+ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml
+
+# 检查部署结果
+if [[ $? -eq 0 ]]; then
+    echo "✅ 硬盘监控部署完成！"
+    echo ""
+    echo "📊 监控信息："
+    echo "- 数据将发送到你现有的 InfluxDB"
+    echo "- 可以在 Grafana 中创建仪表板查看数据"
+    echo "- 已禁用本地日志文件以节省硬盘空间"
+    echo "- 监控数据每30秒收集一次"
+    echo ""
+    echo "🔧 下一步："
+    echo "1. 在 Grafana 中创建 Nomad 集群监控仪表板"
+    echo "2. 设置硬盘使用率告警规则"
+    echo "3. 可以运行以下命令检查监控状态："
+    echo "   ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf'"
+else
+    echo "❌ 部署失败，请检查错误信息"
+    exit 1
+fi
--- a/configuration/deploy-telegraf-remote.sh
+++ b/configuration/deploy-telegraf-remote.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# 使用远程 InfluxDB 2.x 配置快速部署 Telegraf 监控
+
+echo "🚀 使用 InfluxDB 2.x 远程配置部署 Telegraf 监控..."
+
+# 设置变量
+INFLUX_TOKEN="VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w=="
+TELEGRAF_CONFIG_URL="http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000"
+
+# 检查网络连接
+echo "🔍 检查 InfluxDB 连接..."
+if curl -s --max-time 5 "http://influxdb1.tailnet-68f9.ts.net:8086/health" > /dev/null; then
+    echo "✅ InfluxDB 连接正常"
+else
+    echo "❌ 无法连接到 InfluxDB，请检查网络"
+    exit 1
+fi
+
+# 使用远程配置部署
+echo "📦 开始部署到所有节点..."
+ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml \
+  -e "use_remote_config=true" \
+  -e "telegraf_config_url=$TELEGRAF_CONFIG_URL" \
+  -e "influxdb_token=$INFLUX_TOKEN"
+
+# 检查部署结果
+if [[ $? -eq 0 ]]; then
+    echo "✅ Telegraf 监控部署完成！"
+    echo ""
+    echo "📊 配置信息："
+    echo "- 使用远程配置: $TELEGRAF_CONFIG_URL"
+    echo "- InfluxDB 服务器: influxdb1.tailnet-68f9.ts.net:8086"
+    echo "- 已禁用本地日志文件"
+    echo ""
+    echo "🔧 验证部署："
+    echo "ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf --no-pager'"
+else
+    echo "❌ 部署失败，请检查错误信息"
+    exit 1
+fi
--- a/configuration/docker-daemon.json
+++ b/configuration/docker-daemon.json
@@ -1,14 +0,0 @@
-{
-  "proxies": {
-    "http-proxy": "http://istoreos.tailnet-68f9.ts.net:7891",
-    "https-proxy": "http://istoreos.tailnet-68f9.ts.net:7891",
-    "no-proxy": "localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net"
-  },
-  "registry-mirrors": [],
-  "insecure-registries": [],
-  "debug": false,
-  "experimental": false,
-  "features": {
-    "buildkit": true
-  }
-}
--- a/configuration/inventories/production/group_vars/all.yml
+++ b/configuration/inventories/production/group_vars/all.yml
@@ -0,0 +1,20 @@
+# Nomad 集群全局配置
+# InfluxDB 2.x + Grafana 监控配置
+
+# InfluxDB 2.x 连接配置
+influxdb_url: "http://influxdb1.tailnet-68f9.ts.net:8086"
+influxdb_token: "VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w=="
+influxdb_org: "nomad"                     # 组织名称
+influxdb_bucket: "nomad_monitoring"        # Bucket 名称
+
+# 远程 Telegraf 配置 URL
+telegraf_config_url: "http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000"
+
+# 监控配置
+disk_usage_warning: 80               # 硬盘使用率警告阈值
+disk_usage_critical: 90              # 硬盘使用率严重告警阈值
+collection_interval: 30              # 数据收集间隔（秒）
+
+# Telegraf 优化配置
+telegraf_log_level: "ERROR"          # 只记录错误日志
+telegraf_disable_local_logs: true    # 禁用本地日志文件
--- a/configuration/inventories/production/nomad-cluster.ini
+++ b/configuration/inventories/production/nomad-cluster.ini
@@ -1,10 +1,20 @@
 [nomad_servers]
-master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
-semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
-ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
+semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=6
+ash2e ansible_host=ash2e ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
+ash1d ansible_host=ash1d ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
+ch2 ansible_host=ch2 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
+ch3 ansible_host=ch3 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6
+# 新增的 Mac 和 Windows 节点（请替换为实际的 Tailscale IP）
+mac-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_mac_user nomad_role=server nomad_bootstrap_expect=6
+win-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_win_user nomad_role=server nomad_bootstrap_expect=6

 [nomad_clients]
-# 如果需要客户端节点，可以在这里添加
+master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
+ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
+hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
+hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
+hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
+syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client

 [nomad_cluster:children]
 nomad_servers
--- a/configuration/inventories/production/nomad-cluster.ini.backup
+++ b/configuration/inventories/production/nomad-cluster.ini.backup
@@ -0,0 +1,22 @@
+[nomad_servers]
+master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
+semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
+ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
+
+[nomad_clients]
+hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
+hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
+hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
+
+[nomad_cluster:children]
+nomad_servers
+nomad_clients
+
+[nomad_cluster:vars]
+ansible_ssh_private_key_file=~/.ssh/id_ed25519
+ansible_user=ben
+ansible_become=yes
+nomad_version=1.10.5
+nomad_datacenter=dc1
+nomad_region=global
+nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=
--- a/configuration/inventories/production/nomad-cluster.ini.backup-20250924-025928
+++ b/configuration/inventories/production/nomad-cluster.ini.backup-20250924-025928
@@ -0,0 +1,23 @@
+[nomad_servers]
+master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
+semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
+ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
+
+[nomad_clients]
+hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
+hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client
+hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
+syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client
+
+[nomad_cluster:children]
+nomad_servers
+nomad_clients
+
+[nomad_cluster:vars]
+ansible_ssh_private_key_file=~/.ssh/id_ed25519
+ansible_user=ben
+ansible_become=yes
+nomad_version=1.10.5
+nomad_datacenter=dc1
+nomad_region=global
+nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=
--- a/configuration/playbooks/bootstrap/cron-setup.yml
+++ b/configuration/playbooks/bootstrap/cron-setup.yml
@@ -1,183 +0,0 @@
---
- name: Setup Automated Maintenance Cron Jobs
-  hosts: localhost
-  gather_facts: no
-  
-  vars:
-    # 定时任务配置
-    cron_jobs:
-      # 每日快速检查
-      - name: "Daily system health check"
-        job: "cd /root/mgmt && ./scripts/ops-manager.sh toolkit all --check > /var/log/daily-health-check.log 2>&1"
-        minute: "0"
-        hour: "8"
-        day: "*"
-        month: "*"
-        weekday: "*"
-        
-      # 每周系统清理
-      - name: "Weekly system cleanup"
-        job: "cd /root/mgmt && ./scripts/ops-manager.sh cleanup all > /var/log/weekly-cleanup.log 2>&1"
-        minute: "0"
-        hour: "2"
-        day: "*"
-        month: "*"
-        weekday: "0"  # Sunday
-        
-      # 每月安全检查
-      - name: "Monthly security hardening check"
-        job: "cd /root/mgmt && ./scripts/ops-manager.sh security all --check > /var/log/monthly-security-check.log 2>&1"
-        minute: "0"
-        hour: "3"
-        day: "1"
-        month: "*"
-        weekday: "*"
-        
-      # 每周证书检查
-      - name: "Weekly certificate check"
-        job: "cd /root/mgmt && ./scripts/ops-manager.sh cert all > /var/log/weekly-cert-check.log 2>&1"
-        minute: "30"
-        hour: "4"
-        day: "*"
-        month: "*"
-        weekday: "1"  # Monday
-        
-      # 每日 Docker 清理 (仅 LXC 组)
-      - name: "Daily Docker cleanup for LXC"
-        job: "cd /root/mgmt && ansible lxc -i ansible/inventory.ini -m shell -a 'docker system prune -f' --become -e 'ansible_ssh_pass=313131' > /var/log/daily-docker-cleanup.log 2>&1"
-        minute: "0"
-        hour: "1"
-        day: "*"
-        month: "*"
-        weekday: "*"
-        
-      # 每周网络连通性检查
-      - name: "Weekly network connectivity check"
-        job: "cd /root/mgmt && ./scripts/ops-manager.sh network all > /var/log/weekly-network-check.log 2>&1"
-        minute: "0"
-        hour: "6"
-        day: "*"
-        month: "*"
-        weekday: "2"  # Tuesday
-  
-  tasks:
-    # 创建日志目录
-    - name: Create log directory
-      file:
-        path: /var/log/ansible-automation
-        state: directory
-        mode: '0755'
-      become: yes
-      
-    # 设置脚本执行权限
-    - name: Make ops-manager.sh executable
-      file:
-        path: /root/mgmt/scripts/ops-manager.sh
-        mode: '0755'
-        
-    # 创建定时任务
-    - name: Setup cron jobs for automated maintenance
-      cron:
-        name: "{{ item.name }}"
-        job: "{{ item.job }}"
-        minute: "{{ item.minute }}"
-        hour: "{{ item.hour }}"
-        day: "{{ item.day }}"
-        month: "{{ item.month }}"
-        weekday: "{{ item.weekday }}"
-        user: root
-      loop: "{{ cron_jobs }}"
-      become: yes
-      
-    # 创建日志轮转配置
-    - name: Setup log rotation for automation logs
-      copy:
-        content: |
-          /var/log/*-health-check.log
-          /var/log/*-cleanup.log
-          /var/log/*-security-check.log
-          /var/log/*-cert-check.log
-          /var/log/*-docker-cleanup.log
-          /var/log/*-network-check.log {
-              daily
-              missingok
-              rotate 30
-              compress
-              delaycompress
-              notifempty
-              copytruncate
-          }
-        dest: /etc/logrotate.d/ansible-automation
-        mode: '0644'
-      become: yes
-      
-    # 创建监控脚本
-    - name: Create monitoring dashboard script
-      copy:
-        content: |
-          #!/bin/bash
-          # Automation Monitoring Dashboard
-          
-          echo "🤖 Ansible Automation Status Dashboard"
-          echo "======================================"
-          echo ""
-          
-          echo "📅 Last Execution Times:"
-          echo "------------------------"
-          for log in /var/log/*-check.log /var/log/*-cleanup.log; do
-              if [ -f "$log" ]; then
-                  echo "$(basename "$log" .log): $(stat -c %y "$log" | cut -d. -f1)"
-              fi
-          done
-          echo ""
-          
-          echo "📊 Recent Log Summary:"
-          echo "---------------------"
-          for log in /var/log/daily-health-check.log /var/log/weekly-cleanup.log; do
-              if [ -f "$log" ]; then
-                  echo "=== $(basename "$log") ==="
-                  tail -5 "$log" | grep -E "(TASK|PLAY RECAP|ERROR|WARNING)" || echo "No recent activity"
-                  echo ""
-              fi
-          done
-          
-          echo "⏰ Next Scheduled Jobs:"
-          echo "----------------------"
-          crontab -l | grep -E "(health|cleanup|security|cert|docker|network)" | while read line; do
-              echo "$line"
-          done
-          echo ""
-          
-          echo "💾 Log File Sizes:"
-          echo "-----------------"
-          ls -lh /var/log/*-*.log 2>/dev/null | awk '{print $5, $9}' || echo "No log files found"
-        dest: /usr/local/bin/automation-status
-        mode: '0755'
-      become: yes
-      
-    # 显示设置完成信息
-    - name: Display setup completion info
-      debug:
-        msg: |
-          🎉 自动化定时任务设置完成！
-          
-          📋 已配置的定时任务:
-          • 每日 08:00 - 系统健康检查
-          • 每日 01:00 - Docker 清理 (LXC 组)
-          • 每周日 02:00 - 系统清理
-          • 每周一 04:30 - 证书检查
-          • 每周二 06:00 - 网络连通性检查
-          • 每月1日 03:00 - 安全检查
-          
-          📊 监控命令:
-          • 查看状态: automation-status
-          • 查看定时任务: crontab -l
-          • 查看日志: tail -f /var/log/daily-health-check.log
-          
-          📁 日志位置: /var/log/
-          🔄 日志轮转: 30天自动清理
-          
-          💡 手动执行示例:
-          • ./scripts/ops-manager.sh toolkit all
-          • ./scripts/ops-manager.sh cleanup lxc
-          • ./scripts/ops-manager.sh health proxmox
--- a/configuration/playbooks/bootstrap/main.yml
+++ b/configuration/playbooks/bootstrap/main.yml
@@ -1,175 +0,0 @@
---
- name: Bootstrap Infrastructure
-  hosts: all
-  become: yes
-  gather_facts: yes
-  
-  vars:
-    # 基础软件包
-    base_packages:
-      - curl
-      - wget
-      - git
-      - vim
-      - htop
-      - tree
-      - unzip
-      - jq
-      - python3
-      - python3-pip
-      - apt-transport-https
-      - ca-certificates
-      - gnupg
-      - lsb-release
-    
-    # Docker 配置
-    docker_users:
-      - "{{ ansible_user }}"
-    
-    # 系统配置
-    timezone: "Asia/Shanghai"
-    
-  tasks:
-    - name: Update package cache
-      apt:
-        update_cache: yes
-        cache_valid_time: 3600
-      when: ansible_os_family == "Debian"
-      
-    - name: Install base packages
-      package:
-        name: "{{ base_packages }}"
-        state: present
-        
-    - name: Set timezone
-      timezone:
-        name: "{{ timezone }}"
-        
-    - name: Create system users
-      user:
-        name: "{{ ansible_user }}"
-        groups: sudo
-        shell: /bin/bash
-        create_home: yes
-      when: ansible_user != "root"
-      
-    - name: Configure SSH
-      lineinfile:
-        path: /etc/ssh/sshd_config
-        regexp: "{{ item.regexp }}"
-        line: "{{ item.line }}"
-        backup: yes
-      loop:
-        - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' }
-        - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' }
-        - { regexp: '^#?PubkeyAuthentication', line: 'PubkeyAuthentication yes' }
-      notify: restart ssh
-      when: ansible_user != "root"
-      
-    - name: Install Docker
-      block:
-        - name: Add Docker GPG key
-          apt_key:
-            url: https://download.docker.com/linux/ubuntu/gpg
-            state: present
-            
-        - name: Add Docker repository
-          apt_repository:
-            repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
-            state: present
-            
-        - name: Install Docker
-          package:
-            name:
-              - docker-ce
-              - docker-ce-cli
-              - containerd.io
-              - docker-compose-plugin
-            state: present
-            
-        - name: Add users to docker group
-          user:
-            name: "{{ item }}"
-            groups: docker
-            append: yes
-          loop: "{{ docker_users }}"
-          
-        - name: Start and enable Docker
-          systemd:
-            name: docker
-            state: started
-            enabled: yes
-            
-    - name: Install Docker Compose (standalone)
-      get_url:
-        url: "https://github.com/docker/compose/releases/latest/download/docker-compose-linux-x86_64"
-        dest: /usr/local/bin/docker-compose
-        mode: '0755'
-        
-    - name: Configure firewall
-      ufw:
-        rule: "{{ item.rule }}"
-        port: "{{ item.port }}"
-        proto: "{{ item.proto | default('tcp') }}"
-      loop:
-        - { rule: 'allow', port: '22' }
-        - { rule: 'allow', port: '80' }
-        - { rule: 'allow', port: '443' }
-      notify: enable ufw
-      
-    - name: Create application directories
-      file:
-        path: "{{ item }}"
-        state: directory
-        owner: "{{ ansible_user }}"
-        group: "{{ ansible_user }}"
-        mode: '0755'
-      loop:
-        - /opt/apps
-        - /opt/data
-        - /opt/logs
-        - /opt/backups
-        - /opt/scripts
-        
-    - name: Install monitoring tools
-      package:
-        name:
-          - htop
-          - iotop
-          - nethogs
-          - ncdu
-          - tmux
-        state: present
-        
-    - name: Configure system limits
-      pam_limits:
-        domain: '*'
-        limit_type: "{{ item.type }}"
-        limit_item: "{{ item.item }}"
-        value: "{{ item.value }}"
-      loop:
-        - { type: 'soft', item: 'nofile', value: '65536' }
-        - { type: 'hard', item: 'nofile', value: '65536' }
-        - { type: 'soft', item: 'nproc', value: '32768' }
-        - { type: 'hard', item: 'nproc', value: '32768' }
-        
-    - name: Configure sysctl
-      sysctl:
-        name: "{{ item.name }}"
-        value: "{{ item.value }}"
-        state: present
-        reload: yes
-      loop:
-        - { name: 'vm.max_map_count', value: '262144' }
-        - { name: 'fs.file-max', value: '2097152' }
-        - { name: 'net.core.somaxconn', value: '32768' }
-        
-  handlers:
-    - name: restart ssh
-      systemd:
-        name: ssh
-        state: restarted
-        
-    - name: enable ufw
-      ufw:
-        state: enabled
--- a/configuration/playbooks/bootstrap/system-cleanup.yml
+++ b/configuration/playbooks/bootstrap/system-cleanup.yml
@@ -1,83 +0,0 @@
---
- name: System Cleanup and Maintenance
-  hosts: all
-  become: yes
-  gather_facts: yes
-  
-  tasks:
-    # 清理包缓存和孤立包
-    - name: Clean package cache (Debian/Ubuntu)
-      apt:
-        autoclean: yes
-        autoremove: yes
-      when: ansible_os_family == "Debian"
-      
-    - name: Remove orphaned packages (Debian/Ubuntu)
-      shell: apt-get autoremove --purge -y
-      when: ansible_os_family == "Debian"
-      
-    # 清理日志文件
-    - name: Clean old journal logs (keep 7 days)
-      shell: journalctl --vacuum-time=7d
-      
-    - name: Clean old log files
-      find:
-        paths: /var/log
-        patterns: "*.log.*,*.gz"
-        age: "7d"
-        recurse: yes
-      register: old_logs
-      
-    - name: Remove old log files
-      file:
-        path: "{{ item.path }}"
-        state: absent
-      loop: "{{ old_logs.files }}"
-      when: old_logs.files is defined
-      
-    # 清理临时文件
-    - name: Clean /tmp directory (files older than 7 days)
-      find:
-        paths: /tmp
-        age: "7d"
-        recurse: yes
-      register: tmp_files
-      
-    - name: Remove old temp files
-      file:
-        path: "{{ item.path }}"
-        state: absent
-      loop: "{{ tmp_files.files }}"
-      when: tmp_files.files is defined
-      
-    # Docker 清理 (如果存在)
-    - name: Check if Docker is installed
-      command: which docker
-      register: docker_check
-      failed_when: false
-      changed_when: false
-      
-    - name: Clean Docker system
-      shell: |
-        docker system prune -f
-        docker image prune -f
-        docker volume prune -f
-      when: docker_check.rc == 0
-      
-    # 磁盘空间检查
-    - name: Check disk usage
-      shell: df -h
-      register: disk_usage
-      
-    - name: Display disk usage
-      debug:
-        msg: "{{ disk_usage.stdout_lines }}"
-        
-    # 内存使用检查
-    - name: Check memory usage
-      shell: free -h
-      register: memory_usage
-      
-    - name: Display memory usage
-      debug:
-        msg: "{{ memory_usage.stdout_lines }}"
--- a/configuration/playbooks/bootstrap/system-update.yml
+++ b/configuration/playbooks/bootstrap/system-update.yml
@@ -1,43 +0,0 @@
---
- name: System Update Playbook
-  hosts: all
-  become: yes
-  gather_facts: yes
-  
-  tasks:
-    - name: Wait for automatic system updates to complete
-      shell: while fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do sleep 5; done
-      when: ansible_os_family == "Debian"
-      
-    - name: Update apt cache
-      apt:
-        update_cache: yes
-        cache_valid_time: 3600
-      when: ansible_os_family == "Debian"
-      retries: 3
-      delay: 10
-      
-    - name: Upgrade all packages
-      apt:
-        upgrade: yes
-        autoremove: yes
-        autoclean: yes
-      when: ansible_os_family == "Debian"
-      register: upgrade_result
-      retries: 3
-      delay: 10
-      
-    - name: Display upgrade results
-      debug:
-        msg: "System upgrade completed. {{ upgrade_result.changed }} packages were updated."
-        
-    - name: Check if reboot is required
-      stat:
-        path: /var/run/reboot-required
-      register: reboot_required
-      when: ansible_os_family == "Debian"
-      
-    - name: Notify if reboot is required
-      debug:
-        msg: "System reboot is required to complete the update."
-      when: reboot_required.stat.exists is defined and reboot_required.stat.exists
--- a/configuration/playbooks/clear-aliases.yml
+++ b/configuration/playbooks/clear-aliases.yml
@@ -0,0 +1,81 @@
+---
+- name: Clear all aliases on hcp1 and hcp2
+  hosts: hcp1,hcp2
+  become: yes
+  
+  tasks:
+    - name: Check current aliases
+      shell: alias || echo "No aliases found"
+      register: current_aliases
+      
+    - name: Display current aliases
+      debug:
+        msg: "Current aliases: {{ current_aliases.stdout_lines }}"
+    
+    - name: Clear aliases from /root/.bashrc
+      shell: |
+        sed -i '/^alias /d' /root/.bashrc
+        sed -i '/^alias\t/d' /root/.bashrc
+      ignore_errors: yes
+        
+    - name: Clear aliases from /root/.profile
+      shell: |
+        sed -i '/^alias /d' /root/.profile
+        sed -i '/^alias\t/d' /root/.profile
+      ignore_errors: yes
+        
+    - name: Clear aliases from /root/.zshrc
+      shell: |
+        sed -i '/^alias /d' /root/.zshrc
+        sed -i '/^alias\t/d' /root/.zshrc
+      ignore_errors: yes
+        
+    - name: Clear aliases from /etc/bash.bashrc
+      shell: |
+        sed -i '/^alias /d' /etc/bash.bashrc
+        sed -i '/^alias\t/d' /etc/bash.bashrc
+      ignore_errors: yes
+        
+    - name: Clear aliases from /etc/profile
+      shell: |
+        sed -i '/^alias /d' /etc/profile
+        sed -i '/^alias\t/d' /etc/profile
+      ignore_errors: yes
+        
+    - name: Find and clear custom alias files
+      find:
+        paths: ["/root", "/etc", "/home"]
+        patterns: ["*.aliases", ".aliases", "aliases"]
+        recurse: yes
+      register: alias_files
+      
+    - name: Remove found alias files
+      file:
+        path: "{{ item.path }}"
+        state: absent
+      loop: "{{ alias_files.files }}"
+      when: alias_files.files is defined
+      
+    - name: Clear shell history to remove alias commands
+      shell: |
+        > /root/.bash_history
+        > /root/.zsh_history
+        history -c
+      ignore_errors: yes
+        
+    - name: Unalias all current aliases
+      shell: unalias -a
+      ignore_errors: yes
+      
+    - name: Restart shell services
+      shell: |
+        pkill -f bash || true
+        pkill -f zsh || true
+        
+    - name: Test network connectivity after clearing aliases
+      shell: ping -c 2 8.8.8.8 || echo "Ping failed"
+      register: ping_test
+      
+    - name: Display ping test result
+      debug:
+        msg: "Ping test: {{ ping_test.stdout_lines }}"
--- a/configuration/playbooks/clear-proxy.yml
+++ b/configuration/playbooks/clear-proxy.yml
@@ -0,0 +1,76 @@
+---
+- name: Clear proxy settings on hcp1 and hcp2
+  hosts: hcp1,hcp2
+  become: yes
+  
+  tasks:
+    - name: Check current proxy environment variables
+      shell: env | grep -i proxy || echo "No proxy vars found"
+      register: proxy_env_before
+      
+    - name: Display current proxy settings
+      debug:
+        msg: "Current proxy env: {{ proxy_env_before.stdout_lines }}"
+    
+    - name: Clear proxy from /etc/environment
+      lineinfile:
+        path: /etc/environment
+        regexp: "{{ item }}"
+        state: absent
+      loop:
+        - "^http_proxy="
+        - "^https_proxy="
+        - "^HTTP_PROXY="
+        - "^HTTPS_PROXY="
+        - "^ftp_proxy="
+        - "^FTP_PROXY="
+        - "^no_proxy="
+        - "^NO_PROXY="
+        
+    - name: Clear proxy from /etc/apt/apt.conf.d/
+      file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - /etc/apt/apt.conf.d/95proxies
+        - /etc/apt/apt.conf.d/proxy.conf
+        - /etc/apt/apt.conf.d/00proxy
+        
+    - name: Clear proxy from user profiles
+      lineinfile:
+        path: "{{ item }}"
+        regexp: ".*proxy.*"
+        state: absent
+      loop:
+        - /root/.bashrc
+        - /root/.profile
+        - /home/root/.bashrc
+        - /home/root/.profile
+      ignore_errors: yes
+        
+    - name: Unset proxy variables in current session
+      shell: |
+        unset http_proxy
+        unset https_proxy
+        unset HTTP_PROXY
+        unset HTTPS_PROXY
+        unset ftp_proxy
+        unset FTP_PROXY
+        unset no_proxy
+        unset NO_PROXY
+        
+    - name: Check APT proxy configuration
+      shell: apt-config dump | grep -i proxy || echo "No APT proxy found"
+      register: apt_proxy_check
+      
+    - name: Display APT proxy status
+      debug:
+        msg: "APT proxy config: {{ apt_proxy_check.stdout_lines }}"
+        
+    - name: Test direct connection to HashiCorp
+      shell: curl -I --connect-timeout 10 https://releases.hashicorp.com/ || echo "Connection failed"
+      register: connection_test
+      
+    - name: Display connection test result
+      debug:
+        msg: "Connection test: {{ connection_test.stdout_lines }}"
--- a/configuration/playbooks/configure-nomad-podman-cluster.yml
+++ b/configuration/playbooks/configure-nomad-podman-cluster.yml
@@ -0,0 +1,57 @@
+---
+- name: Configure Podman driver for all Nomad client nodes
+  hosts: nomad_clients,nomad_servers
+  become: yes
+  
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+        
+    - name: Install Podman if not present
+      package:
+        name: podman
+        state: present
+      ignore_errors: yes
+        
+    - name: Enable Podman socket
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+      ignore_errors: yes
+        
+    - name: Update Nomad configuration to use Podman
+      lineinfile:
+        path: /etc/nomad.d/nomad.hcl
+        regexp: '^plugin "docker"'
+        line: 'plugin "podman" {'
+        state: present
+        
+    - name: Add Podman plugin configuration
+      blockinfile:
+        path: /etc/nomad.d/nomad.hcl
+        marker: "# {mark} PODMAN PLUGIN CONFIG"
+        block: |
+          plugin "podman" {
+            config {
+              socket_path = "unix:///run/podman/podman.sock"
+              volumes {
+                enabled = true
+              }
+            }
+          }
+        insertafter: 'client {'
+        
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+        
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        host: localhost
+        delay: 5
+        timeout: 30
--- a/configuration/playbooks/configure-nomad-tailscale.yml
+++ b/configuration/playbooks/configure-nomad-tailscale.yml
@@ -0,0 +1,217 @@
+---
+- name: 配置 Nomad 集群使用 Tailscale 网络通讯
+  hosts: nomad_cluster
+  become: yes
+  gather_facts: no
+  vars:
+    nomad_config_dir: "/etc/nomad.d"
+    nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
+    
+  tasks:
+    - name: 获取当前节点的 Tailscale IP
+      shell: tailscale ip | head -1
+      register: current_tailscale_ip
+      failed_when: current_tailscale_ip.rc != 0
+
+    - name: 确保 Nomad 配置目录存在
+      file:
+        path: "{{ nomad_config_dir }}"
+        state: directory
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: 生成 Nomad 服务器配置（使用 Tailscale）
+      copy:
+        dest: "{{ nomad_config_file }}"
+        owner: root
+        group: root
+        mode: '0644'
+        content: |
+          datacenter = "{{ nomad_datacenter | default('dc1') }}"
+          data_dir = "/opt/nomad/data"
+          log_level = "INFO"
+          
+          bind_addr = "{{ current_tailscale_ip.stdout }}"
+          
+          addresses {
+            http = "0.0.0.0"
+            rpc  = "{{ current_tailscale_ip.stdout }}"
+            serf = "{{ current_tailscale_ip.stdout }}"
+          }
+          
+          ports {
+            http = 4646
+            rpc  = 4647
+            serf = 4648
+          }
+          
+          server {
+            enabled = true
+            bootstrap_expect = {{ nomad_bootstrap_expect | default(4) }}
+            
+            retry_join = [
+              "100.116.158.95",  # semaphore
+              "100.103.147.94", # ash2e
+              "100.81.26.3",    # ash1d
+              "100.90.159.68"   # ch2
+            ]
+            
+            encrypt = "{{ nomad_encrypt_key }}"
+          }
+          
+          client {
+            enabled = false
+          }
+          
+          plugin "podman" {
+            config {
+              socket_path = "unix:///run/podman/podman.sock"
+              volumes {
+                enabled = true
+              }
+            }
+          }
+          
+          consul {
+            address = "{{ current_tailscale_ip.stdout }}:8500"
+          }
+      when: nomad_role == "server"
+
+    - name: 生成 Nomad 客户端配置（使用 Tailscale）
+      copy:
+        dest: "{{ nomad_config_file }}"
+        owner: root
+        group: root
+        mode: '0644'
+        content: |
+          datacenter = "{{ nomad_datacenter | default('dc1') }}"
+          data_dir = "/opt/nomad/data"
+          log_level = "INFO"
+          
+          bind_addr = "{{ current_tailscale_ip.stdout }}"
+          
+          addresses {
+            http = "0.0.0.0"
+            rpc  = "{{ current_tailscale_ip.stdout }}"
+            serf = "{{ current_tailscale_ip.stdout }}"
+          }
+          
+          ports {
+            http = 4646
+            rpc  = 4647
+            serf = 4648
+          }
+          
+          server {
+            enabled = false
+          }
+          
+          client {
+            enabled = true
+            
+            servers = [
+              "100.116.158.95:4647",  # semaphore
+              "100.103.147.94:4647", # ash2e
+              "100.81.26.3:4647",    # ash1d
+              "100.90.159.68:4647"   # ch2
+            ]
+          }
+          
+          plugin "podman" {
+            config {
+              socket_path = "unix:///run/podman/podman.sock"
+              volumes {
+                enabled = true
+              }
+            }
+          }
+          
+          consul {
+            address = "{{ current_tailscale_ip.stdout }}:8500"
+          }
+      when: nomad_role == "client"
+
+    - name: 检查 Nomad 二进制文件位置
+      shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
+      register: nomad_binary_path
+      failed_when: nomad_binary_path.stdout == ""
+
+    - name: 创建/更新 Nomad systemd 服务文件
+      copy:
+        dest: "/etc/systemd/system/nomad.service"
+        owner: root
+        group: root
+        mode: '0644'
+        content: |
+          [Unit]
+          Description=Nomad
+          Documentation=https://www.nomadproject.io/
+          Requires=network-online.target
+          After=network-online.target
+          
+          [Service]
+          Type=notify
+          User=root
+          Group=root
+          ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
+          ExecReload=/bin/kill -HUP $MAINPID
+          KillMode=process
+          Restart=on-failure
+          LimitNOFILE=65536
+          
+          [Install]
+          WantedBy=multi-user.target
+      notify: restart nomad
+
+    - name: 确保 Nomad 数据目录存在
+      file:
+        path: "/opt/nomad/data"
+        state: directory
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: 重新加载 systemd daemon
+      systemd:
+        daemon_reload: yes
+
+    - name: 启用并启动 Nomad 服务
+      systemd:
+        name: nomad
+        enabled: yes
+        state: started
+
+    - name: 等待 Nomad 服务启动
+      wait_for:
+        port: 4646
+        host: "{{ current_tailscale_ip.stdout }}"
+        delay: 5
+        timeout: 30
+      ignore_errors: yes
+
+    - name: 检查 Nomad 服务状态
+      shell: systemctl status nomad --no-pager -l
+      register: nomad_status
+      ignore_errors: yes
+
+    - name: 显示配置结果
+      debug:
+        msg: |
+          ✅ 节点 {{ inventory_hostname }} 配置完成
+          🌐 Tailscale IP: {{ current_tailscale_ip.stdout }}
+          🎯 角色: {{ nomad_role }}
+          🔧 Nomad 二进制: {{ nomad_binary_path.stdout }}
+          📊 服务状态: {{ 'active' if nomad_status.rc == 0 else 'failed' }}
+          {% if nomad_status.rc != 0 %}
+          ❌ 错误信息:
+          {{ nomad_status.stdout }}
+          {{ nomad_status.stderr }}
+          {% endif %}
+
+  handlers:
+    - name: restart nomad
+      systemd:
+        name: nomad
+        state: restarted
+        daemon_reload: yes
--- a/configuration/playbooks/debug-nomad-podman.yml
+++ b/configuration/playbooks/debug-nomad-podman.yml
@@ -0,0 +1,60 @@
+---
+- name: Debug Nomad Podman Driver Issues
+  hosts: all
+  become: yes
+  vars:
+    nomad_user: nomad
+
+  tasks:
+    - name: Check Nomad configuration
+      shell: cat /etc/nomad.d/nomad.hcl
+      register: nomad_config
+
+    - name: Display Nomad configuration
+      debug:
+        var: nomad_config.stdout_lines
+
+    - name: Check plugin directory contents
+      shell: ls -la /opt/nomad/data/plugins/
+      register: plugin_dir
+
+    - name: Display plugin directory
+      debug:
+        var: plugin_dir.stdout_lines
+
+    - name: Check Nomad logs for plugin loading
+      shell: journalctl -u nomad -n 50 --no-pager | grep -E "(plugin|driver|podman)"
+      register: nomad_logs
+      failed_when: false
+
+    - name: Display relevant Nomad logs
+      debug:
+        var: nomad_logs.stdout_lines
+
+    - name: Check if plugin is executable
+      stat:
+        path: /opt/nomad/data/plugins/nomad-driver-podman
+      register: plugin_stat
+
+    - name: Display plugin file info
+      debug:
+        var: plugin_stat
+
+    - name: Test plugin directly
+      shell: /opt/nomad/data/plugins/nomad-driver-podman --version
+      register: plugin_version
+      failed_when: false
+      become_user: "{{ nomad_user }}"
+
+    - name: Display plugin version
+      debug:
+        msg: "Plugin version test: {{ 'SUCCESS' if plugin_version.rc == 0 else 'FAILED' }} - {{ plugin_version.stdout if plugin_version.rc == 0 else plugin_version.stderr }}"
+
+    - name: Check Podman socket accessibility
+      shell: sudo -u {{ nomad_user }} curl --unix-socket /run/user/1001/podman/podman.sock http://localhost/v1.0.0/libpod/info 2>/dev/null | head -3
+      register: podman_socket_test
+      failed_when: false
+
+    - name: Display Podman socket test
+      debug:
+        msg: "Podman socket test: {{ 'SUCCESS' if podman_socket_test.rc == 0 else 'FAILED' }}"
--- a/configuration/playbooks/disk-analysis-ncdu.yml
+++ b/configuration/playbooks/disk-analysis-ncdu.yml
@@ -0,0 +1,168 @@
+---
+- name: 磁盘空间分析 - 使用 ncdu 工具
+  hosts: all
+  become: yes
+  vars:
+    ncdu_scan_paths:
+      - "/"
+      - "/var"
+      - "/opt"
+      - "/home"
+    output_dir: "/tmp/disk-analysis"
+    
+  tasks:
+    - name: 安装 ncdu 工具
+      package:
+        name: ncdu
+        state: present
+      register: ncdu_install
+      
+    - name: 创建输出目录
+      file:
+        path: "{{ output_dir }}"
+        state: directory
+        mode: '0755'
+        
+    - name: 检查磁盘空间使用情况
+      shell: df -h
+      register: disk_usage
+      
+    - name: 显示当前磁盘使用情况
+      debug:
+        msg: |
+          === {{ inventory_hostname }} 磁盘使用情况 ===
+          {{ disk_usage.stdout }}
+          
+    - name: 使用 ncdu 扫描根目录并生成报告
+      shell: |
+        ncdu -x -o {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json /
+      async: 300
+      poll: 0
+      register: ncdu_root_scan
+      
+    - name: 使用 ncdu 扫描 /var 目录
+      shell: |
+        ncdu -x -o {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json /var
+      async: 180
+      poll: 0
+      register: ncdu_var_scan
+      when: ansible_mounts | selectattr('mount', 'equalto', '/var') | list | length > 0 or '/var' in ansible_mounts | map(attribute='mount') | list
+      
+    - name: 使用 ncdu 扫描 /opt 目录
+      shell: |
+        ncdu -x -o {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json /opt
+      async: 120
+      poll: 0
+      register: ncdu_opt_scan
+      when: ansible_mounts | selectattr('mount', 'equalto', '/opt') | list | length > 0 or '/opt' in ansible_mounts | map(attribute='mount') | list
+      
+    - name: 等待根目录扫描完成
+      async_status:
+        jid: "{{ ncdu_root_scan.ansible_job_id }}"
+      register: ncdu_root_result
+      until: ncdu_root_result.finished
+      retries: 60
+      delay: 5
+      
+    - name: 等待 /var 目录扫描完成
+      async_status:
+        jid: "{{ ncdu_var_scan.ansible_job_id }}"
+      register: ncdu_var_result
+      until: ncdu_var_result.finished
+      retries: 36
+      delay: 5
+      when: ncdu_var_scan is defined and ncdu_var_scan.ansible_job_id is defined
+      
+    - name: 等待 /opt 目录扫描完成
+      async_status:
+        jid: "{{ ncdu_opt_scan.ansible_job_id }}"
+      register: ncdu_opt_result
+      until: ncdu_opt_result.finished
+      retries: 24
+      delay: 5
+      when: ncdu_opt_scan is defined and ncdu_opt_scan.ansible_job_id is defined
+      
+    - name: 生成磁盘使用分析报告
+      shell: |
+        echo "=== {{ inventory_hostname }} 磁盘分析报告 ===" > {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "生成时间: $(date)" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "=== 磁盘使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        df -h >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "=== 最大的目录 (前10个) ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        du -h --max-depth=2 / 2>/dev/null | sort -hr | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "=== /var 目录最大文件 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        find /var -type f -size +100M -exec ls -lh {} \; 2>/dev/null | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "=== /tmp 目录使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        du -sh /tmp/* 2>/dev/null | sort -hr | head -5 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        echo "=== 日志文件大小 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        find /var/log -name "*.log" -type f -size +50M -exec ls -lh {} \; 2>/dev/null >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+        
+    - name: 显示分析报告
+      shell: cat {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+      register: disk_report
+      
+    - name: 输出磁盘分析结果
+      debug:
+        msg: "{{ disk_report.stdout }}"
+        
+    - name: 检查是否有磁盘使用率超过 80%
+      shell: df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 80) print $0}'
+      register: high_usage_disks
+      
+    - name: 警告高磁盘使用率
+      debug:
+        msg: |
+          ⚠️  警告: {{ inventory_hostname }} 发现高磁盘使用率!
+          {{ high_usage_disks.stdout }}
+      when: high_usage_disks.stdout != ""
+      
+    - name: 创建清理建议
+      shell: |
+        echo "=== {{ inventory_hostname }} 清理建议 ===" > {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "1. 检查日志文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        find /var/log -name "*.log" -type f -size +100M -exec echo "   大日志文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "2. 检查临时文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        find /tmp -type f -size +50M -exec echo "   大临时文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "3. 检查包缓存:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        if [ -d /var/cache/apt ]; then
+          echo "   APT 缓存大小: $(du -sh /var/cache/apt 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        fi
+        if [ -d /var/cache/yum ]; then
+          echo "   YUM 缓存大小: $(du -sh /var/cache/yum 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        fi
+        echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        echo "4. 检查容器相关:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        if command -v podman >/dev/null 2>&1; then
+          echo "   Podman 镜像: $(podman images --format 'table {{.Repository}} {{.Tag}} {{.Size}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+          echo "   Podman 容器: $(podman ps -a --format 'table {{.Names}} {{.Status}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+        fi
+        
+    - name: 显示清理建议
+      shell: cat {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
+      register: cleanup_suggestions
+      
+    - name: 输出清理建议
+      debug:
+        msg: "{{ cleanup_suggestions.stdout }}"
+        
+    - name: 保存 ncdu 文件位置信息
+      debug:
+        msg: |
+          📁 ncdu 扫描文件已保存到:
+          - 根目录: {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json
+          - /var 目录: {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json (如果存在)
+          - /opt 目录: {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json (如果存在)
+          
+          💡 使用方法:
+          ncdu -f {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json
+          
+          📊 完整报告: {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
+          🧹 清理建议: {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
--- a/configuration/playbooks/disk-cleanup.yml
+++ b/configuration/playbooks/disk-cleanup.yml
@@ -0,0 +1,96 @@
+---
+- name: 磁盘清理工具
+  hosts: all
+  become: yes
+  vars:
+    cleanup_logs: true
+    cleanup_cache: true
+    cleanup_temp: true
+    cleanup_containers: false  # 谨慎操作
+    
+  tasks:
+    - name: 检查磁盘使用情况 (清理前)
+      shell: df -h
+      register: disk_before
+      
+    - name: 显示清理前磁盘使用情况
+      debug:
+        msg: |
+          === {{ inventory_hostname }} 清理前磁盘使用情况 ===
+          {{ disk_before.stdout }}
+          
+    - name: 清理系统日志 (保留最近7天)
+      shell: |
+        journalctl --vacuum-time=7d
+        find /var/log -name "*.log" -type f -mtime +7 -exec truncate -s 0 {} \;
+        find /var/log -name "*.log.*" -type f -mtime +7 -delete
+      when: cleanup_logs | bool
+      register: log_cleanup
+      
+    - name: 清理包管理器缓存
+      block:
+        - name: 清理 APT 缓存 (Debian/Ubuntu)
+          shell: |
+            apt-get clean
+            apt-get autoclean
+            apt-get autoremove -y
+          when: ansible_os_family == "Debian"
+          
+        - name: 清理 YUM/DNF 缓存 (RedHat/CentOS)
+          shell: |
+            if command -v dnf >/dev/null 2>&1; then
+              dnf clean all
+            elif command -v yum >/dev/null 2>&1; then
+              yum clean all
+            fi
+          when: ansible_os_family == "RedHat"
+      when: cleanup_cache | bool
+      
+    - name: 清理临时文件
+      shell: |
+        find /tmp -type f -atime +7 -delete 2>/dev/null || true
+        find /var/tmp -type f -atime +7 -delete 2>/dev/null || true
+        rm -rf /tmp/.* 2>/dev/null || true
+      when: cleanup_temp | bool
+      
+    - name: 清理 Podman 资源 (谨慎操作)
+      block:
+        - name: 停止所有容器
+          shell: podman stop --all
+          ignore_errors: yes
+          
+        - name: 删除未使用的容器
+          shell: podman container prune -f
+          ignore_errors: yes
+          
+        - name: 删除未使用的镜像
+          shell: podman image prune -f
+          ignore_errors: yes
+          
+        - name: 删除未使用的卷
+          shell: podman volume prune -f
+          ignore_errors: yes
+      when: cleanup_containers | bool
+      
+    - name: 清理核心转储文件
+      shell: |
+        find /var/crash -name "core.*" -type f -delete 2>/dev/null || true
+        find / -name "core" -type f -size +10M -delete 2>/dev/null || true
+      ignore_errors: yes
+      
+    - name: 检查磁盘使用情况 (清理后)
+      shell: df -h
+      register: disk_after
+      
+    - name: 显示清理结果
+      debug:
+        msg: |
+          === {{ inventory_hostname }} 清理完成 ===
+          
+          清理前:
+          {{ disk_before.stdout }}
+          
+          清理后:
+          {{ disk_after.stdout }}
+          
+          🧹 清理操作完成!
--- a/configuration/playbooks/final-podman-fix.yml
+++ b/configuration/playbooks/final-podman-fix.yml
@@ -0,0 +1,105 @@
+---
+- name: Final Podman Permission Fix for Nomad
+  hosts: all
+  become: yes
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+
+    - name: Install podman for nomad user (system-wide)
+      package:
+        name: podman
+        state: present
+
+    - name: Enable podman socket for nomad user
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+        scope: system
+        daemon_reload: yes
+
+    - name: Create nomad user podman configuration directory
+      file:
+        path: /home/nomad/.config/containers
+        state: directory
+        owner: nomad
+        group: nomad
+        mode: '0755'
+        recurse: yes
+
+    - name: Configure podman for nomad user to use system socket
+      copy:
+        content: |
+          [containers]
+          
+          [engine]
+          remote = true
+          
+          [service_destinations]
+          [service_destinations.system]
+          uri = "unix:///run/podman/podman.sock"
+        dest: /home/nomad/.config/containers/containers.conf
+        owner: nomad
+        group: nomad
+        mode: '0644'
+
+    - name: Update Nomad configuration to use system podman socket
+      replace:
+        path: /etc/nomad.d/nomad.hcl
+        regexp: 'socket_path = "unix:///run/user/1001/podman/podman.sock"'
+        replace: 'socket_path = "unix:///run/podman/podman.sock"'
+
+    - name: Add nomad user to necessary groups
+      user:
+        name: nomad
+        groups: 
+          - podman
+        append: yes
+
+    - name: Create podman group if it doesn't exist
+      group:
+        name: podman
+        state: present
+
+    - name: Set proper permissions on system podman socket directory
+      file:
+        path: /run/podman
+        state: directory
+        mode: '0755'
+        group: podman
+
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+        enabled: yes
+
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        timeout: 60
+
+    - name: Wait for plugins to load
+      pause:
+        seconds: 20
+
+    - name: Final verification - Check driver status
+      shell: sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 "Driver Status"
+      register: final_driver_status
+      failed_when: false
+
+    - name: Display final driver status
+      debug:
+        var: final_driver_status.stdout_lines
+
+    - name: Test podman access for nomad user
+      shell: sudo -u nomad podman version
+      register: podman_test
+      failed_when: false
+
+    - name: Display podman test result
+      debug:
+        var: podman_test.stdout_lines
--- a/configuration/playbooks/fix-hcp-podman.yml
+++ b/configuration/playbooks/fix-hcp-podman.yml
@@ -0,0 +1,83 @@
+---
+- name: Fix HCP1 and HCP2 Podman Configuration
+  hosts: hcp1,hcp2
+  become: yes
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+
+    - name: Ensure nomad user exists
+      user:
+        name: nomad
+        system: yes
+        shell: /bin/false
+        home: /home/nomad
+        create_home: yes
+
+    - name: Ensure Podman socket is running
+      systemd:
+        name: podman.socket
+        state: started
+        enabled: yes
+
+    - name: Set proper permissions on Podman socket
+      file:
+        path: /run/podman/podman.sock
+        mode: '0666'
+      ignore_errors: yes
+
+    - name: Create nomad data directory
+      file:
+        path: /opt/nomad/data
+        state: directory
+        owner: nomad
+        group: nomad
+        mode: '0755'
+
+    - name: Create nomad log directory
+      file:
+        path: /var/log/nomad
+        state: directory
+        owner: nomad
+        group: nomad
+        mode: '0755'
+
+    - name: Test Podman access for nomad user
+      shell: sudo -u nomad podman version
+      register: podman_test
+      failed_when: false
+
+    - name: Display Podman test result
+      debug:
+        var: podman_test.stdout_lines
+
+    - name: Validate Nomad configuration
+      shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl
+      register: config_validation
+      failed_when: false
+
+    - name: Display configuration validation
+      debug:
+        var: config_validation
+
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+        enabled: yes
+
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        timeout: 60
+
+    - name: Check Nomad node status
+      shell: /usr/local/bin/nomad node status -self
+      register: node_status
+      failed_when: false
+
+    - name: Display node status
+      debug:
+        var: node_status.stdout_lines
--- a/configuration/playbooks/fix-hcs-dpkg-issue.yml
+++ b/configuration/playbooks/fix-hcs-dpkg-issue.yml
@@ -0,0 +1,56 @@
+---
+- name: Fix dpkg and initramfs issues on hcs
+  hosts: hcs
+  become: yes
+  tasks:
+    - name: Check current dpkg status
+      shell: dpkg --audit
+      register: dpkg_status
+      ignore_errors: yes
+
+    - name: Display dpkg status
+      debug:
+        var: dpkg_status.stdout_lines
+
+    - name: Fix broken btrfs hook
+      shell: |
+        # Remove problematic btrfs hook temporarily
+        mv /usr/share/initramfs-tools/hooks/btrfs /usr/share/initramfs-tools/hooks/btrfs.bak || true
+        
+        # Try to reconfigure the failed package
+        dpkg --configure -a
+        
+        # If that works, restore the hook
+        if [ $? -eq 0 ]; then
+          mv /usr/share/initramfs-tools/hooks/btrfs.bak /usr/share/initramfs-tools/hooks/btrfs || true
+        fi
+      register: fix_result
+      ignore_errors: yes
+
+    - name: Display fix result
+      debug:
+        var: fix_result
+
+    - name: Alternative fix - reinstall initramfs-tools
+      apt:
+        name: initramfs-tools
+        state: latest
+        force: yes
+      when: fix_result.rc != 0
+      ignore_errors: yes
+
+    - name: Clean up and update
+      shell: |
+        apt autoremove -y
+        apt update
+        apt upgrade -y
+      ignore_errors: yes
+
+    - name: Check final dpkg status
+      shell: dpkg --audit
+      register: final_status
+      ignore_errors: yes
+
+    - name: Display final status
+      debug:
+        var: final_status.stdout_lines
--- a/configuration/playbooks/fix-nomad-local.yml
+++ b/configuration/playbooks/fix-nomad-local.yml
@@ -0,0 +1,99 @@
+---
+- name: Update Nomad configuration for Podman and fix issues
+  hosts: localhost
+  become: yes
+  connection: local
+  
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+        
+    - name: Update Nomad configuration to use Podman and disable Consul
+      copy:
+        content: |
+          datacenter = "dc1"
+          region     = "global"
+          data_dir   = "/opt/nomad/data"
+
+          bind_addr = "100.116.158.95"
+
+          server {
+            enabled          = true
+            bootstrap_expect = 1
+            encrypt          = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
+          }
+
+          client {
+            enabled = true
+          }
+
+          ui {
+            enabled = true
+          }
+
+          addresses {
+            http = "0.0.0.0"
+            rpc  = "100.116.158.95"
+            serf = "100.116.158.95"
+          }
+
+          ports {
+            http = 4646
+            rpc  = 4647
+            serf = 4648
+          }
+
+          plugin "podman" {
+            config {
+              socket_path = "unix:///run/podman/podman.sock"
+              volumes {
+                enabled = true
+              }
+            }
+          }
+
+          # Disable Consul integration for now
+          consul {
+            address = ""
+          }
+
+          log_level = "INFO"
+          log_file  = "/var/log/nomad/nomad.log"
+        dest: /etc/nomad.d/nomad.hcl
+        owner: nomad
+        group: nomad
+        mode: '0640'
+        backup: yes
+        
+    - name: Enable Podman socket for systemd
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+      ignore_errors: yes
+        
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+        
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        host: localhost
+        delay: 5
+        timeout: 30
+        
+    - name: Check Nomad status
+      uri:
+        url: http://localhost:4646/v1/status/leader
+        method: GET
+      register: nomad_status
+      retries: 3
+      delay: 5
+      
+    - name: Display Nomad status
+      debug:
+        msg: "Nomad leader: {{ nomad_status.json if nomad_status.json is defined else 'No leader elected' }}"
--- a/configuration/playbooks/fix-nomad-podman-config.yml
+++ b/configuration/playbooks/fix-nomad-podman-config.yml
@@ -0,0 +1,72 @@
+---
+- name: Fix Nomad Podman Driver Configuration
+  hosts: all
+  become: yes
+  vars:
+    nomad_user: nomad
+
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+
+    - name: Update Nomad configuration to properly reference Podman plugin
+      replace:
+        path: /etc/nomad.d/nomad.hcl
+        regexp: 'plugin "podman" \{\n  config \{\n    socket_path = "unix:///run/user/1001/podman/podman.sock"\n    volumes \{\n      enabled = true\n    \}\n  \}\n\}'
+        replace: |
+          plugin "nomad-driver-podman" {
+            config {
+              socket_path = "unix:///run/user/1001/podman/podman.sock"
+              volumes {
+                enabled = true
+              }
+            }
+          }
+
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        host: localhost
+        delay: 10
+        timeout: 60
+
+    - name: Wait for plugins to load
+      pause:
+        seconds: 15
+
+    - name: Check if Podman driver is now loaded
+      shell: |
+        sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
+      register: driver_status
+
+    - name: Display driver status
+      debug:
+        var: driver_status.stdout_lines
+
+    - name: Check Nomad logs for successful plugin loading
+      shell: journalctl -u nomad -n 20 --no-pager | grep -E "(podman|plugin)"
+      register: recent_logs
+      failed_when: false
+
+    - name: Display recent plugin logs
+      debug:
+        var: recent_logs.stdout_lines
+
+    - name: Final verification - Test Podman functionality
+      shell: |
+        sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | grep -i podman
+      register: podman_driver_check
+      failed_when: false
+
+    - name: Display final result
+      debug:
+        msg: |
+          Podman driver status: {{ 'SUCCESS - Driver loaded!' if 'podman' in (podman_driver_check.stdout | default('')) else 'Still checking...' }}
+          Available drivers: {{ podman_driver_check.stdout_lines | default(['none']) | join(', ') }}
--- a/configuration/playbooks/fix-nomad-systemd.yml
+++ b/configuration/playbooks/fix-nomad-systemd.yml
@@ -0,0 +1,88 @@
+---
+- name: Fix Nomad systemd service binary path
+  hosts: nomad_cluster
+  become: yes
+  
+  tasks:
+    - name: Check Nomad binary location
+      shell: which nomad
+      register: nomad_binary_path
+      
+    - name: Display binary path
+      debug:
+        msg: "Nomad binary 位于: {{ nomad_binary_path.stdout }}"
+        
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+      ignore_errors: yes
+      
+    - name: Update Nomad systemd service with correct binary path
+      copy:
+        content: |
+          [Unit]
+          Description=Nomad
+          Documentation=https://www.nomadproject.io/
+          Requires=network-online.target
+          After=network-online.target
+          ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
+
+          [Service]
+          Type=notify
+          User=nomad
+          Group=nomad
+          ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
+          ExecReload=/bin/kill -HUP $MAINPID
+          KillMode=process
+          Restart=on-failure
+          LimitNOFILE=65536
+
+          [Install]
+          WantedBy=multi-user.target
+        dest: /etc/systemd/system/nomad.service
+        mode: '0644'
+      notify: reload systemd
+      
+    - name: Reload systemd and start Nomad servers first
+      systemd:
+        name: nomad
+        state: started
+        enabled: yes
+        daemon_reload: yes
+      when: inventory_hostname in groups['nomad_servers']
+      
+    - name: Wait for servers to be ready
+      pause:
+        seconds: 15
+      when: inventory_hostname in groups['nomad_servers']
+      
+    - name: Start Nomad clients
+      systemd:
+        name: nomad
+        state: started
+        enabled: yes
+        daemon_reload: yes
+      when: inventory_hostname in groups['nomad_clients']
+      
+    - name: Wait for clients to connect
+      pause:
+        seconds: 10
+      when: inventory_hostname in groups['nomad_clients']
+      
+    - name: Check final service status
+      shell: systemctl status nomad --no-pager -l
+      register: service_status
+      ignore_errors: yes
+      
+    - name: Display service status
+      debug:
+        msg: |
+          ✅ 节点 {{ inventory_hostname }} 服务状态:
+          📊 状态: {{ 'SUCCESS' if service_status.rc == 0 else 'FAILED' }}
+          💾 二进制路径: {{ nomad_binary_path.stdout }}
+          
+  handlers:
+    - name: reload systemd
+      systemd:
+        daemon_reload: yes
--- a/configuration/playbooks/fix-podman-installation.yml
+++ b/configuration/playbooks/fix-podman-installation.yml
@@ -0,0 +1,79 @@
+---
+- name: Fix Podman installation on remaining nodes
+  hosts: semaphore,master,ash3c,hcs
+  become: yes
+  serial: 1  # 逐个处理，避免同时影响多个节点
+  
+  tasks:
+    - name: Current node status
+      debug:
+        msg: "🔧 修复节点: {{ inventory_hostname }}"
+    
+    - name: Check if Podman is already installed
+      shell: podman --version 2>/dev/null || echo "NOT_INSTALLED"
+      register: podman_check
+      
+    - name: Install Podman if not present (semaphore special handling)
+      apt:
+        name:
+          - podman
+          - buildah
+          - skopeo
+        state: present
+        update_cache: yes
+        force_apt_get: yes
+      when: inventory_hostname == 'semaphore' and 'NOT_INSTALLED' in podman_check.stdout
+      ignore_errors: yes
+      
+    - name: Install Podman on other nodes
+      apt:
+        name:
+          - podman
+          - buildah  
+          - skopeo
+        state: present
+      when: inventory_hostname != 'semaphore'
+      ignore_errors: yes
+      
+    - name: Install Python dependencies for podman-compose
+      apt:
+        name:
+          - python3-pip
+          - python3-setuptools
+          - python3-yaml
+          - python3-dotenv
+        state: present
+      ignore_errors: yes
+      
+    - name: Install podman-compose via pip
+      pip:
+        name: 
+          - podman-compose
+        state: present
+        executable: pip3
+      ignore_errors: yes
+      
+    - name: Alternative podman-compose installation via apt
+      apt:
+        name: podman-compose
+        state: present
+      ignore_errors: yes
+      
+    - name: Verify installations
+      shell: |
+        echo "Podman: $(podman --version 2>/dev/null || echo 'FAILED')"
+        echo "Podman Compose: $(podman-compose --version 2>/dev/null || echo 'FAILED')"
+      register: verify_result
+      
+    - name: Display verification results
+      debug:
+        msg: |
+          ✅ 节点 {{ inventory_hostname }} 验证结果:
+          {{ verify_result.stdout }}
+          
+    - name: Enable Podman socket
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+      ignore_errors: yes
--- a/configuration/playbooks/install-nomad-direct-download.yml
+++ b/configuration/playbooks/install-nomad-direct-download.yml
@@ -0,0 +1,133 @@
+---
+- name: Install Nomad by direct download from HashiCorp
+  hosts: hcs
+  become: yes
+  vars:
+    nomad_version: "1.10.5"
+    nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip"
+    nomad_user: "nomad"
+    nomad_group: "nomad"
+    nomad_home: "/opt/nomad"
+    nomad_data_dir: "/opt/nomad/data"
+    nomad_config_dir: "/etc/nomad.d"
+    nomad_datacenter: "dc1"
+    nomad_region: "global"
+    nomad_server_addresses:
+      - "100.116.158.95:4647"  # semaphore server address
+
+  tasks:
+    - name: Create nomad user
+      user:
+        name: "{{ nomad_user }}"
+        group: "{{ nomad_group }}"
+        system: yes
+        shell: /bin/false
+        home: "{{ nomad_home }}"
+        create_home: yes
+
+    - name: Create nomad directories
+      file:
+        path: "{{ item }}"
+        state: directory
+        owner: "{{ nomad_user }}"
+        group: "{{ nomad_group }}"
+        mode: '0755'
+      loop:
+        - "{{ nomad_home }}"
+        - "{{ nomad_data_dir }}"
+        - "{{ nomad_config_dir }}"
+        - /var/log/nomad
+
+    - name: Install unzip package
+      apt:
+        name: unzip
+        state: present
+        update_cache: yes
+
+    - name: Download Nomad binary
+      get_url:
+        url: "{{ nomad_url }}"
+        dest: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
+        mode: '0644'
+        timeout: 300
+
+    - name: Extract Nomad binary
+      unarchive:
+        src: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
+        dest: /tmp
+        remote_src: yes
+
+    - name: Copy Nomad binary to /usr/local/bin
+      copy:
+        src: /tmp/nomad
+        dest: /usr/local/bin/nomad
+        mode: '0755'
+        owner: root
+        group: root
+        remote_src: yes
+
+    - name: Create Nomad client configuration
+      template:
+        src: templates/nomad-client.hcl.j2
+        dest: "{{ nomad_config_dir }}/nomad.hcl"
+        owner: "{{ nomad_user }}"
+        group: "{{ nomad_group }}"
+        mode: '0640'
+
+    - name: Create Nomad systemd service
+      copy:
+        content: |
+          [Unit]
+          Description=Nomad
+          Documentation=https://www.nomadproject.io/
+          Requires=network-online.target
+          After=network-online.target
+          ConditionFileNotEmpty={{ nomad_config_dir }}/nomad.hcl
+
+          [Service]
+          Type=notify
+          User={{ nomad_user }}
+          Group={{ nomad_group }}
+          ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }}
+          ExecReload=/bin/kill -HUP $MAINPID
+          KillMode=process
+          Restart=on-failure
+          LimitNOFILE=65536
+
+          [Install]
+          WantedBy=multi-user.target
+        dest: /etc/systemd/system/nomad.service
+        mode: '0644'
+
+    - name: Reload systemd daemon
+      systemd:
+        daemon_reload: yes
+
+    - name: Enable and start Nomad service
+      systemd:
+        name: nomad
+        enabled: yes
+        state: started
+
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        host: localhost
+        delay: 5
+        timeout: 60
+
+    - name: Verify Nomad installation
+      command: /usr/local/bin/nomad version
+      register: nomad_version_output
+
+    - name: Display Nomad version
+      debug:
+        msg: "{{ nomad_version_output.stdout }}"
+
+    - name: Clean up downloaded files
+      file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
+        - /tmp/nomad
--- a/configuration/playbooks/install-nomad-podman-driver.yml
+++ b/configuration/playbooks/install-nomad-podman-driver.yml
@@ -0,0 +1,131 @@
+---
+- name: Install Nomad Podman Driver Plugin
+  hosts: all
+  become: yes
+  vars:
+    nomad_user: nomad
+    nomad_data_dir: /opt/nomad/data
+    nomad_plugins_dir: "{{ nomad_data_dir }}/plugins"
+    podman_driver_version: "0.6.1"
+    podman_driver_url: "https://releases.hashicorp.com/nomad-driver-podman/{{ podman_driver_version }}/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
+
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+
+    - name: Create plugins directory
+      file:
+        path: "{{ nomad_plugins_dir }}"
+        state: directory
+        owner: "{{ nomad_user }}"
+        group: "{{ nomad_user }}"
+        mode: '0755'
+
+    - name: Download Nomad Podman driver
+      get_url:
+        url: "{{ podman_driver_url }}"
+        dest: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
+        mode: '0644'
+
+    - name: Extract Nomad Podman driver
+      unarchive:
+        src: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
+        dest: "/tmp"
+        remote_src: yes
+
+    - name: Install Nomad Podman driver
+      copy:
+        src: "/tmp/nomad-driver-podman"
+        dest: "{{ nomad_plugins_dir }}/nomad-driver-podman"
+        owner: "{{ nomad_user }}"
+        group: "{{ nomad_user }}"
+        mode: '0755'
+        remote_src: yes
+
+    - name: Update Nomad configuration for plugin directory
+      blockinfile:
+        path: /etc/nomad.d/nomad.hcl
+        marker: "# {mark} PLUGIN DIRECTORY CONFIGURATION"
+        block: |
+          plugin_dir = "{{ nomad_plugins_dir }}"
+        insertafter: 'data_dir = "/opt/nomad/data"'
+
+    - name: Fix Podman socket permissions
+      file:
+        path: /run/user/1001/podman/podman.sock
+        mode: '0666'
+      ignore_errors: yes
+
+    - name: Ensure nomad user can access Podman socket
+      user:
+        name: "{{ nomad_user }}"
+        groups: ben
+        append: yes
+
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+        enabled: yes
+
+    - name: Wait for Nomad to be ready
+      wait_for:
+        port: 4646
+        host: localhost
+        delay: 10
+        timeout: 60
+
+    - name: Verify Nomad is running
+      systemd:
+        name: nomad
+      register: nomad_service_status
+
+    - name: Display Nomad service status
+      debug:
+        msg: "Nomad service is {{ nomad_service_status.status.ActiveState }}"
+
+    - name: Wait for plugins to load
+      pause:
+        seconds: 15
+
+    - name: Check available drivers
+      shell: |
+        sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
+      register: driver_status
+      failed_when: false
+
+    - name: Display driver status
+      debug:
+        var: driver_status.stdout_lines
+
+    - name: Test Podman driver functionality
+      shell: |
+        sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]'
+      register: available_drivers
+      failed_when: false
+
+    - name: Display available drivers
+      debug:
+        msg: "Available drivers: {{ available_drivers.stdout_lines | join(', ') }}"
+
+    - name: Clean up downloaded files
+      file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
+        - "/tmp/nomad-driver-podman"
+
+    - name: Final verification - Check if Podman driver is loaded
+      shell: |
+        sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers.podman.Detected'
+      register: podman_driver_detected
+      failed_when: false
+
+    - name: Display final result
+      debug:
+        msg: |
+          Podman driver installation: {{ 'SUCCESS' if podman_driver_detected.stdout == 'true' else 'NEEDS VERIFICATION' }}
+          Driver detected: {{ podman_driver_detected.stdout | default('unknown') }}
--- a/configuration/playbooks/install-podman-compose.yml
+++ b/configuration/playbooks/install-podman-compose.yml
@@ -0,0 +1,61 @@
+---
+- name: Install Podman Compose on all Nomad cluster nodes
+  hosts: nomad_cluster
+  become: yes
+  
+  tasks:
+    - name: Display target node
+      debug:
+        msg: "正在安装 Podman Compose 到节点: {{ inventory_hostname }}"
+    
+    - name: Update package cache
+      apt:
+        update_cache: yes
+      ignore_errors: yes
+      
+    - name: Install Podman and related tools
+      apt:
+        name:
+          - podman
+          - podman-compose
+          - buildah
+          - skopeo
+        state: present
+      ignore_errors: yes
+      
+    - name: Install additional dependencies
+      apt:
+        name:
+          - python3-pip
+          - python3-setuptools
+        state: present
+      ignore_errors: yes
+      
+    - name: Install podman-compose via pip if package manager failed
+      pip:
+        name: podman-compose
+        state: present
+      ignore_errors: yes
+      
+    - name: Verify Podman installation
+      shell: podman --version
+      register: podman_version
+      
+    - name: Verify Podman Compose installation
+      shell: podman-compose --version
+      register: podman_compose_version
+      ignore_errors: yes
+      
+    - name: Display installation results
+      debug:
+        msg: |
+          ✅ 节点 {{ inventory_hostname }} 安装结果:
+          📦 Podman: {{ podman_version.stdout }}
+          🐳 Podman Compose: {{ podman_compose_version.stdout if podman_compose_version.rc == 0 else '安装失败或不可用' }}
+          
+    - name: Ensure Podman socket is enabled
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+      ignore_errors: yes
--- a/configuration/playbooks/maintenance/ops-toolkit.yml
+++ b/configuration/playbooks/maintenance/ops-toolkit.yml
@@ -1,131 +0,0 @@
---
- name: Operations Toolkit - Unified Management Dashboard
-  hosts: all
-  gather_facts: yes
-  
-  vars:
-    # 可用的运维脚本
-    available_scripts:
-      - { name: "system-update", desc: "System package updates", file: "system-update.yml" }
-      - { name: "system-cleanup", desc: "System cleanup and maintenance", file: "system-cleanup.yml" }
-      - { name: "service-health", desc: "Service health monitoring", file: "service-health-check.yml" }
-      - { name: "security-hardening", desc: "Security hardening and backup", file: "security-hardening.yml" }
-      - { name: "docker-management", desc: "Docker container management", file: "docker-management.yml" }
-      - { name: "network-connectivity", desc: "Network connectivity check", file: "network-connectivity.yml" }
-      - { name: "certificate-management", desc: "SSL certificate monitoring", file: "certificate-management.yml" }
-  
-  tasks:
-    # 显示系统概览
-    - name: Display system overview
-      debug:
-        msg: |
-          🖥️  System Overview for {{ inventory_hostname }}:
-          📊 OS: {{ ansible_distribution }} {{ ansible_distribution_version }}
-          💾 Memory: {{ (ansible_memtotal_mb/1024)|round(1) }}GB total, {{ (ansible_memfree_mb/1024)|round(1) }}GB free
-          💿 CPU: {{ ansible_processor_vcpus }} cores
-          🏠 Architecture: {{ ansible_architecture }}
-          🌐 IP: {{ ansible_default_ipv4.address }}
-          ⏰ Uptime: {{ ansible_uptime_seconds//86400 }}d {{ (ansible_uptime_seconds%86400)//3600 }}h {{ ((ansible_uptime_seconds%3600)//60) }}m
-          
-    # 快速系统状态检查
-    - name: Quick system status check
-      shell: |
-        echo "=== DISK USAGE ==="
-        df -h | grep -E "(Filesystem|/dev/)"
-        echo ""
-        echo "=== MEMORY USAGE ==="
-        free -h
-        echo ""
-        echo "=== LOAD AVERAGE ==="
-        uptime
-        echo ""
-        echo "=== TOP PROCESSES ==="
-        ps aux --sort=-%cpu | head -6
-      register: quick_status
-      
-    - name: Display quick status
-      debug:
-        msg: "{{ quick_status.stdout_lines }}"
-        
-    # 检查关键服务状态
-    - name: Check critical services
-      systemd:
-        name: "{{ item }}"
-      register: service_status
-      loop:
-        - ssh
-        - systemd-resolved
-        - cron
-      failed_when: false
-      
-    - name: Display service status
-      debug:
-        msg: "🔧 {{ item.item }}: {{ item.status.ActiveState if item.status is defined else 'NOT FOUND' }}"
-      loop: "{{ service_status.results }}"
-      
-    # 检查最近的系统日志错误
-    - name: Check recent system errors
-      shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10
-      register: recent_errors
-      failed_when: false
-      
-    - name: Display recent errors
-      debug:
-        msg: "🚨 Recent Errors: {{ recent_errors.stdout_lines if recent_errors.stdout_lines else ['No recent errors found'] }}"
-        
-    # 检查网络连接
-    - name: Quick network check
-      shell: |
-        echo "=== NETWORK INTERFACES ==="
-        ip -br addr show
-        echo ""
-        echo "=== DEFAULT ROUTE ==="
-        ip route | grep default
-        echo ""
-        echo "=== DNS TEST ==="
-        nslookup google.com | grep -A1 "Name:" || echo "DNS resolution failed"
-      register: network_check
-      failed_when: false
-      
-    - name: Display network status
-      debug:
-        msg: "🌐 Network Status: {{ network_check.stdout_lines }}"
-        
-    # 显示可用的运维脚本
-    - name: Display available operations scripts
-      debug:
-        msg: |
-          🛠️  Available Operations Scripts:
-          {% for script in available_scripts %}
-          {{ loop.index }}. {{ script.name }}: {{ script.desc }}
-          {% endfor %}
-          
-          💡 Usage Examples:
-          ansible-playbook -i inventory.ini system-cleanup.yml --limit {{ inventory_hostname }}
-          ansible-playbook -i inventory.ini docker-management.yml --limit lxc
-          ansible-playbook -i inventory.ini network-connectivity.yml --limit proxmox
-          
-    # 生成运维建议
-    - name: Generate maintenance recommendations
-      debug:
-        msg: |
-          💡 Maintenance Recommendations for {{ inventory_hostname }}:
-          
-          🔄 Regular Tasks (Weekly):
-          - Run system-cleanup.yml to free up disk space
-          - Check service-health-check.yml for service status
-          - Review certificate-management.yml for expiring certificates
-          
-          🔒 Security Tasks (Monthly):
-          - Execute security-hardening.yml for security updates
-          - Review network-connectivity.yml for network security
-          
-          🐳 Container Tasks (As needed):
-          - Use docker-management.yml for Docker maintenance
-          
-          📊 Monitoring Tasks (Daily):
-          - Quick check with ops-toolkit.yml (this script)
-          
-          ⚡ Emergency Tasks:
-          - Use system-update.yml for critical security patches
-          - Run network-connectivity.yml for connectivity issues
--- a/configuration/playbooks/migrate-to-podman-simple.yml
+++ b/configuration/playbooks/migrate-to-podman-simple.yml
@@ -0,0 +1,167 @@
+---
+- name: Migrate Nomad from Docker to Podman (Simple Version)
+  hosts: all
+  become: yes
+  vars:
+    nomad_user: nomad
+    nomad_config_dir: /etc/nomad.d
+    nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
+
+  tasks:
+    - name: Stop Nomad service
+      systemd:
+        name: nomad
+        state: stopped
+
+    - name: Backup current Nomad configuration
+      copy:
+        src: "{{ nomad_config_file }}"
+        dest: "{{ nomad_config_file }}.backup-{{ ansible_date_time.epoch }}"
+        remote_src: yes
+
+    - name: Get nomad user info
+      getent:
+        database: passwd
+        key: "{{ nomad_user }}"
+      register: nomad_user_info
+
+    - name: Set nomad user UID variable
+      set_fact:
+        nomad_uid: "{{ nomad_user_info.ansible_facts.getent_passwd[nomad_user][1] }}"
+
+    - name: Enable lingering for nomad user
+      command: loginctl enable-linger {{ nomad_user }}
+      failed_when: false
+
+    - name: Create runtime directory for nomad user
+      file:
+        path: "/run/user/{{ nomad_uid }}"
+        state: directory
+        owner: "{{ nomad_user }}"
+        group: "{{ nomad_user }}"
+        mode: '0700'
+
+    - name: Start Podman socket as nomad user
+      shell: |
+        sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} systemctl --user enable --now podman.socket
+      args:
+        creates: "/run/user/{{ nomad_uid }}/podman/podman.sock"
+
+    - name: Create new Nomad configuration with Podman
+      copy:
+        content: |
+          datacenter = "dc1"
+          region = "global"
+          data_dir = "/opt/nomad/data"
+
+          bind_addr = "0.0.0.0"
+
+          client {
+            enabled = true
+            servers = [
+              "100.116.158.95:4647",
+            ]
+          }
+
+          # Docker plugin (disabled)
+          # plugin "docker" {
+          #   config {
+          #     allow_privileged = true
+          #     volumes {
+          #       enabled = true
+          #     }
+          #   }
+          # }
+
+          plugin "podman" {
+            config {
+              socket_path = "unix:///run/user/{{ nomad_uid }}/podman/podman.sock"
+              volumes {
+                enabled = true
+              }
+            }
+          }
+
+          consul {
+            address = "127.0.0.1:8500"
+          }
+        dest: "{{ nomad_config_file }}"
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Update Nomad systemd service to run as nomad user
+      copy:
+        content: |
+          [Unit]
+          Description=Nomad
+          Documentation=https://www.nomadproject.io/
+          Requires=network-online.target
+          After=network-online.target
+          Wants=network-online.target
+
+          [Service]
+          Type=notify
+          User={{ nomad_user }}
+          Group={{ nomad_user }}
+          ExecReload=/bin/kill -HUP $MAINPID
+          ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }}
+          KillMode=process
+          Restart=on-failure
+          LimitNOFILE=65536
+          Environment=XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }}
+
+          [Install]
+          WantedBy=multi-user.target
+        dest: /etc/systemd/system/nomad.service
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Reload systemd daemon
+      systemd:
+        daemon_reload: yes
+
+    - name: Start Nomad service
+      systemd:
+        name: nomad
+        state: started
+        enabled: yes
+
+    - name: Wait for Nomad to be ready (local check)
+      wait_for:
+        port: 4646
+        host: localhost
+        delay: 5
+        timeout: 60
+
+    - name: Verify Nomad is running
+      shell: systemctl is-active nomad
+      register: nomad_status
+
+    - name: Display Nomad status
+      debug:
+        msg: "Nomad service status: {{ nomad_status.stdout }}"
+
+    - name: Check Podman socket
+      stat:
+        path: "/run/user/{{ nomad_uid }}/podman/podman.sock"
+      register: podman_socket
+
+    - name: Display Podman socket status
+      debug:
+        msg: "Podman socket exists: {{ podman_socket.stat.exists }}"
+
+    - name: Test Podman as nomad user
+      shell: |
+        sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} podman version --format json
+      register: podman_test
+      failed_when: false
+
+    - name: Display Podman test result
+      debug:
+        msg: |
+          Podman test: {{ 'SUCCESS' if podman_test.rc == 0 else 'FAILED' }}
+          {% if podman_test.rc != 0 %}
+          Error: {{ podman_test.stderr }}
+          {% endif %}
--- a/configuration/playbooks/monitoring/network-connectivity.yml
+++ b/configuration/playbooks/monitoring/network-connectivity.yml
@@ -1,143 +0,0 @@
---
- name: Network Connectivity and Performance Check
-  hosts: all
-  gather_facts: yes
-  
-  vars:
-    test_domains:
-      - google.com
-      - github.com
-      - docker.io
-      - tailscale.com
-    test_ports:
-      - { host: "8.8.8.8", port: 53, name: "Google DNS" }
-      - { host: "1.1.1.1", port: 53, name: "Cloudflare DNS" }
-      - { host: "github.com", port: 443, name: "GitHub HTTPS" }
-      - { host: "docker.io", port: 443, name: "Docker Hub" }
-  
-  tasks:
-    # 基本网络信息
-    - name: Get network interfaces
-      shell: ip addr show | grep -E "^[0-9]+:|inet "
-      register: network_interfaces
-      
-    - name: Display network interfaces
-      debug:
-        msg: "🌐 Network Interfaces: {{ network_interfaces.stdout_lines }}"
-        
-    # 检查默认路由
-    - name: Check default route
-      shell: ip route | grep default
-      register: default_route
-      
-    - name: Display default route
-      debug:
-        msg: "🛣️  Default Route: {{ default_route.stdout }}"
-        
-    # DNS 解析测试
-    - name: Test DNS resolution
-      shell: nslookup {{ item }} | grep -A2 "Name:"
-      register: dns_test
-      loop: "{{ test_domains }}"
-      failed_when: false
-      
-    - name: Display DNS test results
-      debug:
-        msg: "🔍 DNS Test for {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}"
-      loop: "{{ dns_test.results }}"
-      
-    # 网络连通性测试
-    - name: Test network connectivity (ping)
-      shell: ping -c 3 {{ item }}
-      register: ping_test
-      loop: "{{ test_domains }}"
-      failed_when: false
-      
-    - name: Display ping test results
-      debug:
-        msg: "🏓 Ping to {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}"
-      loop: "{{ ping_test.results }}"
-      
-    # 端口连通性测试
-    - name: Test port connectivity
-      wait_for:
-        host: "{{ item.host }}"
-        port: "{{ item.port }}"
-        timeout: 5
-      register: port_test
-      loop: "{{ test_ports }}"
-      failed_when: false
-      
-    - name: Display port test results
-      debug:
-        msg: "🔌 {{ item.item.name }} ({{ item.item.host }}:{{ item.item.port }}): {{ 'SUCCESS' if not item.failed else 'FAILED' }}"
-      loop: "{{ port_test.results }}"
-      
-    # 检查 Tailscale 状态
-    - name: Check Tailscale status
-      shell: tailscale status
-      register: tailscale_status
-      failed_when: false
-      
-    - name: Display Tailscale status
-      debug:
-        msg: "🔗 Tailscale Status: {{ 'CONNECTED' if tailscale_status.rc == 0 else 'NOT CONNECTED' }}"
-        
-    - name: Show Tailscale details
-      debug:
-        msg: "{{ tailscale_status.stdout_lines }}"
-      when: tailscale_status.rc == 0
-      
-    # 检查防火墙状态
-    - name: Check UFW status (Ubuntu/Debian)
-      shell: ufw status
-      register: ufw_status
-      failed_when: false
-      when: ansible_os_family == "Debian"
-      
-    - name: Display UFW status
-      debug:
-        msg: "🛡️  UFW Firewall: {{ ufw_status.stdout_lines }}"
-      when: ansible_os_family == "Debian" and ufw_status.rc == 0
-      
-    # 检查 iptables 规则
-    - name: Check iptables rules
-      shell: iptables -L -n | head -20
-      register: iptables_rules
-      failed_when: false
-      become: yes
-      
-    - name: Display iptables summary
-      debug:
-        msg: "🔥 Iptables Rules: {{ iptables_rules.stdout_lines[:10] }}"
-      when: iptables_rules.rc == 0
-      
-    # 网络性能测试
-    - name: Test download speed (small file)
-      shell: curl -o /dev/null -s -w "%{time_total}" http://speedtest.wdc01.softlayer.com/downloads/test10.zip
-      register: download_speed
-      failed_when: false
-      
-    - name: Display download speed test
-      debug:
-        msg: "⚡ Download Speed Test: {{ download_speed.stdout }}s for 10MB file"
-      when: download_speed.rc == 0
-      
-    # 检查网络统计
-    - name: Get network statistics
-      shell: cat /proc/net/dev | grep -v "lo:" | grep ":"
-      register: network_stats
-      
-    - name: Display network statistics
-      debug:
-        msg: "📊 Network Stats: {{ network_stats.stdout_lines }}"
-        
-    # 生成网络健康报告
-    - name: Generate network health summary
-      debug:
-        msg: |
-          🌐 Network Health Summary for {{ inventory_hostname }}:
-          ✅ DNS Resolution: {{ (dns_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} domains
-          ✅ Ping Connectivity: {{ (ping_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} hosts
-          ✅ Port Connectivity: {{ (port_test.results | rejectattr('failed', 'defined') | list | length) }}/{{ test_ports | length }} ports
-          ✅ Tailscale: {{ 'Connected' if tailscale_status.rc == 0 else 'Disconnected' }}
--- a/configuration/playbooks/monitoring/service-health-check.yml
+++ b/configuration/playbooks/monitoring/service-health-check.yml
@@ -1,135 +0,0 @@
---
- name: Service Health Check and Monitoring
-  hosts: all
-  become: yes
-  gather_facts: yes
-  
-  vars:
-    critical_services:
-      - ssh
-      - systemd-resolved
-      - cron
-    web_services:
-      - nginx
-      - apache2
-    database_services:
-      - mysql
-      - mariadb
-      - postgresql
-    container_services:
-      - docker
-      - containerd
-    network_services:
-      - tailscale
-      - cloudflared
-      
-  tasks:
-    # 检查关键系统服务
-    - name: Check critical system services
-      systemd:
-        name: "{{ item }}"
-      register: critical_service_status
-      loop: "{{ critical_services }}"
-      failed_when: false
-      
-    - name: Report critical service issues
-      debug:
-        msg: "⚠️  Critical service {{ item.item }} is {{ item.status.ActiveState | default('not found') }}"
-      loop: "{{ critical_service_status.results }}"
-      when: item.status is defined and item.status.ActiveState != "active"
-      
-    # 检查 Web 服务
-    - name: Check web services
-      systemd:
-        name: "{{ item }}"
-      register: web_service_status
-      loop: "{{ web_services }}"
-      failed_when: false
-      
-    - name: Report web service status
-      debug:
-        msg: "🌐 Web service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
-      loop: "{{ web_service_status.results }}"
-      when: item.status is defined
-      
-    # 检查数据库服务
-    - name: Check database services
-      systemd:
-        name: "{{ item }}"
-      register: db_service_status
-      loop: "{{ database_services }}"
-      failed_when: false
-      
-    - name: Report database service status
-      debug:
-        msg: "🗄️  Database service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
-      loop: "{{ db_service_status.results }}"
-      when: item.status is defined
-      
-    # 检查容器服务
-    - name: Check container services
-      systemd:
-        name: "{{ item }}"
-      register: container_service_status
-      loop: "{{ container_services }}"
-      failed_when: false
-      
-    - name: Report container service status
-      debug:
-        msg: "📦 Container service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
-      loop: "{{ container_service_status.results }}"
-      when: item.status is defined
-      
-    # 检查网络服务
-    - name: Check network services
-      systemd:
-        name: "{{ item }}"
-      register: network_service_status
-      loop: "{{ network_services }}"
-      failed_when: false
-      
-    - name: Report network service status
-      debug:
-        msg: "🌐 Network service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}"
-      loop: "{{ network_service_status.results }}"
-      when: item.status is defined
-      
-    # 检查系统负载
-    - name: Check system load
-      shell: uptime
-      register: system_load
-      
-    - name: Display system load
-      debug:
-        msg: "📊 System Load: {{ system_load.stdout }}"
-        
-    # 检查磁盘空间警告
-    - name: Check disk space usage
-      shell: df -h | awk '$5 > 80 {print $0}'
-      register: disk_warning
-      changed_when: false
-      
-    - name: Warn about high disk usage
-      debug:
-        msg: "⚠️  High disk usage detected: {{ disk_warning.stdout_lines }}"
-      when: disk_warning.stdout_lines | length > 0
-      
-    # 检查内存使用率
-    - name: Check memory usage percentage
-      shell: free | awk 'NR==2{printf "%.2f%%", $3*100/$2}'
-      register: memory_percent
-      
-    - name: Display memory usage
-      debug:
-        msg: "🧠 Memory Usage: {{ memory_percent.stdout }}"
-        
-    # 检查最近的系统错误
-    - name: Check recent system errors
-      shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10
-      register: recent_errors
-      changed_when: false
-      
-    - name: Display recent errors
-      debug:
-        msg: "🚨 Recent system errors: {{ recent_errors.stdout_lines }}"
-      when: recent_errors.stdout_lines | length > 0
--- a/configuration/playbooks/remove-docker-install-podman.yml
+++ b/configuration/playbooks/remove-docker-install-podman.yml
@@ -0,0 +1,120 @@
+---
+- name: 移除 Docker 并安装 Podman - 新 Server 节点
+  hosts: ash2e,ash1d,ch2
+  become: yes
+  gather_facts: no
+  serial: 1  # 逐个节点处理，避免并发冲突
+  
+  tasks:
+    - name: 显示当前处理的节点
+      debug:
+        msg: "🔧 正在处理节点: {{ inventory_hostname }}"
+
+    - name: 检查 Docker 服务状态
+      shell: systemctl is-active docker 2>/dev/null || echo "inactive"
+      register: docker_status
+      changed_when: false
+
+    - name: 停止 Docker 服务
+      systemd:
+        name: docker
+        state: stopped
+        enabled: no
+      ignore_errors: yes
+      when: docker_status.stdout == "active"
+
+    - name: 停止 Docker socket
+      systemd:
+        name: docker.socket
+        state: stopped
+        enabled: no
+      ignore_errors: yes
+
+    - name: 移除 Docker 相关包
+      apt:
+        name:
+          - docker-ce
+          - docker-ce-cli
+          - containerd.io
+          - docker-buildx-plugin
+          - docker-compose-plugin
+          - docker.io
+          - docker-doc
+          - docker-compose
+          - docker-registry
+          - containerd
+          - runc
+        state: absent
+        purge: yes
+      ignore_errors: yes
+
+    - name: 清理 Docker 数据目录
+      file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - /var/lib/docker
+        - /var/lib/containerd
+        - /etc/docker
+        - /etc/containerd
+      ignore_errors: yes
+
+    - name: 清理 Docker 用户组
+      group:
+        name: docker
+        state: absent
+      ignore_errors: yes
+
+    - name: 更新包缓存
+      apt:
+        update_cache: yes
+        cache_valid_time: 3600
+
+    - name: 安装 Podman 及相关工具
+      apt:
+        name:
+          - podman
+          - buildah
+          - skopeo
+          - podman-compose
+        state: present
+      retries: 3
+      delay: 10
+
+    - name: 启用 Podman socket 服务
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+      ignore_errors: yes
+
+    - name: 创建 Podman 用户服务目录
+      file:
+        path: /etc/systemd/user
+        state: directory
+        mode: '0755'
+
+    - name: 验证 Podman 安装
+      shell: podman --version
+      register: podman_version
+      
+    - name: 验证 Podman Compose 安装
+      shell: podman-compose --version 2>/dev/null || echo "未安装"
+      register: podman_compose_version
+      
+    - name: 检查 Docker 清理状态
+      shell: systemctl is-active docker 2>/dev/null || echo "已移除"
+      register: final_docker_status
+      
+    - name: 显示节点处理结果
+      debug:
+        msg: |
+          ✅ 节点 {{ inventory_hostname }} 处理完成
+          🐳 Docker 状态: {{ final_docker_status.stdout }}
+          📦 Podman 版本: {{ podman_version.stdout }}
+          🔧 Compose 状态: {{ podman_compose_version.stdout }}
+
+    - name: 清理 apt 缓存
+      apt:
+        autoclean: yes
+        autoremove: yes
--- a/configuration/playbooks/restart-tailscale.yml
+++ b/configuration/playbooks/restart-tailscale.yml
@@ -0,0 +1,39 @@
+---
+- name: Restart Tailscale to fix DNS issues
+  hosts: hcp1,hcp2
+  become: yes
+  
+  tasks:
+    - name: Check current DNS configuration
+      shell: cat /etc/resolv.conf
+      register: dns_before
+      
+    - name: Display current DNS config
+      debug:
+        msg: "Current DNS config: {{ dns_before.stdout_lines }}"
+    
+    - name: Restart tailscaled service
+      systemd:
+        name: tailscaled
+        state: restarted
+        
+    - name: Wait for tailscale to stabilize
+      wait_for:
+        timeout: 10
+        
+    - name: Check DNS configuration after restart
+      shell: cat /etc/resolv.conf
+      register: dns_after
+      
+    - name: Display new DNS config
+      debug:
+        msg: "New DNS config: {{ dns_after.stdout_lines }}"
+        
+    - name: Test DNS resolution
+      shell: nslookup apt.releases.hashicorp.com
+      register: dns_test
+      ignore_errors: yes
+      
+    - name: Display DNS test result
+      debug:
+        msg: "DNS test result: {{ dns_test.stdout_lines }}"
--- a/configuration/playbooks/security/certificate-management.yml
+++ b/configuration/playbooks/security/certificate-management.yml
@@ -1,152 +0,0 @@
---
- name: SSL Certificate Management and Monitoring
-  hosts: all
-  gather_facts: yes
-  
-  vars:
-    # 常见证书路径
-    cert_paths:
-      - /etc/ssl/certs
-      - /etc/letsencrypt/live
-      - /etc/nginx/ssl
-      - /etc/apache2/ssl
-      - /usr/local/share/ca-certificates
-    
-    # 需要检查的服务端口
-    ssl_services:
-      - { name: "HTTPS", port: 443 }
-      - { name: "SMTPS", port: 465 }
-      - { name: "IMAPS", port: 993 }
-      - { name: "LDAPS", port: 636 }
-  
-  tasks:
-    # 检查证书目录
-    - name: Check certificate directories
-      stat:
-        path: "{{ item }}"
-      register: cert_dirs
-      loop: "{{ cert_paths }}"
-      
-    - name: List existing certificate directories
-      debug:
-        msg: "📁 Certificate directory {{ item.item }}: {{ 'EXISTS' if item.stat.exists else 'NOT FOUND' }}"
-      loop: "{{ cert_dirs.results }}"
-      
-    # 查找证书文件
-    - name: Find certificate files
-      find:
-        paths: "{{ cert_paths }}"
-        patterns: "*.crt,*.pem,*.cert"
-        recurse: yes
-      register: cert_files
-      
-    - name: Display found certificates
-      debug:
-        msg: "🔐 Found {{ cert_files.files | length }} certificate files"
-        
-    # 检查证书过期时间
-    - name: Check certificate expiration
-      shell: |
-        if [ -f "{{ item.path }}" ]; then
-          openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2
-        fi
-      register: cert_expiry
-      loop: "{{ cert_files.files[:10] }}"  # 限制检查前10个证书
-      failed_when: false
-      
-    - name: Display certificate expiration dates
-      debug:
-        msg: "📅 {{ item.item.path | basename }}: expires {{ item.stdout if item.stdout else 'INVALID/UNREADABLE' }}"
-      loop: "{{ cert_expiry.results }}"
-      when: item.stdout != ""
-      
-    # 检查即将过期的证书 (30天内)
-    - name: Check certificates expiring soon
-      shell: |
-        if [ -f "{{ item.path }}" ]; then
-          exp_date=$(openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2)
-          if [ ! -z "$exp_date" ]; then
-            exp_epoch=$(date -d "$exp_date" +%s 2>/dev/null)
-            now_epoch=$(date +%s)
-            days_left=$(( (exp_epoch - now_epoch) / 86400 ))
-            if [ $days_left -lt 30 ]; then
-              echo "WARNING: $days_left days left"
-            else
-              echo "OK: $days_left days left"
-            fi
-          fi
-        fi
-      register: cert_warnings
-      loop: "{{ cert_files.files[:10] }}"
-      failed_when: false
-      
-    - name: Display certificate warnings
-      debug:
-        msg: "⚠️  {{ item.item.path | basename }}: {{ item.stdout }}"
-      loop: "{{ cert_warnings.results }}"
-      when: item.stdout != "" and "WARNING" in item.stdout
-      
-    # 检查 Let's Encrypt 证书
-    - name: Check Let's Encrypt certificates
-      shell: certbot certificates 2>/dev/null || echo "Certbot not installed"
-      register: letsencrypt_certs
-      failed_when: false
-      
-    - name: Display Let's Encrypt status
-      debug:
-        msg: "🔒 Let's Encrypt: {{ letsencrypt_certs.stdout_lines }}"
-      when: "'not installed' not in letsencrypt_certs.stdout"
-      
-    # 检查 SSL 服务端口
-    - name: Check SSL service ports
-      wait_for:
-        port: "{{ item.port }}"
-        timeout: 3
-      register: ssl_ports
-      loop: "{{ ssl_services }}"
-      failed_when: false
-      
-    - name: Display SSL service status
-      debug:
-        msg: "🔌 {{ item.item.name }} (port {{ item.item.port }}): {{ 'LISTENING' if not item.failed else 'NOT AVAILABLE' }}"
-      loop: "{{ ssl_ports.results }}"
-      
-    # 测试 HTTPS 连接
-    - name: Test HTTPS connection to localhost
-      uri:
-        url: "https://{{ ansible_default_ipv4.address }}"
-        method: GET
-        validate_certs: no
-        timeout: 5
-      register: https_test
-      failed_when: false
-      when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed
-      
-    - name: Display HTTPS test result
-      debug:
-        msg: "🌐 HTTPS Test: {{ 'SUCCESS' if https_test.status is defined else 'FAILED' }}"
-      when: https_test is defined
-      
-    # 检查证书链
-    - name: Check certificate chain for HTTPS
-      shell: |
-        echo | openssl s_client -connect {{ ansible_default_ipv4.address }}:443 -servername {{ ansible_hostname }} 2>/dev/null | openssl x509 -noout -subject -issuer
-      register: cert_chain
-      failed_when: false
-      when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed
-      
-    - name: Display certificate chain info
-      debug:
-        msg: "🔗 Certificate Chain: {{ cert_chain.stdout_lines }}"
-      when: cert_chain is defined and cert_chain.rc == 0
-      
-    # 生成证书健康报告
-    - name: Generate certificate health summary
-      debug:
-        msg: |
-          🔐 Certificate Health Summary for {{ inventory_hostname }}:
-          📁 Certificate directories found: {{ (cert_dirs.results | selectattr('stat.exists') | list | length) }}
-          📄 Certificate files found: {{ cert_files.files | length }}
-          ⚠️  Certificates expiring soon: {{ (cert_warnings.results | selectattr('stdout', 'search', 'WARNING') | list | length) }}
-          🔒 Let's Encrypt: {{ 'Configured' if 'not installed' not in letsencrypt_certs.stdout else 'Not installed' }}
-          🌐 SSL Services: {{ (ssl_ports.results | rejectattr('failed') | list | length) }}/{{ ssl_services | length }} available
--- a/configuration/playbooks/security/security-hardening.yml
+++ b/configuration/playbooks/security/security-hardening.yml
@@ -1,119 +0,0 @@
---
- name: Security Hardening and Backup
-  hosts: all
-  become: yes
-  gather_facts: yes
-  
-  tasks:
-    # SSH 安全配置检查
-    - name: Check SSH configuration security
-      lineinfile:
-        path: /etc/ssh/sshd_config
-        regexp: "{{ item.regexp }}"
-        line: "{{ item.line }}"
-        backup: yes
-      loop:
-        - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' }
-        - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' }
-        - { regexp: '^#?X11Forwarding', line: 'X11Forwarding no' }
-        - { regexp: '^#?MaxAuthTries', line: 'MaxAuthTries 3' }
-      notify: restart ssh
-      when: ansible_os_family == "Debian"
-      
-    # 防火墙状态检查
-    - name: Check UFW firewall status
-      shell: ufw status
-      register: ufw_status
-      changed_when: false
-      failed_when: false
-      when: ansible_os_family == "Debian"
-      
-    - name: Display firewall status
-      debug:
-        msg: "🔥 Firewall Status: {{ ufw_status.stdout_lines }}"
-      when: ansible_os_family == "Debian" and ufw_status.stdout_lines is defined
-      
-    # 检查可疑登录
-    - name: Check for failed login attempts
-      shell: grep "Failed password" /var/log/auth.log | tail -10
-      register: failed_logins
-      changed_when: false
-      failed_when: false
-      
-    - name: Report suspicious login attempts
-      debug:
-        msg: "🚨 Recent failed logins: {{ failed_logins.stdout_lines }}"
-      when: failed_logins.stdout_lines | length > 0
-      
-    # 检查 root 用户活动
-    - name: Check recent root activity
-      shell: grep "sudo.*root" /var/log/auth.log | tail -5
-      register: root_activity
-      changed_when: false
-      failed_when: false
-      
-    - name: Display root activity
-      debug:
-        msg: "👑 Recent root activity: {{ root_activity.stdout_lines }}"
-      when: root_activity.stdout_lines | length > 0
-      
-    # 备份重要配置文件
-    - name: Create backup directory
-      file:
-        path: /backup/configs
-        state: directory
-        mode: '0700'
-        
-    - name: Backup important configuration files
-      copy:
-        src: "{{ item }}"
-        dest: "/backup/configs/{{ item | basename }}.{{ ansible_date_time.epoch }}"
-        remote_src: yes
-        backup: yes
-      loop:
-        - /etc/ssh/sshd_config
-        - /etc/hosts
-        - /etc/fstab
-        - /etc/crontab
-      failed_when: false
-      
-    # 检查系统完整性
-    - name: Check for world-writable files
-      shell: find /etc /usr /bin /sbin -type f -perm -002 2>/dev/null | head -10
-      register: world_writable
-      changed_when: false
-      
-    - name: Report world-writable files
-      debug:
-        msg: "⚠️  World-writable files found: {{ world_writable.stdout_lines }}"
-      when: world_writable.stdout_lines | length > 0
-      
-    # 检查 SUID 文件
-    - name: Check for SUID files
-      shell: find /usr /bin /sbin -type f -perm -4000 2>/dev/null
-      register: suid_files
-      changed_when: false
-      
-    - name: Display SUID files count
-      debug:
-        msg: "🔐 Found {{ suid_files.stdout_lines | length }} SUID files"
-        
-    # 更新系统时间
-    - name: Sync system time
-      shell: timedatectl set-ntp true
-      failed_when: false
-      
-    - name: Check time synchronization
-      shell: timedatectl status
-      register: time_status
-      
-    - name: Display time sync status
-      debug:
-        msg: "🕐 Time sync: {{ time_status.stdout_lines | select('match', '.*synchronized.*') | list }}"
-        
-  handlers:
-    - name: restart ssh
-      systemd:
-        name: ssh
-        state: restarted
-      when: ansible_os_family == "Debian"
--- a/configuration/playbooks/setup-disk-monitoring.yml
+++ b/configuration/playbooks/setup-disk-monitoring.yml
@@ -0,0 +1,187 @@
+---
+- name: 部署 Telegraf 硬盘监控到 Nomad 集群
+  hosts: all
+  become: yes
+  vars:
+    # 连接现有的 InfluxDB 2.x + Grafana 监控栈
+    influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}"
+    influxdb_token: "{{ influxdb_token }}"
+    influxdb_org: "{{ influxdb_org | default('nomad') }}"
+    influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}"
+    
+    # 远程 Telegraf 配置模式（优先）
+    use_remote_config: "{{ use_remote_config | default(true) }}"
+    telegraf_config_url: "{{ telegraf_config_url | default('') }}"
+    
+    # 硬盘监控阈值
+    disk_usage_warning: 80  # 80% 使用率警告
+    disk_usage_critical: 90 # 90% 使用率严重告警
+    
+    # 监控间隔（秒）
+    collection_interval: 30
+    
+  tasks:
+    - name: 显示正在处理的节点
+      debug:
+        msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控"
+
+    - name: 添加 InfluxData 仓库密钥
+      apt_key:
+        url: https://repos.influxdata.com/influxdata-archive_compat.key
+        state: present
+      retries: 3
+      delay: 5
+
+    - name: 添加 InfluxData 仓库
+      apt_repository:
+        repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable"
+        state: present
+        update_cache: yes
+      retries: 3
+      delay: 5
+
+    - name: 安装 Telegraf
+      apt:
+        name: telegraf
+        state: present
+        update_cache: yes
+      retries: 3
+      delay: 10
+
+    - name: 创建 Telegraf 配置目录
+      file:
+        path: /etc/telegraf/telegraf.d
+        state: directory
+        owner: telegraf
+        group: telegraf
+        mode: '0755'
+
+    - name: 清理旧的 Telegraf 日志文件（节省硬盘空间）
+      file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - /var/log/telegraf
+        - /var/log/telegraf.log
+      ignore_errors: yes
+
+    - name: 禁用 Telegraf 日志目录创建
+      file:
+        path: /var/log/telegraf
+        state: absent
+      ignore_errors: yes
+
+    - name: 创建 Telegraf 环境变量文件
+      template:
+        src: telegraf-env.j2
+        dest: /etc/default/telegraf
+        owner: root
+        group: root
+        mode: '0600'
+        backup: yes
+      notify: restart telegraf
+
+    - name: 创建 Telegraf systemd 服务文件（支持远程配置）
+      template:
+        src: telegraf.service.j2
+        dest: /etc/systemd/system/telegraf.service
+        owner: root
+        group: root
+        mode: '0644'
+        backup: yes
+      notify:
+        - reload systemd
+        - restart telegraf
+      when: telegraf_config_url is defined and telegraf_config_url != ''
+
+    - name: 生成 Telegraf 主配置文件（本地配置模式）
+      template:
+        src: telegraf.conf.j2
+        dest: /etc/telegraf/telegraf.conf
+        owner: telegraf
+        group: telegraf
+        mode: '0644'
+        backup: yes
+      notify: restart telegraf
+      when: telegraf_config_url is not defined or telegraf_config_url == ''
+
+    - name: 生成硬盘监控配置
+      template:
+        src: disk-monitoring.conf.j2
+        dest: /etc/telegraf/telegraf.d/disk-monitoring.conf
+        owner: telegraf
+        group: telegraf
+        mode: '0644'
+        backup: yes
+      notify: restart telegraf
+
+    - name: 生成系统监控配置
+      template:
+        src: system-monitoring.conf.j2
+        dest: /etc/telegraf/telegraf.d/system-monitoring.conf
+        owner: telegraf
+        group: telegraf
+        mode: '0644'
+        backup: yes
+      notify: restart telegraf
+
+    - name: 启用并启动 Telegraf 服务
+      systemd:
+        name: telegraf
+        state: started
+        enabled: yes
+        daemon_reload: yes
+
+    - name: 验证 Telegraf 状态
+      systemd:
+        name: telegraf
+      register: telegraf_status
+
+    - name: 检查 InfluxDB 连接
+      uri:
+        url: "{{ influxdb_url }}/ping"
+        method: GET
+        timeout: 5
+      register: influxdb_ping
+      ignore_errors: yes
+      delegate_to: localhost
+      run_once: true
+
+    - name: 显示 InfluxDB 连接状态
+      debug:
+        msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败，请检查配置' }}"
+      run_once: true
+
+    - name: 显示 Telegraf 状态
+      debug:
+        msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}"
+
+    - name: 检查硬盘使用情况
+      shell: |
+        df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output;
+        do
+          usage=$(echo $output | awk '{print $1}' | sed 's/%//g')
+          partition=$(echo $output | awk '{print $2}')
+          mount=$(echo $output | awk '{print $3}')
+          if [ $usage -ge {{ disk_usage_warning }} ]; then
+            echo "⚠️  警告: $mount ($partition) 使用率 $usage%"
+          else
+            echo "✅ $mount ($partition) 使用率 $usage%"
+          fi
+        done
+      register: disk_check
+      changed_when: false
+
+    - name: 显示硬盘检查结果
+      debug:
+        msg: "{{ disk_check.stdout_lines }}"
+
+  handlers:
+    - name: reload systemd
+      systemd:
+        daemon_reload: yes
+
+    - name: restart telegraf
+      systemd:
+        name: telegraf
+        state: restarted
--- a/configuration/playbooks/setup-new-nomad-nodes.yml
+++ b/configuration/playbooks/setup-new-nomad-nodes.yml
@@ -0,0 +1,76 @@
+---
+- name: 安装并配置新的 Nomad Server 节点
+  hosts: ash2e,ash1d,ch2
+  become: yes
+  gather_facts: no
+  
+  tasks:
+    - name: 更新包缓存
+      apt:
+        update_cache: yes
+        cache_valid_time: 3600
+      retries: 3
+      delay: 10
+
+    - name: 安装依赖包
+      apt:
+        name:
+          - wget
+          - curl
+          - unzip
+          - podman
+          - buildah
+          - skopeo
+        state: present
+      retries: 3
+      delay: 10
+
+    - name: 检查 Nomad 是否已安装
+      shell: which nomad || echo "not_found"
+      register: nomad_check
+      changed_when: false
+
+    - name: 下载并安装 Nomad
+      block:
+        - name: 下载 Nomad 1.10.5
+          get_url:
+            url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip"
+            dest: "/tmp/nomad.zip"
+            mode: '0644'
+
+        - name: 解压 Nomad
+          unarchive:
+            src: "/tmp/nomad.zip"
+            dest: "/usr/bin/"
+            remote_src: yes
+            owner: root
+            group: root
+            mode: '0755'
+
+        - name: 清理临时文件
+          file:
+            path: "/tmp/nomad.zip"
+            state: absent
+      when: nomad_check.stdout == "not_found"
+
+    - name: 验证 Nomad 安装
+      shell: nomad version
+      register: nomad_version_output
+      
+    - name: 显示安装结果
+      debug:
+        msg: |
+          ✅ 节点 {{ inventory_hostname }} 软件安装完成
+          📦 Podman: {{ ansible_facts.packages.podman[0].version if ansible_facts.packages.podman is defined else 'checking...' }}
+          🎯 Nomad: {{ nomad_version_output.stdout.split('\n')[0] }}
+
+    - name: 启用 Podman socket
+      systemd:
+        name: podman.socket
+        enabled: yes
+        state: started
+      ignore_errors: yes
+
+    - name: 继续完整配置
+      debug:
+        msg: "软件安装完成，现在将运行完整的 Nomad 配置..."
--- a/configuration/templates/disk-monitoring.conf.j2
+++ b/configuration/templates/disk-monitoring.conf.j2
@@ -0,0 +1,68 @@
+# 硬盘监控配置
+# 监控所有挂载点的硬盘使用情况
+
+# 硬盘使用率监控
+[[inputs.disk]]
+  ## 忽略的文件系统类型
+  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
+  
+  ## 监控所有挂载点
+  mount_points = ["/", "/var", "/tmp", "/opt", "/home"]
+  
+  ## 标签配置
+  [inputs.disk.tags]
+    service = "disk-monitoring"
+    
+# 硬盘 I/O 监控
+[[inputs.diskio]]
+  ## 监控所有设备
+  devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"]
+  
+  ## 跳过序列号收集以提高性能
+  skip_serial_number = true
+  
+  [inputs.diskio.tags]
+    service = "disk-io-monitoring"
+
+# 文件系统 inode 监控
+[[inputs.disk]]
+  ## 监控 inode 使用情况
+  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
+  
+  ## 收集 inode 信息
+  [inputs.disk.tags]
+    service = "inode-monitoring"
+
+# 进程监控（可选，用于监控可能占用大量硬盘的进程）
+[[inputs.procstat]]
+  ## 监控 Docker 进程（如果存在）
+  pattern = "docker"
+  
+  [inputs.procstat.tags]
+    service = "docker-process"
+
+[[inputs.procstat]]
+  ## 监控 Podman 进程
+  pattern = "podman"
+  
+  [inputs.procstat.tags]
+    service = "podman-process"
+
+[[inputs.procstat]]
+  ## 监控 Nomad 进程
+  pattern = "nomad"
+  
+  [inputs.procstat.tags]
+    service = "nomad-process"
+
+# 日志文件大小监控
+[[inputs.filestat]]
+  files = [
+    "/var/log/nomad/*.log",
+    "/var/log/syslog",
+    "/var/log/kern.log",
+    "/var/log/auth.log"
+  ]
+  
+  [inputs.filestat.tags]
+    service = "log-monitoring"
--- a/configuration/templates/system-monitoring.conf.j2
+++ b/configuration/templates/system-monitoring.conf.j2
@@ -0,0 +1,68 @@
+# 系统监控配置
+# CPU、内存、网络等系统资源监控
+
+# CPU 监控
+[[inputs.cpu]]
+  ## 是否收集每个 CPU 核心的信息
+  percpu = true
+  ## 是否收集总 CPU 信息
+  totalcpu = true
+  ## 收集字段
+  collect_cpu_time = false
+  ## 报告活跃的 CPU
+  report_active = false
+  
+  [inputs.cpu.tags]
+    service = "cpu-monitoring"
+
+# 内存监控
+[[inputs.mem]]
+  [inputs.mem.tags]
+    service = "memory-monitoring"
+
+# 网络接口监控
+[[inputs.net]]
+  ## 接口配置
+  interfaces = ["eth*", "en*", "tailscale*"]
+  
+  [inputs.net.tags]
+    service = "network-monitoring"
+
+# 系统负载监控
+[[inputs.system]]
+  [inputs.system.tags]
+    service = "system-load"
+
+# 内核统计
+[[inputs.kernel]]
+  [inputs.kernel.tags]
+    service = "kernel-stats"
+
+# 网络统计
+[[inputs.netstat]]
+  [inputs.netstat.tags]
+    service = "network-stats"
+
+# 交换分区监控
+[[inputs.swap]]
+  [inputs.swap.tags]
+    service = "swap-monitoring"
+
+# 服务状态监控
+[[inputs.systemd_units]]
+  ## 监控的服务
+  units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"]
+  
+  [inputs.systemd_units.tags]
+    service = "service-monitoring"
+
+# 硬盘健康状态监控（如果支持 SMART）
+[[inputs.smart]]
+  ## SMART 监控路径
+  path_smartctl = "/usr/sbin/smartctl"
+  
+  ## 超时设置
+  timeout = "30s"
+  
+  [inputs.smart.tags]
+    service = "smart-monitoring"
--- a/configuration/templates/telegraf-env.j2
+++ b/configuration/templates/telegraf-env.j2
@@ -0,0 +1,7 @@
+# Telegraf 环境变量配置
+# InfluxDB 2.x 认证信息
+
+INFLUX_TOKEN={{ influxdb_token }}
+INFLUX_ORG={{ influxdb_org }}
+INFLUX_BUCKET={{ influxdb_bucket }}
+INFLUX_URL={{ influxdb_url }}
--- a/configuration/templates/telegraf.conf.j2
+++ b/configuration/templates/telegraf.conf.j2
@@ -0,0 +1,53 @@
+# Telegraf 主配置文件
+# Nomad 集群硬盘监控配置
+
+# 全局设置
+[global_tags]
+  nomad_cluster = "production"
+  node_role = "{{ nomad_role | default('unknown') }}"
+  hostname = "{{ inventory_hostname }}"
+
+# Agent 配置
+[agent]
+  interval = "{{ collection_interval | default(30) }}s"
+  round_interval = true
+  metric_batch_size = 1000
+  metric_buffer_limit = 10000
+  collection_jitter = "2s"
+  flush_interval = "10s"
+  flush_jitter = "0s"
+  precision = ""
+  hostname = "{{ inventory_hostname }}"
+  omit_hostname = false
+
+# 输出配置 - InfluxDB 2.x
+[[outputs.influxdb_v2]]
+  urls = ["{{ influxdb_url }}"]
+  token = "{{ influxdb_token }}"
+  organization = "{{ influxdb_org | default('nomad') }}"
+  bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}"
+  
+  ## 连接配置
+  timeout = "10s"
+  max_retries = 3
+  retry_timeout = "5s"
+  
+  ## 数据精度
+  precision = "s"
+  
+  ## TLS 配置（如果需要）
+  # tls_ca = "/etc/telegraf/ca.pem"
+  # tls_cert = "/etc/telegraf/cert.pem"
+  # tls_key = "/etc/telegraf/key.pem"
+  # insecure_skip_verify = false
+
+# 日志配置 - 禁用本地日志以节省硬盘空间
+[log]
+  ## 只输出错误日志到 syslog，不生成本地文件
+  level = "ERROR"
+  ## 禁用本地日志文件
+  # file = "/var/log/telegraf/telegraf.log"
+  ## 使用 syslog 替代本地文件
+  logtarget = "syslog"
+  ## 禁用日志轮转
+  logrotate = false
--- a/configuration/templates/telegraf.service.j2
+++ b/configuration/templates/telegraf.service.j2
@@ -0,0 +1,29 @@
+[Unit]
+Description=Telegraf - 节点监控服务
+Documentation=https://github.com/influxdata/telegraf
+After=network.target
+
+[Service]
+Type=notify
+User=telegraf
+Group=telegraf
+ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }}
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=control-group
+Restart=on-failure
+RestartSec=5
+TimeoutStopSec=20
+EnvironmentFile=/etc/default/telegraf
+
+# 安全配置
+NoNewPrivileges=true
+PrivateTmp=true
+ProtectSystem=strict
+ProtectHome=true
+ReadWritePaths=/var/lib/telegraf
+ProtectKernelTunables=true
+ProtectKernelModules=true
+ProtectControlGroups=true
+
+[Install]
+WantedBy=multi-user.target