feat(监控): 添加Telegraf监控配置和磁盘监控脚本
refactor(容器): 从Docker迁移到Podman并更新Nomad配置 fix(配置): 修复代理和别名配置问题 docs(文档): 更新配置文件和脚本注释 chore(清理): 移除不再使用的Consul和Docker相关文件
This commit is contained in:
		
							parent
							
								
									3f45ad8361
								
							
						
					
					
						commit
						d0e7f64c1d
					
				|  | @ -0,0 +1,46 @@ | |||
| #!/bin/bash | ||||
| # Nomad 集群硬盘监控部署脚本 | ||||
| # 使用现有的 InfluxDB + Grafana 监控栈 | ||||
| 
 | ||||
| echo "🚀 开始部署 Nomad 集群硬盘监控..." | ||||
| 
 | ||||
| # 检查配置文件 | ||||
| if [[ ! -f "inventories/production/group_vars/all.yml" ]]; then | ||||
|     echo "❌ 配置文件不存在,请先配置 InfluxDB 连接信息" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| # 显示配置信息 | ||||
| echo "📋 当前监控配置:" | ||||
| grep -E "influxdb_|disk_usage_|collection_interval" inventories/production/group_vars/all.yml | ||||
| 
 | ||||
| echo "" | ||||
| read -p "🤔 确认配置正确吗?(y/N): " confirm | ||||
| if [[ $confirm != [yY] ]]; then | ||||
|     echo "❌ 部署取消,请修改配置后重试" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| # 部署到所有节点 | ||||
| echo "📦 开始安装 Telegraf 到所有节点..." | ||||
| ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml | ||||
| 
 | ||||
| # 检查部署结果 | ||||
| if [[ $? -eq 0 ]]; then | ||||
|     echo "✅ 硬盘监控部署完成!" | ||||
|     echo "" | ||||
|     echo "📊 监控信息:" | ||||
|     echo "- 数据将发送到你现有的 InfluxDB" | ||||
|     echo "- 可以在 Grafana 中创建仪表板查看数据" | ||||
|     echo "- 已禁用本地日志文件以节省硬盘空间" | ||||
|     echo "- 监控数据每30秒收集一次" | ||||
|     echo "" | ||||
|     echo "🔧 下一步:" | ||||
|     echo "1. 在 Grafana 中创建 Nomad 集群监控仪表板" | ||||
|     echo "2. 设置硬盘使用率告警规则" | ||||
|     echo "3. 可以运行以下命令检查监控状态:" | ||||
|     echo "   ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf'" | ||||
| else | ||||
|     echo "❌ 部署失败,请检查错误信息" | ||||
|     exit 1 | ||||
| fi | ||||
|  | @ -0,0 +1,40 @@ | |||
| #!/bin/bash | ||||
| # 使用远程 InfluxDB 2.x 配置快速部署 Telegraf 监控 | ||||
| 
 | ||||
| echo "🚀 使用 InfluxDB 2.x 远程配置部署 Telegraf 监控..." | ||||
| 
 | ||||
| # 设置变量 | ||||
| INFLUX_TOKEN="VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w==" | ||||
| TELEGRAF_CONFIG_URL="http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000" | ||||
| 
 | ||||
| # 检查网络连接 | ||||
| echo "🔍 检查 InfluxDB 连接..." | ||||
| if curl -s --max-time 5 "http://influxdb1.tailnet-68f9.ts.net:8086/health" > /dev/null; then | ||||
|     echo "✅ InfluxDB 连接正常" | ||||
| else | ||||
|     echo "❌ 无法连接到 InfluxDB,请检查网络" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| # 使用远程配置部署 | ||||
| echo "📦 开始部署到所有节点..." | ||||
| ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml \ | ||||
|   -e "use_remote_config=true" \ | ||||
|   -e "telegraf_config_url=$TELEGRAF_CONFIG_URL" \ | ||||
|   -e "influxdb_token=$INFLUX_TOKEN" | ||||
| 
 | ||||
| # 检查部署结果 | ||||
| if [[ $? -eq 0 ]]; then | ||||
|     echo "✅ Telegraf 监控部署完成!" | ||||
|     echo "" | ||||
|     echo "📊 配置信息:" | ||||
|     echo "- 使用远程配置: $TELEGRAF_CONFIG_URL" | ||||
|     echo "- InfluxDB 服务器: influxdb1.tailnet-68f9.ts.net:8086" | ||||
|     echo "- 已禁用本地日志文件" | ||||
|     echo "" | ||||
|     echo "🔧 验证部署:" | ||||
|     echo "ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf --no-pager'" | ||||
| else | ||||
|     echo "❌ 部署失败,请检查错误信息" | ||||
|     exit 1 | ||||
| fi | ||||
|  | @ -1,14 +0,0 @@ | |||
| { | ||||
|   "proxies": { | ||||
|     "http-proxy": "http://istoreos.tailnet-68f9.ts.net:7891", | ||||
|     "https-proxy": "http://istoreos.tailnet-68f9.ts.net:7891", | ||||
|     "no-proxy": "localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" | ||||
|   }, | ||||
|   "registry-mirrors": [], | ||||
|   "insecure-registries": [], | ||||
|   "debug": false, | ||||
|   "experimental": false, | ||||
|   "features": { | ||||
|     "buildkit": true | ||||
|   } | ||||
| } | ||||
|  | @ -0,0 +1,20 @@ | |||
| # Nomad 集群全局配置 | ||||
| # InfluxDB 2.x + Grafana 监控配置 | ||||
| 
 | ||||
| # InfluxDB 2.x 连接配置 | ||||
| influxdb_url: "http://influxdb1.tailnet-68f9.ts.net:8086" | ||||
| influxdb_token: "VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w==" | ||||
| influxdb_org: "nomad"                     # 组织名称 | ||||
| influxdb_bucket: "nomad_monitoring"        # Bucket 名称 | ||||
| 
 | ||||
| # 远程 Telegraf 配置 URL | ||||
| telegraf_config_url: "http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000" | ||||
| 
 | ||||
| # 监控配置 | ||||
| disk_usage_warning: 80               # 硬盘使用率警告阈值 | ||||
| disk_usage_critical: 90              # 硬盘使用率严重告警阈值 | ||||
| collection_interval: 30              # 数据收集间隔(秒) | ||||
| 
 | ||||
| # Telegraf 优化配置 | ||||
| telegraf_log_level: "ERROR"          # 只记录错误日志 | ||||
| telegraf_disable_local_logs: true    # 禁用本地日志文件 | ||||
|  | @ -1,10 +1,20 @@ | |||
| [nomad_servers] | ||||
| master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 | ||||
| semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 | ||||
| ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 | ||||
| semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=6 | ||||
| ash2e ansible_host=ash2e ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 | ||||
| ash1d ansible_host=ash1d ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 | ||||
| ch2 ansible_host=ch2 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 | ||||
| ch3 ansible_host=ch3 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 | ||||
| # 新增的 Mac 和 Windows 节点(请替换为实际的 Tailscale IP) | ||||
| mac-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_mac_user nomad_role=server nomad_bootstrap_expect=6 | ||||
| win-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_win_user nomad_role=server nomad_bootstrap_expect=6 | ||||
| 
 | ||||
| [nomad_clients] | ||||
| # 如果需要客户端节点,可以在这里添加 | ||||
| master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client | ||||
| hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client | ||||
| hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| 
 | ||||
| [nomad_cluster:children] | ||||
| nomad_servers | ||||
|  |  | |||
|  | @ -0,0 +1,22 @@ | |||
| [nomad_servers] | ||||
| master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 | ||||
| semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 | ||||
| ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 | ||||
| 
 | ||||
| [nomad_clients] | ||||
| hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client | ||||
| hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client | ||||
| hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| 
 | ||||
| [nomad_cluster:children] | ||||
| nomad_servers | ||||
| nomad_clients | ||||
| 
 | ||||
| [nomad_cluster:vars] | ||||
| ansible_ssh_private_key_file=~/.ssh/id_ed25519 | ||||
| ansible_user=ben | ||||
| ansible_become=yes | ||||
| nomad_version=1.10.5 | ||||
| nomad_datacenter=dc1 | ||||
| nomad_region=global | ||||
| nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ= | ||||
|  | @ -0,0 +1,23 @@ | |||
| [nomad_servers] | ||||
| master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 | ||||
| semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 | ||||
| ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 | ||||
| 
 | ||||
| [nomad_clients] | ||||
| hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client | ||||
| hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client | ||||
| hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client | ||||
| 
 | ||||
| [nomad_cluster:children] | ||||
| nomad_servers | ||||
| nomad_clients | ||||
| 
 | ||||
| [nomad_cluster:vars] | ||||
| ansible_ssh_private_key_file=~/.ssh/id_ed25519 | ||||
| ansible_user=ben | ||||
| ansible_become=yes | ||||
| nomad_version=1.10.5 | ||||
| nomad_datacenter=dc1 | ||||
| nomad_region=global | ||||
| nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ= | ||||
|  | @ -1,183 +0,0 @@ | |||
| --- | ||||
| - name: Setup Automated Maintenance Cron Jobs | ||||
|   hosts: localhost | ||||
|   gather_facts: no | ||||
|    | ||||
|   vars: | ||||
|     # 定时任务配置 | ||||
|     cron_jobs: | ||||
|       # 每日快速检查 | ||||
|       - name: "Daily system health check" | ||||
|         job: "cd /root/mgmt && ./scripts/ops-manager.sh toolkit all --check > /var/log/daily-health-check.log 2>&1" | ||||
|         minute: "0" | ||||
|         hour: "8" | ||||
|         day: "*" | ||||
|         month: "*" | ||||
|         weekday: "*" | ||||
|          | ||||
|       # 每周系统清理 | ||||
|       - name: "Weekly system cleanup" | ||||
|         job: "cd /root/mgmt && ./scripts/ops-manager.sh cleanup all > /var/log/weekly-cleanup.log 2>&1" | ||||
|         minute: "0" | ||||
|         hour: "2" | ||||
|         day: "*" | ||||
|         month: "*" | ||||
|         weekday: "0"  # Sunday | ||||
|          | ||||
|       # 每月安全检查 | ||||
|       - name: "Monthly security hardening check" | ||||
|         job: "cd /root/mgmt && ./scripts/ops-manager.sh security all --check > /var/log/monthly-security-check.log 2>&1" | ||||
|         minute: "0" | ||||
|         hour: "3" | ||||
|         day: "1" | ||||
|         month: "*" | ||||
|         weekday: "*" | ||||
|          | ||||
|       # 每周证书检查 | ||||
|       - name: "Weekly certificate check" | ||||
|         job: "cd /root/mgmt && ./scripts/ops-manager.sh cert all > /var/log/weekly-cert-check.log 2>&1" | ||||
|         minute: "30" | ||||
|         hour: "4" | ||||
|         day: "*" | ||||
|         month: "*" | ||||
|         weekday: "1"  # Monday | ||||
|          | ||||
|       # 每日 Docker 清理 (仅 LXC 组) | ||||
|       - name: "Daily Docker cleanup for LXC" | ||||
|         job: "cd /root/mgmt && ansible lxc -i ansible/inventory.ini -m shell -a 'docker system prune -f' --become -e 'ansible_ssh_pass=313131' > /var/log/daily-docker-cleanup.log 2>&1" | ||||
|         minute: "0" | ||||
|         hour: "1" | ||||
|         day: "*" | ||||
|         month: "*" | ||||
|         weekday: "*" | ||||
|          | ||||
|       # 每周网络连通性检查 | ||||
|       - name: "Weekly network connectivity check" | ||||
|         job: "cd /root/mgmt && ./scripts/ops-manager.sh network all > /var/log/weekly-network-check.log 2>&1" | ||||
|         minute: "0" | ||||
|         hour: "6" | ||||
|         day: "*" | ||||
|         month: "*" | ||||
|         weekday: "2"  # Tuesday | ||||
|    | ||||
|   tasks: | ||||
|     # 创建日志目录 | ||||
|     - name: Create log directory | ||||
|       file: | ||||
|         path: /var/log/ansible-automation | ||||
|         state: directory | ||||
|         mode: '0755' | ||||
|       become: yes | ||||
|        | ||||
|     # 设置脚本执行权限 | ||||
|     - name: Make ops-manager.sh executable | ||||
|       file: | ||||
|         path: /root/mgmt/scripts/ops-manager.sh | ||||
|         mode: '0755' | ||||
|          | ||||
|     # 创建定时任务 | ||||
|     - name: Setup cron jobs for automated maintenance | ||||
|       cron: | ||||
|         name: "{{ item.name }}" | ||||
|         job: "{{ item.job }}" | ||||
|         minute: "{{ item.minute }}" | ||||
|         hour: "{{ item.hour }}" | ||||
|         day: "{{ item.day }}" | ||||
|         month: "{{ item.month }}" | ||||
|         weekday: "{{ item.weekday }}" | ||||
|         user: root | ||||
|       loop: "{{ cron_jobs }}" | ||||
|       become: yes | ||||
|        | ||||
|     # 创建日志轮转配置 | ||||
|     - name: Setup log rotation for automation logs | ||||
|       copy: | ||||
|         content: | | ||||
|           /var/log/*-health-check.log | ||||
|           /var/log/*-cleanup.log | ||||
|           /var/log/*-security-check.log | ||||
|           /var/log/*-cert-check.log | ||||
|           /var/log/*-docker-cleanup.log | ||||
|           /var/log/*-network-check.log { | ||||
|               daily | ||||
|               missingok | ||||
|               rotate 30 | ||||
|               compress | ||||
|               delaycompress | ||||
|               notifempty | ||||
|               copytruncate | ||||
|           } | ||||
|         dest: /etc/logrotate.d/ansible-automation | ||||
|         mode: '0644' | ||||
|       become: yes | ||||
|        | ||||
|     # 创建监控脚本 | ||||
|     - name: Create monitoring dashboard script | ||||
|       copy: | ||||
|         content: | | ||||
|           #!/bin/bash | ||||
|           # Automation Monitoring Dashboard | ||||
|            | ||||
|           echo "🤖 Ansible Automation Status Dashboard" | ||||
|           echo "======================================" | ||||
|           echo "" | ||||
|            | ||||
|           echo "📅 Last Execution Times:" | ||||
|           echo "------------------------" | ||||
|           for log in /var/log/*-check.log /var/log/*-cleanup.log; do | ||||
|               if [ -f "$log" ]; then | ||||
|                   echo "$(basename "$log" .log): $(stat -c %y "$log" | cut -d. -f1)" | ||||
|               fi | ||||
|           done | ||||
|           echo "" | ||||
|            | ||||
|           echo "📊 Recent Log Summary:" | ||||
|           echo "---------------------" | ||||
|           for log in /var/log/daily-health-check.log /var/log/weekly-cleanup.log; do | ||||
|               if [ -f "$log" ]; then | ||||
|                   echo "=== $(basename "$log") ===" | ||||
|                   tail -5 "$log" | grep -E "(TASK|PLAY RECAP|ERROR|WARNING)" || echo "No recent activity" | ||||
|                   echo "" | ||||
|               fi | ||||
|           done | ||||
|            | ||||
|           echo "⏰ Next Scheduled Jobs:" | ||||
|           echo "----------------------" | ||||
|           crontab -l | grep -E "(health|cleanup|security|cert|docker|network)" | while read line; do | ||||
|               echo "$line" | ||||
|           done | ||||
|           echo "" | ||||
|            | ||||
|           echo "💾 Log File Sizes:" | ||||
|           echo "-----------------" | ||||
|           ls -lh /var/log/*-*.log 2>/dev/null | awk '{print $5, $9}' || echo "No log files found" | ||||
|         dest: /usr/local/bin/automation-status | ||||
|         mode: '0755' | ||||
|       become: yes | ||||
|        | ||||
|     # 显示设置完成信息 | ||||
|     - name: Display setup completion info | ||||
|       debug: | ||||
|         msg: | | ||||
|           🎉 自动化定时任务设置完成! | ||||
|            | ||||
|           📋 已配置的定时任务: | ||||
|           • 每日 08:00 - 系统健康检查 | ||||
|           • 每日 01:00 - Docker 清理 (LXC 组) | ||||
|           • 每周日 02:00 - 系统清理 | ||||
|           • 每周一 04:30 - 证书检查 | ||||
|           • 每周二 06:00 - 网络连通性检查 | ||||
|           • 每月1日 03:00 - 安全检查 | ||||
|            | ||||
|           📊 监控命令: | ||||
|           • 查看状态: automation-status | ||||
|           • 查看定时任务: crontab -l | ||||
|           • 查看日志: tail -f /var/log/daily-health-check.log | ||||
|            | ||||
|           📁 日志位置: /var/log/ | ||||
|           🔄 日志轮转: 30天自动清理 | ||||
|            | ||||
|           💡 手动执行示例: | ||||
|           • ./scripts/ops-manager.sh toolkit all | ||||
|           • ./scripts/ops-manager.sh cleanup lxc | ||||
|           • ./scripts/ops-manager.sh health proxmox | ||||
|  | @ -1,175 +0,0 @@ | |||
| --- | ||||
| - name: Bootstrap Infrastructure | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|    | ||||
|   vars: | ||||
|     # 基础软件包 | ||||
|     base_packages: | ||||
|       - curl | ||||
|       - wget | ||||
|       - git | ||||
|       - vim | ||||
|       - htop | ||||
|       - tree | ||||
|       - unzip | ||||
|       - jq | ||||
|       - python3 | ||||
|       - python3-pip | ||||
|       - apt-transport-https | ||||
|       - ca-certificates | ||||
|       - gnupg | ||||
|       - lsb-release | ||||
|      | ||||
|     # Docker 配置 | ||||
|     docker_users: | ||||
|       - "{{ ansible_user }}" | ||||
|      | ||||
|     # 系统配置 | ||||
|     timezone: "Asia/Shanghai" | ||||
|      | ||||
|   tasks: | ||||
|     - name: Update package cache | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|         cache_valid_time: 3600 | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     - name: Install base packages | ||||
|       package: | ||||
|         name: "{{ base_packages }}" | ||||
|         state: present | ||||
|          | ||||
|     - name: Set timezone | ||||
|       timezone: | ||||
|         name: "{{ timezone }}" | ||||
|          | ||||
|     - name: Create system users | ||||
|       user: | ||||
|         name: "{{ ansible_user }}" | ||||
|         groups: sudo | ||||
|         shell: /bin/bash | ||||
|         create_home: yes | ||||
|       when: ansible_user != "root" | ||||
|        | ||||
|     - name: Configure SSH | ||||
|       lineinfile: | ||||
|         path: /etc/ssh/sshd_config | ||||
|         regexp: "{{ item.regexp }}" | ||||
|         line: "{{ item.line }}" | ||||
|         backup: yes | ||||
|       loop: | ||||
|         - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' } | ||||
|         - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' } | ||||
|         - { regexp: '^#?PubkeyAuthentication', line: 'PubkeyAuthentication yes' } | ||||
|       notify: restart ssh | ||||
|       when: ansible_user != "root" | ||||
|        | ||||
|     - name: Install Docker | ||||
|       block: | ||||
|         - name: Add Docker GPG key | ||||
|           apt_key: | ||||
|             url: https://download.docker.com/linux/ubuntu/gpg | ||||
|             state: present | ||||
|              | ||||
|         - name: Add Docker repository | ||||
|           apt_repository: | ||||
|             repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" | ||||
|             state: present | ||||
|              | ||||
|         - name: Install Docker | ||||
|           package: | ||||
|             name: | ||||
|               - docker-ce | ||||
|               - docker-ce-cli | ||||
|               - containerd.io | ||||
|               - docker-compose-plugin | ||||
|             state: present | ||||
|              | ||||
|         - name: Add users to docker group | ||||
|           user: | ||||
|             name: "{{ item }}" | ||||
|             groups: docker | ||||
|             append: yes | ||||
|           loop: "{{ docker_users }}" | ||||
|            | ||||
|         - name: Start and enable Docker | ||||
|           systemd: | ||||
|             name: docker | ||||
|             state: started | ||||
|             enabled: yes | ||||
|              | ||||
|     - name: Install Docker Compose (standalone) | ||||
|       get_url: | ||||
|         url: "https://github.com/docker/compose/releases/latest/download/docker-compose-linux-x86_64" | ||||
|         dest: /usr/local/bin/docker-compose | ||||
|         mode: '0755' | ||||
|          | ||||
|     - name: Configure firewall | ||||
|       ufw: | ||||
|         rule: "{{ item.rule }}" | ||||
|         port: "{{ item.port }}" | ||||
|         proto: "{{ item.proto | default('tcp') }}" | ||||
|       loop: | ||||
|         - { rule: 'allow', port: '22' } | ||||
|         - { rule: 'allow', port: '80' } | ||||
|         - { rule: 'allow', port: '443' } | ||||
|       notify: enable ufw | ||||
|        | ||||
|     - name: Create application directories | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: directory | ||||
|         owner: "{{ ansible_user }}" | ||||
|         group: "{{ ansible_user }}" | ||||
|         mode: '0755' | ||||
|       loop: | ||||
|         - /opt/apps | ||||
|         - /opt/data | ||||
|         - /opt/logs | ||||
|         - /opt/backups | ||||
|         - /opt/scripts | ||||
|          | ||||
|     - name: Install monitoring tools | ||||
|       package: | ||||
|         name: | ||||
|           - htop | ||||
|           - iotop | ||||
|           - nethogs | ||||
|           - ncdu | ||||
|           - tmux | ||||
|         state: present | ||||
|          | ||||
|     - name: Configure system limits | ||||
|       pam_limits: | ||||
|         domain: '*' | ||||
|         limit_type: "{{ item.type }}" | ||||
|         limit_item: "{{ item.item }}" | ||||
|         value: "{{ item.value }}" | ||||
|       loop: | ||||
|         - { type: 'soft', item: 'nofile', value: '65536' } | ||||
|         - { type: 'hard', item: 'nofile', value: '65536' } | ||||
|         - { type: 'soft', item: 'nproc', value: '32768' } | ||||
|         - { type: 'hard', item: 'nproc', value: '32768' } | ||||
|          | ||||
|     - name: Configure sysctl | ||||
|       sysctl: | ||||
|         name: "{{ item.name }}" | ||||
|         value: "{{ item.value }}" | ||||
|         state: present | ||||
|         reload: yes | ||||
|       loop: | ||||
|         - { name: 'vm.max_map_count', value: '262144' } | ||||
|         - { name: 'fs.file-max', value: '2097152' } | ||||
|         - { name: 'net.core.somaxconn', value: '32768' } | ||||
|          | ||||
|   handlers: | ||||
|     - name: restart ssh | ||||
|       systemd: | ||||
|         name: ssh | ||||
|         state: restarted | ||||
|          | ||||
|     - name: enable ufw | ||||
|       ufw: | ||||
|         state: enabled | ||||
|  | @ -1,83 +0,0 @@ | |||
| --- | ||||
| - name: System Cleanup and Maintenance | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|    | ||||
|   tasks: | ||||
|     # 清理包缓存和孤立包 | ||||
|     - name: Clean package cache (Debian/Ubuntu) | ||||
|       apt: | ||||
|         autoclean: yes | ||||
|         autoremove: yes | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     - name: Remove orphaned packages (Debian/Ubuntu) | ||||
|       shell: apt-get autoremove --purge -y | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     # 清理日志文件 | ||||
|     - name: Clean old journal logs (keep 7 days) | ||||
|       shell: journalctl --vacuum-time=7d | ||||
|        | ||||
|     - name: Clean old log files | ||||
|       find: | ||||
|         paths: /var/log | ||||
|         patterns: "*.log.*,*.gz" | ||||
|         age: "7d" | ||||
|         recurse: yes | ||||
|       register: old_logs | ||||
|        | ||||
|     - name: Remove old log files | ||||
|       file: | ||||
|         path: "{{ item.path }}" | ||||
|         state: absent | ||||
|       loop: "{{ old_logs.files }}" | ||||
|       when: old_logs.files is defined | ||||
|        | ||||
|     # 清理临时文件 | ||||
|     - name: Clean /tmp directory (files older than 7 days) | ||||
|       find: | ||||
|         paths: /tmp | ||||
|         age: "7d" | ||||
|         recurse: yes | ||||
|       register: tmp_files | ||||
|        | ||||
|     - name: Remove old temp files | ||||
|       file: | ||||
|         path: "{{ item.path }}" | ||||
|         state: absent | ||||
|       loop: "{{ tmp_files.files }}" | ||||
|       when: tmp_files.files is defined | ||||
|        | ||||
|     # Docker 清理 (如果存在) | ||||
|     - name: Check if Docker is installed | ||||
|       command: which docker | ||||
|       register: docker_check | ||||
|       failed_when: false | ||||
|       changed_when: false | ||||
|        | ||||
|     - name: Clean Docker system | ||||
|       shell: | | ||||
|         docker system prune -f | ||||
|         docker image prune -f | ||||
|         docker volume prune -f | ||||
|       when: docker_check.rc == 0 | ||||
|        | ||||
|     # 磁盘空间检查 | ||||
|     - name: Check disk usage | ||||
|       shell: df -h | ||||
|       register: disk_usage | ||||
|        | ||||
|     - name: Display disk usage | ||||
|       debug: | ||||
|         msg: "{{ disk_usage.stdout_lines }}" | ||||
|          | ||||
|     # 内存使用检查 | ||||
|     - name: Check memory usage | ||||
|       shell: free -h | ||||
|       register: memory_usage | ||||
|        | ||||
|     - name: Display memory usage | ||||
|       debug: | ||||
|         msg: "{{ memory_usage.stdout_lines }}" | ||||
|  | @ -1,43 +0,0 @@ | |||
| --- | ||||
| - name: System Update Playbook | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Wait for automatic system updates to complete | ||||
|       shell: while fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do sleep 5; done | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     - name: Update apt cache | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|         cache_valid_time: 3600 | ||||
|       when: ansible_os_family == "Debian" | ||||
|       retries: 3 | ||||
|       delay: 10 | ||||
|        | ||||
|     - name: Upgrade all packages | ||||
|       apt: | ||||
|         upgrade: yes | ||||
|         autoremove: yes | ||||
|         autoclean: yes | ||||
|       when: ansible_os_family == "Debian" | ||||
|       register: upgrade_result | ||||
|       retries: 3 | ||||
|       delay: 10 | ||||
|        | ||||
|     - name: Display upgrade results | ||||
|       debug: | ||||
|         msg: "System upgrade completed. {{ upgrade_result.changed }} packages were updated." | ||||
|          | ||||
|     - name: Check if reboot is required | ||||
|       stat: | ||||
|         path: /var/run/reboot-required | ||||
|       register: reboot_required | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     - name: Notify if reboot is required | ||||
|       debug: | ||||
|         msg: "System reboot is required to complete the update." | ||||
|       when: reboot_required.stat.exists is defined and reboot_required.stat.exists | ||||
|  | @ -0,0 +1,81 @@ | |||
| --- | ||||
| - name: Clear all aliases on hcp1 and hcp2 | ||||
|   hosts: hcp1,hcp2 | ||||
|   become: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Check current aliases | ||||
|       shell: alias || echo "No aliases found" | ||||
|       register: current_aliases | ||||
|        | ||||
|     - name: Display current aliases | ||||
|       debug: | ||||
|         msg: "Current aliases: {{ current_aliases.stdout_lines }}" | ||||
|      | ||||
|     - name: Clear aliases from /root/.bashrc | ||||
|       shell: | | ||||
|         sed -i '/^alias /d' /root/.bashrc | ||||
|         sed -i '/^alias\t/d' /root/.bashrc | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Clear aliases from /root/.profile | ||||
|       shell: | | ||||
|         sed -i '/^alias /d' /root/.profile | ||||
|         sed -i '/^alias\t/d' /root/.profile | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Clear aliases from /root/.zshrc | ||||
|       shell: | | ||||
|         sed -i '/^alias /d' /root/.zshrc | ||||
|         sed -i '/^alias\t/d' /root/.zshrc | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Clear aliases from /etc/bash.bashrc | ||||
|       shell: | | ||||
|         sed -i '/^alias /d' /etc/bash.bashrc | ||||
|         sed -i '/^alias\t/d' /etc/bash.bashrc | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Clear aliases from /etc/profile | ||||
|       shell: | | ||||
|         sed -i '/^alias /d' /etc/profile | ||||
|         sed -i '/^alias\t/d' /etc/profile | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Find and clear custom alias files | ||||
|       find: | ||||
|         paths: ["/root", "/etc", "/home"] | ||||
|         patterns: ["*.aliases", ".aliases", "aliases"] | ||||
|         recurse: yes | ||||
|       register: alias_files | ||||
|        | ||||
|     - name: Remove found alias files | ||||
|       file: | ||||
|         path: "{{ item.path }}" | ||||
|         state: absent | ||||
|       loop: "{{ alias_files.files }}" | ||||
|       when: alias_files.files is defined | ||||
|        | ||||
|     - name: Clear shell history to remove alias commands | ||||
|       shell: | | ||||
|         > /root/.bash_history | ||||
|         > /root/.zsh_history | ||||
|         history -c | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Unalias all current aliases | ||||
|       shell: unalias -a | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Restart shell services | ||||
|       shell: | | ||||
|         pkill -f bash || true | ||||
|         pkill -f zsh || true | ||||
|          | ||||
|     - name: Test network connectivity after clearing aliases | ||||
|       shell: ping -c 2 8.8.8.8 || echo "Ping failed" | ||||
|       register: ping_test | ||||
|        | ||||
|     - name: Display ping test result | ||||
|       debug: | ||||
|         msg: "Ping test: {{ ping_test.stdout_lines }}" | ||||
|  | @ -0,0 +1,76 @@ | |||
| --- | ||||
| - name: Clear proxy settings on hcp1 and hcp2 | ||||
|   hosts: hcp1,hcp2 | ||||
|   become: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Check current proxy environment variables | ||||
|       shell: env | grep -i proxy || echo "No proxy vars found" | ||||
|       register: proxy_env_before | ||||
|        | ||||
|     - name: Display current proxy settings | ||||
|       debug: | ||||
|         msg: "Current proxy env: {{ proxy_env_before.stdout_lines }}" | ||||
|      | ||||
|     - name: Clear proxy from /etc/environment | ||||
|       lineinfile: | ||||
|         path: /etc/environment | ||||
|         regexp: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - "^http_proxy=" | ||||
|         - "^https_proxy=" | ||||
|         - "^HTTP_PROXY=" | ||||
|         - "^HTTPS_PROXY=" | ||||
|         - "^ftp_proxy=" | ||||
|         - "^FTP_PROXY=" | ||||
|         - "^no_proxy=" | ||||
|         - "^NO_PROXY=" | ||||
|          | ||||
|     - name: Clear proxy from /etc/apt/apt.conf.d/ | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - /etc/apt/apt.conf.d/95proxies | ||||
|         - /etc/apt/apt.conf.d/proxy.conf | ||||
|         - /etc/apt/apt.conf.d/00proxy | ||||
|          | ||||
|     - name: Clear proxy from user profiles | ||||
|       lineinfile: | ||||
|         path: "{{ item }}" | ||||
|         regexp: ".*proxy.*" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - /root/.bashrc | ||||
|         - /root/.profile | ||||
|         - /home/root/.bashrc | ||||
|         - /home/root/.profile | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Unset proxy variables in current session | ||||
|       shell: | | ||||
|         unset http_proxy | ||||
|         unset https_proxy | ||||
|         unset HTTP_PROXY | ||||
|         unset HTTPS_PROXY | ||||
|         unset ftp_proxy | ||||
|         unset FTP_PROXY | ||||
|         unset no_proxy | ||||
|         unset NO_PROXY | ||||
|          | ||||
|     - name: Check APT proxy configuration | ||||
|       shell: apt-config dump | grep -i proxy || echo "No APT proxy found" | ||||
|       register: apt_proxy_check | ||||
|        | ||||
|     - name: Display APT proxy status | ||||
|       debug: | ||||
|         msg: "APT proxy config: {{ apt_proxy_check.stdout_lines }}" | ||||
|          | ||||
|     - name: Test direct connection to HashiCorp | ||||
|       shell: curl -I --connect-timeout 10 https://releases.hashicorp.com/ || echo "Connection failed" | ||||
|       register: connection_test | ||||
|        | ||||
|     - name: Display connection test result | ||||
|       debug: | ||||
|         msg: "Connection test: {{ connection_test.stdout_lines }}" | ||||
|  | @ -0,0 +1,57 @@ | |||
| --- | ||||
| - name: Configure Podman driver for all Nomad client nodes | ||||
|   hosts: nomad_clients,nomad_servers | ||||
|   become: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|          | ||||
|     - name: Install Podman if not present | ||||
|       package: | ||||
|         name: podman | ||||
|         state: present | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Enable Podman socket | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Update Nomad configuration to use Podman | ||||
|       lineinfile: | ||||
|         path: /etc/nomad.d/nomad.hcl | ||||
|         regexp: '^plugin "docker"' | ||||
|         line: 'plugin "podman" {' | ||||
|         state: present | ||||
|          | ||||
|     - name: Add Podman plugin configuration | ||||
|       blockinfile: | ||||
|         path: /etc/nomad.d/nomad.hcl | ||||
|         marker: "# {mark} PODMAN PLUGIN CONFIG" | ||||
|         block: | | ||||
|           plugin "podman" { | ||||
|             config { | ||||
|               socket_path = "unix:///run/podman/podman.sock" | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|         insertafter: 'client {' | ||||
|          | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|          | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: localhost | ||||
|         delay: 5 | ||||
|         timeout: 30 | ||||
|  | @ -0,0 +1,217 @@ | |||
| --- | ||||
| - name: 配置 Nomad 集群使用 Tailscale 网络通讯 | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   gather_facts: no | ||||
|   vars: | ||||
|     nomad_config_dir: "/etc/nomad.d" | ||||
|     nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" | ||||
|      | ||||
|   tasks: | ||||
|     - name: 获取当前节点的 Tailscale IP | ||||
|       shell: tailscale ip | head -1 | ||||
|       register: current_tailscale_ip | ||||
|       failed_when: current_tailscale_ip.rc != 0 | ||||
| 
 | ||||
|     - name: 确保 Nomad 配置目录存在 | ||||
|       file: | ||||
|         path: "{{ nomad_config_dir }}" | ||||
|         state: directory | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: 生成 Nomad 服务器配置(使用 Tailscale) | ||||
|       copy: | ||||
|         dest: "{{ nomad_config_file }}" | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
|         content: | | ||||
|           datacenter = "{{ nomad_datacenter | default('dc1') }}" | ||||
|           data_dir = "/opt/nomad/data" | ||||
|           log_level = "INFO" | ||||
|            | ||||
|           bind_addr = "{{ current_tailscale_ip.stdout }}" | ||||
|            | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ current_tailscale_ip.stdout }}" | ||||
|             serf = "{{ current_tailscale_ip.stdout }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           server { | ||||
|             enabled = true | ||||
|             bootstrap_expect = {{ nomad_bootstrap_expect | default(4) }} | ||||
|              | ||||
|             retry_join = [ | ||||
|               "100.116.158.95",  # semaphore | ||||
|               "100.103.147.94", # ash2e | ||||
|               "100.81.26.3",    # ash1d | ||||
|               "100.90.159.68"   # ch2 | ||||
|             ] | ||||
|              | ||||
|             encrypt = "{{ nomad_encrypt_key }}" | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = false | ||||
|           } | ||||
|            | ||||
|           plugin "podman" { | ||||
|             config { | ||||
|               socket_path = "unix:///run/podman/podman.sock" | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           consul { | ||||
|             address = "{{ current_tailscale_ip.stdout }}:8500" | ||||
|           } | ||||
|       when: nomad_role == "server" | ||||
| 
 | ||||
|     - name: 生成 Nomad 客户端配置(使用 Tailscale) | ||||
|       copy: | ||||
|         dest: "{{ nomad_config_file }}" | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
|         content: | | ||||
|           datacenter = "{{ nomad_datacenter | default('dc1') }}" | ||||
|           data_dir = "/opt/nomad/data" | ||||
|           log_level = "INFO" | ||||
|            | ||||
|           bind_addr = "{{ current_tailscale_ip.stdout }}" | ||||
|            | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ current_tailscale_ip.stdout }}" | ||||
|             serf = "{{ current_tailscale_ip.stdout }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           server { | ||||
|             enabled = false | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = true | ||||
|              | ||||
|             servers = [ | ||||
|               "100.116.158.95:4647",  # semaphore | ||||
|               "100.103.147.94:4647", # ash2e | ||||
|               "100.81.26.3:4647",    # ash1d | ||||
|               "100.90.159.68:4647"   # ch2 | ||||
|             ] | ||||
|           } | ||||
|            | ||||
|           plugin "podman" { | ||||
|             config { | ||||
|               socket_path = "unix:///run/podman/podman.sock" | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           consul { | ||||
|             address = "{{ current_tailscale_ip.stdout }}:8500" | ||||
|           } | ||||
|       when: nomad_role == "client" | ||||
| 
 | ||||
|     - name: 检查 Nomad 二进制文件位置 | ||||
|       shell: which nomad || find /usr -name nomad 2>/dev/null | head -1 | ||||
|       register: nomad_binary_path | ||||
|       failed_when: nomad_binary_path.stdout == "" | ||||
| 
 | ||||
|     - name: 创建/更新 Nomad systemd 服务文件 | ||||
|       copy: | ||||
|         dest: "/etc/systemd/system/nomad.service" | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
|         content: | | ||||
|           [Unit] | ||||
|           Description=Nomad | ||||
|           Documentation=https://www.nomadproject.io/ | ||||
|           Requires=network-online.target | ||||
|           After=network-online.target | ||||
|            | ||||
|           [Service] | ||||
|           Type=notify | ||||
|           User=root | ||||
|           Group=root | ||||
|           ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl | ||||
|           ExecReload=/bin/kill -HUP $MAINPID | ||||
|           KillMode=process | ||||
|           Restart=on-failure | ||||
|           LimitNOFILE=65536 | ||||
|            | ||||
|           [Install] | ||||
|           WantedBy=multi-user.target | ||||
|       notify: restart nomad | ||||
| 
 | ||||
|     - name: 确保 Nomad 数据目录存在 | ||||
|       file: | ||||
|         path: "/opt/nomad/data" | ||||
|         state: directory | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: 重新加载 systemd daemon | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: 启用并启动 Nomad 服务 | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         enabled: yes | ||||
|         state: started | ||||
| 
 | ||||
|     - name: 等待 Nomad 服务启动 | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: "{{ current_tailscale_ip.stdout }}" | ||||
|         delay: 5 | ||||
|         timeout: 30 | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 检查 Nomad 服务状态 | ||||
|       shell: systemctl status nomad --no-pager -l | ||||
|       register: nomad_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 显示配置结果 | ||||
|       debug: | ||||
|         msg: | | ||||
|           ✅ 节点 {{ inventory_hostname }} 配置完成 | ||||
|           🌐 Tailscale IP: {{ current_tailscale_ip.stdout }} | ||||
|           🎯 角色: {{ nomad_role }} | ||||
|           🔧 Nomad 二进制: {{ nomad_binary_path.stdout }} | ||||
|           📊 服务状态: {{ 'active' if nomad_status.rc == 0 else 'failed' }} | ||||
|           {% if nomad_status.rc != 0 %} | ||||
|           ❌ 错误信息: | ||||
|           {{ nomad_status.stdout }} | ||||
|           {{ nomad_status.stderr }} | ||||
|           {% endif %} | ||||
| 
 | ||||
|   handlers: | ||||
|     - name: restart nomad | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: restarted | ||||
|         daemon_reload: yes | ||||
|  | @ -0,0 +1,60 @@ | |||
| --- | ||||
| - name: Debug Nomad Podman Driver Issues | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_user: nomad | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Check Nomad configuration | ||||
|       shell: cat /etc/nomad.d/nomad.hcl | ||||
|       register: nomad_config | ||||
| 
 | ||||
|     - name: Display Nomad configuration | ||||
|       debug: | ||||
|         var: nomad_config.stdout_lines | ||||
| 
 | ||||
|     - name: Check plugin directory contents | ||||
|       shell: ls -la /opt/nomad/data/plugins/ | ||||
|       register: plugin_dir | ||||
| 
 | ||||
|     - name: Display plugin directory | ||||
|       debug: | ||||
|         var: plugin_dir.stdout_lines | ||||
| 
 | ||||
|     - name: Check Nomad logs for plugin loading | ||||
|       shell: journalctl -u nomad -n 50 --no-pager | grep -E "(plugin|driver|podman)" | ||||
|       register: nomad_logs | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display relevant Nomad logs | ||||
|       debug: | ||||
|         var: nomad_logs.stdout_lines | ||||
| 
 | ||||
|     - name: Check if plugin is executable | ||||
|       stat: | ||||
|         path: /opt/nomad/data/plugins/nomad-driver-podman | ||||
|       register: plugin_stat | ||||
| 
 | ||||
|     - name: Display plugin file info | ||||
|       debug: | ||||
|         var: plugin_stat | ||||
| 
 | ||||
|     - name: Test plugin directly | ||||
|       shell: /opt/nomad/data/plugins/nomad-driver-podman --version | ||||
|       register: plugin_version | ||||
|       failed_when: false | ||||
|       become_user: "{{ nomad_user }}" | ||||
| 
 | ||||
|     - name: Display plugin version | ||||
|       debug: | ||||
|         msg: "Plugin version test: {{ 'SUCCESS' if plugin_version.rc == 0 else 'FAILED' }} - {{ plugin_version.stdout if plugin_version.rc == 0 else plugin_version.stderr }}" | ||||
| 
 | ||||
|     - name: Check Podman socket accessibility | ||||
|       shell: sudo -u {{ nomad_user }} curl --unix-socket /run/user/1001/podman/podman.sock http://localhost/v1.0.0/libpod/info 2>/dev/null | head -3 | ||||
|       register: podman_socket_test | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display Podman socket test | ||||
|       debug: | ||||
|         msg: "Podman socket test: {{ 'SUCCESS' if podman_socket_test.rc == 0 else 'FAILED' }}" | ||||
|  | @ -0,0 +1,168 @@ | |||
| --- | ||||
| - name: 磁盘空间分析 - 使用 ncdu 工具 | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     ncdu_scan_paths: | ||||
|       - "/" | ||||
|       - "/var" | ||||
|       - "/opt" | ||||
|       - "/home" | ||||
|     output_dir: "/tmp/disk-analysis" | ||||
|      | ||||
|   tasks: | ||||
|     - name: 安装 ncdu 工具 | ||||
|       package: | ||||
|         name: ncdu | ||||
|         state: present | ||||
|       register: ncdu_install | ||||
|        | ||||
|     - name: 创建输出目录 | ||||
|       file: | ||||
|         path: "{{ output_dir }}" | ||||
|         state: directory | ||||
|         mode: '0755' | ||||
|          | ||||
|     - name: 检查磁盘空间使用情况 | ||||
|       shell: df -h | ||||
|       register: disk_usage | ||||
|        | ||||
|     - name: 显示当前磁盘使用情况 | ||||
|       debug: | ||||
|         msg: | | ||||
|           === {{ inventory_hostname }} 磁盘使用情况 === | ||||
|           {{ disk_usage.stdout }} | ||||
|            | ||||
|     - name: 使用 ncdu 扫描根目录并生成报告 | ||||
|       shell: | | ||||
|         ncdu -x -o {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json / | ||||
|       async: 300 | ||||
|       poll: 0 | ||||
|       register: ncdu_root_scan | ||||
|        | ||||
|     - name: 使用 ncdu 扫描 /var 目录 | ||||
|       shell: | | ||||
|         ncdu -x -o {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json /var | ||||
|       async: 180 | ||||
|       poll: 0 | ||||
|       register: ncdu_var_scan | ||||
|       when: ansible_mounts | selectattr('mount', 'equalto', '/var') | list | length > 0 or '/var' in ansible_mounts | map(attribute='mount') | list | ||||
|        | ||||
|     - name: 使用 ncdu 扫描 /opt 目录 | ||||
|       shell: | | ||||
|         ncdu -x -o {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json /opt | ||||
|       async: 120 | ||||
|       poll: 0 | ||||
|       register: ncdu_opt_scan | ||||
|       when: ansible_mounts | selectattr('mount', 'equalto', '/opt') | list | length > 0 or '/opt' in ansible_mounts | map(attribute='mount') | list | ||||
|        | ||||
|     - name: 等待根目录扫描完成 | ||||
|       async_status: | ||||
|         jid: "{{ ncdu_root_scan.ansible_job_id }}" | ||||
|       register: ncdu_root_result | ||||
|       until: ncdu_root_result.finished | ||||
|       retries: 60 | ||||
|       delay: 5 | ||||
|        | ||||
|     - name: 等待 /var 目录扫描完成 | ||||
|       async_status: | ||||
|         jid: "{{ ncdu_var_scan.ansible_job_id }}" | ||||
|       register: ncdu_var_result | ||||
|       until: ncdu_var_result.finished | ||||
|       retries: 36 | ||||
|       delay: 5 | ||||
|       when: ncdu_var_scan is defined and ncdu_var_scan.ansible_job_id is defined | ||||
|        | ||||
|     - name: 等待 /opt 目录扫描完成 | ||||
|       async_status: | ||||
|         jid: "{{ ncdu_opt_scan.ansible_job_id }}" | ||||
|       register: ncdu_opt_result | ||||
|       until: ncdu_opt_result.finished | ||||
|       retries: 24 | ||||
|       delay: 5 | ||||
|       when: ncdu_opt_scan is defined and ncdu_opt_scan.ansible_job_id is defined | ||||
|        | ||||
|     - name: 生成磁盘使用分析报告 | ||||
|       shell: | | ||||
|         echo "=== {{ inventory_hostname }} 磁盘分析报告 ===" > {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "生成时间: $(date)" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "=== 磁盘使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         df -h >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "=== 最大的目录 (前10个) ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         du -h --max-depth=2 / 2>/dev/null | sort -hr | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "=== /var 目录最大文件 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         find /var -type f -size +100M -exec ls -lh {} \; 2>/dev/null | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "=== /tmp 目录使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         du -sh /tmp/* 2>/dev/null | sort -hr | head -5 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         echo "=== 日志文件大小 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|         find /var/log -name "*.log" -type f -size +50M -exec ls -lh {} \; 2>/dev/null >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|          | ||||
|     - name: 显示分析报告 | ||||
|       shell: cat {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|       register: disk_report | ||||
|        | ||||
|     - name: 输出磁盘分析结果 | ||||
|       debug: | ||||
|         msg: "{{ disk_report.stdout }}" | ||||
|          | ||||
|     - name: 检查是否有磁盘使用率超过 80% | ||||
|       shell: df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 80) print $0}' | ||||
|       register: high_usage_disks | ||||
|        | ||||
|     - name: 警告高磁盘使用率 | ||||
|       debug: | ||||
|         msg: | | ||||
|           ⚠️  警告: {{ inventory_hostname }} 发现高磁盘使用率! | ||||
|           {{ high_usage_disks.stdout }} | ||||
|       when: high_usage_disks.stdout != "" | ||||
|        | ||||
|     - name: 创建清理建议 | ||||
|       shell: | | ||||
|         echo "=== {{ inventory_hostname }} 清理建议 ===" > {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "1. 检查日志文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         find /var/log -name "*.log" -type f -size +100M -exec echo "   大日志文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "2. 检查临时文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         find /tmp -type f -size +50M -exec echo "   大临时文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "3. 检查包缓存:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         if [ -d /var/cache/apt ]; then | ||||
|           echo "   APT 缓存大小: $(du -sh /var/cache/apt 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         fi | ||||
|         if [ -d /var/cache/yum ]; then | ||||
|           echo "   YUM 缓存大小: $(du -sh /var/cache/yum 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         fi | ||||
|         echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         echo "4. 检查容器相关:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         if command -v podman >/dev/null 2>&1; then | ||||
|           echo "   Podman 镜像: $(podman images --format 'table {{.Repository}} {{.Tag}} {{.Size}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|           echo "   Podman 容器: $(podman ps -a --format 'table {{.Names}} {{.Status}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|         fi | ||||
|          | ||||
|     - name: 显示清理建议 | ||||
|       shell: cat {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|       register: cleanup_suggestions | ||||
|        | ||||
|     - name: 输出清理建议 | ||||
|       debug: | ||||
|         msg: "{{ cleanup_suggestions.stdout }}" | ||||
|          | ||||
|     - name: 保存 ncdu 文件位置信息 | ||||
|       debug: | ||||
|         msg: | | ||||
|           📁 ncdu 扫描文件已保存到: | ||||
|           - 根目录: {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json | ||||
|           - /var 目录: {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json (如果存在) | ||||
|           - /opt 目录: {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json (如果存在) | ||||
|            | ||||
|           💡 使用方法: | ||||
|           ncdu -f {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json | ||||
|            | ||||
|           📊 完整报告: {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt | ||||
|           🧹 清理建议: {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt | ||||
|  | @ -0,0 +1,96 @@ | |||
| --- | ||||
| - name: 磁盘清理工具 | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     cleanup_logs: true | ||||
|     cleanup_cache: true | ||||
|     cleanup_temp: true | ||||
|     cleanup_containers: false  # 谨慎操作 | ||||
|      | ||||
|   tasks: | ||||
|     - name: 检查磁盘使用情况 (清理前) | ||||
|       shell: df -h | ||||
|       register: disk_before | ||||
|        | ||||
|     - name: 显示清理前磁盘使用情况 | ||||
|       debug: | ||||
|         msg: | | ||||
|           === {{ inventory_hostname }} 清理前磁盘使用情况 === | ||||
|           {{ disk_before.stdout }} | ||||
|            | ||||
|     - name: 清理系统日志 (保留最近7天) | ||||
|       shell: | | ||||
|         journalctl --vacuum-time=7d | ||||
|         find /var/log -name "*.log" -type f -mtime +7 -exec truncate -s 0 {} \; | ||||
|         find /var/log -name "*.log.*" -type f -mtime +7 -delete | ||||
|       when: cleanup_logs | bool | ||||
|       register: log_cleanup | ||||
|        | ||||
|     - name: 清理包管理器缓存 | ||||
|       block: | ||||
|         - name: 清理 APT 缓存 (Debian/Ubuntu) | ||||
|           shell: | | ||||
|             apt-get clean | ||||
|             apt-get autoclean | ||||
|             apt-get autoremove -y | ||||
|           when: ansible_os_family == "Debian" | ||||
|            | ||||
|         - name: 清理 YUM/DNF 缓存 (RedHat/CentOS) | ||||
|           shell: | | ||||
|             if command -v dnf >/dev/null 2>&1; then | ||||
|               dnf clean all | ||||
|             elif command -v yum >/dev/null 2>&1; then | ||||
|               yum clean all | ||||
|             fi | ||||
|           when: ansible_os_family == "RedHat" | ||||
|       when: cleanup_cache | bool | ||||
|        | ||||
|     - name: 清理临时文件 | ||||
|       shell: | | ||||
|         find /tmp -type f -atime +7 -delete 2>/dev/null || true | ||||
|         find /var/tmp -type f -atime +7 -delete 2>/dev/null || true | ||||
|         rm -rf /tmp/.* 2>/dev/null || true | ||||
|       when: cleanup_temp | bool | ||||
|        | ||||
|     - name: 清理 Podman 资源 (谨慎操作) | ||||
|       block: | ||||
|         - name: 停止所有容器 | ||||
|           shell: podman stop --all | ||||
|           ignore_errors: yes | ||||
|            | ||||
|         - name: 删除未使用的容器 | ||||
|           shell: podman container prune -f | ||||
|           ignore_errors: yes | ||||
|            | ||||
|         - name: 删除未使用的镜像 | ||||
|           shell: podman image prune -f | ||||
|           ignore_errors: yes | ||||
|            | ||||
|         - name: 删除未使用的卷 | ||||
|           shell: podman volume prune -f | ||||
|           ignore_errors: yes | ||||
|       when: cleanup_containers | bool | ||||
|        | ||||
|     - name: 清理核心转储文件 | ||||
|       shell: | | ||||
|         find /var/crash -name "core.*" -type f -delete 2>/dev/null || true | ||||
|         find / -name "core" -type f -size +10M -delete 2>/dev/null || true | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: 检查磁盘使用情况 (清理后) | ||||
|       shell: df -h | ||||
|       register: disk_after | ||||
|        | ||||
|     - name: 显示清理结果 | ||||
|       debug: | ||||
|         msg: | | ||||
|           === {{ inventory_hostname }} 清理完成 === | ||||
|            | ||||
|           清理前: | ||||
|           {{ disk_before.stdout }} | ||||
|            | ||||
|           清理后: | ||||
|           {{ disk_after.stdout }} | ||||
|            | ||||
|           🧹 清理操作完成! | ||||
|  | @ -0,0 +1,105 @@ | |||
| --- | ||||
| - name: Final Podman Permission Fix for Nomad | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
| 
 | ||||
|     - name: Install podman for nomad user (system-wide) | ||||
|       package: | ||||
|         name: podman | ||||
|         state: present | ||||
| 
 | ||||
|     - name: Enable podman socket for nomad user | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|         scope: system | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: Create nomad user podman configuration directory | ||||
|       file: | ||||
|         path: /home/nomad/.config/containers | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
|         recurse: yes | ||||
| 
 | ||||
|     - name: Configure podman for nomad user to use system socket | ||||
|       copy: | ||||
|         content: | | ||||
|           [containers] | ||||
|            | ||||
|           [engine] | ||||
|           remote = true | ||||
|            | ||||
|           [service_destinations] | ||||
|           [service_destinations.system] | ||||
|           uri = "unix:///run/podman/podman.sock" | ||||
|         dest: /home/nomad/.config/containers/containers.conf | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: Update Nomad configuration to use system podman socket | ||||
|       replace: | ||||
|         path: /etc/nomad.d/nomad.hcl | ||||
|         regexp: 'socket_path = "unix:///run/user/1001/podman/podman.sock"' | ||||
|         replace: 'socket_path = "unix:///run/podman/podman.sock"' | ||||
| 
 | ||||
|     - name: Add nomad user to necessary groups | ||||
|       user: | ||||
|         name: nomad | ||||
|         groups:  | ||||
|           - podman | ||||
|         append: yes | ||||
| 
 | ||||
|     - name: Create podman group if it doesn't exist | ||||
|       group: | ||||
|         name: podman | ||||
|         state: present | ||||
| 
 | ||||
|     - name: Set proper permissions on system podman socket directory | ||||
|       file: | ||||
|         path: /run/podman | ||||
|         state: directory | ||||
|         mode: '0755' | ||||
|         group: podman | ||||
| 
 | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Wait for plugins to load | ||||
|       pause: | ||||
|         seconds: 20 | ||||
| 
 | ||||
|     - name: Final verification - Check driver status | ||||
|       shell: sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 "Driver Status" | ||||
|       register: final_driver_status | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display final driver status | ||||
|       debug: | ||||
|         var: final_driver_status.stdout_lines | ||||
| 
 | ||||
|     - name: Test podman access for nomad user | ||||
|       shell: sudo -u nomad podman version | ||||
|       register: podman_test | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display podman test result | ||||
|       debug: | ||||
|         var: podman_test.stdout_lines | ||||
|  | @ -0,0 +1,83 @@ | |||
| --- | ||||
| - name: Fix HCP1 and HCP2 Podman Configuration | ||||
|   hosts: hcp1,hcp2 | ||||
|   become: yes | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
| 
 | ||||
|     - name: Ensure nomad user exists | ||||
|       user: | ||||
|         name: nomad | ||||
|         system: yes | ||||
|         shell: /bin/false | ||||
|         home: /home/nomad | ||||
|         create_home: yes | ||||
| 
 | ||||
|     - name: Ensure Podman socket is running | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Set proper permissions on Podman socket | ||||
|       file: | ||||
|         path: /run/podman/podman.sock | ||||
|         mode: '0666' | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create nomad data directory | ||||
|       file: | ||||
|         path: /opt/nomad/data | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: Create nomad log directory | ||||
|       file: | ||||
|         path: /var/log/nomad | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: Test Podman access for nomad user | ||||
|       shell: sudo -u nomad podman version | ||||
|       register: podman_test | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display Podman test result | ||||
|       debug: | ||||
|         var: podman_test.stdout_lines | ||||
| 
 | ||||
|     - name: Validate Nomad configuration | ||||
|       shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl | ||||
|       register: config_validation | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display configuration validation | ||||
|       debug: | ||||
|         var: config_validation | ||||
| 
 | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Check Nomad node status | ||||
|       shell: /usr/local/bin/nomad node status -self | ||||
|       register: node_status | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display node status | ||||
|       debug: | ||||
|         var: node_status.stdout_lines | ||||
|  | @ -0,0 +1,56 @@ | |||
| --- | ||||
| - name: Fix dpkg and initramfs issues on hcs | ||||
|   hosts: hcs | ||||
|   become: yes | ||||
|   tasks: | ||||
|     - name: Check current dpkg status | ||||
|       shell: dpkg --audit | ||||
|       register: dpkg_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display dpkg status | ||||
|       debug: | ||||
|         var: dpkg_status.stdout_lines | ||||
| 
 | ||||
|     - name: Fix broken btrfs hook | ||||
|       shell: | | ||||
|         # Remove problematic btrfs hook temporarily | ||||
|         mv /usr/share/initramfs-tools/hooks/btrfs /usr/share/initramfs-tools/hooks/btrfs.bak || true | ||||
|          | ||||
|         # Try to reconfigure the failed package | ||||
|         dpkg --configure -a | ||||
|          | ||||
|         # If that works, restore the hook | ||||
|         if [ $? -eq 0 ]; then | ||||
|           mv /usr/share/initramfs-tools/hooks/btrfs.bak /usr/share/initramfs-tools/hooks/btrfs || true | ||||
|         fi | ||||
|       register: fix_result | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display fix result | ||||
|       debug: | ||||
|         var: fix_result | ||||
| 
 | ||||
|     - name: Alternative fix - reinstall initramfs-tools | ||||
|       apt: | ||||
|         name: initramfs-tools | ||||
|         state: latest | ||||
|         force: yes | ||||
|       when: fix_result.rc != 0 | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Clean up and update | ||||
|       shell: | | ||||
|         apt autoremove -y | ||||
|         apt update | ||||
|         apt upgrade -y | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Check final dpkg status | ||||
|       shell: dpkg --audit | ||||
|       register: final_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display final status | ||||
|       debug: | ||||
|         var: final_status.stdout_lines | ||||
|  | @ -0,0 +1,99 @@ | |||
| --- | ||||
| - name: Update Nomad configuration for Podman and fix issues | ||||
|   hosts: localhost | ||||
|   become: yes | ||||
|   connection: local | ||||
|    | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|          | ||||
|     - name: Update Nomad configuration to use Podman and disable Consul | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
| 
 | ||||
|           bind_addr = "100.116.158.95" | ||||
| 
 | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 1 | ||||
|             encrypt          = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|           } | ||||
| 
 | ||||
|           client { | ||||
|             enabled = true | ||||
|           } | ||||
| 
 | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
| 
 | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "100.116.158.95" | ||||
|             serf = "100.116.158.95" | ||||
|           } | ||||
| 
 | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
| 
 | ||||
|           plugin "podman" { | ||||
|             config { | ||||
|               socket_path = "unix:///run/podman/podman.sock" | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           # Disable Consul integration for now | ||||
|           consul { | ||||
|             address = "" | ||||
|           } | ||||
| 
 | ||||
|           log_level = "INFO" | ||||
|           log_file  = "/var/log/nomad/nomad.log" | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
|         backup: yes | ||||
|          | ||||
|     - name: Enable Podman socket for systemd | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|       ignore_errors: yes | ||||
|          | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|          | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: localhost | ||||
|         delay: 5 | ||||
|         timeout: 30 | ||||
|          | ||||
|     - name: Check Nomad status | ||||
|       uri: | ||||
|         url: http://localhost:4646/v1/status/leader | ||||
|         method: GET | ||||
|       register: nomad_status | ||||
|       retries: 3 | ||||
|       delay: 5 | ||||
|        | ||||
|     - name: Display Nomad status | ||||
|       debug: | ||||
|         msg: "Nomad leader: {{ nomad_status.json if nomad_status.json is defined else 'No leader elected' }}" | ||||
|  | @ -0,0 +1,72 @@ | |||
| --- | ||||
| - name: Fix Nomad Podman Driver Configuration | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_user: nomad | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
| 
 | ||||
|     - name: Update Nomad configuration to properly reference Podman plugin | ||||
|       replace: | ||||
|         path: /etc/nomad.d/nomad.hcl | ||||
|         regexp: 'plugin "podman" \{\n  config \{\n    socket_path = "unix:///run/user/1001/podman/podman.sock"\n    volumes \{\n      enabled = true\n    \}\n  \}\n\}' | ||||
|         replace: | | ||||
|           plugin "nomad-driver-podman" { | ||||
|             config { | ||||
|               socket_path = "unix:///run/user/1001/podman/podman.sock" | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
| 
 | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: localhost | ||||
|         delay: 10 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Wait for plugins to load | ||||
|       pause: | ||||
|         seconds: 15 | ||||
| 
 | ||||
|     - name: Check if Podman driver is now loaded | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status" | ||||
|       register: driver_status | ||||
| 
 | ||||
|     - name: Display driver status | ||||
|       debug: | ||||
|         var: driver_status.stdout_lines | ||||
| 
 | ||||
|     - name: Check Nomad logs for successful plugin loading | ||||
|       shell: journalctl -u nomad -n 20 --no-pager | grep -E "(podman|plugin)" | ||||
|       register: recent_logs | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display recent plugin logs | ||||
|       debug: | ||||
|         var: recent_logs.stdout_lines | ||||
| 
 | ||||
|     - name: Final verification - Test Podman functionality | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | grep -i podman | ||||
|       register: podman_driver_check | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display final result | ||||
|       debug: | ||||
|         msg: | | ||||
|           Podman driver status: {{ 'SUCCESS - Driver loaded!' if 'podman' in (podman_driver_check.stdout | default('')) else 'Still checking...' }} | ||||
|           Available drivers: {{ podman_driver_check.stdout_lines | default(['none']) | join(', ') }} | ||||
|  | @ -0,0 +1,88 @@ | |||
| --- | ||||
| - name: Fix Nomad systemd service binary path | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Check Nomad binary location | ||||
|       shell: which nomad | ||||
|       register: nomad_binary_path | ||||
|        | ||||
|     - name: Display binary path | ||||
|       debug: | ||||
|         msg: "Nomad binary 位于: {{ nomad_binary_path.stdout }}" | ||||
|          | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Update Nomad systemd service with correct binary path | ||||
|       copy: | ||||
|         content: | | ||||
|           [Unit] | ||||
|           Description=Nomad | ||||
|           Documentation=https://www.nomadproject.io/ | ||||
|           Requires=network-online.target | ||||
|           After=network-online.target | ||||
|           ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl | ||||
| 
 | ||||
|           [Service] | ||||
|           Type=notify | ||||
|           User=nomad | ||||
|           Group=nomad | ||||
|           ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl | ||||
|           ExecReload=/bin/kill -HUP $MAINPID | ||||
|           KillMode=process | ||||
|           Restart=on-failure | ||||
|           LimitNOFILE=65536 | ||||
| 
 | ||||
|           [Install] | ||||
|           WantedBy=multi-user.target | ||||
|         dest: /etc/systemd/system/nomad.service | ||||
|         mode: '0644' | ||||
|       notify: reload systemd | ||||
|        | ||||
|     - name: Reload systemd and start Nomad servers first | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
|         daemon_reload: yes | ||||
|       when: inventory_hostname in groups['nomad_servers'] | ||||
|        | ||||
|     - name: Wait for servers to be ready | ||||
|       pause: | ||||
|         seconds: 15 | ||||
|       when: inventory_hostname in groups['nomad_servers'] | ||||
|        | ||||
|     - name: Start Nomad clients | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
|         daemon_reload: yes | ||||
|       when: inventory_hostname in groups['nomad_clients'] | ||||
|        | ||||
|     - name: Wait for clients to connect | ||||
|       pause: | ||||
|         seconds: 10 | ||||
|       when: inventory_hostname in groups['nomad_clients'] | ||||
|        | ||||
|     - name: Check final service status | ||||
|       shell: systemctl status nomad --no-pager -l | ||||
|       register: service_status | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Display service status | ||||
|       debug: | ||||
|         msg: | | ||||
|           ✅ 节点 {{ inventory_hostname }} 服务状态: | ||||
|           📊 状态: {{ 'SUCCESS' if service_status.rc == 0 else 'FAILED' }} | ||||
|           💾 二进制路径: {{ nomad_binary_path.stdout }} | ||||
|            | ||||
|   handlers: | ||||
|     - name: reload systemd | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
|  | @ -0,0 +1,79 @@ | |||
| --- | ||||
| - name: Fix Podman installation on remaining nodes | ||||
|   hosts: semaphore,master,ash3c,hcs | ||||
|   become: yes | ||||
|   serial: 1  # 逐个处理,避免同时影响多个节点 | ||||
|    | ||||
|   tasks: | ||||
|     - name: Current node status | ||||
|       debug: | ||||
|         msg: "🔧 修复节点: {{ inventory_hostname }}" | ||||
|      | ||||
|     - name: Check if Podman is already installed | ||||
|       shell: podman --version 2>/dev/null || echo "NOT_INSTALLED" | ||||
|       register: podman_check | ||||
|        | ||||
|     - name: Install Podman if not present (semaphore special handling) | ||||
|       apt: | ||||
|         name: | ||||
|           - podman | ||||
|           - buildah | ||||
|           - skopeo | ||||
|         state: present | ||||
|         update_cache: yes | ||||
|         force_apt_get: yes | ||||
|       when: inventory_hostname == 'semaphore' and 'NOT_INSTALLED' in podman_check.stdout | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Install Podman on other nodes | ||||
|       apt: | ||||
|         name: | ||||
|           - podman | ||||
|           - buildah   | ||||
|           - skopeo | ||||
|         state: present | ||||
|       when: inventory_hostname != 'semaphore' | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Install Python dependencies for podman-compose | ||||
|       apt: | ||||
|         name: | ||||
|           - python3-pip | ||||
|           - python3-setuptools | ||||
|           - python3-yaml | ||||
|           - python3-dotenv | ||||
|         state: present | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Install podman-compose via pip | ||||
|       pip: | ||||
|         name:  | ||||
|           - podman-compose | ||||
|         state: present | ||||
|         executable: pip3 | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Alternative podman-compose installation via apt | ||||
|       apt: | ||||
|         name: podman-compose | ||||
|         state: present | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Verify installations | ||||
|       shell: | | ||||
|         echo "Podman: $(podman --version 2>/dev/null || echo 'FAILED')" | ||||
|         echo "Podman Compose: $(podman-compose --version 2>/dev/null || echo 'FAILED')" | ||||
|       register: verify_result | ||||
|        | ||||
|     - name: Display verification results | ||||
|       debug: | ||||
|         msg: | | ||||
|           ✅ 节点 {{ inventory_hostname }} 验证结果: | ||||
|           {{ verify_result.stdout }} | ||||
|            | ||||
|     - name: Enable Podman socket | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|       ignore_errors: yes | ||||
|  | @ -0,0 +1,133 @@ | |||
| --- | ||||
| - name: Install Nomad by direct download from HashiCorp | ||||
|   hosts: hcs | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_version: "1.10.5" | ||||
|     nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip" | ||||
|     nomad_user: "nomad" | ||||
|     nomad_group: "nomad" | ||||
|     nomad_home: "/opt/nomad" | ||||
|     nomad_data_dir: "/opt/nomad/data" | ||||
|     nomad_config_dir: "/etc/nomad.d" | ||||
|     nomad_datacenter: "dc1" | ||||
|     nomad_region: "global" | ||||
|     nomad_server_addresses: | ||||
|       - "100.116.158.95:4647"  # semaphore server address | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Create nomad user | ||||
|       user: | ||||
|         name: "{{ nomad_user }}" | ||||
|         group: "{{ nomad_group }}" | ||||
|         system: yes | ||||
|         shell: /bin/false | ||||
|         home: "{{ nomad_home }}" | ||||
|         create_home: yes | ||||
| 
 | ||||
|     - name: Create nomad directories | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: directory | ||||
|         owner: "{{ nomad_user }}" | ||||
|         group: "{{ nomad_group }}" | ||||
|         mode: '0755' | ||||
|       loop: | ||||
|         - "{{ nomad_home }}" | ||||
|         - "{{ nomad_data_dir }}" | ||||
|         - "{{ nomad_config_dir }}" | ||||
|         - /var/log/nomad | ||||
| 
 | ||||
|     - name: Install unzip package | ||||
|       apt: | ||||
|         name: unzip | ||||
|         state: present | ||||
|         update_cache: yes | ||||
| 
 | ||||
|     - name: Download Nomad binary | ||||
|       get_url: | ||||
|         url: "{{ nomad_url }}" | ||||
|         dest: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" | ||||
|         mode: '0644' | ||||
|         timeout: 300 | ||||
| 
 | ||||
|     - name: Extract Nomad binary | ||||
|       unarchive: | ||||
|         src: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" | ||||
|         dest: /tmp | ||||
|         remote_src: yes | ||||
| 
 | ||||
|     - name: Copy Nomad binary to /usr/local/bin | ||||
|       copy: | ||||
|         src: /tmp/nomad | ||||
|         dest: /usr/local/bin/nomad | ||||
|         mode: '0755' | ||||
|         owner: root | ||||
|         group: root | ||||
|         remote_src: yes | ||||
| 
 | ||||
|     - name: Create Nomad client configuration | ||||
|       template: | ||||
|         src: templates/nomad-client.hcl.j2 | ||||
|         dest: "{{ nomad_config_dir }}/nomad.hcl" | ||||
|         owner: "{{ nomad_user }}" | ||||
|         group: "{{ nomad_group }}" | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     - name: Create Nomad systemd service | ||||
|       copy: | ||||
|         content: | | ||||
|           [Unit] | ||||
|           Description=Nomad | ||||
|           Documentation=https://www.nomadproject.io/ | ||||
|           Requires=network-online.target | ||||
|           After=network-online.target | ||||
|           ConditionFileNotEmpty={{ nomad_config_dir }}/nomad.hcl | ||||
| 
 | ||||
|           [Service] | ||||
|           Type=notify | ||||
|           User={{ nomad_user }} | ||||
|           Group={{ nomad_group }} | ||||
|           ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }} | ||||
|           ExecReload=/bin/kill -HUP $MAINPID | ||||
|           KillMode=process | ||||
|           Restart=on-failure | ||||
|           LimitNOFILE=65536 | ||||
| 
 | ||||
|           [Install] | ||||
|           WantedBy=multi-user.target | ||||
|         dest: /etc/systemd/system/nomad.service | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: Reload systemd daemon | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: Enable and start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         enabled: yes | ||||
|         state: started | ||||
| 
 | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: localhost | ||||
|         delay: 5 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Verify Nomad installation | ||||
|       command: /usr/local/bin/nomad version | ||||
|       register: nomad_version_output | ||||
| 
 | ||||
|     - name: Display Nomad version | ||||
|       debug: | ||||
|         msg: "{{ nomad_version_output.stdout }}" | ||||
| 
 | ||||
|     - name: Clean up downloaded files | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" | ||||
|         - /tmp/nomad | ||||
|  | @ -0,0 +1,131 @@ | |||
| --- | ||||
| - name: Install Nomad Podman Driver Plugin | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_user: nomad | ||||
|     nomad_data_dir: /opt/nomad/data | ||||
|     nomad_plugins_dir: "{{ nomad_data_dir }}/plugins" | ||||
|     podman_driver_version: "0.6.1" | ||||
|     podman_driver_url: "https://releases.hashicorp.com/nomad-driver-podman/{{ podman_driver_version }}/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
| 
 | ||||
|     - name: Create plugins directory | ||||
|       file: | ||||
|         path: "{{ nomad_plugins_dir }}" | ||||
|         state: directory | ||||
|         owner: "{{ nomad_user }}" | ||||
|         group: "{{ nomad_user }}" | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: Download Nomad Podman driver | ||||
|       get_url: | ||||
|         url: "{{ podman_driver_url }}" | ||||
|         dest: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: Extract Nomad Podman driver | ||||
|       unarchive: | ||||
|         src: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" | ||||
|         dest: "/tmp" | ||||
|         remote_src: yes | ||||
| 
 | ||||
|     - name: Install Nomad Podman driver | ||||
|       copy: | ||||
|         src: "/tmp/nomad-driver-podman" | ||||
|         dest: "{{ nomad_plugins_dir }}/nomad-driver-podman" | ||||
|         owner: "{{ nomad_user }}" | ||||
|         group: "{{ nomad_user }}" | ||||
|         mode: '0755' | ||||
|         remote_src: yes | ||||
| 
 | ||||
|     - name: Update Nomad configuration for plugin directory | ||||
|       blockinfile: | ||||
|         path: /etc/nomad.d/nomad.hcl | ||||
|         marker: "# {mark} PLUGIN DIRECTORY CONFIGURATION" | ||||
|         block: | | ||||
|           plugin_dir = "{{ nomad_plugins_dir }}" | ||||
|         insertafter: 'data_dir = "/opt/nomad/data"' | ||||
| 
 | ||||
|     - name: Fix Podman socket permissions | ||||
|       file: | ||||
|         path: /run/user/1001/podman/podman.sock | ||||
|         mode: '0666' | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Ensure nomad user can access Podman socket | ||||
|       user: | ||||
|         name: "{{ nomad_user }}" | ||||
|         groups: ben | ||||
|         append: yes | ||||
| 
 | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for Nomad to be ready | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: localhost | ||||
|         delay: 10 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Verify Nomad is running | ||||
|       systemd: | ||||
|         name: nomad | ||||
|       register: nomad_service_status | ||||
| 
 | ||||
|     - name: Display Nomad service status | ||||
|       debug: | ||||
|         msg: "Nomad service is {{ nomad_service_status.status.ActiveState }}" | ||||
| 
 | ||||
|     - name: Wait for plugins to load | ||||
|       pause: | ||||
|         seconds: 15 | ||||
| 
 | ||||
|     - name: Check available drivers | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status" | ||||
|       register: driver_status | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display driver status | ||||
|       debug: | ||||
|         var: driver_status.stdout_lines | ||||
| 
 | ||||
|     - name: Test Podman driver functionality | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | ||||
|       register: available_drivers | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display available drivers | ||||
|       debug: | ||||
|         msg: "Available drivers: {{ available_drivers.stdout_lines | join(', ') }}" | ||||
| 
 | ||||
|     - name: Clean up downloaded files | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" | ||||
|         - "/tmp/nomad-driver-podman" | ||||
| 
 | ||||
|     - name: Final verification - Check if Podman driver is loaded | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers.podman.Detected' | ||||
|       register: podman_driver_detected | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display final result | ||||
|       debug: | ||||
|         msg: | | ||||
|           Podman driver installation: {{ 'SUCCESS' if podman_driver_detected.stdout == 'true' else 'NEEDS VERIFICATION' }} | ||||
|           Driver detected: {{ podman_driver_detected.stdout | default('unknown') }} | ||||
|  | @ -0,0 +1,61 @@ | |||
| --- | ||||
| - name: Install Podman Compose on all Nomad cluster nodes | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Display target node | ||||
|       debug: | ||||
|         msg: "正在安装 Podman Compose 到节点: {{ inventory_hostname }}" | ||||
|      | ||||
|     - name: Update package cache | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Install Podman and related tools | ||||
|       apt: | ||||
|         name: | ||||
|           - podman | ||||
|           - podman-compose | ||||
|           - buildah | ||||
|           - skopeo | ||||
|         state: present | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Install additional dependencies | ||||
|       apt: | ||||
|         name: | ||||
|           - python3-pip | ||||
|           - python3-setuptools | ||||
|         state: present | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Install podman-compose via pip if package manager failed | ||||
|       pip: | ||||
|         name: podman-compose | ||||
|         state: present | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Verify Podman installation | ||||
|       shell: podman --version | ||||
|       register: podman_version | ||||
|        | ||||
|     - name: Verify Podman Compose installation | ||||
|       shell: podman-compose --version | ||||
|       register: podman_compose_version | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Display installation results | ||||
|       debug: | ||||
|         msg: | | ||||
|           ✅ 节点 {{ inventory_hostname }} 安装结果: | ||||
|           📦 Podman: {{ podman_version.stdout }} | ||||
|           🐳 Podman Compose: {{ podman_compose_version.stdout if podman_compose_version.rc == 0 else '安装失败或不可用' }} | ||||
|            | ||||
|     - name: Ensure Podman socket is enabled | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|       ignore_errors: yes | ||||
|  | @ -1,131 +0,0 @@ | |||
| --- | ||||
| - name: Operations Toolkit - Unified Management Dashboard | ||||
|   hosts: all | ||||
|   gather_facts: yes | ||||
|    | ||||
|   vars: | ||||
|     # 可用的运维脚本 | ||||
|     available_scripts: | ||||
|       - { name: "system-update", desc: "System package updates", file: "system-update.yml" } | ||||
|       - { name: "system-cleanup", desc: "System cleanup and maintenance", file: "system-cleanup.yml" } | ||||
|       - { name: "service-health", desc: "Service health monitoring", file: "service-health-check.yml" } | ||||
|       - { name: "security-hardening", desc: "Security hardening and backup", file: "security-hardening.yml" } | ||||
|       - { name: "docker-management", desc: "Docker container management", file: "docker-management.yml" } | ||||
|       - { name: "network-connectivity", desc: "Network connectivity check", file: "network-connectivity.yml" } | ||||
|       - { name: "certificate-management", desc: "SSL certificate monitoring", file: "certificate-management.yml" } | ||||
|    | ||||
|   tasks: | ||||
|     # 显示系统概览 | ||||
|     - name: Display system overview | ||||
|       debug: | ||||
|         msg: | | ||||
|           🖥️  System Overview for {{ inventory_hostname }}: | ||||
|           📊 OS: {{ ansible_distribution }} {{ ansible_distribution_version }} | ||||
|           💾 Memory: {{ (ansible_memtotal_mb/1024)|round(1) }}GB total, {{ (ansible_memfree_mb/1024)|round(1) }}GB free | ||||
|           💿 CPU: {{ ansible_processor_vcpus }} cores | ||||
|           🏠 Architecture: {{ ansible_architecture }} | ||||
|           🌐 IP: {{ ansible_default_ipv4.address }} | ||||
|           ⏰ Uptime: {{ ansible_uptime_seconds//86400 }}d {{ (ansible_uptime_seconds%86400)//3600 }}h {{ ((ansible_uptime_seconds%3600)//60) }}m | ||||
|            | ||||
|     # 快速系统状态检查 | ||||
|     - name: Quick system status check | ||||
|       shell: | | ||||
|         echo "=== DISK USAGE ===" | ||||
|         df -h | grep -E "(Filesystem|/dev/)" | ||||
|         echo "" | ||||
|         echo "=== MEMORY USAGE ===" | ||||
|         free -h | ||||
|         echo "" | ||||
|         echo "=== LOAD AVERAGE ===" | ||||
|         uptime | ||||
|         echo "" | ||||
|         echo "=== TOP PROCESSES ===" | ||||
|         ps aux --sort=-%cpu | head -6 | ||||
|       register: quick_status | ||||
|        | ||||
|     - name: Display quick status | ||||
|       debug: | ||||
|         msg: "{{ quick_status.stdout_lines }}" | ||||
|          | ||||
|     # 检查关键服务状态 | ||||
|     - name: Check critical services | ||||
|       systemd: | ||||
|         name: "{{ item }}" | ||||
|       register: service_status | ||||
|       loop: | ||||
|         - ssh | ||||
|         - systemd-resolved | ||||
|         - cron | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display service status | ||||
|       debug: | ||||
|         msg: "🔧 {{ item.item }}: {{ item.status.ActiveState if item.status is defined else 'NOT FOUND' }}" | ||||
|       loop: "{{ service_status.results }}" | ||||
|        | ||||
|     # 检查最近的系统日志错误 | ||||
|     - name: Check recent system errors | ||||
|       shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10 | ||||
|       register: recent_errors | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display recent errors | ||||
|       debug: | ||||
|         msg: "🚨 Recent Errors: {{ recent_errors.stdout_lines if recent_errors.stdout_lines else ['No recent errors found'] }}" | ||||
|          | ||||
|     # 检查网络连接 | ||||
|     - name: Quick network check | ||||
|       shell: | | ||||
|         echo "=== NETWORK INTERFACES ===" | ||||
|         ip -br addr show | ||||
|         echo "" | ||||
|         echo "=== DEFAULT ROUTE ===" | ||||
|         ip route | grep default | ||||
|         echo "" | ||||
|         echo "=== DNS TEST ===" | ||||
|         nslookup google.com | grep -A1 "Name:" || echo "DNS resolution failed" | ||||
|       register: network_check | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display network status | ||||
|       debug: | ||||
|         msg: "🌐 Network Status: {{ network_check.stdout_lines }}" | ||||
|          | ||||
|     # 显示可用的运维脚本 | ||||
|     - name: Display available operations scripts | ||||
|       debug: | ||||
|         msg: | | ||||
|           🛠️  Available Operations Scripts: | ||||
|           {% for script in available_scripts %} | ||||
|           {{ loop.index }}. {{ script.name }}: {{ script.desc }} | ||||
|           {% endfor %} | ||||
|            | ||||
|           💡 Usage Examples: | ||||
|           ansible-playbook -i inventory.ini system-cleanup.yml --limit {{ inventory_hostname }} | ||||
|           ansible-playbook -i inventory.ini docker-management.yml --limit lxc | ||||
|           ansible-playbook -i inventory.ini network-connectivity.yml --limit proxmox | ||||
|            | ||||
|     # 生成运维建议 | ||||
|     - name: Generate maintenance recommendations | ||||
|       debug: | ||||
|         msg: | | ||||
|           💡 Maintenance Recommendations for {{ inventory_hostname }}: | ||||
|            | ||||
|           🔄 Regular Tasks (Weekly): | ||||
|           - Run system-cleanup.yml to free up disk space | ||||
|           - Check service-health-check.yml for service status | ||||
|           - Review certificate-management.yml for expiring certificates | ||||
|            | ||||
|           🔒 Security Tasks (Monthly): | ||||
|           - Execute security-hardening.yml for security updates | ||||
|           - Review network-connectivity.yml for network security | ||||
|            | ||||
|           🐳 Container Tasks (As needed): | ||||
|           - Use docker-management.yml for Docker maintenance | ||||
|            | ||||
|           📊 Monitoring Tasks (Daily): | ||||
|           - Quick check with ops-toolkit.yml (this script) | ||||
|            | ||||
|           ⚡ Emergency Tasks: | ||||
|           - Use system-update.yml for critical security patches | ||||
|           - Run network-connectivity.yml for connectivity issues | ||||
|  | @ -0,0 +1,167 @@ | |||
| --- | ||||
| - name: Migrate Nomad from Docker to Podman (Simple Version) | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_user: nomad | ||||
|     nomad_config_dir: /etc/nomad.d | ||||
|     nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Stop Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
| 
 | ||||
|     - name: Backup current Nomad configuration | ||||
|       copy: | ||||
|         src: "{{ nomad_config_file }}" | ||||
|         dest: "{{ nomad_config_file }}.backup-{{ ansible_date_time.epoch }}" | ||||
|         remote_src: yes | ||||
| 
 | ||||
|     - name: Get nomad user info | ||||
|       getent: | ||||
|         database: passwd | ||||
|         key: "{{ nomad_user }}" | ||||
|       register: nomad_user_info | ||||
| 
 | ||||
|     - name: Set nomad user UID variable | ||||
|       set_fact: | ||||
|         nomad_uid: "{{ nomad_user_info.ansible_facts.getent_passwd[nomad_user][1] }}" | ||||
| 
 | ||||
|     - name: Enable lingering for nomad user | ||||
|       command: loginctl enable-linger {{ nomad_user }} | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Create runtime directory for nomad user | ||||
|       file: | ||||
|         path: "/run/user/{{ nomad_uid }}" | ||||
|         state: directory | ||||
|         owner: "{{ nomad_user }}" | ||||
|         group: "{{ nomad_user }}" | ||||
|         mode: '0700' | ||||
| 
 | ||||
|     - name: Start Podman socket as nomad user | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} systemctl --user enable --now podman.socket | ||||
|       args: | ||||
|         creates: "/run/user/{{ nomad_uid }}/podman/podman.sock" | ||||
| 
 | ||||
|     - name: Create new Nomad configuration with Podman | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region = "global" | ||||
|           data_dir = "/opt/nomad/data" | ||||
| 
 | ||||
|           bind_addr = "0.0.0.0" | ||||
| 
 | ||||
|           client { | ||||
|             enabled = true | ||||
|             servers = [ | ||||
|               "100.116.158.95:4647", | ||||
|             ] | ||||
|           } | ||||
| 
 | ||||
|           # Docker plugin (disabled) | ||||
|           # plugin "docker" { | ||||
|           #   config { | ||||
|           #     allow_privileged = true | ||||
|           #     volumes { | ||||
|           #       enabled = true | ||||
|           #     } | ||||
|           #   } | ||||
|           # } | ||||
| 
 | ||||
|           plugin "podman" { | ||||
|             config { | ||||
|               socket_path = "unix:///run/user/{{ nomad_uid }}/podman/podman.sock" | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           consul { | ||||
|             address = "127.0.0.1:8500" | ||||
|           } | ||||
|         dest: "{{ nomad_config_file }}" | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: Update Nomad systemd service to run as nomad user | ||||
|       copy: | ||||
|         content: | | ||||
|           [Unit] | ||||
|           Description=Nomad | ||||
|           Documentation=https://www.nomadproject.io/ | ||||
|           Requires=network-online.target | ||||
|           After=network-online.target | ||||
|           Wants=network-online.target | ||||
| 
 | ||||
|           [Service] | ||||
|           Type=notify | ||||
|           User={{ nomad_user }} | ||||
|           Group={{ nomad_user }} | ||||
|           ExecReload=/bin/kill -HUP $MAINPID | ||||
|           ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }} | ||||
|           KillMode=process | ||||
|           Restart=on-failure | ||||
|           LimitNOFILE=65536 | ||||
|           Environment=XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} | ||||
| 
 | ||||
|           [Install] | ||||
|           WantedBy=multi-user.target | ||||
|         dest: /etc/systemd/system/nomad.service | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: Reload systemd daemon | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: Start Nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for Nomad to be ready (local check) | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: localhost | ||||
|         delay: 5 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Verify Nomad is running | ||||
|       shell: systemctl is-active nomad | ||||
|       register: nomad_status | ||||
| 
 | ||||
|     - name: Display Nomad status | ||||
|       debug: | ||||
|         msg: "Nomad service status: {{ nomad_status.stdout }}" | ||||
| 
 | ||||
|     - name: Check Podman socket | ||||
|       stat: | ||||
|         path: "/run/user/{{ nomad_uid }}/podman/podman.sock" | ||||
|       register: podman_socket | ||||
| 
 | ||||
|     - name: Display Podman socket status | ||||
|       debug: | ||||
|         msg: "Podman socket exists: {{ podman_socket.stat.exists }}" | ||||
| 
 | ||||
|     - name: Test Podman as nomad user | ||||
|       shell: | | ||||
|         sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} podman version --format json | ||||
|       register: podman_test | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Display Podman test result | ||||
|       debug: | ||||
|         msg: | | ||||
|           Podman test: {{ 'SUCCESS' if podman_test.rc == 0 else 'FAILED' }} | ||||
|           {% if podman_test.rc != 0 %} | ||||
|           Error: {{ podman_test.stderr }} | ||||
|           {% endif %} | ||||
|  | @ -1,143 +0,0 @@ | |||
| --- | ||||
| - name: Network Connectivity and Performance Check | ||||
|   hosts: all | ||||
|   gather_facts: yes | ||||
|    | ||||
|   vars: | ||||
|     test_domains: | ||||
|       - google.com | ||||
|       - github.com | ||||
|       - docker.io | ||||
|       - tailscale.com | ||||
|     test_ports: | ||||
|       - { host: "8.8.8.8", port: 53, name: "Google DNS" } | ||||
|       - { host: "1.1.1.1", port: 53, name: "Cloudflare DNS" } | ||||
|       - { host: "github.com", port: 443, name: "GitHub HTTPS" } | ||||
|       - { host: "docker.io", port: 443, name: "Docker Hub" } | ||||
|    | ||||
|   tasks: | ||||
|     # 基本网络信息 | ||||
|     - name: Get network interfaces | ||||
|       shell: ip addr show | grep -E "^[0-9]+:|inet " | ||||
|       register: network_interfaces | ||||
|        | ||||
|     - name: Display network interfaces | ||||
|       debug: | ||||
|         msg: "🌐 Network Interfaces: {{ network_interfaces.stdout_lines }}" | ||||
|          | ||||
|     # 检查默认路由 | ||||
|     - name: Check default route | ||||
|       shell: ip route | grep default | ||||
|       register: default_route | ||||
|        | ||||
|     - name: Display default route | ||||
|       debug: | ||||
|         msg: "🛣️  Default Route: {{ default_route.stdout }}" | ||||
|          | ||||
|     # DNS 解析测试 | ||||
|     - name: Test DNS resolution | ||||
|       shell: nslookup {{ item }} | grep -A2 "Name:" | ||||
|       register: dns_test | ||||
|       loop: "{{ test_domains }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display DNS test results | ||||
|       debug: | ||||
|         msg: "🔍 DNS Test for {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}" | ||||
|       loop: "{{ dns_test.results }}" | ||||
|        | ||||
|     # 网络连通性测试 | ||||
|     - name: Test network connectivity (ping) | ||||
|       shell: ping -c 3 {{ item }} | ||||
|       register: ping_test | ||||
|       loop: "{{ test_domains }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display ping test results | ||||
|       debug: | ||||
|         msg: "🏓 Ping to {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}" | ||||
|       loop: "{{ ping_test.results }}" | ||||
|        | ||||
|     # 端口连通性测试 | ||||
|     - name: Test port connectivity | ||||
|       wait_for: | ||||
|         host: "{{ item.host }}" | ||||
|         port: "{{ item.port }}" | ||||
|         timeout: 5 | ||||
|       register: port_test | ||||
|       loop: "{{ test_ports }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display port test results | ||||
|       debug: | ||||
|         msg: "🔌 {{ item.item.name }} ({{ item.item.host }}:{{ item.item.port }}): {{ 'SUCCESS' if not item.failed else 'FAILED' }}" | ||||
|       loop: "{{ port_test.results }}" | ||||
|        | ||||
|     # 检查 Tailscale 状态 | ||||
|     - name: Check Tailscale status | ||||
|       shell: tailscale status | ||||
|       register: tailscale_status | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display Tailscale status | ||||
|       debug: | ||||
|         msg: "🔗 Tailscale Status: {{ 'CONNECTED' if tailscale_status.rc == 0 else 'NOT CONNECTED' }}" | ||||
|          | ||||
|     - name: Show Tailscale details | ||||
|       debug: | ||||
|         msg: "{{ tailscale_status.stdout_lines }}" | ||||
|       when: tailscale_status.rc == 0 | ||||
|        | ||||
|     # 检查防火墙状态 | ||||
|     - name: Check UFW status (Ubuntu/Debian) | ||||
|       shell: ufw status | ||||
|       register: ufw_status | ||||
|       failed_when: false | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     - name: Display UFW status | ||||
|       debug: | ||||
|         msg: "🛡️  UFW Firewall: {{ ufw_status.stdout_lines }}" | ||||
|       when: ansible_os_family == "Debian" and ufw_status.rc == 0 | ||||
|        | ||||
|     # 检查 iptables 规则 | ||||
|     - name: Check iptables rules | ||||
|       shell: iptables -L -n | head -20 | ||||
|       register: iptables_rules | ||||
|       failed_when: false | ||||
|       become: yes | ||||
|        | ||||
|     - name: Display iptables summary | ||||
|       debug: | ||||
|         msg: "🔥 Iptables Rules: {{ iptables_rules.stdout_lines[:10] }}" | ||||
|       when: iptables_rules.rc == 0 | ||||
|        | ||||
|     # 网络性能测试 | ||||
|     - name: Test download speed (small file) | ||||
|       shell: curl -o /dev/null -s -w "%{time_total}" http://speedtest.wdc01.softlayer.com/downloads/test10.zip | ||||
|       register: download_speed | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display download speed test | ||||
|       debug: | ||||
|         msg: "⚡ Download Speed Test: {{ download_speed.stdout }}s for 10MB file" | ||||
|       when: download_speed.rc == 0 | ||||
|        | ||||
|     # 检查网络统计 | ||||
|     - name: Get network statistics | ||||
|       shell: cat /proc/net/dev | grep -v "lo:" | grep ":" | ||||
|       register: network_stats | ||||
|        | ||||
|     - name: Display network statistics | ||||
|       debug: | ||||
|         msg: "📊 Network Stats: {{ network_stats.stdout_lines }}" | ||||
|          | ||||
|     # 生成网络健康报告 | ||||
|     - name: Generate network health summary | ||||
|       debug: | ||||
|         msg: | | ||||
|           🌐 Network Health Summary for {{ inventory_hostname }}: | ||||
|           ✅ DNS Resolution: {{ (dns_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} domains | ||||
|           ✅ Ping Connectivity: {{ (ping_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} hosts | ||||
|           ✅ Port Connectivity: {{ (port_test.results | rejectattr('failed', 'defined') | list | length) }}/{{ test_ports | length }} ports | ||||
|           ✅ Tailscale: {{ 'Connected' if tailscale_status.rc == 0 else 'Disconnected' }} | ||||
|  | @ -1,135 +0,0 @@ | |||
| --- | ||||
| - name: Service Health Check and Monitoring | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|    | ||||
|   vars: | ||||
|     critical_services: | ||||
|       - ssh | ||||
|       - systemd-resolved | ||||
|       - cron | ||||
|     web_services: | ||||
|       - nginx | ||||
|       - apache2 | ||||
|     database_services: | ||||
|       - mysql | ||||
|       - mariadb | ||||
|       - postgresql | ||||
|     container_services: | ||||
|       - docker | ||||
|       - containerd | ||||
|     network_services: | ||||
|       - tailscale | ||||
|       - cloudflared | ||||
|        | ||||
|   tasks: | ||||
|     # 检查关键系统服务 | ||||
|     - name: Check critical system services | ||||
|       systemd: | ||||
|         name: "{{ item }}" | ||||
|       register: critical_service_status | ||||
|       loop: "{{ critical_services }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Report critical service issues | ||||
|       debug: | ||||
|         msg: "⚠️  Critical service {{ item.item }} is {{ item.status.ActiveState | default('not found') }}" | ||||
|       loop: "{{ critical_service_status.results }}" | ||||
|       when: item.status is defined and item.status.ActiveState != "active" | ||||
|        | ||||
|     # 检查 Web 服务 | ||||
|     - name: Check web services | ||||
|       systemd: | ||||
|         name: "{{ item }}" | ||||
|       register: web_service_status | ||||
|       loop: "{{ web_services }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Report web service status | ||||
|       debug: | ||||
|         msg: "🌐 Web service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" | ||||
|       loop: "{{ web_service_status.results }}" | ||||
|       when: item.status is defined | ||||
|        | ||||
|     # 检查数据库服务 | ||||
|     - name: Check database services | ||||
|       systemd: | ||||
|         name: "{{ item }}" | ||||
|       register: db_service_status | ||||
|       loop: "{{ database_services }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Report database service status | ||||
|       debug: | ||||
|         msg: "🗄️  Database service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" | ||||
|       loop: "{{ db_service_status.results }}" | ||||
|       when: item.status is defined | ||||
|        | ||||
|     # 检查容器服务 | ||||
|     - name: Check container services | ||||
|       systemd: | ||||
|         name: "{{ item }}" | ||||
|       register: container_service_status | ||||
|       loop: "{{ container_services }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Report container service status | ||||
|       debug: | ||||
|         msg: "📦 Container service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" | ||||
|       loop: "{{ container_service_status.results }}" | ||||
|       when: item.status is defined | ||||
|        | ||||
|     # 检查网络服务 | ||||
|     - name: Check network services | ||||
|       systemd: | ||||
|         name: "{{ item }}" | ||||
|       register: network_service_status | ||||
|       loop: "{{ network_services }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Report network service status | ||||
|       debug: | ||||
|         msg: "🌐 Network service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" | ||||
|       loop: "{{ network_service_status.results }}" | ||||
|       when: item.status is defined | ||||
|        | ||||
|     # 检查系统负载 | ||||
|     - name: Check system load | ||||
|       shell: uptime | ||||
|       register: system_load | ||||
|        | ||||
|     - name: Display system load | ||||
|       debug: | ||||
|         msg: "📊 System Load: {{ system_load.stdout }}" | ||||
|          | ||||
|     # 检查磁盘空间警告 | ||||
|     - name: Check disk space usage | ||||
|       shell: df -h | awk '$5 > 80 {print $0}' | ||||
|       register: disk_warning | ||||
|       changed_when: false | ||||
|        | ||||
|     - name: Warn about high disk usage | ||||
|       debug: | ||||
|         msg: "⚠️  High disk usage detected: {{ disk_warning.stdout_lines }}" | ||||
|       when: disk_warning.stdout_lines | length > 0 | ||||
|        | ||||
|     # 检查内存使用率 | ||||
|     - name: Check memory usage percentage | ||||
|       shell: free | awk 'NR==2{printf "%.2f%%", $3*100/$2}' | ||||
|       register: memory_percent | ||||
|        | ||||
|     - name: Display memory usage | ||||
|       debug: | ||||
|         msg: "🧠 Memory Usage: {{ memory_percent.stdout }}" | ||||
|          | ||||
|     # 检查最近的系统错误 | ||||
|     - name: Check recent system errors | ||||
|       shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10 | ||||
|       register: recent_errors | ||||
|       changed_when: false | ||||
|        | ||||
|     - name: Display recent errors | ||||
|       debug: | ||||
|         msg: "🚨 Recent system errors: {{ recent_errors.stdout_lines }}" | ||||
|       when: recent_errors.stdout_lines | length > 0 | ||||
|  | @ -0,0 +1,120 @@ | |||
| --- | ||||
| - name: 移除 Docker 并安装 Podman - 新 Server 节点 | ||||
|   hosts: ash2e,ash1d,ch2 | ||||
|   become: yes | ||||
|   gather_facts: no | ||||
|   serial: 1  # 逐个节点处理,避免并发冲突 | ||||
|    | ||||
|   tasks: | ||||
|     - name: 显示当前处理的节点 | ||||
|       debug: | ||||
|         msg: "🔧 正在处理节点: {{ inventory_hostname }}" | ||||
| 
 | ||||
|     - name: 检查 Docker 服务状态 | ||||
|       shell: systemctl is-active docker 2>/dev/null || echo "inactive" | ||||
|       register: docker_status | ||||
|       changed_when: false | ||||
| 
 | ||||
|     - name: 停止 Docker 服务 | ||||
|       systemd: | ||||
|         name: docker | ||||
|         state: stopped | ||||
|         enabled: no | ||||
|       ignore_errors: yes | ||||
|       when: docker_status.stdout == "active" | ||||
| 
 | ||||
|     - name: 停止 Docker socket | ||||
|       systemd: | ||||
|         name: docker.socket | ||||
|         state: stopped | ||||
|         enabled: no | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 移除 Docker 相关包 | ||||
|       apt: | ||||
|         name: | ||||
|           - docker-ce | ||||
|           - docker-ce-cli | ||||
|           - containerd.io | ||||
|           - docker-buildx-plugin | ||||
|           - docker-compose-plugin | ||||
|           - docker.io | ||||
|           - docker-doc | ||||
|           - docker-compose | ||||
|           - docker-registry | ||||
|           - containerd | ||||
|           - runc | ||||
|         state: absent | ||||
|         purge: yes | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 清理 Docker 数据目录 | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - /var/lib/docker | ||||
|         - /var/lib/containerd | ||||
|         - /etc/docker | ||||
|         - /etc/containerd | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 清理 Docker 用户组 | ||||
|       group: | ||||
|         name: docker | ||||
|         state: absent | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 更新包缓存 | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|         cache_valid_time: 3600 | ||||
| 
 | ||||
|     - name: 安装 Podman 及相关工具 | ||||
|       apt: | ||||
|         name: | ||||
|           - podman | ||||
|           - buildah | ||||
|           - skopeo | ||||
|           - podman-compose | ||||
|         state: present | ||||
|       retries: 3 | ||||
|       delay: 10 | ||||
| 
 | ||||
|     - name: 启用 Podman socket 服务 | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 创建 Podman 用户服务目录 | ||||
|       file: | ||||
|         path: /etc/systemd/user | ||||
|         state: directory | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: 验证 Podman 安装 | ||||
|       shell: podman --version | ||||
|       register: podman_version | ||||
|        | ||||
|     - name: 验证 Podman Compose 安装 | ||||
|       shell: podman-compose --version 2>/dev/null || echo "未安装" | ||||
|       register: podman_compose_version | ||||
|        | ||||
|     - name: 检查 Docker 清理状态 | ||||
|       shell: systemctl is-active docker 2>/dev/null || echo "已移除" | ||||
|       register: final_docker_status | ||||
|        | ||||
|     - name: 显示节点处理结果 | ||||
|       debug: | ||||
|         msg: | | ||||
|           ✅ 节点 {{ inventory_hostname }} 处理完成 | ||||
|           🐳 Docker 状态: {{ final_docker_status.stdout }} | ||||
|           📦 Podman 版本: {{ podman_version.stdout }} | ||||
|           🔧 Compose 状态: {{ podman_compose_version.stdout }} | ||||
| 
 | ||||
|     - name: 清理 apt 缓存 | ||||
|       apt: | ||||
|         autoclean: yes | ||||
|         autoremove: yes | ||||
|  | @ -0,0 +1,39 @@ | |||
| --- | ||||
| - name: Restart Tailscale to fix DNS issues | ||||
|   hosts: hcp1,hcp2 | ||||
|   become: yes | ||||
|    | ||||
|   tasks: | ||||
|     - name: Check current DNS configuration | ||||
|       shell: cat /etc/resolv.conf | ||||
|       register: dns_before | ||||
|        | ||||
|     - name: Display current DNS config | ||||
|       debug: | ||||
|         msg: "Current DNS config: {{ dns_before.stdout_lines }}" | ||||
|      | ||||
|     - name: Restart tailscaled service | ||||
|       systemd: | ||||
|         name: tailscaled | ||||
|         state: restarted | ||||
|          | ||||
|     - name: Wait for tailscale to stabilize | ||||
|       wait_for: | ||||
|         timeout: 10 | ||||
|          | ||||
|     - name: Check DNS configuration after restart | ||||
|       shell: cat /etc/resolv.conf | ||||
|       register: dns_after | ||||
|        | ||||
|     - name: Display new DNS config | ||||
|       debug: | ||||
|         msg: "New DNS config: {{ dns_after.stdout_lines }}" | ||||
|          | ||||
|     - name: Test DNS resolution | ||||
|       shell: nslookup apt.releases.hashicorp.com | ||||
|       register: dns_test | ||||
|       ignore_errors: yes | ||||
|        | ||||
|     - name: Display DNS test result | ||||
|       debug: | ||||
|         msg: "DNS test result: {{ dns_test.stdout_lines }}" | ||||
|  | @ -1,152 +0,0 @@ | |||
| --- | ||||
| - name: SSL Certificate Management and Monitoring | ||||
|   hosts: all | ||||
|   gather_facts: yes | ||||
|    | ||||
|   vars: | ||||
|     # 常见证书路径 | ||||
|     cert_paths: | ||||
|       - /etc/ssl/certs | ||||
|       - /etc/letsencrypt/live | ||||
|       - /etc/nginx/ssl | ||||
|       - /etc/apache2/ssl | ||||
|       - /usr/local/share/ca-certificates | ||||
|      | ||||
|     # 需要检查的服务端口 | ||||
|     ssl_services: | ||||
|       - { name: "HTTPS", port: 443 } | ||||
|       - { name: "SMTPS", port: 465 } | ||||
|       - { name: "IMAPS", port: 993 } | ||||
|       - { name: "LDAPS", port: 636 } | ||||
|    | ||||
|   tasks: | ||||
|     # 检查证书目录 | ||||
|     - name: Check certificate directories | ||||
|       stat: | ||||
|         path: "{{ item }}" | ||||
|       register: cert_dirs | ||||
|       loop: "{{ cert_paths }}" | ||||
|        | ||||
|     - name: List existing certificate directories | ||||
|       debug: | ||||
|         msg: "📁 Certificate directory {{ item.item }}: {{ 'EXISTS' if item.stat.exists else 'NOT FOUND' }}" | ||||
|       loop: "{{ cert_dirs.results }}" | ||||
|        | ||||
|     # 查找证书文件 | ||||
|     - name: Find certificate files | ||||
|       find: | ||||
|         paths: "{{ cert_paths }}" | ||||
|         patterns: "*.crt,*.pem,*.cert" | ||||
|         recurse: yes | ||||
|       register: cert_files | ||||
|        | ||||
|     - name: Display found certificates | ||||
|       debug: | ||||
|         msg: "🔐 Found {{ cert_files.files | length }} certificate files" | ||||
|          | ||||
|     # 检查证书过期时间 | ||||
|     - name: Check certificate expiration | ||||
|       shell: | | ||||
|         if [ -f "{{ item.path }}" ]; then | ||||
|           openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2 | ||||
|         fi | ||||
|       register: cert_expiry | ||||
|       loop: "{{ cert_files.files[:10] }}"  # 限制检查前10个证书 | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display certificate expiration dates | ||||
|       debug: | ||||
|         msg: "📅 {{ item.item.path | basename }}: expires {{ item.stdout if item.stdout else 'INVALID/UNREADABLE' }}" | ||||
|       loop: "{{ cert_expiry.results }}" | ||||
|       when: item.stdout != "" | ||||
|        | ||||
|     # 检查即将过期的证书 (30天内) | ||||
|     - name: Check certificates expiring soon | ||||
|       shell: | | ||||
|         if [ -f "{{ item.path }}" ]; then | ||||
|           exp_date=$(openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2) | ||||
|           if [ ! -z "$exp_date" ]; then | ||||
|             exp_epoch=$(date -d "$exp_date" +%s 2>/dev/null) | ||||
|             now_epoch=$(date +%s) | ||||
|             days_left=$(( (exp_epoch - now_epoch) / 86400 )) | ||||
|             if [ $days_left -lt 30 ]; then | ||||
|               echo "WARNING: $days_left days left" | ||||
|             else | ||||
|               echo "OK: $days_left days left" | ||||
|             fi | ||||
|           fi | ||||
|         fi | ||||
|       register: cert_warnings | ||||
|       loop: "{{ cert_files.files[:10] }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display certificate warnings | ||||
|       debug: | ||||
|         msg: "⚠️  {{ item.item.path | basename }}: {{ item.stdout }}" | ||||
|       loop: "{{ cert_warnings.results }}" | ||||
|       when: item.stdout != "" and "WARNING" in item.stdout | ||||
|        | ||||
|     # 检查 Let's Encrypt 证书 | ||||
|     - name: Check Let's Encrypt certificates | ||||
|       shell: certbot certificates 2>/dev/null || echo "Certbot not installed" | ||||
|       register: letsencrypt_certs | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display Let's Encrypt status | ||||
|       debug: | ||||
|         msg: "🔒 Let's Encrypt: {{ letsencrypt_certs.stdout_lines }}" | ||||
|       when: "'not installed' not in letsencrypt_certs.stdout" | ||||
|        | ||||
|     # 检查 SSL 服务端口 | ||||
|     - name: Check SSL service ports | ||||
|       wait_for: | ||||
|         port: "{{ item.port }}" | ||||
|         timeout: 3 | ||||
|       register: ssl_ports | ||||
|       loop: "{{ ssl_services }}" | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display SSL service status | ||||
|       debug: | ||||
|         msg: "🔌 {{ item.item.name }} (port {{ item.item.port }}): {{ 'LISTENING' if not item.failed else 'NOT AVAILABLE' }}" | ||||
|       loop: "{{ ssl_ports.results }}" | ||||
|        | ||||
|     # 测试 HTTPS 连接 | ||||
|     - name: Test HTTPS connection to localhost | ||||
|       uri: | ||||
|         url: "https://{{ ansible_default_ipv4.address }}" | ||||
|         method: GET | ||||
|         validate_certs: no | ||||
|         timeout: 5 | ||||
|       register: https_test | ||||
|       failed_when: false | ||||
|       when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed | ||||
|        | ||||
|     - name: Display HTTPS test result | ||||
|       debug: | ||||
|         msg: "🌐 HTTPS Test: {{ 'SUCCESS' if https_test.status is defined else 'FAILED' }}" | ||||
|       when: https_test is defined | ||||
|        | ||||
|     # 检查证书链 | ||||
|     - name: Check certificate chain for HTTPS | ||||
|       shell: | | ||||
|         echo | openssl s_client -connect {{ ansible_default_ipv4.address }}:443 -servername {{ ansible_hostname }} 2>/dev/null | openssl x509 -noout -subject -issuer | ||||
|       register: cert_chain | ||||
|       failed_when: false | ||||
|       when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed | ||||
|        | ||||
|     - name: Display certificate chain info | ||||
|       debug: | ||||
|         msg: "🔗 Certificate Chain: {{ cert_chain.stdout_lines }}" | ||||
|       when: cert_chain is defined and cert_chain.rc == 0 | ||||
|        | ||||
|     # 生成证书健康报告 | ||||
|     - name: Generate certificate health summary | ||||
|       debug: | ||||
|         msg: | | ||||
|           🔐 Certificate Health Summary for {{ inventory_hostname }}: | ||||
|           📁 Certificate directories found: {{ (cert_dirs.results | selectattr('stat.exists') | list | length) }} | ||||
|           📄 Certificate files found: {{ cert_files.files | length }} | ||||
|           ⚠️  Certificates expiring soon: {{ (cert_warnings.results | selectattr('stdout', 'search', 'WARNING') | list | length) }} | ||||
|           🔒 Let's Encrypt: {{ 'Configured' if 'not installed' not in letsencrypt_certs.stdout else 'Not installed' }} | ||||
|           🌐 SSL Services: {{ (ssl_ports.results | rejectattr('failed') | list | length) }}/{{ ssl_services | length }} available | ||||
|  | @ -1,119 +0,0 @@ | |||
| --- | ||||
| - name: Security Hardening and Backup | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|    | ||||
|   tasks: | ||||
|     # SSH 安全配置检查 | ||||
|     - name: Check SSH configuration security | ||||
|       lineinfile: | ||||
|         path: /etc/ssh/sshd_config | ||||
|         regexp: "{{ item.regexp }}" | ||||
|         line: "{{ item.line }}" | ||||
|         backup: yes | ||||
|       loop: | ||||
|         - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' } | ||||
|         - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' } | ||||
|         - { regexp: '^#?X11Forwarding', line: 'X11Forwarding no' } | ||||
|         - { regexp: '^#?MaxAuthTries', line: 'MaxAuthTries 3' } | ||||
|       notify: restart ssh | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     # 防火墙状态检查 | ||||
|     - name: Check UFW firewall status | ||||
|       shell: ufw status | ||||
|       register: ufw_status | ||||
|       changed_when: false | ||||
|       failed_when: false | ||||
|       when: ansible_os_family == "Debian" | ||||
|        | ||||
|     - name: Display firewall status | ||||
|       debug: | ||||
|         msg: "🔥 Firewall Status: {{ ufw_status.stdout_lines }}" | ||||
|       when: ansible_os_family == "Debian" and ufw_status.stdout_lines is defined | ||||
|        | ||||
|     # 检查可疑登录 | ||||
|     - name: Check for failed login attempts | ||||
|       shell: grep "Failed password" /var/log/auth.log | tail -10 | ||||
|       register: failed_logins | ||||
|       changed_when: false | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Report suspicious login attempts | ||||
|       debug: | ||||
|         msg: "🚨 Recent failed logins: {{ failed_logins.stdout_lines }}" | ||||
|       when: failed_logins.stdout_lines | length > 0 | ||||
|        | ||||
|     # 检查 root 用户活动 | ||||
|     - name: Check recent root activity | ||||
|       shell: grep "sudo.*root" /var/log/auth.log | tail -5 | ||||
|       register: root_activity | ||||
|       changed_when: false | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Display root activity | ||||
|       debug: | ||||
|         msg: "👑 Recent root activity: {{ root_activity.stdout_lines }}" | ||||
|       when: root_activity.stdout_lines | length > 0 | ||||
|        | ||||
|     # 备份重要配置文件 | ||||
|     - name: Create backup directory | ||||
|       file: | ||||
|         path: /backup/configs | ||||
|         state: directory | ||||
|         mode: '0700' | ||||
|          | ||||
|     - name: Backup important configuration files | ||||
|       copy: | ||||
|         src: "{{ item }}" | ||||
|         dest: "/backup/configs/{{ item | basename }}.{{ ansible_date_time.epoch }}" | ||||
|         remote_src: yes | ||||
|         backup: yes | ||||
|       loop: | ||||
|         - /etc/ssh/sshd_config | ||||
|         - /etc/hosts | ||||
|         - /etc/fstab | ||||
|         - /etc/crontab | ||||
|       failed_when: false | ||||
|        | ||||
|     # 检查系统完整性 | ||||
|     - name: Check for world-writable files | ||||
|       shell: find /etc /usr /bin /sbin -type f -perm -002 2>/dev/null | head -10 | ||||
|       register: world_writable | ||||
|       changed_when: false | ||||
|        | ||||
|     - name: Report world-writable files | ||||
|       debug: | ||||
|         msg: "⚠️  World-writable files found: {{ world_writable.stdout_lines }}" | ||||
|       when: world_writable.stdout_lines | length > 0 | ||||
|        | ||||
|     # 检查 SUID 文件 | ||||
|     - name: Check for SUID files | ||||
|       shell: find /usr /bin /sbin -type f -perm -4000 2>/dev/null | ||||
|       register: suid_files | ||||
|       changed_when: false | ||||
|        | ||||
|     - name: Display SUID files count | ||||
|       debug: | ||||
|         msg: "🔐 Found {{ suid_files.stdout_lines | length }} SUID files" | ||||
|          | ||||
|     # 更新系统时间 | ||||
|     - name: Sync system time | ||||
|       shell: timedatectl set-ntp true | ||||
|       failed_when: false | ||||
|        | ||||
|     - name: Check time synchronization | ||||
|       shell: timedatectl status | ||||
|       register: time_status | ||||
|        | ||||
|     - name: Display time sync status | ||||
|       debug: | ||||
|         msg: "🕐 Time sync: {{ time_status.stdout_lines | select('match', '.*synchronized.*') | list }}" | ||||
|          | ||||
|   handlers: | ||||
|     - name: restart ssh | ||||
|       systemd: | ||||
|         name: ssh | ||||
|         state: restarted | ||||
|       when: ansible_os_family == "Debian" | ||||
|  | @ -0,0 +1,187 @@ | |||
| --- | ||||
| - name: 部署 Telegraf 硬盘监控到 Nomad 集群 | ||||
|   hosts: all | ||||
|   become: yes | ||||
|   vars: | ||||
|     # 连接现有的 InfluxDB 2.x + Grafana 监控栈 | ||||
|     influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}" | ||||
|     influxdb_token: "{{ influxdb_token }}" | ||||
|     influxdb_org: "{{ influxdb_org | default('nomad') }}" | ||||
|     influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}" | ||||
|      | ||||
|     # 远程 Telegraf 配置模式(优先) | ||||
|     use_remote_config: "{{ use_remote_config | default(true) }}" | ||||
|     telegraf_config_url: "{{ telegraf_config_url | default('') }}" | ||||
|      | ||||
|     # 硬盘监控阈值 | ||||
|     disk_usage_warning: 80  # 80% 使用率警告 | ||||
|     disk_usage_critical: 90 # 90% 使用率严重告警 | ||||
|      | ||||
|     # 监控间隔(秒) | ||||
|     collection_interval: 30 | ||||
|      | ||||
|   tasks: | ||||
|     - name: 显示正在处理的节点 | ||||
|       debug: | ||||
|         msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控" | ||||
| 
 | ||||
|     - name: 添加 InfluxData 仓库密钥 | ||||
|       apt_key: | ||||
|         url: https://repos.influxdata.com/influxdata-archive_compat.key | ||||
|         state: present | ||||
|       retries: 3 | ||||
|       delay: 5 | ||||
| 
 | ||||
|     - name: 添加 InfluxData 仓库 | ||||
|       apt_repository: | ||||
|         repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable" | ||||
|         state: present | ||||
|         update_cache: yes | ||||
|       retries: 3 | ||||
|       delay: 5 | ||||
| 
 | ||||
|     - name: 安装 Telegraf | ||||
|       apt: | ||||
|         name: telegraf | ||||
|         state: present | ||||
|         update_cache: yes | ||||
|       retries: 3 | ||||
|       delay: 10 | ||||
| 
 | ||||
|     - name: 创建 Telegraf 配置目录 | ||||
|       file: | ||||
|         path: /etc/telegraf/telegraf.d | ||||
|         state: directory | ||||
|         owner: telegraf | ||||
|         group: telegraf | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: 清理旧的 Telegraf 日志文件(节省硬盘空间) | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - /var/log/telegraf | ||||
|         - /var/log/telegraf.log | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 禁用 Telegraf 日志目录创建 | ||||
|       file: | ||||
|         path: /var/log/telegraf | ||||
|         state: absent | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 创建 Telegraf 环境变量文件 | ||||
|       template: | ||||
|         src: telegraf-env.j2 | ||||
|         dest: /etc/default/telegraf | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0600' | ||||
|         backup: yes | ||||
|       notify: restart telegraf | ||||
| 
 | ||||
|     - name: 创建 Telegraf systemd 服务文件(支持远程配置) | ||||
|       template: | ||||
|         src: telegraf.service.j2 | ||||
|         dest: /etc/systemd/system/telegraf.service | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
|         backup: yes | ||||
|       notify: | ||||
|         - reload systemd | ||||
|         - restart telegraf | ||||
|       when: telegraf_config_url is defined and telegraf_config_url != '' | ||||
| 
 | ||||
|     - name: 生成 Telegraf 主配置文件(本地配置模式) | ||||
|       template: | ||||
|         src: telegraf.conf.j2 | ||||
|         dest: /etc/telegraf/telegraf.conf | ||||
|         owner: telegraf | ||||
|         group: telegraf | ||||
|         mode: '0644' | ||||
|         backup: yes | ||||
|       notify: restart telegraf | ||||
|       when: telegraf_config_url is not defined or telegraf_config_url == '' | ||||
| 
 | ||||
|     - name: 生成硬盘监控配置 | ||||
|       template: | ||||
|         src: disk-monitoring.conf.j2 | ||||
|         dest: /etc/telegraf/telegraf.d/disk-monitoring.conf | ||||
|         owner: telegraf | ||||
|         group: telegraf | ||||
|         mode: '0644' | ||||
|         backup: yes | ||||
|       notify: restart telegraf | ||||
| 
 | ||||
|     - name: 生成系统监控配置 | ||||
|       template: | ||||
|         src: system-monitoring.conf.j2 | ||||
|         dest: /etc/telegraf/telegraf.d/system-monitoring.conf | ||||
|         owner: telegraf | ||||
|         group: telegraf | ||||
|         mode: '0644' | ||||
|         backup: yes | ||||
|       notify: restart telegraf | ||||
| 
 | ||||
|     - name: 启用并启动 Telegraf 服务 | ||||
|       systemd: | ||||
|         name: telegraf | ||||
|         state: started | ||||
|         enabled: yes | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: 验证 Telegraf 状态 | ||||
|       systemd: | ||||
|         name: telegraf | ||||
|       register: telegraf_status | ||||
| 
 | ||||
|     - name: 检查 InfluxDB 连接 | ||||
|       uri: | ||||
|         url: "{{ influxdb_url }}/ping" | ||||
|         method: GET | ||||
|         timeout: 5 | ||||
|       register: influxdb_ping | ||||
|       ignore_errors: yes | ||||
|       delegate_to: localhost | ||||
|       run_once: true | ||||
| 
 | ||||
|     - name: 显示 InfluxDB 连接状态 | ||||
|       debug: | ||||
|         msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}" | ||||
|       run_once: true | ||||
| 
 | ||||
|     - name: 显示 Telegraf 状态 | ||||
|       debug: | ||||
|         msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}" | ||||
| 
 | ||||
|     - name: 检查硬盘使用情况 | ||||
|       shell: | | ||||
|         df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output; | ||||
|         do | ||||
|           usage=$(echo $output | awk '{print $1}' | sed 's/%//g') | ||||
|           partition=$(echo $output | awk '{print $2}') | ||||
|           mount=$(echo $output | awk '{print $3}') | ||||
|           if [ $usage -ge {{ disk_usage_warning }} ]; then | ||||
|             echo "⚠️  警告: $mount ($partition) 使用率 $usage%" | ||||
|           else | ||||
|             echo "✅ $mount ($partition) 使用率 $usage%" | ||||
|           fi | ||||
|         done | ||||
|       register: disk_check | ||||
|       changed_when: false | ||||
| 
 | ||||
|     - name: 显示硬盘检查结果 | ||||
|       debug: | ||||
|         msg: "{{ disk_check.stdout_lines }}" | ||||
| 
 | ||||
|   handlers: | ||||
|     - name: reload systemd | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: restart telegraf | ||||
|       systemd: | ||||
|         name: telegraf | ||||
|         state: restarted | ||||
|  | @ -0,0 +1,76 @@ | |||
| --- | ||||
| - name: 安装并配置新的 Nomad Server 节点 | ||||
|   hosts: ash2e,ash1d,ch2 | ||||
|   become: yes | ||||
|   gather_facts: no | ||||
|    | ||||
|   tasks: | ||||
|     - name: 更新包缓存 | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|         cache_valid_time: 3600 | ||||
|       retries: 3 | ||||
|       delay: 10 | ||||
| 
 | ||||
|     - name: 安装依赖包 | ||||
|       apt: | ||||
|         name: | ||||
|           - wget | ||||
|           - curl | ||||
|           - unzip | ||||
|           - podman | ||||
|           - buildah | ||||
|           - skopeo | ||||
|         state: present | ||||
|       retries: 3 | ||||
|       delay: 10 | ||||
| 
 | ||||
|     - name: 检查 Nomad 是否已安装 | ||||
|       shell: which nomad || echo "not_found" | ||||
|       register: nomad_check | ||||
|       changed_when: false | ||||
| 
 | ||||
|     - name: 下载并安装 Nomad | ||||
|       block: | ||||
|         - name: 下载 Nomad 1.10.5 | ||||
|           get_url: | ||||
|             url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip" | ||||
|             dest: "/tmp/nomad.zip" | ||||
|             mode: '0644' | ||||
| 
 | ||||
|         - name: 解压 Nomad | ||||
|           unarchive: | ||||
|             src: "/tmp/nomad.zip" | ||||
|             dest: "/usr/bin/" | ||||
|             remote_src: yes | ||||
|             owner: root | ||||
|             group: root | ||||
|             mode: '0755' | ||||
| 
 | ||||
|         - name: 清理临时文件 | ||||
|           file: | ||||
|             path: "/tmp/nomad.zip" | ||||
|             state: absent | ||||
|       when: nomad_check.stdout == "not_found" | ||||
| 
 | ||||
|     - name: 验证 Nomad 安装 | ||||
|       shell: nomad version | ||||
|       register: nomad_version_output | ||||
|        | ||||
|     - name: 显示安装结果 | ||||
|       debug: | ||||
|         msg: | | ||||
|           ✅ 节点 {{ inventory_hostname }} 软件安装完成 | ||||
|           📦 Podman: {{ ansible_facts.packages.podman[0].version if ansible_facts.packages.podman is defined else 'checking...' }} | ||||
|           🎯 Nomad: {{ nomad_version_output.stdout.split('\n')[0] }} | ||||
| 
 | ||||
|     - name: 启用 Podman socket | ||||
|       systemd: | ||||
|         name: podman.socket | ||||
|         enabled: yes | ||||
|         state: started | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: 继续完整配置 | ||||
|       debug: | ||||
|         msg: "软件安装完成,现在将运行完整的 Nomad 配置..." | ||||
|  | @ -0,0 +1,68 @@ | |||
| # 硬盘监控配置 | ||||
| # 监控所有挂载点的硬盘使用情况 | ||||
| 
 | ||||
| # 硬盘使用率监控 | ||||
| [[inputs.disk]] | ||||
|   ## 忽略的文件系统类型 | ||||
|   ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] | ||||
|    | ||||
|   ## 监控所有挂载点 | ||||
|   mount_points = ["/", "/var", "/tmp", "/opt", "/home"] | ||||
|    | ||||
|   ## 标签配置 | ||||
|   [inputs.disk.tags] | ||||
|     service = "disk-monitoring" | ||||
|      | ||||
| # 硬盘 I/O 监控 | ||||
| [[inputs.diskio]] | ||||
|   ## 监控所有设备 | ||||
|   devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"] | ||||
|    | ||||
|   ## 跳过序列号收集以提高性能 | ||||
|   skip_serial_number = true | ||||
|    | ||||
|   [inputs.diskio.tags] | ||||
|     service = "disk-io-monitoring" | ||||
| 
 | ||||
| # 文件系统 inode 监控 | ||||
| [[inputs.disk]] | ||||
|   ## 监控 inode 使用情况 | ||||
|   ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] | ||||
|    | ||||
|   ## 收集 inode 信息 | ||||
|   [inputs.disk.tags] | ||||
|     service = "inode-monitoring" | ||||
| 
 | ||||
| # 进程监控(可选,用于监控可能占用大量硬盘的进程) | ||||
| [[inputs.procstat]] | ||||
|   ## 监控 Docker 进程(如果存在) | ||||
|   pattern = "docker" | ||||
|    | ||||
|   [inputs.procstat.tags] | ||||
|     service = "docker-process" | ||||
| 
 | ||||
| [[inputs.procstat]] | ||||
|   ## 监控 Podman 进程 | ||||
|   pattern = "podman" | ||||
|    | ||||
|   [inputs.procstat.tags] | ||||
|     service = "podman-process" | ||||
| 
 | ||||
| [[inputs.procstat]] | ||||
|   ## 监控 Nomad 进程 | ||||
|   pattern = "nomad" | ||||
|    | ||||
|   [inputs.procstat.tags] | ||||
|     service = "nomad-process" | ||||
| 
 | ||||
| # 日志文件大小监控 | ||||
| [[inputs.filestat]] | ||||
|   files = [ | ||||
|     "/var/log/nomad/*.log", | ||||
|     "/var/log/syslog", | ||||
|     "/var/log/kern.log", | ||||
|     "/var/log/auth.log" | ||||
|   ] | ||||
|    | ||||
|   [inputs.filestat.tags] | ||||
|     service = "log-monitoring" | ||||
|  | @ -0,0 +1,68 @@ | |||
| # 系统监控配置 | ||||
| # CPU、内存、网络等系统资源监控 | ||||
| 
 | ||||
| # CPU 监控 | ||||
| [[inputs.cpu]] | ||||
|   ## 是否收集每个 CPU 核心的信息 | ||||
|   percpu = true | ||||
|   ## 是否收集总 CPU 信息 | ||||
|   totalcpu = true | ||||
|   ## 收集字段 | ||||
|   collect_cpu_time = false | ||||
|   ## 报告活跃的 CPU | ||||
|   report_active = false | ||||
|    | ||||
|   [inputs.cpu.tags] | ||||
|     service = "cpu-monitoring" | ||||
| 
 | ||||
| # 内存监控 | ||||
| [[inputs.mem]] | ||||
|   [inputs.mem.tags] | ||||
|     service = "memory-monitoring" | ||||
| 
 | ||||
| # 网络接口监控 | ||||
| [[inputs.net]] | ||||
|   ## 接口配置 | ||||
|   interfaces = ["eth*", "en*", "tailscale*"] | ||||
|    | ||||
|   [inputs.net.tags] | ||||
|     service = "network-monitoring" | ||||
| 
 | ||||
| # 系统负载监控 | ||||
| [[inputs.system]] | ||||
|   [inputs.system.tags] | ||||
|     service = "system-load" | ||||
| 
 | ||||
| # 内核统计 | ||||
| [[inputs.kernel]] | ||||
|   [inputs.kernel.tags] | ||||
|     service = "kernel-stats" | ||||
| 
 | ||||
| # 网络统计 | ||||
| [[inputs.netstat]] | ||||
|   [inputs.netstat.tags] | ||||
|     service = "network-stats" | ||||
| 
 | ||||
| # 交换分区监控 | ||||
| [[inputs.swap]] | ||||
|   [inputs.swap.tags] | ||||
|     service = "swap-monitoring" | ||||
| 
 | ||||
| # 服务状态监控 | ||||
| [[inputs.systemd_units]] | ||||
|   ## 监控的服务 | ||||
|   units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"] | ||||
|    | ||||
|   [inputs.systemd_units.tags] | ||||
|     service = "service-monitoring" | ||||
| 
 | ||||
| # 硬盘健康状态监控(如果支持 SMART) | ||||
| [[inputs.smart]] | ||||
|   ## SMART 监控路径 | ||||
|   path_smartctl = "/usr/sbin/smartctl" | ||||
|    | ||||
|   ## 超时设置 | ||||
|   timeout = "30s" | ||||
|    | ||||
|   [inputs.smart.tags] | ||||
|     service = "smart-monitoring" | ||||
|  | @ -0,0 +1,7 @@ | |||
| # Telegraf 环境变量配置 | ||||
| # InfluxDB 2.x 认证信息 | ||||
| 
 | ||||
| INFLUX_TOKEN={{ influxdb_token }} | ||||
| INFLUX_ORG={{ influxdb_org }} | ||||
| INFLUX_BUCKET={{ influxdb_bucket }} | ||||
| INFLUX_URL={{ influxdb_url }} | ||||
|  | @ -0,0 +1,53 @@ | |||
| # Telegraf 主配置文件 | ||||
| # Nomad 集群硬盘监控配置 | ||||
| 
 | ||||
| # 全局设置 | ||||
| [global_tags] | ||||
|   nomad_cluster = "production" | ||||
|   node_role = "{{ nomad_role | default('unknown') }}" | ||||
|   hostname = "{{ inventory_hostname }}" | ||||
| 
 | ||||
| # Agent 配置 | ||||
| [agent] | ||||
|   interval = "{{ collection_interval | default(30) }}s" | ||||
|   round_interval = true | ||||
|   metric_batch_size = 1000 | ||||
|   metric_buffer_limit = 10000 | ||||
|   collection_jitter = "2s" | ||||
|   flush_interval = "10s" | ||||
|   flush_jitter = "0s" | ||||
|   precision = "" | ||||
|   hostname = "{{ inventory_hostname }}" | ||||
|   omit_hostname = false | ||||
| 
 | ||||
| # 输出配置 - InfluxDB 2.x | ||||
| [[outputs.influxdb_v2]] | ||||
|   urls = ["{{ influxdb_url }}"] | ||||
|   token = "{{ influxdb_token }}" | ||||
|   organization = "{{ influxdb_org | default('nomad') }}" | ||||
|   bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}" | ||||
|    | ||||
|   ## 连接配置 | ||||
|   timeout = "10s" | ||||
|   max_retries = 3 | ||||
|   retry_timeout = "5s" | ||||
|    | ||||
|   ## 数据精度 | ||||
|   precision = "s" | ||||
|    | ||||
|   ## TLS 配置(如果需要) | ||||
|   # tls_ca = "/etc/telegraf/ca.pem" | ||||
|   # tls_cert = "/etc/telegraf/cert.pem" | ||||
|   # tls_key = "/etc/telegraf/key.pem" | ||||
|   # insecure_skip_verify = false | ||||
| 
 | ||||
| # 日志配置 - 禁用本地日志以节省硬盘空间 | ||||
| [log] | ||||
|   ## 只输出错误日志到 syslog,不生成本地文件 | ||||
|   level = "ERROR" | ||||
|   ## 禁用本地日志文件 | ||||
|   # file = "/var/log/telegraf/telegraf.log" | ||||
|   ## 使用 syslog 替代本地文件 | ||||
|   logtarget = "syslog" | ||||
|   ## 禁用日志轮转 | ||||
|   logrotate = false | ||||
|  | @ -0,0 +1,29 @@ | |||
| [Unit] | ||||
| Description=Telegraf - 节点监控服务 | ||||
| Documentation=https://github.com/influxdata/telegraf | ||||
| After=network.target | ||||
| 
 | ||||
| [Service] | ||||
| Type=notify | ||||
| User=telegraf | ||||
| Group=telegraf | ||||
| ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }} | ||||
| ExecReload=/bin/kill -HUP $MAINPID | ||||
| KillMode=control-group | ||||
| Restart=on-failure | ||||
| RestartSec=5 | ||||
| TimeoutStopSec=20 | ||||
| EnvironmentFile=/etc/default/telegraf | ||||
| 
 | ||||
| # 安全配置 | ||||
| NoNewPrivileges=true | ||||
| PrivateTmp=true | ||||
| ProtectSystem=strict | ||||
| ProtectHome=true | ||||
| ReadWritePaths=/var/lib/telegraf | ||||
| ProtectKernelTunables=true | ||||
| ProtectKernelModules=true | ||||
| ProtectControlGroups=true | ||||
| 
 | ||||
| [Install] | ||||
| WantedBy=multi-user.target | ||||
|  | @ -0,0 +1,169 @@ | |||
| # 磁盘管理工具使用指南 | ||||
| 
 | ||||
| ## 🔧 工具概览 | ||||
| 
 | ||||
| 我们提供了三个主要的磁盘管理工具来解决磁盘空间不足的问题: | ||||
| 
 | ||||
| ### 1. 磁盘分析工具 (`disk-analysis-ncdu.yml`) | ||||
| 使用 `ncdu` 工具深度分析磁盘使用情况,生成详细报告。 | ||||
| 
 | ||||
| ### 2. 磁盘清理工具 (`disk-cleanup.yml`) | ||||
| 自动清理系统垃圾文件、日志、缓存等。 | ||||
| 
 | ||||
| ### 3. 磁盘监控脚本 (`disk-monitor.sh`) | ||||
| 一键监控所有节点的磁盘使用情况。 | ||||
| 
 | ||||
| ## 🚀 快速使用 | ||||
| 
 | ||||
| ### 监控所有节点磁盘使用情况 | ||||
| ```bash | ||||
| # 使用默认阈值 85% | ||||
| ./scripts/utilities/disk-monitor.sh | ||||
| 
 | ||||
| # 使用自定义阈值 90% | ||||
| ./scripts/utilities/disk-monitor.sh 90 | ||||
| ``` | ||||
| 
 | ||||
| ### 分析特定节点磁盘使用 | ||||
| ```bash | ||||
| # 分析所有节点 | ||||
| ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|   configuration/playbooks/disk-analysis-ncdu.yml | ||||
| 
 | ||||
| # 分析特定节点 | ||||
| ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|   configuration/playbooks/disk-analysis-ncdu.yml --limit semaphore | ||||
| ``` | ||||
| 
 | ||||
| ### 清理磁盘空间 | ||||
| ```bash | ||||
| # 清理所有节点 (安全模式) | ||||
| ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|   configuration/playbooks/disk-cleanup.yml | ||||
| 
 | ||||
| # 清理特定节点 | ||||
| ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|   configuration/playbooks/disk-cleanup.yml --limit ash3c | ||||
| 
 | ||||
| # 包含容器清理 (谨慎使用) | ||||
| ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|   configuration/playbooks/disk-cleanup.yml -e cleanup_containers=true | ||||
| ``` | ||||
| 
 | ||||
| ## 📊 分析报告说明 | ||||
| 
 | ||||
| ### ncdu 文件位置 | ||||
| 分析完成后,ncdu 扫描文件保存在各节点的 `/tmp/disk-analysis/` 目录: | ||||
| 
 | ||||
| - `ncdu-root-<hostname>.json` - 根目录扫描结果 | ||||
| - `ncdu-var-<hostname>.json` - /var 目录扫描结果   | ||||
| - `ncdu-opt-<hostname>.json` - /opt 目录扫描结果 | ||||
| 
 | ||||
| ### 查看 ncdu 报告 | ||||
| ```bash | ||||
| # 在目标节点上查看交互式报告 | ||||
| ncdu -f /tmp/disk-analysis/ncdu-root-semaphore.json | ||||
| 
 | ||||
| # 查看文本报告 | ||||
| cat /tmp/disk-analysis/disk-report-semaphore.txt | ||||
| 
 | ||||
| # 查看清理建议 | ||||
| cat /tmp/disk-analysis/cleanup-suggestions-semaphore.txt | ||||
| ``` | ||||
| 
 | ||||
| ## 🧹 清理选项说明 | ||||
| 
 | ||||
| ### 默认清理项目 | ||||
| - ✅ **系统日志**: 清理7天前的日志文件 | ||||
| - ✅ **包缓存**: 清理 APT/YUM 缓存 | ||||
| - ✅ **临时文件**: 清理7天前的临时文件 | ||||
| - ✅ **核心转储**: 删除 core dump 文件 | ||||
| 
 | ||||
| ### 可选清理项目 | ||||
| - ⚠️ **容器清理**: 需要手动启用 (`cleanup_containers=true`) | ||||
|   - 停止所有容器 | ||||
|   - 删除未使用的容器、镜像、卷 | ||||
| 
 | ||||
| ### 自定义清理参数 | ||||
| ```bash | ||||
| ansible-playbook configuration/playbooks/disk-cleanup.yml \ | ||||
|   -e cleanup_logs=false \ | ||||
|   -e cleanup_cache=true \ | ||||
|   -e cleanup_temp=true \ | ||||
|   -e cleanup_containers=false | ||||
| ``` | ||||
| 
 | ||||
| ## 🚨 紧急情况处理 | ||||
| 
 | ||||
| ### 磁盘使用率 > 95% | ||||
| ```bash | ||||
| # 1. 立即检查最大文件 | ||||
| ansible all -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|   -m shell -a "find / -type f -size +1G -exec ls -lh {} \; 2>/dev/null | head -5" | ||||
| 
 | ||||
| # 2. 紧急清理 | ||||
| ansible-playbook configuration/playbooks/disk-cleanup.yml \ | ||||
|   -e cleanup_containers=true | ||||
| 
 | ||||
| # 3. 手动清理大文件 | ||||
| ansible all -m shell -a "truncate -s 0 /var/log/large.log" | ||||
| ``` | ||||
| 
 | ||||
| ### 常见大文件位置 | ||||
| - `/var/log/` - 系统日志 | ||||
| - `/tmp/` - 临时文件 | ||||
| - `/var/cache/` - 包管理器缓存 | ||||
| - `/opt/nomad/data/` - Nomad 数据 | ||||
| - `~/.local/share/containers/` - Podman 数据 | ||||
| 
 | ||||
| ## 📈 定期维护建议 | ||||
| 
 | ||||
| ### 每日监控 | ||||
| ```bash | ||||
| # 添加到 crontab | ||||
| 0 9 * * * /root/mgmt/scripts/utilities/disk-monitor.sh 85 | ||||
| ``` | ||||
| 
 | ||||
| ### 每周清理 | ||||
| ```bash | ||||
| # 每周日凌晨2点自动清理 | ||||
| 0 2 * * 0 cd /root/mgmt && ansible-playbook configuration/playbooks/disk-cleanup.yml | ||||
| ``` | ||||
| 
 | ||||
| ### 每月深度分析 | ||||
| ```bash | ||||
| # 每月1号生成详细报告 | ||||
| 0 3 1 * * cd /root/mgmt && ansible-playbook configuration/playbooks/disk-analysis-ncdu.yml | ||||
| ``` | ||||
| 
 | ||||
| ## 🔍 故障排除 | ||||
| 
 | ||||
| ### ncdu 安装失败 | ||||
| ```bash | ||||
| # 手动安装 | ||||
| ansible all -m package -a "name=ncdu state=present" --become | ||||
| ``` | ||||
| 
 | ||||
| ### 扫描超时 | ||||
| ```bash | ||||
| # 增加超时时间 | ||||
| ansible-playbook disk-analysis-ncdu.yml -e ansible_timeout=600 | ||||
| ``` | ||||
| 
 | ||||
| ### 权限问题 | ||||
| ```bash | ||||
| # 确保使用 sudo | ||||
| ansible-playbook disk-analysis-ncdu.yml --become | ||||
| ``` | ||||
| 
 | ||||
| ## 💡 最佳实践 | ||||
| 
 | ||||
| 1. **定期监控**: 每天检查磁盘使用情况 | ||||
| 2. **预防性清理**: 使用率超过80%时主动清理 | ||||
| 3. **日志轮转**: 配置合适的日志轮转策略 | ||||
| 4. **容器管理**: 定期清理未使用的容器镜像 | ||||
| 5. **监控告警**: 设置磁盘使用率告警阈值 | ||||
| 
 | ||||
| --- | ||||
| 
 | ||||
| 💡 **提示**: 使用 `./scripts/utilities/disk-monitor.sh` 可以快速检查所有节点状态! | ||||
							
								
								
									
										162
									
								
								mgmt.sh
								
								
								
								
							
							
						
						
									
										162
									
								
								mgmt.sh
								
								
								
								
							|  | @ -1,162 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # 项目管理主脚本 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' | ||||
| 
 | ||||
| # 项目根目录 | ||||
| PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | ||||
| 
 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 显示项目状态 | ||||
| show_status() { | ||||
|     log_info "=== 项目状态总览 ===" | ||||
|     echo "" | ||||
|      | ||||
|     # Docker Swarm 状态 | ||||
|     log_info "Docker Swarm 状态:" | ||||
|     if docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null | grep -q "active"; then | ||||
|         log_success "✓ Docker Swarm 已激活" | ||||
|         docker node ls 2>/dev/null | head -n 5 | ||||
|     else | ||||
|         log_warning "✗ Docker Swarm 未激活" | ||||
|     fi | ||||
|     echo "" | ||||
|      | ||||
|     # OpenTofu 状态 | ||||
|     log_info "OpenTofu 状态:" | ||||
|     if command -v tofu &> /dev/null; then | ||||
|         local version=$(tofu version | head -n1) | ||||
|         log_success "✓ OpenTofu 已安装: $version" | ||||
|     else | ||||
|         log_warning "✗ OpenTofu 未安装" | ||||
|     fi | ||||
|     echo "" | ||||
|      | ||||
|     # 部署的堆栈 | ||||
|     log_info "已部署的 Docker Stack:" | ||||
|     if docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null | grep -q "active"; then | ||||
|         docker stack ls 2>/dev/null || log_warning "无堆栈部署" | ||||
|     else | ||||
|         log_warning "Swarm 未激活,无法查看堆栈" | ||||
|     fi | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 快速部署 | ||||
| quick_deploy() { | ||||
|     log_info "=== 快速部署 ===" | ||||
|      | ||||
|     # 检查 Swarm | ||||
|     if ! docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null | grep -q "active"; then | ||||
|         log_info "初始化 Docker Swarm..." | ||||
|         "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" init | ||||
|     fi | ||||
|      | ||||
|     # 部署 Traefik | ||||
|     log_info "部署 Traefik 反向代理..." | ||||
|     "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" deploy traefik "${PROJECT_ROOT}/swarm/stacks/traefik-swarm-stack.yml" | ||||
|      | ||||
|     # 等待 Traefik 启动 | ||||
|     log_info "等待 Traefik 启动..." | ||||
|     sleep 10 | ||||
|      | ||||
|     # 部署示例服务 | ||||
|     log_info "部署示例服务..." | ||||
|     "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" deploy demo "${PROJECT_ROOT}/swarm/stacks/demo-services-stack.yml" | ||||
|      | ||||
|     log_success "快速部署完成!" | ||||
|     echo "" | ||||
|     log_info "访问地址:" | ||||
|     echo "  - Traefik Dashboard: http://localhost:8080" | ||||
|     echo "  - 示例应用: 请查看 demo 堆栈的服务配置" | ||||
| } | ||||
| 
 | ||||
| # 清理环境 | ||||
| cleanup() { | ||||
|     log_info "=== 清理环境 ===" | ||||
|      | ||||
|     # 停止所有堆栈 | ||||
|     log_info "停止所有 Docker Stack..." | ||||
|     docker stack ls --format "{{.Name}}" 2>/dev/null | while read -r stack; do | ||||
|         if [[ -n "$stack" ]]; then | ||||
|             log_info "删除堆栈: $stack" | ||||
|             docker stack rm "$stack" | ||||
|         fi | ||||
|     done | ||||
|      | ||||
|     # 等待服务清理 | ||||
|     log_info "等待服务清理..." | ||||
|     sleep 5 | ||||
|      | ||||
|     log_success "环境清理完成" | ||||
| } | ||||
| 
 | ||||
| # 显示帮助 | ||||
| show_help() { | ||||
|     echo "项目管理脚本" | ||||
|     echo "" | ||||
|     echo "用法: $0 [命令]" | ||||
|     echo "" | ||||
|     echo "命令:" | ||||
|     echo "  status      - 显示项目状态总览" | ||||
|     echo "  deploy      - 快速部署所有服务" | ||||
|     echo "  cleanup     - 清理所有部署的服务" | ||||
|     echo "  swarm       - 打开 Swarm 管理工具" | ||||
|     echo "  tofu        - 打开 OpenTofu 管理工具" | ||||
|     echo "  help        - 显示此帮助信息" | ||||
|     echo "" | ||||
|     echo "子工具:" | ||||
|     echo "  ./swarm/scripts/swarm-manager.sh   - Docker Swarm 管理" | ||||
|     echo "  ./scripts/setup/setup-opentofu.sh - OpenTofu 设置" | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     cd "$PROJECT_ROOT" | ||||
|      | ||||
|     case "${1:-help}" in | ||||
|         "status") | ||||
|             show_status | ||||
|             ;; | ||||
|         "deploy") | ||||
|             quick_deploy | ||||
|             ;; | ||||
|         "cleanup") | ||||
|             cleanup | ||||
|             ;; | ||||
|         "swarm") | ||||
|             exec "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" "${@:2}" | ||||
|             ;; | ||||
|         "tofu") | ||||
|             exec "${PROJECT_ROOT}/scripts/setup/setup-opentofu.sh" "${@:2}" | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             show_help | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -1,137 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查必要的文件 | ||||
| check_prerequisites() { | ||||
|     log_info "检查前置条件..." | ||||
|      | ||||
|     if [ ! -f "configuration/inventories/production/nomad-cluster.ini" ]; then | ||||
|         log_error "找不到 Nomad 集群配置文件" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     if [ ! -f "configuration/playbooks/applications/configure-nomad-cluster.yml" ]; then | ||||
|         log_error "找不到 Nomad 配置 playbook" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     log_success "前置条件检查完成" | ||||
| } | ||||
| 
 | ||||
| # 生成加密密钥 | ||||
| generate_encrypt_key() { | ||||
|     log_info "生成 Nomad 加密密钥..." | ||||
|      | ||||
|     if command -v nomad >/dev/null 2>&1; then | ||||
|         ENCRYPT_KEY=$(nomad operator gossip keyring generate) | ||||
|         log_success "生成加密密钥: $ENCRYPT_KEY" | ||||
|          | ||||
|         # 更新配置文件中的加密密钥 | ||||
|         sed -i "s|YOUR_NOMAD_ENCRYPT_KEY_HERE|$ENCRYPT_KEY|g" configuration/inventories/production/nomad-cluster.ini | ||||
|         log_success "已更新配置文件中的加密密钥" | ||||
|     else | ||||
|         log_warning "本地未安装 Nomad,将在远程节点生成密钥" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 测试连接 | ||||
| test_connectivity() { | ||||
|     log_info "测试目标主机连接性..." | ||||
|      | ||||
|     ansible -i configuration/inventories/production/nomad-cluster.ini nomad_cluster -m ping | ||||
|      | ||||
|     if [ $? -eq 0 ]; then | ||||
|         log_success "所有主机连接正常" | ||||
|     else | ||||
|         log_error "部分主机连接失败,请检查网络和SSH配置" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 配置 Nomad 集群 | ||||
| configure_cluster() { | ||||
|     log_info "开始配置 Nomad 集群..." | ||||
|      | ||||
|     ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ | ||||
|                      configuration/playbooks/applications/configure-nomad-cluster.yml \ | ||||
|                      -v | ||||
|      | ||||
|     if [ $? -eq 0 ]; then | ||||
|         log_success "Nomad 集群配置完成" | ||||
|     else | ||||
|         log_error "Nomad 集群配置失败" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 验证集群状态 | ||||
| verify_cluster() { | ||||
|     log_info "验证集群状态..." | ||||
|      | ||||
|     # 等待服务启动 | ||||
|     sleep 10 | ||||
|      | ||||
|     log_info "检查 Nomad 服务状态..." | ||||
|     ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ | ||||
|             -m shell -a "systemctl status nomad --no-pager" | ||||
|      | ||||
|     log_info "检查集群成员..." | ||||
|     ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ | ||||
|             -m shell -a "nomad server members" --limit 1 | ||||
|      | ||||
|     log_info "检查节点状态..." | ||||
|     ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ | ||||
|             -m shell -a "nomad node status" --limit 1 | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     echo "🚀 开始配置 Nomad 集群..." | ||||
|     echo "==================================" | ||||
|      | ||||
|     check_prerequisites | ||||
|     generate_encrypt_key | ||||
|     test_connectivity | ||||
|     configure_cluster | ||||
|     verify_cluster | ||||
|      | ||||
|     echo "==================================" | ||||
|     log_success "Nomad 集群配置完成!" | ||||
|     echo "" | ||||
|     echo "访问 Nomad UI:" | ||||
|     echo "- Master: http://100.117.106.136:4646" | ||||
|     echo "- Semaphore: http://100.116.158.95:4646" | ||||
|     echo "" | ||||
|     echo "常用命令:" | ||||
|     echo "- 查看集群状态: nomad server members" | ||||
|     echo "- 查看节点状态: nomad node status" | ||||
|     echo "- 运行作业: nomad job run <job-file>" | ||||
| } | ||||
| 
 | ||||
| # 运行主函数 | ||||
| main "$@" | ||||
|  | @ -1,104 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Consul 集群部署脚本 | ||||
| # 使用 Ansible 在物理机上部署 Consul 集群 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" | ||||
| PLAYBOOK_FILE="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster.yml" | ||||
| 
 | ||||
| echo "=== Consul 集群部署脚本 ===" | ||||
| echo "项目根目录: $PROJECT_ROOT" | ||||
| echo "清单文件: $INVENTORY_FILE" | ||||
| echo "Playbook: $PLAYBOOK_FILE" | ||||
| echo | ||||
| 
 | ||||
| # 检查必要文件 | ||||
| if [[ ! -f "$INVENTORY_FILE" ]]; then | ||||
|     echo "错误: 清单文件不存在: $INVENTORY_FILE" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| if [[ ! -f "$PLAYBOOK_FILE" ]]; then | ||||
|     echo "错误: Playbook 文件不存在: $PLAYBOOK_FILE" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| # 生成 Consul 加密密钥(如果需要) | ||||
| echo "1. 检查 Consul 加密密钥..." | ||||
| if grep -q "YOUR_BASE64_ENCRYPT_KEY_HERE" "$INVENTORY_FILE"; then | ||||
|     echo "需要生成 Consul 加密密钥..." | ||||
|      | ||||
|     # 尝试使用已安装的 consul 生成密钥 | ||||
|     if command -v consul &> /dev/null; then | ||||
|         ENCRYPT_KEY=$(consul keygen) | ||||
|         echo "生成的加密密钥: $ENCRYPT_KEY" | ||||
|          | ||||
|         # 替换清单文件中的占位符 | ||||
|         sed -i "s/YOUR_BASE64_ENCRYPT_KEY_HERE/$ENCRYPT_KEY/" "$INVENTORY_FILE" | ||||
|         echo "已更新清单文件中的加密密钥" | ||||
|     else | ||||
|         echo "警告: 未找到 consul 命令,请手动生成加密密钥并更新清单文件" | ||||
|         echo "可以使用以下命令生成: consul keygen" | ||||
|         echo "或者使用在线工具生成 32 字节的 base64 编码密钥" | ||||
|     fi | ||||
| fi | ||||
| 
 | ||||
| # 测试连接 | ||||
| echo | ||||
| echo "2. 测试目标主机连接..." | ||||
| ansible -i "$INVENTORY_FILE" consul_cluster -m ping | ||||
| 
 | ||||
| if [[ $? -ne 0 ]]; then | ||||
|     echo "错误: 无法连接到目标主机,请检查清单文件中的连接信息" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| # 显示部署信息 | ||||
| echo | ||||
| echo "3. 部署信息:" | ||||
| echo "目标主机:" | ||||
| ansible -i "$INVENTORY_FILE" consul_cluster --list-hosts | ||||
| 
 | ||||
| echo | ||||
| echo "Consul 版本: $(grep consul_version "$INVENTORY_FILE" | cut -d'=' -f2)" | ||||
| echo "数据中心: $(grep consul_datacenter "$INVENTORY_FILE" | cut -d'=' -f2)" | ||||
| 
 | ||||
| # 确认部署 | ||||
| echo | ||||
| read -p "确认部署 Consul 集群到上述主机? (y/N): " confirm | ||||
| if [[ $confirm != "y" && $confirm != "Y" ]]; then | ||||
|     echo "部署已取消" | ||||
|     exit 0 | ||||
| fi | ||||
| 
 | ||||
| # 执行部署 | ||||
| echo | ||||
| echo "4. 开始部署 Consul 集群..." | ||||
| ansible-playbook -i "$INVENTORY_FILE" "$PLAYBOOK_FILE" -v | ||||
| 
 | ||||
| if [[ $? -eq 0 ]]; then | ||||
|     echo | ||||
|     echo "=== 部署完成 ===" | ||||
|     echo | ||||
|     echo "验证集群状态:" | ||||
|     echo "1. 检查服务状态:" | ||||
|     echo "   ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'systemctl status consul'" | ||||
|     echo | ||||
|     echo "2. 检查集群成员:" | ||||
|     echo "   ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'consul members'" | ||||
|     echo | ||||
|     echo "3. 访问 Web UI:" | ||||
|     echo "   - Master: http://master:8500" | ||||
|     echo "   - Ash3c: http://ash3c:8500" | ||||
|     echo | ||||
|     echo "4. 检查集群领导者:" | ||||
|     echo "   curl http://master:8500/v1/status/leader" | ||||
|     echo | ||||
| else | ||||
|     echo "部署失败,请检查错误信息" | ||||
|     exit 1 | ||||
| fi | ||||
|  | @ -1,132 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Consul Cluster Simple Deployment Script | ||||
| # 简化版 Consul 集群部署脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查依赖 | ||||
| check_dependencies() { | ||||
|     log_info "检查依赖项..." | ||||
|      | ||||
|     if ! command -v ansible-playbook &> /dev/null; then | ||||
|         log_error "ansible-playbook 未找到,请安装 Ansible" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     if ! command -v python3 &> /dev/null; then | ||||
|         log_error "python3 未找到" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     log_success "依赖检查完成" | ||||
| } | ||||
| 
 | ||||
| # 检查网络连接 | ||||
| check_connectivity() { | ||||
|     log_info "检查目标主机连接性..." | ||||
|      | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$inventory_file" ]]; then | ||||
|         log_error "清单文件不存在: $inventory_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 测试连接 | ||||
|     if ansible consul_cluster -i "$inventory_file" -m ping --one-line; then | ||||
|         log_success "所有主机连接正常" | ||||
|     else | ||||
|         log_warning "部分主机连接失败,但继续部署..." | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 部署 Consul 集群 | ||||
| deploy_consul() { | ||||
|     log_info "开始部署 Consul 集群..." | ||||
|      | ||||
|     local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster-simple.yml" | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$playbook_file" ]]; then | ||||
|         log_error "Playbook 文件不存在: $playbook_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 运行 Ansible playbook | ||||
|     if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then | ||||
|         log_success "Consul 集群部署完成" | ||||
|     else | ||||
|         log_error "Consul 集群部署失败" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 验证集群状态 | ||||
| verify_cluster() { | ||||
|     log_info "验证 Consul 集群状态..." | ||||
|      | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" | ||||
|      | ||||
|     # 检查服务状态 | ||||
|     log_info "检查 Consul 服务状态..." | ||||
|     ansible consul_cluster -i "$inventory_file" -m shell -a "systemctl status consul --no-pager" || true | ||||
|      | ||||
|     # 检查集群成员 | ||||
|     log_info "检查集群成员..." | ||||
|     ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
|      | ||||
|     # 检查领导者 | ||||
|     log_info "检查集群领导者..." | ||||
|     ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul operator raft list-peers" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     log_info "开始 Consul 集群简化部署..." | ||||
|      | ||||
|     check_dependencies | ||||
|     check_connectivity | ||||
|     deploy_consul | ||||
|     verify_cluster | ||||
|      | ||||
|     log_success "Consul 集群部署流程完成!" | ||||
|      | ||||
|     echo "" | ||||
|     log_info "后续步骤:" | ||||
|     echo "1. 检查集群状态: consul members" | ||||
|     echo "2. 访问 Web UI: http://<node-ip>:8500" | ||||
|     echo "3. 检查日志: journalctl -u consul -f" | ||||
| } | ||||
| 
 | ||||
| # 脚本入口 | ||||
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | ||||
|     main "$@" | ||||
| fi | ||||
|  | @ -1,146 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Nomad Cluster Deployment Script | ||||
| # Nomad 集群部署脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查依赖 | ||||
| check_dependencies() { | ||||
|     log_info "检查依赖项..." | ||||
|      | ||||
|     if ! command -v ansible-playbook &> /dev/null; then | ||||
|         log_error "ansible-playbook 未找到,请安装 Ansible" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     log_success "依赖检查完成" | ||||
| } | ||||
| 
 | ||||
| # 检查网络连接 | ||||
| check_connectivity() { | ||||
|     log_info "检查目标主机连接性..." | ||||
|      | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$inventory_file" ]]; then | ||||
|         log_error "清单文件不存在: $inventory_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 测试连接 | ||||
|     if ansible nomad_cluster -i "$inventory_file" -m ping --one-line; then | ||||
|         log_success "所有主机连接正常" | ||||
|     else | ||||
|         log_warning "部分主机连接失败,但继续部署..." | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 部署 Nomad 集群 | ||||
| deploy_nomad() { | ||||
|     log_info "开始部署 Nomad 集群..." | ||||
|      | ||||
|     local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-cluster.yml" | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$playbook_file" ]]; then | ||||
|         log_error "Playbook 文件不存在: $playbook_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 运行 Ansible playbook | ||||
|     if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then | ||||
|         log_success "Nomad 集群部署完成" | ||||
|     else | ||||
|         log_error "Nomad 集群部署失败" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 验证集群状态 | ||||
| verify_cluster() { | ||||
|     log_info "验证 Nomad 集群状态..." | ||||
|      | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     # 检查服务状态 | ||||
|     log_info "检查 Nomad 服务状态..." | ||||
|     ansible nomad_cluster -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true | ||||
|      | ||||
|     # 检查集群成员 | ||||
|     log_info "检查集群服务器..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
|      | ||||
|     # 检查节点状态 | ||||
|     log_info "检查节点状态..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
|      | ||||
|     # 显示集群信息 | ||||
|     log_info "集群信息..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
| } | ||||
| 
 | ||||
| # 显示访问信息 | ||||
| show_access_info() { | ||||
|     log_info "Nomad 集群访问信息:" | ||||
|     echo "" | ||||
|     echo "Web UI 访问地址:" | ||||
|     echo "  - http://10.0.0.232:4646" | ||||
|     echo "  - http://10.0.0.179:4646" | ||||
|     echo "" | ||||
|     echo "API 访问地址:" | ||||
|     echo "  - http://10.0.0.232:4646/v1/" | ||||
|     echo "  - http://10.0.0.179:4646/v1/" | ||||
|     echo "" | ||||
|     echo "常用命令:" | ||||
|     echo "  - 查看集群状态: nomad status" | ||||
|     echo "  - 查看节点: nomad node status" | ||||
|     echo "  - 查看服务器: nomad server members" | ||||
|     echo "  - 提交作业: nomad job run <job-file>" | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     log_info "开始 Nomad 集群部署..." | ||||
|      | ||||
|     check_dependencies | ||||
|     check_connectivity | ||||
|     deploy_nomad | ||||
|     verify_cluster | ||||
|     show_access_info | ||||
|      | ||||
|     log_success "Nomad 集群部署流程完成!" | ||||
| } | ||||
| 
 | ||||
| # 脚本入口 | ||||
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | ||||
|     main "$@" | ||||
| fi | ||||
|  | @ -1,136 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Nomad Local Deployment Script | ||||
| # Nomad 本地部署脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查依赖 | ||||
| check_dependencies() { | ||||
|     log_info "检查依赖项..." | ||||
|      | ||||
|     if ! command -v ansible-playbook &> /dev/null; then | ||||
|         log_error "ansible-playbook 未找到,请安装 Ansible" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     if ! command -v docker &> /dev/null; then | ||||
|         log_error "docker 未找到,请安装 Docker" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     log_success "依赖检查完成" | ||||
| } | ||||
| 
 | ||||
| # 部署 Nomad | ||||
| deploy_nomad() { | ||||
|     log_info "开始部署 Nomad (本地单节点)..." | ||||
|      | ||||
|     local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-local.yml" | ||||
|      | ||||
|     if [[ ! -f "$playbook_file" ]]; then | ||||
|         log_error "Playbook 文件不存在: $playbook_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 运行 Ansible playbook | ||||
|     if ansible-playbook "$playbook_file" -v; then | ||||
|         log_success "Nomad 本地部署完成" | ||||
|     else | ||||
|         log_error "Nomad 本地部署失败" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 验证部署 | ||||
| verify_deployment() { | ||||
|     log_info "验证 Nomad 部署..." | ||||
|      | ||||
|     # 等待服务启动 | ||||
|     sleep 5 | ||||
|      | ||||
|     # 检查服务状态 | ||||
|     log_info "检查 Nomad 服务状态..." | ||||
|     systemctl status nomad --no-pager || true | ||||
|      | ||||
|     # 检查 Nomad 版本 | ||||
|     log_info "检查 Nomad 版本..." | ||||
|     /usr/local/bin/nomad version || true | ||||
|      | ||||
|     # 检查节点状态 | ||||
|     log_info "检查节点状态..." | ||||
|     /usr/local/bin/nomad node status || true | ||||
|      | ||||
|     # 检查服务器状态 | ||||
|     log_info "检查服务器状态..." | ||||
|     /usr/local/bin/nomad server members || true | ||||
| } | ||||
| 
 | ||||
| # 显示访问信息 | ||||
| show_access_info() { | ||||
|     local current_ip=$(hostname -I | awk '{print $1}') | ||||
|      | ||||
|     log_info "Nomad 访问信息:" | ||||
|     echo "" | ||||
|     echo "Web UI 访问地址:" | ||||
|     echo "  - http://localhost:4646" | ||||
|     echo "  - http://${current_ip}:4646" | ||||
|     echo "" | ||||
|     echo "API 访问地址:" | ||||
|     echo "  - http://localhost:4646/v1/" | ||||
|     echo "  - http://${current_ip}:4646/v1/" | ||||
|     echo "" | ||||
|     echo "常用命令:" | ||||
|     echo "  - 查看集群状态: nomad status" | ||||
|     echo "  - 查看节点: nomad node status" | ||||
|     echo "  - 查看服务器: nomad server members" | ||||
|     echo "  - 提交作业: nomad job run <job-file>" | ||||
|     echo "" | ||||
|     echo "示例作业文件位置:" | ||||
|     echo "  - $PROJECT_ROOT/examples/nomad-jobs/" | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     log_info "开始 Nomad 本地部署..." | ||||
|      | ||||
|     check_dependencies | ||||
|     deploy_nomad | ||||
|     verify_deployment | ||||
|     show_access_info | ||||
|      | ||||
|     log_success "Nomad 本地部署流程完成!" | ||||
| } | ||||
| 
 | ||||
| # 脚本入口 | ||||
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | ||||
|     main "$@" | ||||
| fi | ||||
|  | @ -1,149 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Install Nomad Cluster via APT | ||||
| # 通过 APT 安装 Nomad 集群 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查依赖 | ||||
| check_dependencies() { | ||||
|     log_info "检查依赖项..." | ||||
|      | ||||
|     if ! command -v ansible-playbook &> /dev/null; then | ||||
|         log_error "ansible-playbook 未找到,请安装 Ansible" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     log_success "依赖检查完成" | ||||
| } | ||||
| 
 | ||||
| # 检查网络连接 | ||||
| check_connectivity() { | ||||
|     log_info "检查目标主机连接性..." | ||||
|      | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$inventory_file" ]]; then | ||||
|         log_error "清单文件不存在: $inventory_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 测试连接 | ||||
|     if ansible nomad_servers -i "$inventory_file" -m ping --one-line; then | ||||
|         log_success "所有主机连接正常" | ||||
|     else | ||||
|         log_warning "部分主机连接失败,但继续安装..." | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 安装 Nomad | ||||
| install_nomad() { | ||||
|     log_info "开始在远程主机安装 Nomad..." | ||||
|      | ||||
|     local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/install-nomad-apt.yml" | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$playbook_file" ]]; then | ||||
|         log_error "Playbook 文件不存在: $playbook_file" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 运行 Ansible playbook | ||||
|     if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then | ||||
|         log_success "Nomad 集群安装完成" | ||||
|     else | ||||
|         log_error "Nomad 集群安装失败" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 验证安装 | ||||
| verify_installation() { | ||||
|     log_info "验证 Nomad 安装..." | ||||
|      | ||||
|     local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     # 检查服务状态 | ||||
|     log_info "检查 Nomad 服务状态..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true | ||||
|      | ||||
|     # 检查 Nomad 版本 | ||||
|     log_info "检查 Nomad 版本..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "nomad version" || true | ||||
|      | ||||
|     # 检查集群成员 | ||||
|     log_info "检查集群服务器..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
|      | ||||
|     # 检查节点状态 | ||||
|     log_info "检查节点状态..." | ||||
|     ansible nomad_servers -i "$inventory_file" -m shell -a "nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true | ||||
| } | ||||
| 
 | ||||
| # 显示访问信息 | ||||
| show_access_info() { | ||||
|     log_info "Nomad 集群访问信息:" | ||||
|     echo "" | ||||
|     echo "Web UI 访问地址:" | ||||
|     echo "  - http://100.117.106.136:4646 (master)" | ||||
|     echo "  - http://100.116.158.95:4646 (semaphore)" | ||||
|     echo "" | ||||
|     echo "API 访问地址:" | ||||
|     echo "  - http://100.117.106.136:4646/v1/ (master)" | ||||
|     echo "  - http://100.116.158.95:4646/v1/ (semaphore)" | ||||
|     echo "" | ||||
|     echo "常用命令:" | ||||
|     echo "  - 查看集群状态: nomad status" | ||||
|     echo "  - 查看节点: nomad node status" | ||||
|     echo "  - 查看服务器: nomad server members" | ||||
|     echo "  - 提交作业: nomad job run <job-file>" | ||||
|     echo "" | ||||
|     echo "示例作业文件位置:" | ||||
|     echo "  - $PROJECT_ROOT/examples/nomad-jobs/" | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     log_info "开始 Nomad 集群安装..." | ||||
|      | ||||
|     check_dependencies | ||||
|     check_connectivity | ||||
|     install_nomad | ||||
|     verify_installation | ||||
|     show_access_info | ||||
|      | ||||
|     log_success "Nomad 集群安装流程完成!" | ||||
| } | ||||
| 
 | ||||
| # 脚本入口 | ||||
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | ||||
|     main "$@" | ||||
| fi | ||||
|  | @ -1,467 +0,0 @@ | |||
| #!/bin/bash | ||||
| # Gitea 集成设置脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| echo "🔗 设置 Gitea 集成..." | ||||
| 
 | ||||
| # 配置变量 | ||||
| GITEA_HOST="gitea" | ||||
| GITEA_USER="ben" | ||||
| GITEA_SSH_URL="git@${GITEA_HOST}" | ||||
| REPO_NAME="mgmt" | ||||
| GITEA_HTTP_URL="http://${GITEA_HOST}:3000" | ||||
| 
 | ||||
| # 检查 SSH 连接 | ||||
| echo "🔍 检查 Gitea SSH 连接..." | ||||
| if ssh -o ConnectTimeout=5 -o BatchMode=yes "${GITEA_SSH_URL}" 2>&1 | grep -q "successfully authenticated"; then | ||||
|     echo "✅ SSH 连接正常" | ||||
| else | ||||
|     echo "❌ SSH 连接失败,请检查:" | ||||
|     echo "   1. Gitea 服务是否运行" | ||||
|     echo "   2. SSH 密钥是否已添加到 Gitea" | ||||
|     echo "   3. 网络连接是否正常" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| # 检查是否已经是 Git 仓库 | ||||
| if [ ! -d ".git" ]; then | ||||
|     echo "📦 初始化 Git 仓库..." | ||||
|     git init | ||||
|     git config user.name "${GITEA_USER}" | ||||
|     git config user.email "${GITEA_USER}@example.com" | ||||
| else | ||||
|     echo "✅ Git 仓库已存在" | ||||
| fi | ||||
| 
 | ||||
| # 检查远程仓库配置 | ||||
| if git remote get-url origin >/dev/null 2>&1; then | ||||
|     CURRENT_ORIGIN=$(git remote get-url origin) | ||||
|     echo "ℹ️  当前远程仓库: $CURRENT_ORIGIN" | ||||
|      | ||||
|     if [[ "$CURRENT_ORIGIN" != *"${GITEA_HOST}"* ]]; then | ||||
|         echo "🔄 更新远程仓库地址..." | ||||
|         git remote set-url origin "${GITEA_SSH_URL}:${GITEA_USER}/${REPO_NAME}.git" | ||||
|     fi | ||||
| else | ||||
|     echo "➕ 添加远程仓库..." | ||||
|     git remote add origin "${GITEA_SSH_URL}:${GITEA_USER}/${REPO_NAME}.git" | ||||
| fi | ||||
| 
 | ||||
| # 创建 .gitignore | ||||
| echo "📝 创建 .gitignore..." | ||||
| cat > .gitignore << 'EOF' | ||||
| # OpenTofu/Terraform | ||||
| *.tfstate | ||||
| *.tfstate.* | ||||
| *.tfvars | ||||
| !*.tfvars.example | ||||
| .terraform/ | ||||
| .terraform.lock.hcl | ||||
| crash.log | ||||
| crash.*.log | ||||
| 
 | ||||
| # Ansible | ||||
| *.retry | ||||
| .vault_pass | ||||
| host_vars/*/vault.yml | ||||
| group_vars/*/vault.yml | ||||
| 
 | ||||
| # Docker | ||||
| .env | ||||
| docker-compose.override.yml | ||||
| 
 | ||||
| # IDE | ||||
| .vscode/ | ||||
| .idea/ | ||||
| *.swp | ||||
| *.swo | ||||
| *~ | ||||
| 
 | ||||
| # OS | ||||
| .DS_Store | ||||
| Thumbs.db | ||||
| 
 | ||||
| # Logs | ||||
| *.log | ||||
| logs/ | ||||
| 
 | ||||
| # Temporary files | ||||
| tmp/ | ||||
| temp/ | ||||
| .tmp/ | ||||
| 
 | ||||
| # Backup files | ||||
| backup-*/ | ||||
| *.bak | ||||
| 
 | ||||
| # Secrets | ||||
| secrets/ | ||||
| *.pem | ||||
| *.key | ||||
| *.crt | ||||
| !*.example.* | ||||
| 
 | ||||
| # Node modules (if any) | ||||
| node_modules/ | ||||
| 
 | ||||
| # Python | ||||
| __pycache__/ | ||||
| *.pyc | ||||
| *.pyo | ||||
| *.pyd | ||||
| .Python | ||||
| env/ | ||||
| venv/ | ||||
| .venv/ | ||||
| pip-log.txt | ||||
| pip-delete-this-directory.txt | ||||
| .tox/ | ||||
| .coverage | ||||
| .coverage.* | ||||
| .cache | ||||
| nosetests.xml | ||||
| coverage.xml | ||||
| *.cover | ||||
| *.log | ||||
| .git | ||||
| .mypy_cache | ||||
| .pytest_cache | ||||
| .hypothesis | ||||
| 
 | ||||
| # Local development | ||||
| .local/ | ||||
| local-* | ||||
| EOF | ||||
| 
 | ||||
| # 创建 Gitea Actions 工作流 | ||||
| echo "🔄 创建 Gitea Actions 工作流..." | ||||
| 
 | ||||
| # 基础设施 CI/CD | ||||
| cat > .gitea/workflows/infrastructure.yml << 'EOF' | ||||
| name: Infrastructure CI/CD | ||||
| 
 | ||||
| on: | ||||
|   push: | ||||
|     branches: [ main, develop ] | ||||
|     paths: | ||||
|       - 'infrastructure/**' | ||||
|       - '.gitea/workflows/infrastructure.yml' | ||||
|   pull_request: | ||||
|     branches: [ main ] | ||||
|     paths: | ||||
|       - 'infrastructure/**' | ||||
| 
 | ||||
| jobs: | ||||
|   validate: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Validate Infrastructure | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Setup OpenTofu | ||||
|         uses: opentofu/setup-opentofu@v1 | ||||
|         with: | ||||
|           tofu_version: 1.10.6 | ||||
| 
 | ||||
|       - name: Validate OpenTofu configurations | ||||
|         run: | | ||||
|           for dir in infrastructure/providers/*/; do | ||||
|             if [ -d "$dir" ]; then | ||||
|               echo "Validating $dir" | ||||
|               cd "$dir" | ||||
|               tofu init -backend=false | ||||
|               tofu validate | ||||
|               cd - > /dev/null | ||||
|             fi | ||||
|           done | ||||
| 
 | ||||
|       - name: Check formatting | ||||
|         run: | | ||||
|           tofu fmt -check -recursive infrastructure/ | ||||
| 
 | ||||
|       - name: Security scan | ||||
|         run: | | ||||
|           # 这里可以添加 tfsec 或 checkov 扫描 | ||||
|           echo "Security scan placeholder" | ||||
| 
 | ||||
|   plan: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Plan Infrastructure | ||||
|     needs: validate | ||||
|     if: github.event_name == 'pull_request' | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Setup OpenTofu | ||||
|         uses: opentofu/setup-opentofu@v1 | ||||
|         with: | ||||
|           tofu_version: 1.10.6 | ||||
| 
 | ||||
|       - name: Plan infrastructure changes | ||||
|         run: | | ||||
|           cd infrastructure/environments/dev | ||||
|           tofu init | ||||
|           tofu plan -var-file="terraform.tfvars" -out=tfplan | ||||
|         env: | ||||
|           # 这里需要配置云服务商的环境变量 | ||||
|           TF_VAR_environment: dev | ||||
| 
 | ||||
|   apply: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Apply Infrastructure | ||||
|     needs: validate | ||||
|     if: github.ref == 'refs/heads/main' && github.event_name == 'push' | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Setup OpenTofu | ||||
|         uses: opentofu/setup-opentofu@v1 | ||||
|         with: | ||||
|           tofu_version: 1.10.6 | ||||
| 
 | ||||
|       - name: Apply infrastructure changes | ||||
|         run: | | ||||
|           cd infrastructure/environments/dev | ||||
|           tofu init | ||||
|           tofu apply -var-file="terraform.tfvars" -auto-approve | ||||
|         env: | ||||
|           TF_VAR_environment: dev | ||||
| EOF | ||||
| 
 | ||||
| # 应用部署工作流 | ||||
| cat > .gitea/workflows/deploy.yml << 'EOF' | ||||
| name: Application Deployment | ||||
| 
 | ||||
| on: | ||||
|   push: | ||||
|     branches: [ main ] | ||||
|     paths: | ||||
|       - 'configuration/**' | ||||
|       - 'containers/**' | ||||
|       - '.gitea/workflows/deploy.yml' | ||||
|   workflow_dispatch: | ||||
|     inputs: | ||||
|       environment: | ||||
|         description: 'Target environment' | ||||
|         required: true | ||||
|         default: 'dev' | ||||
|         type: choice | ||||
|         options: | ||||
|           - dev | ||||
|           - staging | ||||
|           - production | ||||
| 
 | ||||
| jobs: | ||||
|   ansible-check: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Ansible Syntax Check | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v4 | ||||
|         with: | ||||
|           python-version: '3.11' | ||||
| 
 | ||||
|       - name: Install Ansible | ||||
|         run: | | ||||
|           pip install ansible ansible-core | ||||
|           ansible-galaxy collection install community.general | ||||
|           ansible-galaxy collection install ansible.posix | ||||
|           ansible-galaxy collection install community.docker | ||||
| 
 | ||||
|       - name: Ansible syntax check | ||||
|         run: | | ||||
|           cd configuration | ||||
|           for playbook in playbooks/*/*.yml; do | ||||
|             if [ -f "$playbook" ]; then | ||||
|               echo "Checking $playbook" | ||||
|               ansible-playbook --syntax-check "$playbook" | ||||
|             fi | ||||
|           done | ||||
| 
 | ||||
|   deploy: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Deploy Applications | ||||
|     needs: ansible-check | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v4 | ||||
|         with: | ||||
|           python-version: '3.11' | ||||
| 
 | ||||
|       - name: Install Ansible | ||||
|         run: | | ||||
|           pip install ansible ansible-core | ||||
|           ansible-galaxy collection install community.general | ||||
|           ansible-galaxy collection install ansible.posix | ||||
|           ansible-galaxy collection install community.docker | ||||
| 
 | ||||
|       - name: Deploy applications | ||||
|         run: | | ||||
|           cd configuration | ||||
|           ENV="${{ github.event.inputs.environment || 'dev' }}" | ||||
|           ansible-playbook -i "inventories/${ENV}/inventory.ini" playbooks/bootstrap/main.yml | ||||
|         env: | ||||
|           ANSIBLE_HOST_KEY_CHECKING: False | ||||
| EOF | ||||
| 
 | ||||
| # Docker 构建工作流 | ||||
| cat > .gitea/workflows/docker.yml << 'EOF' | ||||
| name: Docker Build and Deploy | ||||
| 
 | ||||
| on: | ||||
|   push: | ||||
|     branches: [ main ] | ||||
|     paths: | ||||
|       - 'containers/**' | ||||
|       - 'Dockerfile*' | ||||
|       - '.gitea/workflows/docker.yml' | ||||
| 
 | ||||
| jobs: | ||||
|   build: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Build Docker Images | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Set up Docker Buildx | ||||
|         uses: docker/setup-buildx-action@v3 | ||||
| 
 | ||||
|       - name: Login to Container Registry | ||||
|         uses: docker/login-action@v3 | ||||
|         with: | ||||
|           registry: ${{ secrets.REGISTRY_URL }} | ||||
|           username: ${{ secrets.REGISTRY_USERNAME }} | ||||
|           password: ${{ secrets.REGISTRY_PASSWORD }} | ||||
| 
 | ||||
|       - name: Build and push images | ||||
|         run: | | ||||
|           # 构建应用镜像 | ||||
|           for dockerfile in containers/applications/*/Dockerfile; do | ||||
|             if [ -f "$dockerfile" ]; then | ||||
|               app_name=$(basename $(dirname "$dockerfile")) | ||||
|               echo "Building $app_name" | ||||
|               docker build -t "${{ secrets.REGISTRY_URL }}/$app_name:${{ github.sha }}" -f "$dockerfile" . | ||||
|               docker push "${{ secrets.REGISTRY_URL }}/$app_name:${{ github.sha }}" | ||||
|             fi | ||||
|           done | ||||
| 
 | ||||
|   deploy-swarm: | ||||
|     runs-on: ubuntu-latest | ||||
|     name: Deploy to Docker Swarm | ||||
|     needs: build | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
| 
 | ||||
|       - name: Deploy to Swarm | ||||
|         run: | | ||||
|           # 这里可以通过 SSH 连接到 Swarm 管理节点进行部署 | ||||
|           echo "Deploy to Swarm placeholder" | ||||
| EOF | ||||
| 
 | ||||
| # 创建项目配置文件 | ||||
| echo "⚙️ 创建项目配置文件..." | ||||
| 
 | ||||
| # Gitea 仓库配置 | ||||
| cat > .gitea/settings.yml << 'EOF' | ||||
| # Gitea 仓库设置 | ||||
| repository: | ||||
|   name: mgmt | ||||
|   description: "基础设施管理项目 - OpenTofu + Ansible + Docker Swarm" | ||||
|   website: "" | ||||
|   default_branch: main | ||||
|    | ||||
|   # 功能开关 | ||||
|   has_issues: true | ||||
|   has_wiki: true | ||||
|   has_projects: true | ||||
|   has_actions: true | ||||
|    | ||||
|   # 权限设置 | ||||
|   private: false | ||||
|   allow_merge_commits: true | ||||
|   allow_squash_merge: true | ||||
|   allow_rebase_merge: true | ||||
|   delete_branch_on_merge: true | ||||
| 
 | ||||
| # Actions 设置 | ||||
| actions: | ||||
|   enabled: true | ||||
|   allow_fork_pull_request_run: true | ||||
|   default_actions_url: "https://gitea.com" | ||||
| 
 | ||||
| # 分支保护 | ||||
| branch_protection: | ||||
|   main: | ||||
|     enable_push: false | ||||
|     enable_push_whitelist: true | ||||
|     push_whitelist_usernames: ["ben"] | ||||
|     require_signed_commits: false | ||||
|     enable_merge_whitelist: true | ||||
|     merge_whitelist_usernames: ["ben"] | ||||
|     enable_status_check: true | ||||
|     status_check_contexts: ["validate", "plan"] | ||||
|     enable_approvals_whitelist: false | ||||
|     approvals_whitelist_usernames: [] | ||||
|     block_on_rejected_reviews: true | ||||
|     dismiss_stale_approvals: true | ||||
|     require_signed_commits: false | ||||
| EOF | ||||
| 
 | ||||
| # 添加所有文件到 Git | ||||
| echo "📦 添加文件到 Git..." | ||||
| git add . | ||||
| 
 | ||||
| # 检查是否有变更需要提交 | ||||
| if git diff --staged --quiet; then | ||||
|     echo "ℹ️  没有新的变更需要提交" | ||||
| else | ||||
|     echo "💾 提交变更..." | ||||
|     git commit -m "feat: 集成 OpenTofu + Ansible + Gitea CI/CD | ||||
| 
 | ||||
| - 重构项目目录结构 | ||||
| - 添加 OpenTofu 多云支持 | ||||
| - 配置 Ansible 自动化部署 | ||||
| - 集成 Gitea Actions CI/CD 流水线 | ||||
| - 添加 Docker Swarm 管理 | ||||
| - 完善监控和安全配置" | ||||
| fi | ||||
| 
 | ||||
| # 推送到远程仓库 | ||||
| echo "🚀 推送到 Gitea..." | ||||
| if git push -u origin main; then | ||||
|     echo "✅ 成功推送到 Gitea" | ||||
| else | ||||
|     echo "⚠️  推送失败,可能需要先在 Gitea 创建仓库" | ||||
|     echo "   请访问: ${GITEA_HTTP_URL}/repo/create" | ||||
|     echo "   创建名为 '${REPO_NAME}' 的仓库" | ||||
| fi | ||||
| 
 | ||||
| echo "" | ||||
| echo "🎉 Gitea 集成设置完成!" | ||||
| echo "" | ||||
| echo "📋 下一步操作:" | ||||
| echo "1. 访问 Gitea: ${GITEA_HTTP_URL}/${GITEA_USER}/${REPO_NAME}" | ||||
| echo "2. 配置 Actions Secrets (如果需要):" | ||||
| echo "   - REGISTRY_URL: 容器镜像仓库地址" | ||||
| echo "   - REGISTRY_USERNAME: 仓库用户名" | ||||
| echo "   - REGISTRY_PASSWORD: 仓库密码" | ||||
| echo "3. 配置云服务商凭据 (通过 Secrets 或环境变量)" | ||||
| echo "4. 测试 CI/CD 流水线" | ||||
| echo "" | ||||
| echo "🔗 有用的命令:" | ||||
| echo "  git status                    - 查看仓库状态" | ||||
| echo "  git log --oneline            - 查看提交历史" | ||||
| echo "  git push                     - 推送变更" | ||||
| echo "  make help                    - 查看项目命令" | ||||
|  | @ -0,0 +1,230 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Nomad 笔记本设置脚本 - Mac/Linux 版本 | ||||
| # 用于将 Mac 或 Linux 笔记本加入 Nomad 集群作为 server | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| # 配置变量 | ||||
| NOMAD_VERSION="1.10.5" | ||||
| NOMAD_DATACENTER="dc1" | ||||
| NOMAD_ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
| 
 | ||||
| # 检测操作系统 | ||||
| OS=$(uname -s | tr '[:upper:]' '[:lower:]') | ||||
| ARCH=$(uname -m) | ||||
| 
 | ||||
| case $ARCH in | ||||
|     x86_64) ARCH="amd64" ;; | ||||
|     arm64|aarch64) ARCH="arm64" ;; | ||||
|     *) echo "不支持的架构: $ARCH"; exit 1 ;; | ||||
| esac | ||||
| 
 | ||||
| echo "🚀 开始设置 Nomad ($OS-$ARCH)..." | ||||
| 
 | ||||
| # 1. 检查 Tailscale | ||||
| echo "📡 检查 Tailscale 连接..." | ||||
| if ! command -v tailscale &> /dev/null; then | ||||
|     echo "❌ 请先安装 Tailscale" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| TAILSCALE_IP=$(tailscale ip | head -1) | ||||
| if [ -z "$TAILSCALE_IP" ]; then | ||||
|     echo "❌ Tailscale 未连接,请先运行: tailscale up" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| echo "✅ Tailscale IP: $TAILSCALE_IP" | ||||
| 
 | ||||
| # 2. 安装 Nomad(如果需要) | ||||
| if ! command -v nomad &> /dev/null; then | ||||
|     echo "📦 安装 Nomad $NOMAD_VERSION..." | ||||
|      | ||||
|     if [[ "$OS" == "darwin" ]]; then | ||||
|         # macOS | ||||
|         if command -v brew &> /dev/null; then | ||||
|             brew install nomad | ||||
|         else | ||||
|             echo "❌ 请先安装 Homebrew 或手动安装 Nomad" | ||||
|             exit 1 | ||||
|         fi | ||||
|     else | ||||
|         # Linux | ||||
|         NOMAD_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_${OS}_${ARCH}.zip" | ||||
|         curl -L "$NOMAD_URL" -o nomad.zip | ||||
|         unzip nomad.zip | ||||
|         sudo mv nomad /usr/local/bin/ | ||||
|         rm nomad.zip | ||||
|     fi | ||||
| fi | ||||
| 
 | ||||
| echo "✅ Nomad 版本: $(nomad version)" | ||||
| 
 | ||||
| # 3. 创建配置目录 | ||||
| echo "📁 创建配置目录..." | ||||
| sudo mkdir -p /etc/nomad.d /opt/nomad/data | ||||
| sudo chown -R $(whoami):$(id -gn) /etc/nomad.d /opt/nomad/data | ||||
| 
 | ||||
| # 4. 生成 Nomad 配置 | ||||
| echo "⚙️  生成 Nomad 配置..." | ||||
| cat > /etc/nomad.d/nomad.hcl << EOF | ||||
| datacenter = "$NOMAD_DATACENTER" | ||||
| data_dir = "/opt/nomad/data" | ||||
| log_level = "INFO" | ||||
| 
 | ||||
| bind_addr = "$TAILSCALE_IP" | ||||
| 
 | ||||
| addresses { | ||||
|   http = "0.0.0.0" | ||||
|   rpc  = "$TAILSCALE_IP" | ||||
|   serf = "$TAILSCALE_IP" | ||||
| } | ||||
| 
 | ||||
| ports { | ||||
|   http = 4646 | ||||
|   rpc  = 4647 | ||||
|   serf = 4648 | ||||
| } | ||||
| 
 | ||||
| server { | ||||
|   enabled = true | ||||
|   bootstrap_expect = 6 | ||||
|    | ||||
|   retry_join = [ | ||||
|     "100.116.158.95",   # semaphore | ||||
|     "100.117.106.136",  # master (现在是 client) | ||||
|     "100.116.80.94"     # ash3c (现在是 client) | ||||
|   ] | ||||
|    | ||||
|   encrypt = "$NOMAD_ENCRYPT_KEY" | ||||
| } | ||||
| 
 | ||||
| client { | ||||
|   enabled = false | ||||
| } | ||||
| 
 | ||||
| # 如果是 macOS,可能需要 Docker 插件 | ||||
| plugin "podman" { | ||||
|   config { | ||||
|     volumes { | ||||
|       enabled = true | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| consul { | ||||
|   address = "$TAILSCALE_IP:8500" | ||||
| } | ||||
| EOF | ||||
| 
 | ||||
| echo "✅ 配置文件已生成: /etc/nomad.d/nomad.hcl" | ||||
| 
 | ||||
| # 5. 创建启动脚本(macOS 不使用 systemd) | ||||
| if [[ "$OS" == "darwin" ]]; then | ||||
|     # macOS - 创建 LaunchDaemon | ||||
|     echo "🍎 创建 macOS LaunchDaemon..." | ||||
|     sudo tee /Library/LaunchDaemons/io.nomadproject.nomad.plist > /dev/null << EOF | ||||
| <?xml version="1.0" encoding="UTF-8"?> | ||||
| <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> | ||||
| <plist version="1.0"> | ||||
| <dict> | ||||
|     <key>Label</key> | ||||
|     <string>io.nomadproject.nomad</string> | ||||
|     <key>ProgramArguments</key> | ||||
|     <array> | ||||
|         <string>/usr/local/bin/nomad</string> | ||||
|         <string>agent</string> | ||||
|         <string>-config=/etc/nomad.d/nomad.hcl</string> | ||||
|     </array> | ||||
|     <key>RunAtLoad</key> | ||||
|     <true/> | ||||
|     <key>KeepAlive</key> | ||||
|     <true/> | ||||
|     <key>StandardOutPath</key> | ||||
|     <string>/var/log/nomad.log</string> | ||||
|     <key>StandardErrorPath</key> | ||||
|     <string>/var/log/nomad.log</string> | ||||
| </dict> | ||||
| </plist> | ||||
| EOF | ||||
|      | ||||
|     # 加载并启动服务 | ||||
|     sudo launchctl load /Library/LaunchDaemons/io.nomadproject.nomad.plist | ||||
|     sudo launchctl start io.nomadproject.nomad | ||||
|      | ||||
| else | ||||
|     # Linux - 创建 systemd 服务 | ||||
|     echo "🐧 创建 systemd 服务..." | ||||
|     sudo tee /etc/systemd/system/nomad.service > /dev/null << EOF | ||||
| [Unit] | ||||
| Description=Nomad | ||||
| Documentation=https://www.nomadproject.io/ | ||||
| Requires=network-online.target | ||||
| After=network-online.target | ||||
| 
 | ||||
| [Service] | ||||
| Type=notify | ||||
| User=$(whoami) | ||||
| Group=$(id -gn) | ||||
| ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl | ||||
| ExecReload=/bin/kill -HUP \$MAINPID | ||||
| KillMode=process | ||||
| Restart=on-failure | ||||
| LimitNOFILE=65536 | ||||
| 
 | ||||
| [Install] | ||||
| WantedBy=multi-user.target | ||||
| EOF | ||||
|      | ||||
|     # 启动服务 | ||||
|     sudo systemctl daemon-reload | ||||
|     sudo systemctl enable nomad | ||||
|     sudo systemctl start nomad | ||||
| fi | ||||
| 
 | ||||
| # 6. 验证安装 | ||||
| echo "🔍 验证 Nomad 服务..." | ||||
| sleep 5 | ||||
| 
 | ||||
| if [[ "$OS" == "darwin" ]]; then | ||||
|     if sudo launchctl list | grep -q nomad; then | ||||
|         echo "✅ Nomad 服务已启动" | ||||
|     else | ||||
|         echo "❌ Nomad 服务启动失败" | ||||
|         exit 1 | ||||
|     fi | ||||
| else | ||||
|     if systemctl is-active --quiet nomad; then | ||||
|         echo "✅ Nomad 服务已启动" | ||||
|     else | ||||
|         echo "❌ Nomad 服务启动失败" | ||||
|         sudo systemctl status nomad | ||||
|         exit 1 | ||||
|     fi | ||||
| fi | ||||
| 
 | ||||
| # 7. 检查集群状态 | ||||
| echo "🌐 检查集群连接..." | ||||
| sleep 10 | ||||
| 
 | ||||
| if nomad server members 2>/dev/null | grep -q alive; then | ||||
|     echo "✅ 成功加入 Nomad 集群!" | ||||
|     nomad server members | ||||
| else | ||||
|     echo "⚠️  正在连接集群,请稍等..." | ||||
|     echo "可以运行以下命令检查状态:" | ||||
|     echo "  nomad server members" | ||||
|     echo "  nomad node status" | ||||
| fi | ||||
| 
 | ||||
| echo "" | ||||
| echo "🎉 设置完成!" | ||||
| echo "📊 Web UI: http://$TAILSCALE_IP:4646" | ||||
| echo "🔧 配置文件: /etc/nomad.d/nomad.hcl" | ||||
| echo "📝 日志查看:" | ||||
| if [[ "$OS" == "darwin" ]]; then | ||||
|     echo "  tail -f /var/log/nomad.log" | ||||
| else | ||||
|     echo "  sudo journalctl -u nomad -f" | ||||
| fi | ||||
|  | @ -0,0 +1,212 @@ | |||
| # Nomad Windows 设置脚本 | ||||
| # 用于将 Windows 笔记本加入 Nomad 集群作为 server | ||||
| 
 | ||||
| param( | ||||
|     [string]$NomadVersion = "1.10.5", | ||||
|     [string]$DataCenter = "dc1", | ||||
|     [string]$EncryptKey = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
| ) | ||||
| 
 | ||||
| # 需要管理员权限 | ||||
| if (-NOT ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) { | ||||
|     Write-Host "❌ 此脚本需要管理员权限运行" -ForegroundColor Red | ||||
|     Write-Host "请以管理员身份运行 PowerShell" -ForegroundColor Yellow | ||||
|     exit 1 | ||||
| } | ||||
| 
 | ||||
| Write-Host "🚀 开始设置 Windows Nomad Server..." -ForegroundColor Green | ||||
| 
 | ||||
| # 1. 检查 Tailscale | ||||
| Write-Host "📡 检查 Tailscale 连接..." -ForegroundColor Cyan | ||||
| try { | ||||
|     $tailscaleIP = (tailscale ip) | Select-Object -First 1 | ||||
|     if ([string]::IsNullOrEmpty($tailscaleIP)) { | ||||
|         throw "Tailscale IP 为空" | ||||
|     } | ||||
|     Write-Host "✅ Tailscale IP: $tailscaleIP" -ForegroundColor Green | ||||
| } catch { | ||||
|     Write-Host "❌ Tailscale 未安装或未连接" -ForegroundColor Red | ||||
|     Write-Host "请先安装 Tailscale 并运行: tailscale up" -ForegroundColor Yellow | ||||
|     exit 1 | ||||
| } | ||||
| 
 | ||||
| # 2. 创建目录 | ||||
| Write-Host "📁 创建 Nomad 目录..." -ForegroundColor Cyan | ||||
| $nomadDir = "C:\nomad" | ||||
| $configDir = "$nomadDir\config" | ||||
| $dataDir = "$nomadDir\data" | ||||
| $binDir = "$nomadDir\bin" | ||||
| 
 | ||||
| New-Item -ItemType Directory -Force -Path $configDir | Out-Null | ||||
| New-Item -ItemType Directory -Force -Path $dataDir | Out-Null | ||||
| New-Item -ItemType Directory -Force -Path $binDir | Out-Null | ||||
| 
 | ||||
| # 3. 下载 Nomad(如果需要) | ||||
| $nomadExe = "$binDir\nomad.exe" | ||||
| if (-not (Test-Path $nomadExe)) { | ||||
|     Write-Host "📦 下载 Nomad $NomadVersion..." -ForegroundColor Cyan | ||||
|     $nomadUrl = "https://releases.hashicorp.com/nomad/$NomadVersion/nomad_${NomadVersion}_windows_amd64.zip" | ||||
|     $zipPath = "$env:TEMP\nomad.zip" | ||||
|      | ||||
|     try { | ||||
|         Invoke-WebRequest -Uri $nomadUrl -OutFile $zipPath | ||||
|         Expand-Archive -Path $zipPath -DestinationPath $binDir -Force | ||||
|         Remove-Item $zipPath | ||||
|         Write-Host "✅ Nomad 下载完成" -ForegroundColor Green | ||||
|     } catch { | ||||
|         Write-Host "❌ 下载 Nomad 失败: $_" -ForegroundColor Red | ||||
|         exit 1 | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| # 4. 添加到 PATH(如果需要) | ||||
| $currentPath = [Environment]::GetEnvironmentVariable("PATH", "Machine") | ||||
| if ($currentPath -notlike "*$binDir*") { | ||||
|     Write-Host "🔧 添加 Nomad 到系统 PATH..." -ForegroundColor Cyan | ||||
|     [Environment]::SetEnvironmentVariable("PATH", "$currentPath;$binDir", "Machine") | ||||
|     $env:PATH += ";$binDir" | ||||
| } | ||||
| 
 | ||||
| # 5. 生成配置文件 | ||||
| Write-Host "⚙️  生成 Nomad 配置..." -ForegroundColor Cyan | ||||
| $configContent = @" | ||||
| datacenter = "$DataCenter" | ||||
| data_dir = "$($dataDir -replace '\\', '/')" | ||||
| log_level = "INFO" | ||||
| 
 | ||||
| bind_addr = "$tailscaleIP" | ||||
| 
 | ||||
| addresses { | ||||
|   http = "0.0.0.0" | ||||
|   rpc  = "$tailscaleIP" | ||||
|   serf = "$tailscaleIP" | ||||
| } | ||||
| 
 | ||||
| ports { | ||||
|   http = 4646 | ||||
|   rpc  = 4647 | ||||
|   serf = 4648 | ||||
| } | ||||
| 
 | ||||
| server { | ||||
|   enabled = true | ||||
|   bootstrap_expect = 6 | ||||
|    | ||||
|   retry_join = [ | ||||
|     "100.116.158.95",   # semaphore | ||||
|     "100.117.106.136",  # master | ||||
|     "100.116.80.94"     # ash3c | ||||
|   ] | ||||
|    | ||||
|   encrypt = "$EncryptKey" | ||||
| } | ||||
| 
 | ||||
| client { | ||||
|   enabled = false | ||||
| } | ||||
| 
 | ||||
| plugin "podman" { | ||||
|   config { | ||||
|     volumes { | ||||
|       enabled = true | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| consul { | ||||
|   address = "$tailscaleIP:8500" | ||||
| } | ||||
| "@ | ||||
| 
 | ||||
| $configFile = "$configDir\nomad.hcl" | ||||
| $configContent | Out-File -FilePath $configFile -Encoding UTF8 | ||||
| Write-Host "✅ 配置文件已生成: $configFile" -ForegroundColor Green | ||||
| 
 | ||||
| # 6. 创建 Windows 服务 | ||||
| Write-Host "🔧 创建 Windows 服务..." -ForegroundColor Cyan | ||||
| 
 | ||||
| # 先停止并删除现有服务(如果存在) | ||||
| try { | ||||
|     Stop-Service -Name "Nomad" -ErrorAction SilentlyContinue | ||||
|     & sc.exe delete "Nomad" 2>$null | ||||
| } catch {} | ||||
| 
 | ||||
| # 创建新服务 | ||||
| $serviceName = "Nomad" | ||||
| $serviceDisplayName = "HashiCorp Nomad" | ||||
| $serviceDescription = "HashiCorp Nomad Agent" | ||||
| $serviceCommand = "`"$nomadExe`" agent -config=`"$configFile`"" | ||||
| 
 | ||||
| try { | ||||
|     & sc.exe create $serviceName binPath= $serviceCommand DisplayName= $serviceDisplayName start= auto | ||||
|     & sc.exe description $serviceName $serviceDescription | ||||
|      | ||||
|     # 配置服务恢复选项 | ||||
|     & sc.exe failure $serviceName reset= 30 actions= restart/5000/restart/5000/restart/5000 | ||||
|      | ||||
|     Write-Host "✅ Windows 服务已创建" -ForegroundColor Green | ||||
| } catch { | ||||
|     Write-Host "❌ 创建服务失败: $_" -ForegroundColor Red | ||||
|     exit 1 | ||||
| } | ||||
| 
 | ||||
| # 7. 启动服务 | ||||
| Write-Host "🚀 启动 Nomad 服务..." -ForegroundColor Cyan | ||||
| try { | ||||
|     Start-Service -Name $serviceName | ||||
|     Write-Host "✅ Nomad 服务已启动" -ForegroundColor Green | ||||
| } catch { | ||||
|     Write-Host "❌ 启动服务失败: $_" -ForegroundColor Red | ||||
|     Write-Host "检查服务状态: Get-Service Nomad" -ForegroundColor Yellow | ||||
|     exit 1 | ||||
| } | ||||
| 
 | ||||
| # 8. 验证安装 | ||||
| Write-Host "🔍 验证 Nomad 服务..." -ForegroundColor Cyan | ||||
| Start-Sleep -Seconds 10 | ||||
| 
 | ||||
| try { | ||||
|     $serviceStatus = Get-Service -Name $serviceName | ||||
|     if ($serviceStatus.Status -eq "Running") { | ||||
|         Write-Host "✅ Nomad 服务运行正常" -ForegroundColor Green | ||||
|     } else { | ||||
|         Write-Host "❌ Nomad 服务状态异常: $($serviceStatus.Status)" -ForegroundColor Red | ||||
|     } | ||||
| } catch { | ||||
|     Write-Host "❌ 检查服务状态失败: $_" -ForegroundColor Red | ||||
| } | ||||
| 
 | ||||
| # 9. 检查集群连接 | ||||
| Write-Host "🌐 检查集群连接..." -ForegroundColor Cyan | ||||
| Start-Sleep -Seconds 15 | ||||
| 
 | ||||
| try { | ||||
|     & $nomadExe server members | ||||
|     Write-Host "✅ 成功加入 Nomad 集群!" -ForegroundColor Green | ||||
| } catch { | ||||
|     Write-Host "⚠️  正在连接集群,请稍等..." -ForegroundColor Yellow | ||||
|     Write-Host "可以运行以下命令检查状态:" -ForegroundColor Cyan | ||||
|     Write-Host "  nomad server members" -ForegroundColor White | ||||
|     Write-Host "  nomad node status" -ForegroundColor White | ||||
| } | ||||
| 
 | ||||
| # 10. 防火墙规则 | ||||
| Write-Host "🔥 配置防火墙规则..." -ForegroundColor Cyan | ||||
| try { | ||||
|     New-NetFirewallRule -DisplayName "Nomad HTTP" -Direction Inbound -Protocol TCP -LocalPort 4646 -Action Allow -ErrorAction SilentlyContinue | ||||
|     New-NetFirewallRule -DisplayName "Nomad RPC" -Direction Inbound -Protocol TCP -LocalPort 4647 -Action Allow -ErrorAction SilentlyContinue | ||||
|     New-NetFirewallRule -DisplayName "Nomad Serf" -Direction Inbound -Protocol TCP -LocalPort 4648 -Action Allow -ErrorAction SilentlyContinue | ||||
|     Write-Host "✅ 防火墙规则已配置" -ForegroundColor Green | ||||
| } catch { | ||||
|     Write-Host "⚠️  防火墙规则配置可能失败,请手动检查" -ForegroundColor Yellow | ||||
| } | ||||
| 
 | ||||
| Write-Host "" | ||||
| Write-Host "🎉 Windows Nomad Server 设置完成!" -ForegroundColor Green | ||||
| Write-Host "📊 Web UI: http://$tailscaleIP:4646" -ForegroundColor Cyan | ||||
| Write-Host "🔧 配置文件: $configFile" -ForegroundColor Cyan | ||||
| Write-Host "📝 服务管理:" -ForegroundColor Cyan | ||||
| Write-Host "  启动: Start-Service Nomad" -ForegroundColor White | ||||
| Write-Host "  停止: Stop-Service Nomad" -ForegroundColor White | ||||
| Write-Host "  状态: Get-Service Nomad" -ForegroundColor White | ||||
| Write-Host "  日志: Get-EventLog -LogName Application -Source Nomad" -ForegroundColor White | ||||
|  | @ -1,174 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # OpenTofu 设置脚本 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查 OpenTofu 是否已安装 | ||||
| check_opentofu() { | ||||
|     log_info "检查 OpenTofu 安装状态..." | ||||
|      | ||||
|     if command -v tofu &> /dev/null; then | ||||
|         local version=$(tofu version | head -n1) | ||||
|         log_success "OpenTofu 已安装: $version" | ||||
|         return 0 | ||||
|     else | ||||
|         log_error "OpenTofu 未安装" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 检查配置文件 | ||||
| check_config() { | ||||
|     log_info "检查配置文件..." | ||||
|      | ||||
|     local config_file="tofu/environments/dev/terraform.tfvars" | ||||
|      | ||||
|     if [[ ! -f "$config_file" ]]; then | ||||
|         log_error "配置文件不存在: $config_file" | ||||
|         log_info "请复制 terraform.tfvars.example 并填入实际配置" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     # 检查是否包含示例值 | ||||
|     if grep -q "your_tenancy_id_here\|your_user_id_here\|your:key:fingerprint:here" "$config_file"; then | ||||
|         log_warning "配置文件包含示例值,请填入实际的 Oracle Cloud 配置" | ||||
|         log_info "需要配置以下项目:" | ||||
|         echo "  - tenancy_ocid: Oracle Cloud 租户 OCID" | ||||
|         echo "  - user_ocid: 用户 OCID" | ||||
|         echo "  - fingerprint: API 密钥指纹" | ||||
|         echo "  - private_key_path: 私钥文件路径" | ||||
|         echo "  - compartment_ocid: 区间 OCID" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     log_success "配置文件检查通过" | ||||
|     return 0 | ||||
| } | ||||
| 
 | ||||
| # 初始化 OpenTofu | ||||
| init_opentofu() { | ||||
|     log_info "初始化 OpenTofu..." | ||||
|      | ||||
|     cd tofu/environments/dev | ||||
|      | ||||
|     # 清理旧的状态文件 | ||||
|     if [[ -d ".terraform" ]]; then | ||||
|         log_info "清理旧的 .terraform 目录..." | ||||
|         rm -rf .terraform | ||||
|     fi | ||||
|      | ||||
|     # 初始化 | ||||
|     if tofu init; then | ||||
|         log_success "OpenTofu 初始化成功" | ||||
|     else | ||||
|         log_error "OpenTofu 初始化失败" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     cd - > /dev/null | ||||
| } | ||||
| 
 | ||||
| # 验证配置 | ||||
| validate_config() { | ||||
|     log_info "验证 OpenTofu 配置..." | ||||
|      | ||||
|     cd tofu/environments/dev | ||||
|      | ||||
|     if tofu validate; then | ||||
|         log_success "配置验证通过" | ||||
|     else | ||||
|         log_error "配置验证失败" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     cd - > /dev/null | ||||
| } | ||||
| 
 | ||||
| # 生成计划 | ||||
| plan_infrastructure() { | ||||
|     log_info "生成基础设施计划..." | ||||
|      | ||||
|     cd tofu/environments/dev | ||||
|      | ||||
|     if tofu plan -var-file="terraform.tfvars" -out=tfplan; then | ||||
|         log_success "计划生成成功" | ||||
|         log_info "计划文件已保存为 tfplan" | ||||
|     else | ||||
|         log_error "计划生成失败" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     cd - > /dev/null | ||||
| } | ||||
| 
 | ||||
| # 显示帮助信息 | ||||
| show_help() { | ||||
|     echo "OpenTofu 设置脚本" | ||||
|     echo "" | ||||
|     echo "用法: $0 [选项]" | ||||
|     echo "" | ||||
|     echo "选项:" | ||||
|     echo "  init     - 初始化 OpenTofu" | ||||
|     echo "  validate - 验证配置" | ||||
|     echo "  plan     - 生成执行计划" | ||||
|     echo "  check    - 检查环境和配置" | ||||
|     echo "  help     - 显示此帮助信息" | ||||
|     echo "" | ||||
|     echo "示例:" | ||||
|     echo "  $0 check    # 检查环境" | ||||
|     echo "  $0 init     # 初始化项目" | ||||
|     echo "  $0 plan     # 生成计划" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     case "${1:-help}" in | ||||
|         "check") | ||||
|             check_opentofu | ||||
|             check_config | ||||
|             ;; | ||||
|         "init") | ||||
|             check_opentofu || exit 1 | ||||
|             check_config || exit 1 | ||||
|             init_opentofu | ||||
|             ;; | ||||
|         "validate") | ||||
|             validate_config | ||||
|             ;; | ||||
|         "plan") | ||||
|             check_opentofu || exit 1 | ||||
|             check_config || exit 1 | ||||
|             plan_infrastructure | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             show_help | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| # 运行主函数 | ||||
| main "$@" | ||||
|  | @ -1,375 +0,0 @@ | |||
| --- | ||||
| # ☢️ NUCLEAR NOMAD RESET ☢️ | ||||
| # 这是比终极还要强的修复脚本 | ||||
| # 警告:这将完全摧毁并重建 Nomad 集群 | ||||
| - name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️" | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|   serial: 1  # 一次处理一个节点,避免同时炸掉所有节点 | ||||
|   vars: | ||||
|     nomad_version: "1.10.5" | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     tailscale_ips: | ||||
|       semaphore: "100.116.158.95" | ||||
|       master: "100.117.106.136"  | ||||
|       ash3c: "100.116.80.94" | ||||
|      | ||||
|   tasks: | ||||
|     - name: "🚨 警告:即将进行核弹级重置" | ||||
|       debug: | ||||
|         msg: | | ||||
|           ☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️ | ||||
|           这将完全摧毁所有 Nomad 相关的数据、配置和进程! | ||||
|           如果你不确定,请立即按 Ctrl+C 取消! | ||||
|            | ||||
|     - name: "⏰ 等待 10 秒,给你最后的机会取消..." | ||||
|       pause: | ||||
|         seconds: 10 | ||||
| 
 | ||||
|     # ========== 第一阶段:核弹级清理 ========== | ||||
|     - name: "💀 第一阶段:核弹级进程清理" | ||||
|       debug: | ||||
|         msg: "开始核弹级进程清理..." | ||||
| 
 | ||||
|     - name: "🔥 停止 Nomad 服务(如果存在)" | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|         enabled: no | ||||
|         daemon_reload: yes | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: "💣 强制杀死所有 Nomad 相关进程" | ||||
|       shell: | | ||||
|         # 杀死所有 nomad 进程 | ||||
|         pkill -9 -f nomad || true | ||||
|         # 杀死所有可能的子进程 | ||||
|         pkill -9 -f "nomad agent" || true | ||||
|         pkill -9 -f "nomad server" || true | ||||
|         pkill -9 -f "nomad client" || true | ||||
|         # 等待进程完全死亡 | ||||
|         sleep 5 | ||||
|         # 再次确认杀死 | ||||
|         ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: "🧹 清理所有 Nomad 相关文件和目录" | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - /opt/nomad | ||||
|         - /etc/nomad.d | ||||
|         - /var/log/nomad | ||||
|         - /etc/systemd/system/nomad.service | ||||
|         - /usr/local/bin/nomad | ||||
|         - /usr/bin/nomad | ||||
|         - /tmp/nomad* | ||||
|         - /var/lib/nomad | ||||
|         - /run/nomad | ||||
|         - /var/run/nomad.pid | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: "🔧 清理 systemd 缓存" | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     # ========== 第二阶段:重新安装 Nomad ========== | ||||
|     - name: "🚀 第二阶段:重新安装 Nomad" | ||||
|       debug: | ||||
|         msg: "开始重新安装 Nomad..." | ||||
| 
 | ||||
|     - name: "🔑 添加 HashiCorp GPG 密钥" | ||||
|       apt_key: | ||||
|         url: https://apt.releases.hashicorp.com/gpg | ||||
|         state: present | ||||
| 
 | ||||
|     - name: "📦 添加 HashiCorp APT 仓库" | ||||
|       apt_repository: | ||||
|         repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main" | ||||
|         state: present | ||||
|         update_cache: yes | ||||
| 
 | ||||
|     - name: "🔧 安装 Nomad(自动检测架构)" | ||||
|       apt: | ||||
|         name: "nomad={{ nomad_version }}-1" | ||||
|         state: present | ||||
|         update_cache: yes | ||||
| 
 | ||||
|     - name: "👤 创建 nomad 用户和组" | ||||
|       group: | ||||
|         name: nomad | ||||
|         state: present | ||||
|        | ||||
|     - name: "👤 创建 nomad 用户" | ||||
|       user: | ||||
|         name: nomad | ||||
|         group: nomad | ||||
|         system: yes | ||||
|         shell: /bin/false | ||||
|         home: /opt/nomad | ||||
|         create_home: no | ||||
| 
 | ||||
|     - name: "📁 创建全新的目录结构" | ||||
|       file: | ||||
|         path: "{{ item.path }}" | ||||
|         state: directory | ||||
|         owner: "{{ item.owner | default('nomad') }}" | ||||
|         group: "{{ item.group | default('nomad') }}" | ||||
|         mode: "{{ item.mode | default('0755') }}" | ||||
|       loop: | ||||
|         - { path: "/etc/nomad.d", mode: "0755" } | ||||
|         - { path: "/opt/nomad", mode: "0755" } | ||||
|         - { path: "/opt/nomad/data", mode: "0755" } | ||||
|         - { path: "/opt/nomad/alloc_mounts", mode: "0755" } | ||||
|         - { path: "/var/log/nomad", mode: "0755" } | ||||
| 
 | ||||
|     # ========== 第三阶段:网络和防火墙检查 ========== | ||||
|     - name: "🌐 第三阶段:网络配置验证" | ||||
|       debug: | ||||
|         msg: "验证网络配置..." | ||||
| 
 | ||||
|     - name: "🔍 检查 Tailscale IP 是否正确绑定" | ||||
|       shell: | | ||||
|         ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND" | ||||
|       register: ip_check | ||||
|        | ||||
|     - name: "⚠️ IP 地址检查结果" | ||||
|       debug: | ||||
|         msg: | | ||||
|           节点: {{ inventory_hostname }} | ||||
|           期望 IP: {{ tailscale_ips[inventory_hostname] }} | ||||
|           检查结果: {{ ip_check.stdout }} | ||||
|           {% if 'IP_NOT_FOUND' in ip_check.stdout %} | ||||
|           ❌ 警告:IP 地址未正确绑定! | ||||
|           {% else %} | ||||
|           ✅ IP 地址检查通过 | ||||
|           {% endif %} | ||||
| 
 | ||||
|     - name: "🔥 确保防火墙端口开放" | ||||
|       shell: | | ||||
|         # 检查并开放 Nomad 端口 | ||||
|         if command -v ufw >/dev/null 2>&1; then | ||||
|           ufw allow 4646/tcp  # HTTP API | ||||
|           ufw allow 4647/tcp  # RPC | ||||
|           ufw allow 4648/tcp  # Serf | ||||
|         elif command -v firewall-cmd >/dev/null 2>&1; then | ||||
|           firewall-cmd --permanent --add-port=4646/tcp | ||||
|           firewall-cmd --permanent --add-port=4647/tcp | ||||
|           firewall-cmd --permanent --add-port=4648/tcp | ||||
|           firewall-cmd --reload | ||||
|         fi | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     # ========== 第四阶段:创建超强配置 ========== | ||||
|     - name: "⚙️ 第四阶段:创建超强配置文件" | ||||
|       debug: | ||||
|         msg: "创建超强配置文件..." | ||||
| 
 | ||||
|     - name: "📝 创建核弹级 Nomad 配置" | ||||
|       copy: | ||||
|         content: | | ||||
|           # ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }} | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
|            | ||||
|           # 使用正确的 Tailscale IP | ||||
|           bind_addr = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|            | ||||
|           # 日志配置 | ||||
|           log_level = "INFO" | ||||
|           log_file = "/var/log/nomad/nomad.log" | ||||
|           log_rotate_duration = "24h" | ||||
|           log_rotate_max_files = 5 | ||||
|            | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             # 更激进的重试配置 | ||||
|             server_join { | ||||
|               retry_join = [ | ||||
|                 "{{ tailscale_ips.semaphore }}:4647", | ||||
|                 "{{ tailscale_ips.master }}:4647", | ||||
|                 "{{ tailscale_ips.ash3c }}:4647" | ||||
|               ] | ||||
|               retry_max = 10 | ||||
|               retry_interval = "15s" | ||||
|             } | ||||
|              | ||||
|             # 更宽松的心跳配置 | ||||
|             heartbeat_grace = "30s" | ||||
|             min_heartbeat_ttl = "10s" | ||||
|             max_heartbeats_per_second = 50.0 | ||||
|              | ||||
|             # Raft 配置优化 | ||||
|             raft_protocol = 3 | ||||
|             raft_multiplier = 1 | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = true | ||||
|              | ||||
|             # 网络接口配置 | ||||
|             network_interface = "tailscale0" | ||||
|              | ||||
|             # 更宽松的心跳配置 | ||||
|             max_kill_timeout = "30s" | ||||
|              | ||||
|             # 主机卷配置 | ||||
|             host_volume "docker-sock" { | ||||
|               path      = "/var/run/docker.sock" | ||||
|               read_only = false | ||||
|             } | ||||
|           } | ||||
|            | ||||
| 
 | ||||
|            | ||||
|           # 地址和端口配置 | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|             serf = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           # Docker 插件配置 | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|                | ||||
|               # 更宽松的资源限制 | ||||
|               gc { | ||||
|                 image       = true | ||||
|                 image_delay = "10m" | ||||
|                 container   = true | ||||
|                 dangling_containers { | ||||
|                   enabled        = true | ||||
|                   dry_run        = false | ||||
|                   period         = "5m" | ||||
|                   creation_grace = "5m" | ||||
|                 } | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           # 遥测配置 | ||||
|           telemetry { | ||||
|             collection_interval = "10s" | ||||
|             disable_hostname = false | ||||
|             prometheus_metrics = true | ||||
|             publish_allocation_metrics = true | ||||
|             publish_node_metrics = true | ||||
|           } | ||||
|         dest: "/etc/nomad.d/nomad.hcl" | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     # ========== 第五阶段:创建超强 systemd 服务 ========== | ||||
|     - name: "🔧 创建超强 systemd 服务文件" | ||||
|       copy: | ||||
|         content: | | ||||
|           [Unit] | ||||
|           Description=Nomad - Nuclear Edition | ||||
|           Documentation=https://www.nomadproject.io/ | ||||
|           Wants=network-online.target | ||||
|           After=network-online.target | ||||
|           ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl | ||||
|            | ||||
|           [Service] | ||||
|           Type=notify | ||||
|           User=nomad | ||||
|           Group=nomad | ||||
|           ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl | ||||
|           ExecReload=/bin/kill -HUP $MAINPID | ||||
|           KillMode=process | ||||
|           Restart=always | ||||
|           RestartSec=10 | ||||
|           LimitNOFILE=65536 | ||||
|            | ||||
|           # 更强的重启策略 | ||||
|           StartLimitInterval=0 | ||||
|           StartLimitBurst=10 | ||||
|            | ||||
|           # 环境变量 | ||||
|           Environment=NOMAD_DISABLE_UPDATE_CHECK=1 | ||||
|            | ||||
|           [Install] | ||||
|           WantedBy=multi-user.target | ||||
|         dest: "/etc/systemd/system/nomad.service" | ||||
|         owner: root | ||||
|         group: root | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: "🔄 重新加载 systemd" | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     # ========== 第六阶段:启动和验证 ========== | ||||
|     - name: "🚀 第六阶段:启动服务" | ||||
|       debug: | ||||
|         msg: "启动 Nomad 服务..." | ||||
| 
 | ||||
|     - name: "🔥 启用并启动 Nomad 服务" | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         enabled: yes | ||||
|         state: started | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: "⏰ 等待服务启动" | ||||
|       pause: | ||||
|         seconds: 15 | ||||
| 
 | ||||
|     - name: "🔍 验证服务状态" | ||||
|       systemd: | ||||
|         name: nomad | ||||
|       register: nomad_service_status | ||||
| 
 | ||||
|     - name: "📊 显示服务状态" | ||||
|       debug: | ||||
|         msg: | | ||||
|           ☢️ 核弹级重置完成! | ||||
|           节点: {{ inventory_hostname }} | ||||
|           服务状态: {{ nomad_service_status.status.ActiveState }} | ||||
|           IP 地址: {{ tailscale_ips[inventory_hostname] }} | ||||
|            | ||||
|           {% if nomad_service_status.status.ActiveState == 'active' %} | ||||
|           ✅ 服务启动成功! | ||||
|           {% else %} | ||||
|           ❌ 服务启动失败,请检查日志! | ||||
|           {% endif %} | ||||
| 
 | ||||
|     - name: "🧹 清理临时文件" | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: absent | ||||
|       loop: | ||||
|         - "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" | ||||
|         - "/tmp/nomad" | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: "🎉 核弹级重置完成通知" | ||||
|       debug: | ||||
|         msg: | | ||||
|           ☢️☢️☢️ 核弹级重置完成!☢️☢️☢️ | ||||
|            | ||||
|           节点 {{ inventory_hostname }} 已经被完全摧毁并重建! | ||||
|            | ||||
|           下一步: | ||||
|           1. 等待所有节点完成重置 | ||||
|           2. 检查集群状态:nomad server members | ||||
|           3. 检查节点状态:nomad node status | ||||
|           4. 如果还有问题,那就真的没救了... 😅 | ||||
|  | @ -1,189 +0,0 @@ | |||
| --- | ||||
| - name: Complete Nomad Cluster Fix with Ansible | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     tailscale_ips: | ||||
|       semaphore: "100.116.158.95" | ||||
|       master: "100.117.106.136"  | ||||
|       ash3c: "100.116.80.94" | ||||
|      | ||||
|   tasks: | ||||
|     - name: Stop nomad service completely | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|         enabled: yes | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Kill any remaining nomad processes | ||||
|       shell: pkill -f nomad || true | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Reset systemd failure state | ||||
|       shell: systemctl reset-failed nomad | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create nomad user if not exists | ||||
|       user: | ||||
|         name: nomad | ||||
|         system: yes | ||||
|         shell: /bin/false | ||||
|         home: /opt/nomad | ||||
|         create_home: no | ||||
| 
 | ||||
|     - name: Create all required directories with correct permissions | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
|       loop: | ||||
|         - /opt/nomad | ||||
|         - /opt/nomad/data | ||||
|         - /opt/nomad/alloc_mounts | ||||
|         - /var/log/nomad | ||||
|         - /etc/nomad.d | ||||
| 
 | ||||
|     - name: Completely clean nomad data directory | ||||
|       shell: rm -rf /opt/nomad/data/* /opt/nomad/data/.* | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create correct nomad configuration | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
|            | ||||
|           bind_addr = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|            | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = [ | ||||
|                 "{{ tailscale_ips.semaphore }}:4647", | ||||
|                 "{{ tailscale_ips.master }}:4647", | ||||
|                 "{{ tailscale_ips.ash3c }}:4647" | ||||
|               ] | ||||
|               retry_interval = "15s" | ||||
|               retry_max      = 3 | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = true | ||||
|             alloc_dir = "/opt/nomad/alloc_mounts" | ||||
|           } | ||||
|            | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
|            | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|             serf = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           log_level = "INFO" | ||||
|           log_file  = "/var/log/nomad/nomad.log" | ||||
|           log_rotate_duration = "24h" | ||||
|           log_rotate_max_files = 5 | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     - name: Set correct ownership for all nomad files | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         recurse: yes | ||||
|       loop: | ||||
|         - /opt/nomad | ||||
|         - /var/log/nomad | ||||
|         - /etc/nomad.d | ||||
| 
 | ||||
|     - name: Validate nomad configuration | ||||
|       shell: nomad config validate /etc/nomad.d/nomad.hcl | ||||
|       register: config_validation | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Show config validation result | ||||
|       debug: | ||||
|         var: config_validation | ||||
| 
 | ||||
|     - name: Start nomad service on first node (semaphore) | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         daemon_reload: yes | ||||
|       when: inventory_hostname == 'semaphore' | ||||
| 
 | ||||
|     - name: Wait for first node to start | ||||
|       pause: | ||||
|         seconds: 30 | ||||
|       when: inventory_hostname == 'semaphore' | ||||
| 
 | ||||
|     - name: Start nomad service on remaining nodes | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         daemon_reload: yes | ||||
|       when: inventory_hostname != 'semaphore' | ||||
| 
 | ||||
|     - name: Wait for all services to start | ||||
|       pause: | ||||
|         seconds: 20 | ||||
| 
 | ||||
|     - name: Check nomad service status | ||||
|       shell: systemctl status nomad --no-pager -l | ||||
|       register: service_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Show service status | ||||
|       debug: | ||||
|         var: service_status.stdout_lines | ||||
| 
 | ||||
|     - name: Check nomad logs for errors | ||||
|       shell: journalctl -u nomad -n 10 --no-pager | ||||
|       register: nomad_logs | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Show recent nomad logs | ||||
|       debug: | ||||
|         var: nomad_logs.stdout_lines | ||||
| 
 | ||||
|     - name: Test nomad connectivity | ||||
|       shell: nomad server members | ||||
|       register: nomad_members | ||||
|       ignore_errors: yes | ||||
|       when: inventory_hostname == 'semaphore' | ||||
| 
 | ||||
|     - name: Show cluster members | ||||
|       debug: | ||||
|         var: nomad_members.stdout_lines | ||||
|       when: inventory_hostname == 'semaphore' | ||||
|  | @ -1,151 +0,0 @@ | |||
| --- | ||||
| - name: Complete Nomad Cluster Reset and Rebuild | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   serial: 1  # 一次处理一个节点 | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     tailscale_ips: | ||||
|       semaphore: "100.116.158.95" | ||||
|       master: "100.117.106.136"  | ||||
|       ash3c: "100.116.80.94" | ||||
|      | ||||
|   tasks: | ||||
|     - name: Stop nomad service completely | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Kill any remaining nomad processes | ||||
|       shell: pkill -f nomad || true | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Remove all nomad data and state | ||||
|       shell: | | ||||
|         rm -rf /opt/nomad/data/* | ||||
|         rm -rf /opt/nomad/data/.* | ||||
|         rm -rf /var/log/nomad/* | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create fresh nomad configuration with correct Tailscale IPs | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
|            | ||||
|           # 使用 Tailscale IP 地址 | ||||
|           bind_addr = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|            | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = [ | ||||
|                 "{{ tailscale_ips.semaphore }}", | ||||
|                 "{{ tailscale_ips.master }}", | ||||
|                 "{{ tailscale_ips.ash3c }}" | ||||
|               ] | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = true | ||||
|             network_interface = "tailscale0" | ||||
|           } | ||||
|            | ||||
|           ui_config { | ||||
|             enabled = true | ||||
|           } | ||||
|            | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|             serf = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           log_level = "INFO" | ||||
|           log_file  = "/var/log/nomad/nomad.log" | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     - name: Ensure log directory exists | ||||
|       file: | ||||
|         path: /var/log/nomad | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: Start nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for nomad to start | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: "{{ tailscale_ips[inventory_hostname] }}" | ||||
|         delay: 5 | ||||
|         timeout: 30 | ||||
| 
 | ||||
|     - name: Check nomad service status | ||||
|       shell: systemctl status nomad --no-pager -l | ||||
|       register: nomad_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display nomad status | ||||
|       debug: | ||||
|         var: nomad_status.stdout_lines | ||||
| 
 | ||||
| - name: Wait for cluster to form | ||||
|   hosts: localhost | ||||
|   gather_facts: no | ||||
|   tasks: | ||||
|     - name: Wait for cluster formation | ||||
|       pause: | ||||
|         seconds: 30 | ||||
|         prompt: "等待集群形成..." | ||||
| 
 | ||||
| - name: Verify cluster status | ||||
|   hosts: semaphore | ||||
|   become: yes | ||||
|   tasks: | ||||
|     - name: Check cluster members | ||||
|       shell: nomad server members | ||||
|       register: cluster_members | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display cluster members | ||||
|       debug: | ||||
|         var: cluster_members.stdout_lines | ||||
| 
 | ||||
|     - name: Check node status | ||||
|       shell: nomad node status | ||||
|       register: node_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display node status | ||||
|       debug: | ||||
|         var: node_status.stdout_lines | ||||
|  | @ -1,233 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Consul 集群管理脚本 | ||||
| # 提供集群状态检查、重启、停止等功能 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 打印带颜色的消息 | ||||
| print_status() { | ||||
|     echo -e "${GREEN}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| print_warning() { | ||||
|     echo -e "${YELLOW}[WARN]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| print_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| print_header() { | ||||
|     echo -e "${BLUE}=== $1 ===${NC}" | ||||
| } | ||||
| 
 | ||||
| # 检查必要文件 | ||||
| check_prerequisites() { | ||||
|     if [[ ! -f "$INVENTORY_FILE" ]]; then | ||||
|         print_error "清单文件不存在: $INVENTORY_FILE" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     if ! command -v ansible &> /dev/null; then | ||||
|         print_error "未找到 ansible 命令" | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 显示帮助信息 | ||||
| show_help() { | ||||
|     echo "Consul 集群管理脚本" | ||||
|     echo | ||||
|     echo "用法: $0 [命令]" | ||||
|     echo | ||||
|     echo "命令:" | ||||
|     echo "  status      - 检查集群状态" | ||||
|     echo "  members     - 显示集群成员" | ||||
|     echo "  leader      - 显示集群领导者" | ||||
|     echo "  restart     - 重启 Consul 服务" | ||||
|     echo "  stop        - 停止 Consul 服务" | ||||
|     echo "  start       - 启动 Consul 服务" | ||||
|     echo "  logs        - 查看服务日志" | ||||
|     echo "  health      - 健康检查" | ||||
|     echo "  cleanup     - 清理 Consul 数据(危险操作)" | ||||
|     echo "  help        - 显示此帮助信息" | ||||
|     echo | ||||
| } | ||||
| 
 | ||||
| # 检查集群状态 | ||||
| check_status() { | ||||
|     print_header "Consul 服务状态" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o | ||||
|      | ||||
|     echo | ||||
|     print_header "Consul 进程状态" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ps aux | grep consul | grep -v grep" -o | ||||
| } | ||||
| 
 | ||||
| # 显示集群成员 | ||||
| show_members() { | ||||
|     print_header "Consul 集群成员" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members" -o | ||||
| } | ||||
| 
 | ||||
| # 显示集群领导者 | ||||
| show_leader() { | ||||
|     print_header "Consul 集群领导者" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul operator raft list-peers" -o | ||||
|      | ||||
|     echo | ||||
|     print_header "通过 API 检查领导者" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s http://localhost:8500/v1/status/leader" -o | ||||
| } | ||||
| 
 | ||||
| # 重启服务 | ||||
| restart_service() { | ||||
|     print_header "重启 Consul 服务" | ||||
|     print_warning "即将重启所有 Consul 节点..." | ||||
|     read -p "确认继续? (y/N): " confirm | ||||
|     if [[ $confirm != "y" && $confirm != "Y" ]]; then | ||||
|         print_status "操作已取消" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=restarted" -b | ||||
|      | ||||
|     print_status "等待服务启动..." | ||||
|     sleep 10 | ||||
|     check_status | ||||
| } | ||||
| 
 | ||||
| # 停止服务 | ||||
| stop_service() { | ||||
|     print_header "停止 Consul 服务" | ||||
|     print_warning "即将停止所有 Consul 节点..." | ||||
|     read -p "确认继续? (y/N): " confirm | ||||
|     if [[ $confirm != "y" && $confirm != "Y" ]]; then | ||||
|         print_status "操作已取消" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b | ||||
| } | ||||
| 
 | ||||
| # 启动服务 | ||||
| start_service() { | ||||
|     print_header "启动 Consul 服务" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b | ||||
|      | ||||
|     print_status "等待服务启动..." | ||||
|     sleep 10 | ||||
|     check_status | ||||
| } | ||||
| 
 | ||||
| # 查看日志 | ||||
| show_logs() { | ||||
|     print_header "Consul 服务日志" | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "journalctl -u consul --no-pager -n 20" -o | ||||
| } | ||||
| 
 | ||||
| # 健康检查 | ||||
| health_check() { | ||||
|     print_header "Consul 健康检查" | ||||
|      | ||||
|     # 检查服务状态 | ||||
|     print_status "检查服务状态..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o | ||||
|      | ||||
|     echo | ||||
|     # 检查端口监听 | ||||
|     print_status "检查端口监听..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ss -tlnp | grep :8500" -o | ||||
|      | ||||
|     echo | ||||
|     # 检查集群成员 | ||||
|     print_status "检查集群成员..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members | wc -l" -o | ||||
|      | ||||
|     echo | ||||
|     # 检查 API 响应 | ||||
|     print_status "检查 API 响应..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s -o /dev/null -w '%{http_code}' http://localhost:8500/v1/status/leader" -o | ||||
| } | ||||
| 
 | ||||
| # 清理数据(危险操作) | ||||
| cleanup_data() { | ||||
|     print_header "清理 Consul 数据" | ||||
|     print_error "警告: 此操作将删除所有 Consul 数据,包括服务注册、KV 存储等!" | ||||
|     print_error "此操作不可逆!" | ||||
|     echo | ||||
|     read -p "确认要清理所有数据? 请输入 'YES' 确认: " confirm | ||||
|     if [[ $confirm != "YES" ]]; then | ||||
|         print_status "操作已取消" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     print_status "停止 Consul 服务..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b | ||||
|      | ||||
|     print_status "清理数据目录..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "rm -rf /opt/consul/data/*" -b | ||||
|      | ||||
|     print_status "启动 Consul 服务..." | ||||
|     ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b | ||||
|      | ||||
|     print_status "数据清理完成" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     check_prerequisites | ||||
|      | ||||
|     case "${1:-help}" in | ||||
|         status) | ||||
|             check_status | ||||
|             ;; | ||||
|         members) | ||||
|             show_members | ||||
|             ;; | ||||
|         leader) | ||||
|             show_leader | ||||
|             ;; | ||||
|         restart) | ||||
|             restart_service | ||||
|             ;; | ||||
|         stop) | ||||
|             stop_service | ||||
|             ;; | ||||
|         start) | ||||
|             start_service | ||||
|             ;; | ||||
|         logs) | ||||
|             show_logs | ||||
|             ;; | ||||
|         health) | ||||
|             health_check | ||||
|             ;; | ||||
|         cleanup) | ||||
|             cleanup_data | ||||
|             ;; | ||||
|         help|--help|-h) | ||||
|             show_help | ||||
|             ;; | ||||
|         *) | ||||
|             print_error "未知命令: $1" | ||||
|             echo | ||||
|             show_help | ||||
|             exit 1 | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -1,228 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Consul 密钥管理脚本 | ||||
| # 用于安全地管理 Oracle Cloud 和其他云服务商的敏感配置 | ||||
| 
 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 配置 | ||||
| CONSUL_ADDR="${CONSUL_ADDR:-http://localhost:8500}" | ||||
| CONSUL_TOKEN="${CONSUL_TOKEN:-}" | ||||
| ENVIRONMENT="${ENVIRONMENT:-dev}" | ||||
| 
 | ||||
| # 颜色输出 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查 Consul 连接 | ||||
| check_consul() { | ||||
|     log_info "检查 Consul 连接..." | ||||
|     if ! curl -s "${CONSUL_ADDR}/v1/status/leader" > /dev/null; then | ||||
|         log_error "无法连接到 Consul: ${CONSUL_ADDR}" | ||||
|         exit 1 | ||||
|     fi | ||||
|     log_success "Consul 连接正常" | ||||
| } | ||||
| 
 | ||||
| # 设置 Oracle Cloud 配置 | ||||
| set_oracle_config() { | ||||
|     log_info "设置 Oracle Cloud 配置..." | ||||
|      | ||||
|     echo "请输入 Oracle Cloud 配置信息:" | ||||
|      | ||||
|     read -p "租户 OCID: " tenancy_ocid | ||||
|     read -p "用户 OCID: " user_ocid | ||||
|     read -p "API 密钥指纹: " fingerprint | ||||
|     read -p "私钥文件路径: " private_key_path | ||||
|     read -p "区间 OCID: " compartment_ocid | ||||
|      | ||||
|     # 验证私钥文件是否存在 | ||||
|     if [[ ! -f "$private_key_path" ]]; then | ||||
|         log_error "私钥文件不存在: $private_key_path" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 读取私钥内容 | ||||
|     private_key_content=$(cat "$private_key_path") | ||||
|      | ||||
|     # 存储到 Consul | ||||
|     local base_path="config/${ENVIRONMENT}/oracle" | ||||
|      | ||||
|     curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid" -d "$tenancy_ocid" > /dev/null | ||||
|     curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid" -d "$user_ocid" > /dev/null | ||||
|     curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint" -d "$fingerprint" > /dev/null | ||||
|     curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/private_key" -d "$private_key_content" > /dev/null | ||||
|     curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid" -d "$compartment_ocid" > /dev/null | ||||
|      | ||||
|     log_success "Oracle Cloud 配置已存储到 Consul" | ||||
| } | ||||
| 
 | ||||
| # 获取 Oracle Cloud 配置 | ||||
| get_oracle_config() { | ||||
|     log_info "从 Consul 获取 Oracle Cloud 配置..." | ||||
|      | ||||
|     local base_path="config/${ENVIRONMENT}/oracle" | ||||
|      | ||||
|     echo "Oracle Cloud 配置:" | ||||
|     echo "租户 OCID: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid?raw" 2>/dev/null || echo "未设置")" | ||||
|     echo "用户 OCID: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid?raw" 2>/dev/null || echo "未设置")" | ||||
|     echo "指纹: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint?raw" 2>/dev/null || echo "未设置")" | ||||
|     echo "区间 OCID: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid?raw" 2>/dev/null || echo "未设置")" | ||||
|     echo "私钥: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/private_key?raw" 2>/dev/null | head -1 || echo "未设置")" | ||||
| } | ||||
| 
 | ||||
| # 删除 Oracle Cloud 配置 | ||||
| delete_oracle_config() { | ||||
|     log_warning "删除 Oracle Cloud 配置..." | ||||
|      | ||||
|     read -p "确定要删除所有 Oracle Cloud 配置吗?(y/N): " confirm | ||||
|     if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then | ||||
|         log_info "操作已取消" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     local base_path="config/${ENVIRONMENT}/oracle" | ||||
|      | ||||
|     curl -s -X DELETE "${CONSUL_ADDR}/v1/kv/${base_path}?recurse" > /dev/null | ||||
|      | ||||
|     log_success "Oracle Cloud 配置已删除" | ||||
| } | ||||
| 
 | ||||
| # 生成 Terraform 变量文件 | ||||
| generate_terraform_vars() { | ||||
|     log_info "生成 Terraform 变量文件..." | ||||
|      | ||||
|     local base_path="config/${ENVIRONMENT}/oracle" | ||||
|     local output_file="infrastructure/environments/${ENVIRONMENT}/terraform.tfvars.consul" | ||||
|      | ||||
|     # 从 Consul 获取配置 | ||||
|     local tenancy_ocid=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid?raw" 2>/dev/null || echo "") | ||||
|     local user_ocid=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid?raw" 2>/dev/null || echo "") | ||||
|     local fingerprint=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint?raw" 2>/dev/null || echo "") | ||||
|     local compartment_ocid=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid?raw" 2>/dev/null || echo "") | ||||
|      | ||||
|     if [[ -z "$tenancy_ocid" ]]; then | ||||
|         log_error "Consul 中没有找到 Oracle Cloud 配置" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     # 创建临时私钥文件 | ||||
|     local temp_key_file="/tmp/oci_private_key_${ENVIRONMENT}.pem" | ||||
|     curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/private_key?raw" > "$temp_key_file" | ||||
|     chmod 600 "$temp_key_file" | ||||
|      | ||||
|     # 生成 Terraform 变量文件 | ||||
|     cat > "$output_file" << EOF | ||||
| # 从 Consul 生成的 Oracle Cloud 配置 | ||||
| # 生成时间: $(date) | ||||
| # 环境: ${ENVIRONMENT} | ||||
| 
 | ||||
| oci_config = { | ||||
|   tenancy_ocid     = "$tenancy_ocid" | ||||
|   user_ocid        = "$user_ocid" | ||||
|   fingerprint      = "$fingerprint" | ||||
|   private_key_path = "$temp_key_file" | ||||
|   region           = "ap-seoul-1" | ||||
|   compartment_ocid = "$compartment_ocid" | ||||
| } | ||||
| EOF | ||||
|      | ||||
|     log_success "Terraform 变量文件已生成: $output_file" | ||||
|     log_warning "私钥文件位置: $temp_key_file" | ||||
|     log_warning "请在使用完毕后删除临时私钥文件" | ||||
| } | ||||
| 
 | ||||
| # 清理临时文件 | ||||
| cleanup_temp_files() { | ||||
|     log_info "清理临时文件..." | ||||
|      | ||||
|     rm -f /tmp/oci_private_key_*.pem | ||||
|     rm -f infrastructure/environments/*/terraform.tfvars.consul | ||||
|      | ||||
|     log_success "临时文件已清理" | ||||
| } | ||||
| 
 | ||||
| # 显示帮助信息 | ||||
| show_help() { | ||||
|     cat << EOF | ||||
| Consul 密钥管理脚本 | ||||
| 
 | ||||
| 用法: $0 [选项] | ||||
| 
 | ||||
| 选项: | ||||
|     set-oracle      设置 Oracle Cloud 配置到 Consul | ||||
|     get-oracle      从 Consul 获取 Oracle Cloud 配置 | ||||
|     delete-oracle   从 Consul 删除 Oracle Cloud 配置 | ||||
|     generate-vars   从 Consul 生成 Terraform 变量文件 | ||||
|     cleanup         清理临时文件 | ||||
|     help           显示此帮助信息 | ||||
| 
 | ||||
| 环境变量: | ||||
|     CONSUL_ADDR     Consul 地址 (默认: http://localhost:8500) | ||||
|     CONSUL_TOKEN    Consul ACL Token (可选) | ||||
|     ENVIRONMENT     环境名称 (默认: dev) | ||||
| 
 | ||||
| 示例: | ||||
|     # 设置 Oracle Cloud 配置 | ||||
|     $0 set-oracle | ||||
|      | ||||
|     # 生成 Terraform 变量文件 | ||||
|     $0 generate-vars | ||||
|      | ||||
|     # 查看配置 | ||||
|     $0 get-oracle | ||||
|      | ||||
|     # 清理临时文件 | ||||
|     $0 cleanup | ||||
| EOF | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     case "${1:-help}" in | ||||
|         "set-oracle") | ||||
|             check_consul | ||||
|             set_oracle_config | ||||
|             ;; | ||||
|         "get-oracle") | ||||
|             check_consul | ||||
|             get_oracle_config | ||||
|             ;; | ||||
|         "delete-oracle") | ||||
|             check_consul | ||||
|             delete_oracle_config | ||||
|             ;; | ||||
|         "generate-vars") | ||||
|             check_consul | ||||
|             generate_terraform_vars | ||||
|             ;; | ||||
|         "cleanup") | ||||
|             cleanup_temp_files | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             show_help | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -1,115 +0,0 @@ | |||
| --- | ||||
| - name: Correct Nomad Cluster Configuration | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     tailscale_ips: | ||||
|       semaphore: "100.116.158.95" | ||||
|       master: "100.117.106.136" | ||||
|       ash3c: "100.116.80.94" | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Stop nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Clean nomad data | ||||
|       file: | ||||
|         path: /opt/nomad/data | ||||
|         state: absent | ||||
| 
 | ||||
|     - name: Recreate nomad data directory | ||||
|       file: | ||||
|         path: /opt/nomad/data | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: Create correct nomad configuration | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
| 
 | ||||
|           bind_addr = "{{ tailscale_ips[inventory_hostname] }}" | ||||
| 
 | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = [ | ||||
|                 "{{ tailscale_ips.semaphore }}:4647", | ||||
|                 "{{ tailscale_ips.master }}:4647", | ||||
|                 "{{ tailscale_ips.ash3c }}:4647" | ||||
|               ] | ||||
|               retry_interval = "15s" | ||||
|               retry_max      = 3 | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           client { | ||||
|             enabled = true | ||||
|             alloc_dir = "/opt/nomad/alloc_mounts" | ||||
|           } | ||||
| 
 | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
| 
 | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|             serf = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|           } | ||||
| 
 | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
| 
 | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           log_level = "INFO" | ||||
|           log_file  = "/var/log/nomad/nomad.log" | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
| - name: Start nomad services in sequence | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   serial: 1 | ||||
|   tasks: | ||||
|     - name: Start nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: Wait for nomad to start | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: "{{ tailscale_ips[inventory_hostname] }}" | ||||
|         delay: 10 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Wait between nodes | ||||
|       pause: | ||||
|         seconds: 30 | ||||
|  | @ -1,113 +0,0 @@ | |||
| --- | ||||
| - name: Deploy Nomad Configurations | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     node_ips: | ||||
|       semaphore: "100.116.158.95" | ||||
|       master: "100.117.106.136" | ||||
|       ash3c: "100.116.80.94" | ||||
|    | ||||
|   tasks: | ||||
|     - name: Create nomad configuration for each node | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
|            | ||||
|           bind_addr = "{{ node_ips[inventory_hostname] }}" | ||||
|            | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = [ | ||||
|                 "{{ node_ips.semaphore }}:4647", | ||||
|                 "{{ node_ips.master }}:4647", | ||||
|                 "{{ node_ips.ash3c }}:4647" | ||||
|               ] | ||||
|               retry_interval = "15s" | ||||
|               retry_max      = 3 | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = true | ||||
|             alloc_dir = "/opt/nomad/alloc_mounts" | ||||
|           } | ||||
|            | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
|            | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ node_ips[inventory_hostname] }}" | ||||
|             serf = "{{ node_ips[inventory_hostname] }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           log_level = "INFO" | ||||
|           log_file  = "/var/log/nomad/nomad.log" | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     - name: Validate nomad configuration | ||||
|       shell: nomad config validate /etc/nomad.d/nomad.hcl | ||||
|       register: config_validation | ||||
| 
 | ||||
|     - name: Show validation result | ||||
|       debug: | ||||
|         var: config_validation.stdout_lines | ||||
| 
 | ||||
|     - name: Start nomad service on bootstrap node first | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         daemon_reload: yes | ||||
|       when: inventory_hostname == 'semaphore' | ||||
| 
 | ||||
|     - name: Wait for bootstrap node | ||||
|       pause: | ||||
|         seconds: 15 | ||||
|       when: inventory_hostname == 'semaphore' | ||||
| 
 | ||||
|     - name: Start nomad service on other nodes | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         daemon_reload: yes | ||||
|       when: inventory_hostname != 'semaphore' | ||||
| 
 | ||||
|     - name: Wait for services to start | ||||
|       pause: | ||||
|         seconds: 10 | ||||
| 
 | ||||
|     - name: Check service status | ||||
|       shell: systemctl status nomad --no-pager | ||||
|       register: service_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Show service status | ||||
|       debug: | ||||
|         var: service_status.stdout_lines | ||||
|  | @ -0,0 +1,33 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # 磁盘监控脚本 | ||||
| # 使用方法: ./disk-monitor.sh [threshold] | ||||
| 
 | ||||
| THRESHOLD=${1:-85}  # 默认阈值 85% | ||||
| INVENTORY_FILE="configuration/inventories/production/nomad-cluster.ini" | ||||
| 
 | ||||
| echo "🔍 开始磁盘空间监控 (阈值: ${THRESHOLD}%)" | ||||
| echo "==================================" | ||||
| 
 | ||||
| # 运行磁盘分析 | ||||
| echo "📊 运行磁盘分析..." | ||||
| ansible-playbook -i "$INVENTORY_FILE" configuration/playbooks/disk-analysis-ncdu.yml | ||||
| 
 | ||||
| echo "" | ||||
| echo "⚠️  检查高磁盘使用率节点..." | ||||
| 
 | ||||
| # 检查所有节点的磁盘使用情况 | ||||
| ansible all -i "$INVENTORY_FILE" -m shell -a "df -h | awk 'NR>1 {gsub(/%/, \"\", \$5); if(\$5 > $THRESHOLD) print \$0}'" | while read line; do | ||||
|     if [[ $line == *"=>"* ]]; then | ||||
|         echo "🚨 节点: $line" | ||||
|     elif [[ $line =~ ^/dev ]]; then | ||||
|         echo "   高使用率磁盘: $line" | ||||
|     fi | ||||
| done | ||||
| 
 | ||||
| echo "" | ||||
| echo "💡 如需清理,运行:" | ||||
| echo "   ansible-playbook -i $INVENTORY_FILE configuration/playbooks/disk-cleanup.yml" | ||||
| echo "" | ||||
| echo "📁 详细报告位置: /tmp/disk-analysis/" | ||||
| echo "   使用 ncdu -f /tmp/disk-analysis/ncdu-root-<hostname>.json 查看详细信息" | ||||
|  | @ -1,190 +0,0 @@ | |||
| --- | ||||
| - name: Final Complete Nomad Cluster Fix | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   gather_facts: yes | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     nomad_servers: | ||||
|       - "100.116.158.95:4647"  # semaphore | ||||
|       - "100.117.106.136:4647" # master   | ||||
|       - "100.116.80.94:4647"   # ash3c | ||||
| 
 | ||||
|   tasks: | ||||
|     - name: Stop nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Reset failed nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         daemon_reload: yes | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create nomad user if not exists | ||||
|       user: | ||||
|         name: nomad | ||||
|         system: yes | ||||
|         shell: /bin/false | ||||
|         home: /opt/nomad | ||||
|         create_home: no | ||||
| 
 | ||||
|     - name: Create nomad directories with correct permissions | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
|       loop: | ||||
|         - /etc/nomad.d | ||||
|         - /opt/nomad | ||||
|         - /opt/nomad/data | ||||
|         - /opt/nomad/alloc_mounts | ||||
|         - /var/log/nomad | ||||
| 
 | ||||
|     - name: Clean old nomad data | ||||
|       file: | ||||
|         path: /opt/nomad/data | ||||
|         state: absent | ||||
| 
 | ||||
|     - name: Recreate nomad data directory | ||||
|       file: | ||||
|         path: /opt/nomad/data | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
| 
 | ||||
|     - name: Get Tailscale IP address | ||||
|       shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1 | ||||
|       register: tailscale_ip | ||||
|       failed_when: false | ||||
| 
 | ||||
|     - name: Set bind address (fallback to default interface if tailscale not available) | ||||
|       set_fact: | ||||
|         bind_address: "{{ tailscale_ip.stdout if tailscale_ip.stdout != '' else ansible_default_ipv4.address }}" | ||||
| 
 | ||||
|     - name: Generate nomad configuration | ||||
|       template: | ||||
|         src: nomad-server.hcl.j2 | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
|       vars: | ||||
|         nomad_datacenter: "dc1" | ||||
|         nomad_region: "global" | ||||
|         nomad_data_dir: "/opt/nomad/data" | ||||
|         nomad_bind_addr: "{{ bind_address }}" | ||||
|         nomad_bootstrap_expect: 3 | ||||
|         nomad_encrypt: "{{ nomad_encrypt_key }}" | ||||
|         nomad_retry_join: "{{ nomad_servers }}" | ||||
|         nomad_alloc_dir: "/opt/nomad/alloc_mounts" | ||||
|         nomad_log_file: "/var/log/nomad/nomad.log" | ||||
| 
 | ||||
|     - name: Create nomad systemd service | ||||
|       copy: | ||||
|         content: | | ||||
|           [Unit] | ||||
|           Description=Nomad | ||||
|           Documentation=https://www.nomadproject.io/ | ||||
|           Requires=network-online.target | ||||
|           After=network-online.target | ||||
|           ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl | ||||
| 
 | ||||
|           [Service] | ||||
|           Type=notify | ||||
|           User=nomad | ||||
|           Group=nomad | ||||
|           ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl | ||||
|           ExecReload=/bin/kill -HUP $MAINPID | ||||
|           KillMode=process | ||||
|           Restart=on-failure | ||||
|           LimitNOFILE=65536 | ||||
| 
 | ||||
|           [Install] | ||||
|           WantedBy=multi-user.target | ||||
|         dest: /etc/systemd/system/nomad.service | ||||
|         mode: '0644' | ||||
| 
 | ||||
|     - name: Reload systemd daemon | ||||
|       systemd: | ||||
|         daemon_reload: yes | ||||
| 
 | ||||
|     - name: Start nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for nomad to start | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: "{{ bind_address }}" | ||||
|         delay: 5 | ||||
|         timeout: 30 | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
| - name: Create nomad configuration template | ||||
|   hosts: localhost | ||||
|   gather_facts: no | ||||
|   tasks: | ||||
|     - name: Create nomad server template | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "{{ nomad_datacenter }}" | ||||
|           region     = "{{ nomad_region }}" | ||||
|           data_dir   = "{{ nomad_data_dir }}" | ||||
| 
 | ||||
|           bind_addr = "{{ nomad_bind_addr }}" | ||||
| 
 | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = {{ nomad_bootstrap_expect }} | ||||
|             encrypt          = "{{ nomad_encrypt }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = {{ nomad_retry_join | to_json }} | ||||
|               retry_interval = "15s" | ||||
|               retry_max      = 3 | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           client { | ||||
|             enabled = true | ||||
|             alloc_dir = "{{ nomad_alloc_dir }}" | ||||
|           } | ||||
| 
 | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
| 
 | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ nomad_bind_addr }}" | ||||
|             serf = "{{ nomad_bind_addr }}" | ||||
|           } | ||||
| 
 | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
| 
 | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           log_level = "INFO" | ||||
|           log_file  = "{{ nomad_log_file }}" | ||||
|         dest: /tmp/nomad-server.hcl.j2 | ||||
|       delegate_to: localhost | ||||
|       run_once: true | ||||
|  | @ -1,111 +0,0 @@ | |||
| --- | ||||
| - name: Final Nomad Cluster Fix | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|     tailscale_ips: | ||||
|       semaphore: "100.116.158.95" | ||||
|       master: "100.117.106.136"  | ||||
|       ash3c: "100.116.80.94" | ||||
|      | ||||
|   tasks: | ||||
|     - name: Stop nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create required directories | ||||
|       file: | ||||
|         path: "{{ item }}" | ||||
|         state: directory | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0755' | ||||
|       loop: | ||||
|         - /opt/nomad/data | ||||
|         - /opt/nomad/alloc_mounts | ||||
|         - /var/log/nomad | ||||
| 
 | ||||
|     - name: Clean nomad data | ||||
|       shell: rm -rf /opt/nomad/data/* | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create working nomad configuration | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
|            | ||||
|           bind_addr = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|            | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = [ | ||||
|                 "{{ tailscale_ips.semaphore }}", | ||||
|                 "{{ tailscale_ips.master }}", | ||||
|                 "{{ tailscale_ips.ash3c }}" | ||||
|               ] | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           client { | ||||
|             enabled = true | ||||
|           } | ||||
|            | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
|            | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|             serf = "{{ tailscale_ips[inventory_hostname] }}" | ||||
|           } | ||||
|            | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
|            | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|            | ||||
|           log_level = "INFO" | ||||
|           log_file  = "/var/log/nomad/nomad.log" | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     - name: Start nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for service to start | ||||
|       pause: | ||||
|         seconds: 10 | ||||
| 
 | ||||
|     - name: Check service status | ||||
|       shell: systemctl status nomad --no-pager -l | ||||
|       register: service_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Show service status | ||||
|       debug: | ||||
|         var: service_status.stdout_lines | ||||
|  | @ -1,137 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # 🔧 ash3c IP 地址修复脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| echo "🔧 ash3c IP 地址问题修复脚本" | ||||
| echo "" | ||||
| 
 | ||||
| # 定义正确的 IP 地址 | ||||
| CORRECT_IP="100.116.80.94" | ||||
| ASH3C_HOST="100.116.80.94" | ||||
| 
 | ||||
| echo "📡 检查 ash3c 节点的网络配置..." | ||||
| 
 | ||||
| # 检查 ash3c 的实际 IP 配置 | ||||
| echo "🔍 检查 ash3c 节点的 IP 地址绑定..." | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S ip addr show" | grep -E "inet.*100\." || echo "❌ 未找到 Tailscale IP" | ||||
| 
 | ||||
| echo "" | ||||
| echo "🔍 检查 Tailscale 状态..." | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S tailscale status" || echo "❌ Tailscale 状态检查失败" | ||||
| 
 | ||||
| echo "" | ||||
| echo "🔧 修复 ash3c 的 Nomad 配置..." | ||||
| 
 | ||||
| # 创建正确的配置文件 | ||||
| cat > /tmp/ash3c-nomad.hcl << EOF | ||||
| # 🔧 ash3c 修复后的 Nomad 配置 | ||||
| datacenter = "dc1" | ||||
| region     = "global" | ||||
| data_dir   = "/opt/nomad/data" | ||||
| 
 | ||||
| # 强制使用正确的 Tailscale IP | ||||
| bind_addr = "${CORRECT_IP}" | ||||
| 
 | ||||
| # 日志配置 | ||||
| log_level = "INFO" | ||||
| log_file = "/var/log/nomad/nomad.log" | ||||
| 
 | ||||
| server { | ||||
|   enabled          = true | ||||
|   bootstrap_expect = 3 | ||||
|   encrypt          = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|    | ||||
|   server_join { | ||||
|     retry_join = [ | ||||
|       "100.116.158.95:4647", | ||||
|       "100.117.106.136:4647",  | ||||
|       "100.116.80.94:4647" | ||||
|     ] | ||||
|     retry_max = 10 | ||||
|     retry_interval = "15s" | ||||
|   } | ||||
|    | ||||
|   # 更宽松的心跳配置 | ||||
|   heartbeat_grace = "30s" | ||||
|   min_heartbeat_ttl = "10s" | ||||
| } | ||||
| 
 | ||||
| client { | ||||
|   enabled = true | ||||
|   network_interface = "tailscale0" | ||||
| } | ||||
| 
 | ||||
| ui_config { | ||||
|   enabled = true | ||||
| } | ||||
| 
 | ||||
| addresses { | ||||
|   http = "0.0.0.0" | ||||
|   rpc  = "${CORRECT_IP}" | ||||
|   serf = "${CORRECT_IP}" | ||||
| } | ||||
| 
 | ||||
| ports { | ||||
|   http = 4646 | ||||
|   rpc  = 4647 | ||||
|   serf = 4648 | ||||
| } | ||||
| 
 | ||||
| plugin "docker" { | ||||
|   config { | ||||
|     allow_privileged = true | ||||
|     volumes { | ||||
|       enabled = true | ||||
|     } | ||||
|   } | ||||
| } | ||||
| EOF | ||||
| 
 | ||||
| echo "📤 上传修复后的配置到 ash3c..." | ||||
| scp -P 22 -i ~/.ssh/id_ed25519 /tmp/ash3c-nomad.hcl ben@${ASH3C_HOST}:/tmp/ | ||||
| 
 | ||||
| echo "🔧 在 ash3c 上应用修复..." | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} << 'REMOTE_SCRIPT' | ||||
| echo '3131' | sudo -S systemctl stop nomad || true | ||||
| echo '3131' | sudo -S pkill -f nomad || true | ||||
| sleep 5 | ||||
| 
 | ||||
| # 备份旧配置 | ||||
| echo '3131' | sudo -S cp /etc/nomad.d/nomad.hcl /etc/nomad.d/nomad.hcl.backup.$(date +%Y%m%d_%H%M%S) || true | ||||
| 
 | ||||
| # 应用新配置 | ||||
| echo '3131' | sudo -S cp /tmp/ash3c-nomad.hcl /etc/nomad.d/nomad.hcl | ||||
| echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl | ||||
| echo '3131' | sudo -S chmod 640 /etc/nomad.d/nomad.hcl | ||||
| 
 | ||||
| # 清理数据目录 | ||||
| echo '3131' | sudo -S rm -rf /opt/nomad/data/* | ||||
| 
 | ||||
| # 重启服务 | ||||
| echo '3131' | sudo -S systemctl daemon-reload | ||||
| echo '3131' | sudo -S systemctl enable nomad | ||||
| echo '3131' | sudo -S systemctl start nomad | ||||
| 
 | ||||
| echo "✅ ash3c 配置修复完成" | ||||
| REMOTE_SCRIPT | ||||
| 
 | ||||
| echo "" | ||||
| echo "⏰ 等待 ash3c 服务启动..." | ||||
| sleep 15 | ||||
| 
 | ||||
| echo "" | ||||
| echo "🔍 检查 ash3c 服务状态..." | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S systemctl status nomad --no-pager" || echo "❌ 服务状态检查失败" | ||||
| 
 | ||||
| echo "" | ||||
| echo "🧹 清理临时文件..." | ||||
| rm -f /tmp/ash3c-nomad.hcl | ||||
| 
 | ||||
| echo "" | ||||
| echo "✅ ash3c IP 修复完成!" | ||||
| echo "" | ||||
| echo "下一步:" | ||||
| echo "1. 检查集群状态: nomad server members" | ||||
| echo "2. 如果还有问题,运行核弹级重置: ./scripts/utilities/nuclear-reset.sh" | ||||
|  | @ -1,151 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Consul 集群修复脚本 | ||||
| # 解决 "No cluster leader" 问题 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| echo "=== Consul 集群修复脚本 ===" | ||||
| echo "当前时间: $(date)" | ||||
| echo | ||||
| 
 | ||||
| # 检查当前 Consul 服务状态 | ||||
| echo "1. 检查当前 Consul 服务状态..." | ||||
| docker service ls | grep consul || echo "未找到 consul 服务" | ||||
| echo | ||||
| 
 | ||||
| # 显示当前问题 | ||||
| echo "2. 检查 Consul 日志中的错误..." | ||||
| echo "Master 节点日志:" | ||||
| docker service logs consul-cluster_consul-master --tail 5 2>/dev/null || echo "无法获取 master 日志" | ||||
| echo | ||||
| echo "Ash3c 节点日志:" | ||||
| docker service logs consul-cluster_consul-ash3c --tail 5 2>/dev/null || echo "无法获取 ash3c 日志" | ||||
| echo | ||||
| 
 | ||||
| # 提供修复选项 | ||||
| echo "3. 修复选项:" | ||||
| echo "   a) 使用修复后的 overlay 网络配置 (推荐)" | ||||
| echo "   b) 使用 macvlan 网络配置" | ||||
| echo "   c) 仅重启现有服务" | ||||
| echo | ||||
| 
 | ||||
| read -p "请选择修复方案 (a/b/c): " choice | ||||
| 
 | ||||
| case $choice in | ||||
|     a) | ||||
|         echo "使用修复后的 overlay 网络配置..." | ||||
|          | ||||
|         # 停止现有服务 | ||||
|         echo "停止现有 Consul 集群..." | ||||
|         docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在" | ||||
|          | ||||
|         # 等待服务完全停止 | ||||
|         echo "等待服务完全停止..." | ||||
|         sleep 10 | ||||
|          | ||||
|         # 清理数据卷 (可选) | ||||
|         read -p "是否清理现有数据卷? (y/n): " clean_volumes | ||||
|         if [[ $clean_volumes == "y" ]]; then | ||||
|             docker volume rm consul-cluster_consul_master_data 2>/dev/null || true | ||||
|             docker volume rm consul-cluster_consul_ash3c_data 2>/dev/null || true | ||||
|             echo "数据卷已清理" | ||||
|         fi | ||||
|          | ||||
|         # 部署修复后的配置 | ||||
|         echo "部署修复后的 Consul 集群..." | ||||
|         docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-fixed.yml consul-cluster | ||||
|          | ||||
|         echo "等待服务启动..." | ||||
|         sleep 15 | ||||
|          | ||||
|         # 检查服务状态 | ||||
|         echo "检查新服务状态..." | ||||
|         docker service ls | grep consul | ||||
|         ;; | ||||
|          | ||||
|     b) | ||||
|         echo "使用 macvlan 网络配置..." | ||||
|         echo "注意: 需要根据你的网络环境调整 IP 地址和网络接口" | ||||
|          | ||||
|         # 检查网络接口 | ||||
|         echo "当前网络接口:" | ||||
|         ip link show | grep -E "^[0-9]+:" | awk '{print $2}' | sed 's/://' | ||||
|         echo | ||||
|          | ||||
|         read -p "请输入要使用的网络接口 (如 eth0): " interface | ||||
|         read -p "请输入子网 (如 192.168.1.0/24): " subnet | ||||
|         read -p "请输入网关 (如 192.168.1.1): " gateway | ||||
|          | ||||
|         # 更新 macvlan 配置文件 | ||||
|         sed -i "s/parent: eth0/parent: $interface/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml | ||||
|         sed -i "s/192.168.1.0\/24/$subnet/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml | ||||
|         sed -i "s/192.168.1.1/$gateway/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml | ||||
|          | ||||
|         # 停止现有服务 | ||||
|         echo "停止现有 Consul 集群..." | ||||
|         docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在" | ||||
|          | ||||
|         # 等待服务完全停止 | ||||
|         echo "等待服务完全停止..." | ||||
|         sleep 10 | ||||
|          | ||||
|         # 部署 macvlan 配置 | ||||
|         echo "部署 macvlan Consul 集群..." | ||||
|         docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml consul-cluster | ||||
|          | ||||
|         echo "等待服务启动..." | ||||
|         sleep 15 | ||||
|          | ||||
|         # 检查服务状态 | ||||
|         echo "检查新服务状态..." | ||||
|         docker service ls | grep consul | ||||
|         ;; | ||||
|          | ||||
|     c) | ||||
|         echo "重启现有服务..." | ||||
|          | ||||
|         # 重启服务 | ||||
|         docker service update --force consul-cluster_consul-master | ||||
|         docker service update --force consul-cluster_consul-ash3c | ||||
|          | ||||
|         echo "等待服务重启..." | ||||
|         sleep 10 | ||||
|          | ||||
|         # 检查服务状态 | ||||
|         echo "检查服务状态..." | ||||
|         docker service ls | grep consul | ||||
|         ;; | ||||
|          | ||||
|     *) | ||||
|         echo "无效选择,退出" | ||||
|         exit 1 | ||||
|         ;; | ||||
| esac | ||||
| 
 | ||||
| echo | ||||
| echo "4. 验证修复结果..." | ||||
| sleep 5 | ||||
| 
 | ||||
| # 检查服务状态 | ||||
| echo "服务状态:" | ||||
| docker service ls | grep consul | ||||
| 
 | ||||
| echo | ||||
| echo "等待 30 秒后检查集群状态..." | ||||
| sleep 30 | ||||
| 
 | ||||
| # 尝试检查集群成员 | ||||
| echo "尝试检查集群成员状态..." | ||||
| timeout 10 docker service logs consul-cluster_consul-master --tail 10 2>/dev/null || echo "无法获取日志" | ||||
| 
 | ||||
| echo | ||||
| echo "=== 修复完成 ===" | ||||
| echo "请等待几分钟让集群完全启动,然后访问:" | ||||
| echo "- Master UI: http://your-master-ip:8500" | ||||
| echo "- Ash3c UI: http://your-ash3c-ip:8501" | ||||
| echo | ||||
| echo "如果问题仍然存在,请检查:" | ||||
| echo "1. 节点间网络连通性" | ||||
| echo "2. 防火墙设置" | ||||
| echo "3. Docker Swarm 网络配置" | ||||
|  | @ -1,92 +0,0 @@ | |||
| --- | ||||
| - name: Fix Nomad Cluster Issues | ||||
|   hosts: nomad_cluster | ||||
|   become: yes | ||||
|   vars: | ||||
|     nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
|      | ||||
|   tasks: | ||||
|     - name: Stop nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: stopped | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Clean nomad data directory | ||||
|       shell: rm -rf /opt/nomad/data/* | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Create correct nomad configuration | ||||
|       copy: | ||||
|         content: | | ||||
|           datacenter = "dc1" | ||||
|           region     = "global" | ||||
|           data_dir   = "/opt/nomad/data" | ||||
| 
 | ||||
|           bind_addr = "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}" | ||||
| 
 | ||||
|           server { | ||||
|             enabled          = true | ||||
|             bootstrap_expect = 3 | ||||
|             encrypt          = "{{ nomad_encrypt_key }}" | ||||
|              | ||||
|             server_join { | ||||
|               retry_join = ["100.116.158.95", "100.117.106.136", "100.116.80.94"] | ||||
|             } | ||||
|           } | ||||
| 
 | ||||
|           client { | ||||
|             enabled = true | ||||
|             network_interface = "{{ ansible_default_ipv4.interface | default('eth0') }}" | ||||
|           } | ||||
| 
 | ||||
|           ui { | ||||
|             enabled = true | ||||
|           } | ||||
| 
 | ||||
|           addresses { | ||||
|             http = "0.0.0.0" | ||||
|             rpc  = "0.0.0.0" | ||||
|             serf = "0.0.0.0" | ||||
|           } | ||||
| 
 | ||||
|           ports { | ||||
|             http = 4646 | ||||
|             rpc  = 4647 | ||||
|             serf = 4648 | ||||
|           } | ||||
| 
 | ||||
|           plugin "docker" { | ||||
|             config { | ||||
|               allow_privileged = true | ||||
|               volumes { | ||||
|                 enabled = true | ||||
|               } | ||||
|             } | ||||
|           } | ||||
|         dest: /etc/nomad.d/nomad.hcl | ||||
|         owner: nomad | ||||
|         group: nomad | ||||
|         mode: '0640' | ||||
| 
 | ||||
|     - name: Start nomad service | ||||
|       systemd: | ||||
|         name: nomad | ||||
|         state: started | ||||
|         enabled: yes | ||||
| 
 | ||||
|     - name: Wait for nomad to start | ||||
|       wait_for: | ||||
|         port: 4646 | ||||
|         host: "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}" | ||||
|         delay: 10 | ||||
|         timeout: 60 | ||||
| 
 | ||||
|     - name: Check nomad status | ||||
|       shell: systemctl status nomad --no-pager -l | ||||
|       register: nomad_status | ||||
|       ignore_errors: yes | ||||
| 
 | ||||
|     - name: Display nomad status | ||||
|       debug: | ||||
|         var: nomad_status.stdout_lines | ||||
|  | @ -1,242 +0,0 @@ | |||
| #!/bin/bash | ||||
| # Gitea 仓库管理脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| # 配置 | ||||
| GITEA_HOST="gitea" | ||||
| GITEA_USER="ben" | ||||
| GITEA_HTTP_URL="http://${GITEA_HOST}:3000" | ||||
| GITEA_SSH_URL="git@${GITEA_HOST}" | ||||
| REPO_NAME="mgmt" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 打印带颜色的消息 | ||||
| print_message() { | ||||
|     local color=$1 | ||||
|     local message=$2 | ||||
|     echo -e "${color}${message}${NC}" | ||||
| } | ||||
| 
 | ||||
| # 检查 SSH 连接 | ||||
| check_ssh_connection() { | ||||
|     print_message $BLUE "🔍 检查 Gitea SSH 连接..." | ||||
|      | ||||
|     if ssh -o ConnectTimeout=5 -o BatchMode=yes "${GITEA_SSH_URL}" 2>&1 | grep -q "successfully authenticated"; then | ||||
|         print_message $GREEN "✅ SSH 连接正常" | ||||
|         return 0 | ||||
|     else | ||||
|         print_message $RED "❌ SSH 连接失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 检查仓库状态 | ||||
| check_repo_status() { | ||||
|     print_message $BLUE "📊 检查仓库状态..." | ||||
|      | ||||
|     if [ -d ".git" ]; then | ||||
|         print_message $GREEN "✅ Git 仓库已初始化" | ||||
|          | ||||
|         if git remote get-url origin >/dev/null 2>&1; then | ||||
|             local origin_url=$(git remote get-url origin) | ||||
|             print_message $GREEN "✅ 远程仓库: $origin_url" | ||||
|         else | ||||
|             print_message $YELLOW "⚠️  未配置远程仓库" | ||||
|         fi | ||||
|          | ||||
|         local branch=$(git branch --show-current) | ||||
|         print_message $BLUE "📍 当前分支: $branch" | ||||
|          | ||||
|         local status=$(git status --porcelain) | ||||
|         if [ -z "$status" ]; then | ||||
|             print_message $GREEN "✅ 工作目录干净" | ||||
|         else | ||||
|             print_message $YELLOW "⚠️  有未提交的变更" | ||||
|         fi | ||||
|     else | ||||
|         print_message $RED "❌ 不是 Git 仓库" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 初始化仓库 | ||||
| init_repo() { | ||||
|     print_message $BLUE "📦 初始化 Git 仓库..." | ||||
|      | ||||
|     if [ ! -d ".git" ]; then | ||||
|         git init | ||||
|         git config user.name "${GITEA_USER}" | ||||
|         git config user.email "${GITEA_USER}@example.com" | ||||
|         print_message $GREEN "✅ Git 仓库初始化完成" | ||||
|     fi | ||||
|      | ||||
|     # 配置远程仓库 | ||||
|     if ! git remote get-url origin >/dev/null 2>&1; then | ||||
|         git remote add origin "${GITEA_SSH_URL}:${GITEA_USER}/${REPO_NAME}.git" | ||||
|         print_message $GREEN "✅ 远程仓库配置完成" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 同步代码 | ||||
| sync_code() { | ||||
|     print_message $BLUE "🔄 同步代码..." | ||||
|      | ||||
|     # 检查是否有未提交的变更 | ||||
|     if ! git diff --quiet || ! git diff --staged --quiet; then | ||||
|         print_message $YELLOW "⚠️  发现未提交的变更" | ||||
|         git status --short | ||||
|          | ||||
|         read -p "是否提交这些变更? (y/N): " -n 1 -r | ||||
|         echo | ||||
|         if [[ $REPLY =~ ^[Yy]$ ]]; then | ||||
|             git add . | ||||
|             read -p "请输入提交消息: " commit_message | ||||
|             git commit -m "$commit_message" | ||||
|             print_message $GREEN "✅ 变更已提交" | ||||
|         else | ||||
|             print_message $YELLOW "⚠️  跳过提交" | ||||
|             return 1 | ||||
|         fi | ||||
|     fi | ||||
|      | ||||
|     # 推送到远程仓库 | ||||
|     if git push origin main; then | ||||
|         print_message $GREEN "✅ 代码推送成功" | ||||
|     else | ||||
|         print_message $RED "❌ 代码推送失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 拉取最新代码 | ||||
| pull_code() { | ||||
|     print_message $BLUE "⬇️  拉取最新代码..." | ||||
|      | ||||
|     if git pull origin main; then | ||||
|         print_message $GREEN "✅ 代码拉取成功" | ||||
|     else | ||||
|         print_message $RED "❌ 代码拉取失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 查看提交历史 | ||||
| show_history() { | ||||
|     print_message $BLUE "📜 提交历史:" | ||||
|     git log --oneline --graph --decorate -10 | ||||
| } | ||||
| 
 | ||||
| # 查看分支状态 | ||||
| show_branches() { | ||||
|     print_message $BLUE "🌿 分支状态:" | ||||
|     git branch -a | ||||
| } | ||||
| 
 | ||||
| # 创建新分支 | ||||
| create_branch() { | ||||
|     local branch_name=$1 | ||||
|     if [ -z "$branch_name" ]; then | ||||
|         read -p "请输入分支名称: " branch_name | ||||
|     fi | ||||
|      | ||||
|     if [ -n "$branch_name" ]; then | ||||
|         git checkout -b "$branch_name" | ||||
|         print_message $GREEN "✅ 分支 '$branch_name' 创建成功" | ||||
|     else | ||||
|         print_message $RED "❌ 分支名称不能为空" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 切换分支 | ||||
| switch_branch() { | ||||
|     local branch_name=$1 | ||||
|     if [ -z "$branch_name" ]; then | ||||
|         print_message $BLUE "可用分支:" | ||||
|         git branch -a | ||||
|         read -p "请输入要切换的分支名称: " branch_name | ||||
|     fi | ||||
|      | ||||
|     if [ -n "$branch_name" ]; then | ||||
|         git checkout "$branch_name" | ||||
|         print_message $GREEN "✅ 已切换到分支 '$branch_name'" | ||||
|     else | ||||
|         print_message $RED "❌ 分支名称不能为空" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 显示帮助 | ||||
| show_help() { | ||||
|     echo "Gitea 仓库管理脚本" | ||||
|     echo "" | ||||
|     echo "用法: $0 [命令]" | ||||
|     echo "" | ||||
|     echo "命令:" | ||||
|     echo "  check       检查连接和仓库状态" | ||||
|     echo "  init        初始化仓库" | ||||
|     echo "  sync        同步代码到远程仓库" | ||||
|     echo "  pull        拉取最新代码" | ||||
|     echo "  history     查看提交历史" | ||||
|     echo "  branches    查看分支状态" | ||||
|     echo "  create-branch [name]  创建新分支" | ||||
|     echo "  switch-branch [name]  切换分支" | ||||
|     echo "  status      查看仓库状态" | ||||
|     echo "  help        显示帮助信息" | ||||
|     echo "" | ||||
|     echo "示例:" | ||||
|     echo "  $0 check                    # 检查状态" | ||||
|     echo "  $0 sync                     # 同步代码" | ||||
|     echo "  $0 create-branch feature-x  # 创建功能分支" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     local command=${1:-help} | ||||
|      | ||||
|     case $command in | ||||
|         check) | ||||
|             check_ssh_connection | ||||
|             check_repo_status | ||||
|             ;; | ||||
|         init) | ||||
|             init_repo | ||||
|             ;; | ||||
|         sync) | ||||
|             sync_code | ||||
|             ;; | ||||
|         pull) | ||||
|             pull_code | ||||
|             ;; | ||||
|         history) | ||||
|             show_history | ||||
|             ;; | ||||
|         branches) | ||||
|             show_branches | ||||
|             ;; | ||||
|         create-branch) | ||||
|             create_branch "$2" | ||||
|             ;; | ||||
|         switch-branch) | ||||
|             switch_branch "$2" | ||||
|             ;; | ||||
|         status) | ||||
|             check_repo_status | ||||
|             ;; | ||||
|         help|--help|-h) | ||||
|             show_help | ||||
|             ;; | ||||
|         *) | ||||
|             print_message $RED "❌ 未知命令: $command" | ||||
|             show_help | ||||
|             exit 1 | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| # 执行主函数 | ||||
| main "$@" | ||||
|  | @ -0,0 +1,227 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # 🚀 Nomad 集群管理脚本 | ||||
| # Nomad Cluster Management Script | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| PURPLE='\033[0;35m' | ||||
| CYAN='\033[0;36m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_header() { | ||||
|     echo -e "${PURPLE}=== $1 ===${NC}" | ||||
| } | ||||
| 
 | ||||
| # 显示集群状态 | ||||
| show_cluster_status() { | ||||
|     log_header "Nomad 集群状态概览" | ||||
|      | ||||
|     # 检查 Leader | ||||
|     echo -e "${CYAN}Leader 状态:${NC}" | ||||
|     LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接") | ||||
|     if [[ "$LEADER" =~ ^\".*\"$ ]]; then | ||||
|         echo "  ✅ Leader: $(echo $LEADER | tr -d '\"')" | ||||
|     else | ||||
|         echo "  ❌ 无 Leader 或连接失败" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     echo "" | ||||
|      | ||||
|     # 节点状态 | ||||
|     echo -e "${CYAN}节点状态:${NC}" | ||||
|     curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | "  \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || { | ||||
|         log_warning "无法获取节点状态详情" | ||||
|         nomad node status 2>/dev/null || echo "  ❌ 命令执行失败" | ||||
|     } | ||||
|      | ||||
|     echo "" | ||||
|      | ||||
|     # 驱动状态 | ||||
|     echo -e "${CYAN}驱动程序状态:${NC}" | ||||
|     curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r ' | ||||
|         .[] |  | ||||
|         "  节点: \(.Name)" as $node | | ||||
|         .Drivers |  | ||||
|         to_entries[] |  | ||||
|         "    \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")" | ||||
|     ' 2>/dev/null || { | ||||
|         log_warning "无法获取驱动状态详情" | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| # 显示作业状态 | ||||
| show_jobs_status() { | ||||
|     log_header "作业状态" | ||||
|      | ||||
|     JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null) | ||||
|     if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then | ||||
|         echo "$JOBS" | jq -r '.[] | "  \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null | ||||
|     else | ||||
|         echo "  📝 当前没有运行的作业" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 显示访问信息 | ||||
| show_access_info() { | ||||
|     log_header "访问信息" | ||||
|      | ||||
|     echo -e "${CYAN}Web UI:${NC}" | ||||
|     echo "  🌐 http://100.116.158.95:4646" | ||||
|     echo "" | ||||
|      | ||||
|     echo -e "${CYAN}API 端点:${NC}" | ||||
|     echo "  🔗 http://100.116.158.95:4646/v1/" | ||||
|     echo "" | ||||
|      | ||||
|     echo -e "${CYAN}常用命令:${NC}" | ||||
|     echo "  📊 nomad status                    # 查看集群概览" | ||||
|     echo "  🖥️  nomad node status              # 查看节点状态" | ||||
|     echo "  🔧 nomad server members           # 查看服务器成员" | ||||
|     echo "  📋 nomad job status <job-name>    # 查看作业状态" | ||||
|     echo "  🚀 nomad job run <job-file>       # 运行作业" | ||||
|     echo "  📜 journalctl -u nomad -f         # 查看日志" | ||||
| } | ||||
| 
 | ||||
| # 运行诊断 | ||||
| run_diagnosis() { | ||||
|     log_header "运行完整诊断" | ||||
|      | ||||
|     if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then | ||||
|         bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" | ||||
|     else | ||||
|         log_error "诊断脚本未找到" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 配置 Podman 驱动 | ||||
| configure_podman() { | ||||
|     log_header "配置所有节点使用 Podman 驱动" | ||||
|      | ||||
|     local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml" | ||||
|     local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|      | ||||
|     if [[ ! -f "$playbook" ]]; then | ||||
|         log_error "Playbook 文件不存在: $playbook" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     if [[ ! -f "$inventory" ]]; then | ||||
|         log_error "Inventory 文件不存在: $inventory" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     cd "$PROJECT_ROOT/configuration" | ||||
|     python3 -m ansible playbook -i "$inventory" "$playbook" -v | ||||
| } | ||||
| 
 | ||||
| # 重启集群 | ||||
| restart_cluster() { | ||||
|     log_header "重启 Nomad 集群" | ||||
|      | ||||
|     log_warning "这将重启整个 Nomad 集群" | ||||
|     read -p "确认继续? (y/N): " -n 1 -r | ||||
|     echo "" | ||||
|      | ||||
|     if [[ $REPLY =~ ^[Yy]$ ]]; then | ||||
|         local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" | ||||
|         cd "$PROJECT_ROOT/configuration" | ||||
|         python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become | ||||
|          | ||||
|         log_info "等待集群启动..." | ||||
|         sleep 15 | ||||
|         show_cluster_status | ||||
|     else | ||||
|         log_info "操作已取消" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 主菜单 | ||||
| show_menu() { | ||||
|     echo "" | ||||
|     log_header "Nomad 集群管理菜单" | ||||
|     echo "" | ||||
|     echo "1) 📊 显示集群状态" | ||||
|     echo "2) 📋 显示作业状态"   | ||||
|     echo "3) 🔍 运行完整诊断" | ||||
|     echo "4) 🐳 配置 Podman 驱动" | ||||
|     echo "5) 🔄 重启集群" | ||||
|     echo "6) ℹ️  显示访问信息" | ||||
|     echo "0) ❌ 退出" | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     echo "" | ||||
|     echo "🚀 Nomad 集群管理工具" | ||||
|     echo "===================" | ||||
|      | ||||
|     while true; do | ||||
|         show_menu | ||||
|         read -p "请选择操作 (0-6): " choice | ||||
|          | ||||
|         case $choice in | ||||
|             1) | ||||
|                 show_cluster_status | ||||
|                 ;; | ||||
|             2) | ||||
|                 show_jobs_status | ||||
|                 ;; | ||||
|             3) | ||||
|                 run_diagnosis | ||||
|                 ;; | ||||
|             4) | ||||
|                 configure_podman | ||||
|                 ;; | ||||
|             5) | ||||
|                 restart_cluster | ||||
|                 ;; | ||||
|             6) | ||||
|                 show_access_info | ||||
|                 ;; | ||||
|             0) | ||||
|                 log_info "再见!" | ||||
|                 exit 0 | ||||
|                 ;; | ||||
|             *) | ||||
|                 log_error "无效选择,请重试" | ||||
|                 ;; | ||||
|         esac | ||||
|          | ||||
|         echo "" | ||||
|         read -p "按回车键继续..." -r | ||||
|     done | ||||
| } | ||||
| 
 | ||||
| # 如果直接运行脚本 | ||||
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | ||||
|     main "$@" | ||||
| fi | ||||
|  | @ -1,304 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # 代理开关脚本 | ||||
| # 用于一键开启/关闭 istoreos.tailnet-68f9.ts.net:1082 代理 | ||||
| 
 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' | ||||
| 
 | ||||
| # 代理配置 | ||||
| PROXY_HOST="istoreos.tailnet-68f9.ts.net" | ||||
| PROXY_PORT="1082" | ||||
| PROXY_URL="http://${PROXY_HOST}:${PROXY_PORT}" | ||||
| 
 | ||||
| # 配置文件路径 | ||||
| PROXY_ENV_FILE="/root/mgmt/configuration/proxy.env" | ||||
| SHELL_RC_FILE="$HOME/.zshrc" | ||||
| BASH_RC_FILE="$HOME/.bashrc" | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查代理状态 | ||||
| check_proxy_status() { | ||||
|     if [[ -n "${http_proxy:-}" ]] || [[ -n "${HTTP_PROXY:-}" ]]; then | ||||
|         echo "on" | ||||
|     else | ||||
|         echo "off" | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 测试代理连接 | ||||
| test_proxy() { | ||||
|     log_info "测试代理连接..." | ||||
|     if curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://httpbin.org/ip >/dev/null 2>&1; then | ||||
|         log_success "代理连接正常" | ||||
|         return 0 | ||||
|     else | ||||
|         log_error "代理连接失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 开启代理 | ||||
| enable_proxy() { | ||||
|     log_info "开启代理..." | ||||
|      | ||||
|     # 设置环境变量 | ||||
|     export http_proxy="$PROXY_URL" | ||||
|     export https_proxy="$PROXY_URL" | ||||
|     export HTTP_PROXY="$PROXY_URL" | ||||
|     export HTTPS_PROXY="$PROXY_URL" | ||||
|     export no_proxy="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" | ||||
|     export NO_PROXY="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" | ||||
|     export ALL_PROXY="$PROXY_URL" | ||||
|     export all_proxy="$PROXY_URL" | ||||
|      | ||||
|     # 测试连接 | ||||
|     if test_proxy; then | ||||
|         log_success "代理已开启: $PROXY_URL" | ||||
|          | ||||
|         # 显示当前IP | ||||
|         local current_ip=$(curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") | ||||
|         log_info "当前IP: $current_ip" | ||||
|          | ||||
|         return 0 | ||||
|     else | ||||
|         log_error "代理开启失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 关闭代理 | ||||
| disable_proxy() { | ||||
|     log_info "关闭代理..." | ||||
|      | ||||
|     # 清除环境变量 | ||||
|     unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY | ||||
|     unset no_proxy NO_PROXY ALL_PROXY all_proxy | ||||
|      | ||||
|     log_success "代理已关闭" | ||||
|      | ||||
|     # 显示当前IP | ||||
|     local current_ip=$(curl -s --connect-timeout 5 https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") | ||||
|     log_info "当前IP: $current_ip" | ||||
| } | ||||
| 
 | ||||
| # 切换代理状态 | ||||
| toggle_proxy() { | ||||
|     local current_status=$(check_proxy_status) | ||||
|      | ||||
|     if [[ "$current_status" == "on" ]]; then | ||||
|         disable_proxy | ||||
|     else | ||||
|         enable_proxy | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 永久开启代理(写入配置文件) | ||||
| enable_proxy_permanent() { | ||||
|     log_info "永久开启代理..." | ||||
|      | ||||
|     # 创建代理环境文件 | ||||
|     cat > "$PROXY_ENV_FILE" << EOF | ||||
| # Proxy Configuration for ${PROXY_HOST}:${PROXY_PORT} | ||||
| # This file contains proxy environment variables for the management system | ||||
| 
 | ||||
| # HTTP/HTTPS Proxy Settings | ||||
| export http_proxy=${PROXY_URL} | ||||
| export https_proxy=${PROXY_URL} | ||||
| export HTTP_PROXY=${PROXY_URL} | ||||
| export HTTPS_PROXY=${PROXY_URL} | ||||
| 
 | ||||
| # No Proxy Settings (local networks and services) | ||||
| export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net | ||||
| export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net | ||||
| 
 | ||||
| # Additional proxy settings for various tools | ||||
| export ALL_PROXY=${PROXY_URL} | ||||
| export all_proxy=${PROXY_URL} | ||||
| 
 | ||||
| # Docker proxy settings | ||||
| export DOCKER_BUILDKIT=1 | ||||
| export BUILDKIT_PROGRESS=plain | ||||
| 
 | ||||
| # Git proxy settings | ||||
| export GIT_HTTP_PROXY=${PROXY_URL} | ||||
| export GIT_HTTPS_PROXY=${PROXY_URL} | ||||
| 
 | ||||
| # Curl proxy settings | ||||
| export CURL_PROXY=${PROXY_URL} | ||||
| 
 | ||||
| # Wget proxy settings | ||||
| export WGET_PROXY=${PROXY_URL} | ||||
| EOF | ||||
| 
 | ||||
|     # 在 shell 配置文件中加载代理配置 | ||||
|     local shell_files=("$SHELL_RC_FILE" "$BASH_RC_FILE") | ||||
|      | ||||
|     for shell_file in "${shell_files[@]}"; do | ||||
|         if [[ -f "$shell_file" ]]; then | ||||
|             # 检查是否已经加载了代理配置 | ||||
|             if ! grep -q "source.*proxy.env" "$shell_file"; then | ||||
|                 log_info "在 $shell_file 中添加代理配置加载..." | ||||
|                 echo "" >> "$shell_file" | ||||
|                 echo "# Load proxy configuration if exists" >> "$shell_file" | ||||
|                 echo "if [[ -f $PROXY_ENV_FILE ]]; then" >> "$shell_file" | ||||
|                 echo "    source $PROXY_ENV_FILE" >> "$shell_file" | ||||
|                 echo "fi" >> "$shell_file" | ||||
|             fi | ||||
|         fi | ||||
|     done | ||||
|      | ||||
|     # 立即加载配置 | ||||
|     if [[ -f "$PROXY_ENV_FILE" ]]; then | ||||
|         source "$PROXY_ENV_FILE" | ||||
|     fi | ||||
|      | ||||
|     if test_proxy; then | ||||
|         log_success "代理已永久开启" | ||||
|         log_info "配置已保存到: $PROXY_ENV_FILE" | ||||
|         log_info "请重新登录或运行: source ~/.zshrc" | ||||
|     else | ||||
|         log_error "代理永久开启失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 永久关闭代理(从配置文件移除) | ||||
| disable_proxy_permanent() { | ||||
|     log_info "永久关闭代理..." | ||||
|      | ||||
|     # 备份现有配置 | ||||
|     if [[ -f "$PROXY_ENV_FILE" ]]; then | ||||
|         cp "$PROXY_ENV_FILE" "${PROXY_ENV_FILE}.backup.$(date +%Y%m%d_%H%M%S)" | ||||
|         rm -f "$PROXY_ENV_FILE" | ||||
|     fi | ||||
|      | ||||
|     # 从 shell 配置文件中移除代理配置加载 | ||||
|     local shell_files=("$SHELL_RC_FILE" "$BASH_RC_FILE") | ||||
|      | ||||
|     for shell_file in "${shell_files[@]}"; do | ||||
|         if [[ -f "$shell_file" ]]; then | ||||
|             # 移除代理配置加载行 | ||||
|             if grep -q "source.*proxy.env" "$shell_file"; then | ||||
|                 log_info "从 $shell_file 中移除代理配置加载..." | ||||
|                 sed -i '/# Load proxy configuration if exists/,/^fi$/d' "$shell_file" | ||||
|             fi | ||||
|         fi | ||||
|     done | ||||
|      | ||||
|     # 立即清除环境变量 | ||||
|     disable_proxy | ||||
|      | ||||
|     log_success "代理已永久关闭" | ||||
|     log_info "请重新登录或运行: source ~/.zshrc" | ||||
| } | ||||
| 
 | ||||
| # 显示代理状态 | ||||
| show_status() { | ||||
|     local current_status=$(check_proxy_status) | ||||
|      | ||||
|     echo "" | ||||
|     log_info "=== 代理状态 ===" | ||||
|      | ||||
|     if [[ "$current_status" == "on" ]]; then | ||||
|         log_success "代理状态: 开启" | ||||
|         log_info "代理地址: $PROXY_URL" | ||||
|          | ||||
|         # 显示当前IP | ||||
|         local current_ip=$(curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") | ||||
|         log_info "当前IP: $current_ip" | ||||
|     else | ||||
|         log_warning "代理状态: 关闭" | ||||
|          | ||||
|         # 显示当前IP | ||||
|         local current_ip=$(curl -s --connect-timeout 5 https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") | ||||
|         log_info "当前IP: $current_ip" | ||||
|     fi | ||||
|      | ||||
|     # 检查配置文件状态 | ||||
|     if [[ -f "$PROXY_ENV_FILE" ]]; then | ||||
|         log_info "配置文件: 存在 ($PROXY_ENV_FILE)" | ||||
|     else | ||||
|         log_info "配置文件: 不存在" | ||||
|     fi | ||||
|      | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 显示帮助信息 | ||||
| show_help() { | ||||
|     echo "代理开关脚本 - 管理 istoreos.tailnet-68f9.ts.net:1082 代理" | ||||
|     echo "" | ||||
|     echo "用法: $0 [命令]" | ||||
|     echo "" | ||||
|     echo "命令:" | ||||
|     echo "  on           - 临时开启代理(仅当前会话)" | ||||
|     echo "  off          - 临时关闭代理(仅当前会话)" | ||||
|     echo "  toggle       - 切换代理状态" | ||||
|     echo "  enable       - 永久开启代理(写入配置文件)" | ||||
|     echo "  disable      - 永久关闭代理(从配置文件移除)" | ||||
|     echo "  status       - 显示代理状态" | ||||
|     echo "  test         - 测试代理连接" | ||||
|     echo "  help         - 显示此帮助信息" | ||||
|     echo "" | ||||
|     echo "示例:" | ||||
|     echo "  $0 on        # 临时开启代理" | ||||
|     echo "  $0 enable    # 永久开启代理" | ||||
|     echo "  $0 status    # 查看代理状态" | ||||
|     echo "  $0 toggle    # 切换代理状态" | ||||
|     echo "" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     case "${1:-help}" in | ||||
|         "on") | ||||
|             enable_proxy | ||||
|             ;; | ||||
|         "off") | ||||
|             disable_proxy | ||||
|             ;; | ||||
|         "toggle") | ||||
|             toggle_proxy | ||||
|             ;; | ||||
|         "enable") | ||||
|             enable_proxy_permanent | ||||
|             ;; | ||||
|         "disable") | ||||
|             disable_proxy_permanent | ||||
|             ;; | ||||
|         "status") | ||||
|             show_status | ||||
|             ;; | ||||
|         "test") | ||||
|             test_proxy | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             show_help | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -1,114 +0,0 @@ | |||
| #!/bin/bash | ||||
| # 快速启动脚本 | ||||
| 
 | ||||
| set -e | ||||
| 
 | ||||
| echo "🚀 欢迎使用基础设施管理平台!" | ||||
| echo "" | ||||
| 
 | ||||
| # 检查必要工具 | ||||
| check_tool() { | ||||
|     if ! command -v "$1" &> /dev/null; then | ||||
|         echo "❌ $1 未安装,请先运行 'make setup'" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| echo "🔍 检查必要工具..." | ||||
| check_tool "tofu" || exit 1 | ||||
| check_tool "ansible" || exit 1 | ||||
| check_tool "docker" || exit 1 | ||||
| 
 | ||||
| echo "✅ 工具检查通过" | ||||
| echo "" | ||||
| 
 | ||||
| # 检查配置文件 | ||||
| CONFIG_FILE="infrastructure/environments/dev/terraform.tfvars" | ||||
| if [ ! -f "$CONFIG_FILE" ]; then | ||||
|     echo "⚠️  配置文件不存在,正在创建..." | ||||
|     cp "${CONFIG_FILE}.example" "$CONFIG_FILE" | ||||
|     echo "📝 请编辑配置文件: $CONFIG_FILE" | ||||
|     echo "   填入你的云服务商凭据后再次运行此脚本" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
| echo "✅ 配置文件存在" | ||||
| echo "" | ||||
| 
 | ||||
| # 选择操作 | ||||
| echo "请选择要执行的操作:" | ||||
| echo "1) 初始化基础设施" | ||||
| echo "2) 查看执行计划" | ||||
| echo "3) 应用基础设施变更" | ||||
| echo "4) 部署应用" | ||||
| echo "5) 启动开发环境" | ||||
| echo "6) 查看监控" | ||||
| echo "7) 完整部署流程" | ||||
| echo "" | ||||
| 
 | ||||
| read -p "请输入选项 (1-7): " choice | ||||
| 
 | ||||
| case $choice in | ||||
|     1) | ||||
|         echo "🏗️  初始化基础设施..." | ||||
|         make init | ||||
|         ;; | ||||
|     2) | ||||
|         echo "📋 查看执行计划..." | ||||
|         make plan | ||||
|         ;; | ||||
|     3) | ||||
|         echo "🚀 应用基础设施变更..." | ||||
|         make apply | ||||
|         ;; | ||||
|     4) | ||||
|         echo "📦 部署应用..." | ||||
|         make ansible-deploy | ||||
|         ;; | ||||
|     5) | ||||
|         echo "🐳 启动开发环境..." | ||||
|         make docker-up | ||||
|         ;; | ||||
|     6) | ||||
|         echo "📊 启动监控..." | ||||
|         make monitor | ||||
|         ;; | ||||
|     7) | ||||
|         echo "🎯 执行完整部署流程..." | ||||
|         echo "" | ||||
|         echo "步骤 1/4: 初始化基础设施..." | ||||
|         make init | ||||
|         echo "" | ||||
|         echo "步骤 2/4: 查看执行计划..." | ||||
|         make plan | ||||
|         echo "" | ||||
|         read -p "是否继续应用基础设施变更? (y/N): " -n 1 -r | ||||
|         echo | ||||
|         if [[ $REPLY =~ ^[Yy]$ ]]; then | ||||
|             echo "步骤 3/4: 应用基础设施变更..." | ||||
|             make apply | ||||
|             echo "" | ||||
|             echo "步骤 4/4: 部署应用..." | ||||
|             make ansible-deploy | ||||
|             echo "" | ||||
|             echo "🎉 完整部署流程完成!" | ||||
|         else | ||||
|             echo "ℹ️  部署流程已取消" | ||||
|         fi | ||||
|         ;; | ||||
|     *) | ||||
|         echo "❌ 无效选项" | ||||
|         exit 1 | ||||
|         ;; | ||||
| esac | ||||
| 
 | ||||
| echo "" | ||||
| echo "🎉 操作完成!" | ||||
| echo "" | ||||
| echo "📋 有用的命令:" | ||||
| echo "  make help          - 查看所有可用命令" | ||||
| echo "  make plan          - 查看基础设施变更计划" | ||||
| echo "  make apply         - 应用基础设施变更" | ||||
| echo "  make ansible-deploy - 部署应用" | ||||
| echo "  make monitor       - 启动监控" | ||||
| echo "  make clean         - 清理临时文件" | ||||
|  | @ -1,104 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| echo "=== 简单的 Nomad 集群修复脚本 ===" | ||||
| 
 | ||||
| # 定义 Tailscale IP 地址 | ||||
| SEMAPHORE_IP="100.116.158.95" | ||||
| MASTER_IP="100.117.106.136" | ||||
| ASH3C_IP="100.116.80.94" | ||||
| ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" | ||||
| 
 | ||||
| # 创建配置文件函数 | ||||
| create_config() { | ||||
|     local node_name=$1 | ||||
|     local bind_ip=$2 | ||||
|      | ||||
|     cat > /tmp/nomad-${node_name}.hcl << EOF | ||||
| datacenter = "dc1" | ||||
| region     = "global" | ||||
| data_dir   = "/opt/nomad/data" | ||||
| 
 | ||||
| bind_addr = "${bind_ip}" | ||||
| 
 | ||||
| server { | ||||
|   enabled          = true | ||||
|   bootstrap_expect = 3 | ||||
|   encrypt          = "${ENCRYPT_KEY}" | ||||
|    | ||||
|   server_join { | ||||
|     retry_join = ["${SEMAPHORE_IP}", "${MASTER_IP}", "${ASH3C_IP}"] | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| client { | ||||
|   enabled = true | ||||
| } | ||||
| 
 | ||||
| ui_config { | ||||
|   enabled = true | ||||
| } | ||||
| 
 | ||||
| addresses { | ||||
|   http = "0.0.0.0" | ||||
|   rpc  = "${bind_ip}" | ||||
|   serf = "${bind_ip}" | ||||
| } | ||||
| 
 | ||||
| ports { | ||||
|   http = 4646 | ||||
|   rpc  = 4647 | ||||
|   serf = 4648 | ||||
| } | ||||
| 
 | ||||
| plugin "docker" { | ||||
|   config { | ||||
|     allow_privileged = true | ||||
|     volumes { | ||||
|       enabled = true | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| log_level = "INFO" | ||||
| log_file  = "/var/log/nomad/nomad.log" | ||||
| EOF | ||||
| } | ||||
| 
 | ||||
| echo "1. 停止所有 Nomad 服务..." | ||||
| systemctl stop nomad | ||||
| ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl stop nomad" | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl stop nomad" | ||||
| 
 | ||||
| echo "2. 清理数据目录..." | ||||
| rm -rf /opt/nomad/data/* | ||||
| ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*" | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*" | ||||
| 
 | ||||
| echo "3. 创建新配置文件..." | ||||
| create_config "semaphore" "${SEMAPHORE_IP}" | ||||
| create_config "master" "${MASTER_IP}" | ||||
| create_config "ash3c" "${ASH3C_IP}" | ||||
| 
 | ||||
| echo "4. 部署配置文件..." | ||||
| cp /tmp/nomad-semaphore.hcl /etc/nomad.d/nomad.hcl | ||||
| chown nomad:nomad /etc/nomad.d/nomad.hcl | ||||
| 
 | ||||
| scp -P 60022 -i ~/.ssh/id_ed25519 /tmp/nomad-master.hcl ben@${MASTER_IP}:/tmp/ | ||||
| ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S cp /tmp/nomad-master.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl" | ||||
| 
 | ||||
| scp -P 22 -i ~/.ssh/id_ed25519 /tmp/nomad-ash3c.hcl ben@${ASH3C_IP}:/tmp/ | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S cp /tmp/nomad-ash3c.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl" | ||||
| 
 | ||||
| echo "5. 启动服务..." | ||||
| systemctl start nomad | ||||
| ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl start nomad" | ||||
| ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl start nomad" | ||||
| 
 | ||||
| echo "6. 等待集群形成..." | ||||
| sleep 30 | ||||
| 
 | ||||
| echo "7. 检查集群状态..." | ||||
| nomad server members | ||||
| nomad node status | ||||
| 
 | ||||
| echo "=== 修复完成 ===" | ||||
|  | @ -1,311 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Terraform Consul Provider 配置脚本 | ||||
| # 用于配置 Terraform 从 Consul 读取敏感配置 | ||||
| 
 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| ENVIRONMENT="${ENVIRONMENT:-dev}" | ||||
| CONSUL_ADDR="${CONSUL_ADDR:-http://localhost:8500}" | ||||
| 
 | ||||
| # 颜色输出 | ||||
| GREEN='\033[0;32m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' | ||||
| 
 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 创建 Terraform Consul Provider 配置 | ||||
| create_consul_provider() { | ||||
|     local tf_dir="infrastructure/environments/${ENVIRONMENT}" | ||||
|      | ||||
|     log_info "创建 Terraform Consul Provider 配置..." | ||||
|      | ||||
|     cat > "${tf_dir}/consul-provider.tf" << 'EOF' | ||||
| # Consul Provider 配置 | ||||
| terraform { | ||||
|   required_providers { | ||||
|     consul = { | ||||
|       source  = "hashicorp/consul" | ||||
|       version = "~> 2.18" | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| provider "consul" { | ||||
|   address = var.consul_config.address | ||||
|   token   = lookup(var.consul_config, "token", null) | ||||
| } | ||||
| 
 | ||||
| # 从 Consul 读取 Oracle Cloud 配置 | ||||
| data "consul_keys" "oracle_config" { | ||||
|   key { | ||||
|     name = "tenancy_ocid" | ||||
|     path = "config/${var.environment}/oracle/tenancy_ocid" | ||||
|   } | ||||
|    | ||||
|   key { | ||||
|     name = "user_ocid" | ||||
|     path = "config/${var.environment}/oracle/user_ocid" | ||||
|   } | ||||
|    | ||||
|   key { | ||||
|     name = "fingerprint" | ||||
|     path = "config/${var.environment}/oracle/fingerprint" | ||||
|   } | ||||
|    | ||||
|   key { | ||||
|     name = "private_key" | ||||
|     path = "config/${var.environment}/oracle/private_key" | ||||
|   } | ||||
|    | ||||
|   key { | ||||
|     name = "compartment_ocid" | ||||
|     path = "config/${var.environment}/oracle/compartment_ocid" | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| # 创建临时私钥文件 | ||||
| resource "local_file" "oci_private_key" { | ||||
|   content         = data.consul_keys.oracle_config.var.private_key | ||||
|   filename        = "/tmp/oci_private_key_${var.environment}.pem" | ||||
|   file_permission = "0600" | ||||
|    | ||||
|   lifecycle { | ||||
|     ignore_changes = [content] | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| # 本地变量,用于构建完整的 OCI 配置 | ||||
| locals { | ||||
|   oci_config_from_consul = { | ||||
|     tenancy_ocid     = data.consul_keys.oracle_config.var.tenancy_ocid | ||||
|     user_ocid        = data.consul_keys.oracle_config.var.user_ocid | ||||
|     fingerprint      = data.consul_keys.oracle_config.var.fingerprint | ||||
|     private_key_path = local_file.oci_private_key.filename | ||||
|     region           = var.oci_config.region | ||||
|     compartment_ocid = data.consul_keys.oracle_config.var.compartment_ocid | ||||
|   } | ||||
| } | ||||
| EOF | ||||
| 
 | ||||
|     log_success "Consul Provider 配置已创建: ${tf_dir}/consul-provider.tf" | ||||
| } | ||||
| 
 | ||||
| # 创建变量定义文件 | ||||
| create_variables() { | ||||
|     local tf_dir="infrastructure/environments/${ENVIRONMENT}" | ||||
|      | ||||
|     log_info "更新 Terraform 变量定义..." | ||||
|      | ||||
|     cat > "${tf_dir}/variables.tf" << 'EOF' | ||||
| # 基本变量 | ||||
| variable "environment" { | ||||
|   description = "环境名称" | ||||
|   type        = string | ||||
| } | ||||
| 
 | ||||
| variable "project_name" { | ||||
|   description = "项目名称" | ||||
|   type        = string | ||||
| } | ||||
| 
 | ||||
| variable "owner" { | ||||
|   description = "项目所有者" | ||||
|   type        = string | ||||
| } | ||||
| 
 | ||||
| variable "cloud_providers" { | ||||
|   description = "要启用的云服务商" | ||||
|   type        = list(string) | ||||
|   default     = [] | ||||
| } | ||||
| 
 | ||||
| variable "vpc_cidr" { | ||||
|   description = "VPC CIDR 块" | ||||
|   type        = string | ||||
| } | ||||
| 
 | ||||
| variable "availability_zones" { | ||||
|   description = "可用区列表" | ||||
|   type        = list(string) | ||||
| } | ||||
| 
 | ||||
| variable "common_tags" { | ||||
|   description = "通用标签" | ||||
|   type        = map(string) | ||||
|   default     = {} | ||||
| } | ||||
| 
 | ||||
| # Consul 配置 | ||||
| variable "consul_config" { | ||||
|   description = "Consul 配置" | ||||
|   type = object({ | ||||
|     address = string | ||||
|     token   = optional(string) | ||||
|   }) | ||||
| } | ||||
| 
 | ||||
| # Oracle Cloud 配置(基本信息) | ||||
| variable "oci_config" { | ||||
|   description = "Oracle Cloud 基本配置" | ||||
|   type = object({ | ||||
|     region           = string | ||||
|     tenancy_ocid     = optional(string, "FROM_CONSUL") | ||||
|     user_ocid        = optional(string, "FROM_CONSUL") | ||||
|     fingerprint      = optional(string, "FROM_CONSUL") | ||||
|     private_key_path = optional(string, "FROM_CONSUL") | ||||
|     compartment_ocid = optional(string, "FROM_CONSUL") | ||||
|   }) | ||||
| } | ||||
| 
 | ||||
| # 其他云服务商配置 | ||||
| variable "huawei_config" { | ||||
|   description = "华为云配置" | ||||
|   type = object({ | ||||
|     access_key = string | ||||
|     secret_key = string | ||||
|     region     = string | ||||
|     project_id = string | ||||
|   }) | ||||
|   default = { | ||||
|     access_key = "" | ||||
|     secret_key = "" | ||||
|     region     = "cn-north-4" | ||||
|     project_id = "" | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| variable "gcp_config" { | ||||
|   description = "Google Cloud 配置" | ||||
|   type = object({ | ||||
|     project_id       = string | ||||
|     region           = string | ||||
|     zone             = string | ||||
|     credentials_file = string | ||||
|   }) | ||||
|   default = { | ||||
|     project_id       = "" | ||||
|     region           = "asia-northeast3" | ||||
|     zone             = "asia-northeast3-a" | ||||
|     credentials_file = "" | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| variable "aws_config" { | ||||
|   description = "AWS 配置" | ||||
|   type = object({ | ||||
|     region     = string | ||||
|     access_key = string | ||||
|     secret_key = string | ||||
|   }) | ||||
|   default = { | ||||
|     region     = "ap-northeast-2" | ||||
|     access_key = "" | ||||
|     secret_key = "" | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| variable "do_config" { | ||||
|   description = "DigitalOcean 配置" | ||||
|   type = object({ | ||||
|     token  = string | ||||
|     region = string | ||||
|   }) | ||||
|   default = { | ||||
|     token  = "" | ||||
|     region = "sgp1" | ||||
|   } | ||||
| } | ||||
| EOF | ||||
| 
 | ||||
|     log_success "变量定义已更新: ${tf_dir}/variables.tf" | ||||
| } | ||||
| 
 | ||||
| # 创建示例 main.tf | ||||
| create_main_tf() { | ||||
|     local tf_dir="infrastructure/environments/${ENVIRONMENT}" | ||||
|      | ||||
|     log_info "创建示例 main.tf..." | ||||
|      | ||||
|     cat > "${tf_dir}/main.tf" << 'EOF' | ||||
| # 主要 Terraform 配置文件 | ||||
| 
 | ||||
| terraform { | ||||
|   required_version = ">= 1.0" | ||||
|    | ||||
|   required_providers { | ||||
|     oci = { | ||||
|       source  = "oracle/oci" | ||||
|       version = "~> 5.0" | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| # Oracle Cloud Provider | ||||
| provider "oci" { | ||||
|   tenancy_ocid     = local.oci_config_from_consul.tenancy_ocid | ||||
|   user_ocid        = local.oci_config_from_consul.user_ocid | ||||
|   fingerprint      = local.oci_config_from_consul.fingerprint | ||||
|   private_key_path = local.oci_config_from_consul.private_key_path | ||||
|   region           = local.oci_config_from_consul.region | ||||
| } | ||||
| 
 | ||||
| # 示例:创建 VCN | ||||
| resource "oci_core_vcn" "main" { | ||||
|   count          = contains(var.cloud_providers, "oracle") ? 1 : 0 | ||||
|   compartment_id = local.oci_config_from_consul.compartment_ocid | ||||
|   cidr_block     = var.vpc_cidr | ||||
|   display_name   = "${var.project_name}-${var.environment}-vcn" | ||||
|    | ||||
|   freeform_tags = var.common_tags | ||||
| } | ||||
| 
 | ||||
| # 输出 | ||||
| output "vcn_id" { | ||||
|   description = "VCN ID" | ||||
|   value       = try(oci_core_vcn.main[0].id, null) | ||||
| } | ||||
| 
 | ||||
| output "oci_config_source" { | ||||
|   description = "OCI 配置来源" | ||||
|   value       = "consul" | ||||
| } | ||||
| EOF | ||||
| 
 | ||||
|     log_success "示例 main.tf 已创建: ${tf_dir}/main.tf" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     case "${1:-help}" in | ||||
|         "setup") | ||||
|             create_consul_provider | ||||
|             create_variables | ||||
|             create_main_tf | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             cat << EOF | ||||
| Terraform Consul Provider 配置脚本 | ||||
| 
 | ||||
| 用法: $0 [选项] | ||||
| 
 | ||||
| 选项: | ||||
|     setup    创建 Terraform Consul Provider 配置 | ||||
|     help     显示此帮助信息 | ||||
| 
 | ||||
| 环境变量: | ||||
|     ENVIRONMENT     环境名称 (默认: dev) | ||||
|     CONSUL_ADDR     Consul 地址 (默认: http://localhost:8500) | ||||
| EOF | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -1,128 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # 简化版 OpenTofu 密钥上传脚本 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 配置 | ||||
| CONSUL_ADDR="${CONSUL_ADDR:-http://master:8500}" | ||||
| ENVIRONMENT="${ENVIRONMENT:-dev}" | ||||
| TFVARS_FILE="tofu/environments/${ENVIRONMENT}/terraform.tfvars" | ||||
| 
 | ||||
| # 颜色输出 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' | ||||
| 
 | ||||
| log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } | ||||
| log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } | ||||
| log_error() { echo -e "${RED}[ERROR]${NC} $1"; } | ||||
| 
 | ||||
| # 检查 Consul 连接 | ||||
| check_consul() { | ||||
|     log_info "检查 Consul 连接..." | ||||
|     if ! curl -s "${CONSUL_ADDR}/v1/status/leader" > /dev/null; then | ||||
|         log_error "无法连接到 Consul: ${CONSUL_ADDR}" | ||||
|         exit 1 | ||||
|     fi | ||||
|     log_success "Consul 连接正常" | ||||
| } | ||||
| 
 | ||||
| # 上传配置 | ||||
| upload_configs() { | ||||
|     local uploaded_count=0 | ||||
|      | ||||
|     log_info "开始解析并上传配置..." | ||||
|      | ||||
|     # 直接解析 tfvars 文件 | ||||
|     while IFS= read -r line; do | ||||
|         # 跳过注释和空行 | ||||
|         if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         # 匹配变量赋值 | ||||
|         if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\"([^\"]*)\"|^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]#]+) ]]; then | ||||
|             local var_name="${BASH_REMATCH[1]:-${BASH_REMATCH[3]}}" | ||||
|             local var_value="${BASH_REMATCH[2]:-${BASH_REMATCH[4]}}" | ||||
|              | ||||
|             # 跳过空值 | ||||
|             if [[ -z "$var_value" || "$var_value" == "null" ]]; then | ||||
|                 continue | ||||
|             fi | ||||
|              | ||||
|             # 确定配置分类和路径 | ||||
|             local consul_path="" | ||||
|             if [[ "$var_name" =~ ^oci_ ]]; then | ||||
|                 consul_path="config/${ENVIRONMENT}/oracle/${var_name#oci_}" | ||||
|             elif [[ "$var_name" =~ ^huawei_ ]]; then | ||||
|                 consul_path="config/${ENVIRONMENT}/huawei/${var_name#huawei_}" | ||||
|             elif [[ "$var_name" =~ ^aws_ ]]; then | ||||
|                 consul_path="config/${ENVIRONMENT}/aws/${var_name#aws_}" | ||||
|             elif [[ "$var_name" =~ ^do_ ]]; then | ||||
|                 consul_path="config/${ENVIRONMENT}/digitalocean/${var_name#do_}" | ||||
|             elif [[ "$var_name" =~ ^gcp_ ]]; then | ||||
|                 consul_path="config/${ENVIRONMENT}/gcp/${var_name#gcp_}" | ||||
|             else | ||||
|                 consul_path="config/${ENVIRONMENT}/general/${var_name}" | ||||
|             fi | ||||
|              | ||||
|             # 上传到 Consul | ||||
|             if curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${consul_path}" -d "$var_value" > /dev/null; then | ||||
|                 log_info "上传: ${consul_path}" | ||||
|                 ((uploaded_count++)) | ||||
|             else | ||||
|                 log_error "上传失败: ${consul_path}" | ||||
|             fi | ||||
|         fi | ||||
|     done < "$TFVARS_FILE" | ||||
|      | ||||
|     log_success "总共上传了 $uploaded_count 个配置项到 Consul" | ||||
| } | ||||
| 
 | ||||
| # 列出配置 | ||||
| list_configs() { | ||||
|     log_info "列出 Consul 中的配置..." | ||||
|      | ||||
|     local keys=$(curl -s "${CONSUL_ADDR}/v1/kv/config/${ENVIRONMENT}/?keys" | jq -r '.[]' 2>/dev/null || echo "") | ||||
|      | ||||
|     if [[ -z "$keys" ]]; then | ||||
|         log_error "没有找到配置" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     echo "=== 环境 ${ENVIRONMENT} 的配置 ===" | ||||
|     echo "$keys" | while read -r key; do | ||||
|         local value=$(curl -s "${CONSUL_ADDR}/v1/kv/${key}?raw" 2>/dev/null || echo "无法读取") | ||||
|         # 隐藏敏感信息 | ||||
|         if [[ "$key" =~ (secret|key|token|password|ocid) ]]; then | ||||
|             echo "$key: [已隐藏]" | ||||
|         else | ||||
|             echo "$key: $value" | ||||
|         fi | ||||
|     done | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     if [[ ! -f "$TFVARS_FILE" ]]; then | ||||
|         log_error "找不到配置文件: $TFVARS_FILE" | ||||
|         exit 1 | ||||
|     fi | ||||
|      | ||||
|     check_consul | ||||
|      | ||||
|     case "${1:-upload}" in | ||||
|         "upload") | ||||
|             upload_configs | ||||
|             ;; | ||||
|         "list") | ||||
|             list_configs | ||||
|             ;; | ||||
|         *) | ||||
|             echo "用法: $0 [upload|list]" | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -1,495 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # OpenTofu 密钥上传脚本 | ||||
| # 用于将 terraform.tfvars 中的敏感配置批量上传到 Consul | ||||
| 
 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 配置 | ||||
| CONSUL_ADDR="${CONSUL_ADDR:-http://master:8500}" | ||||
| CONSUL_TOKEN="${CONSUL_TOKEN:-}" | ||||
| ENVIRONMENT="${ENVIRONMENT:-dev}" | ||||
| TOFU_DIR="${TOFU_DIR:-tofu/environments/${ENVIRONMENT}}" | ||||
| TFVARS_FILE="${TFVARS_FILE:-${TOFU_DIR}/terraform.tfvars}" | ||||
| 
 | ||||
| # 颜色输出 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查依赖 | ||||
| check_dependencies() { | ||||
|     local deps=("curl" "jq") | ||||
|     for dep in "${deps[@]}"; do | ||||
|         if ! command -v "$dep" &> /dev/null; then | ||||
|             log_error "缺少依赖: $dep" | ||||
|             exit 1 | ||||
|         fi | ||||
|     done | ||||
| } | ||||
| 
 | ||||
| # 检查 Consul 连接 | ||||
| check_consul() { | ||||
|     log_info "检查 Consul 连接..." | ||||
|     if ! curl -s "${CONSUL_ADDR}/v1/status/leader" > /dev/null; then | ||||
|         log_error "无法连接到 Consul: ${CONSUL_ADDR}" | ||||
|         exit 1 | ||||
|     fi | ||||
|     log_success "Consul 连接正常" | ||||
| } | ||||
| 
 | ||||
| # 检查 tfvars 文件 | ||||
| check_tfvars_file() { | ||||
|     if [[ ! -f "$TFVARS_FILE" ]]; then | ||||
|         log_error "找不到 terraform.tfvars 文件: $TFVARS_FILE" | ||||
|         exit 1 | ||||
|     fi | ||||
|     log_info "找到配置文件: $TFVARS_FILE" | ||||
| } | ||||
| 
 | ||||
| # 解析 HCL 配置并转换为 JSON | ||||
| parse_hcl_to_json() { | ||||
|     local tfvars_file="$1" | ||||
|     local temp_tf_file="/tmp/temp_config.tf" | ||||
|     local temp_json_file="/tmp/temp_config.json" | ||||
|      | ||||
|     # 创建临时 .tf 文件,将变量赋值转换为输出 | ||||
|     log_info "解析 HCL 配置..." | ||||
|      | ||||
|     # 读取 tfvars 文件并转换为 output 格式 | ||||
|     cat > "$temp_tf_file" << 'EOF' | ||||
| # 临时配置文件,用于解析 tfvars | ||||
| EOF | ||||
|      | ||||
|     # 解析每个配置块 | ||||
|     while IFS= read -r line; do | ||||
|         # 跳过注释和空行 | ||||
|         if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         # 提取变量名和值 | ||||
|         if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*(.+)$ ]]; then | ||||
|             local var_name="${BASH_REMATCH[1]}" | ||||
|             local var_value="${BASH_REMATCH[2]}" | ||||
|              | ||||
|             echo "output \"$var_name\" {" >> "$temp_tf_file" | ||||
|             echo "  value = $var_value" >> "$temp_tf_file" | ||||
|             echo "}" >> "$temp_tf_file" | ||||
|         fi | ||||
|     done < "$tfvars_file" | ||||
|      | ||||
|     # 使用 terraform 解析配置 | ||||
|     if command -v terraform &> /dev/null; then | ||||
|         cd "$(dirname "$temp_tf_file")" | ||||
|         terraform init -backend=false > /dev/null 2>&1 || true | ||||
|         terraform output -json > "$temp_json_file" 2>/dev/null || { | ||||
|             log_warning "无法使用 terraform 解析,尝试手动解析..." | ||||
|             manual_parse_tfvars "$tfvars_file" "$temp_json_file" | ||||
|         } | ||||
|     else | ||||
|         log_warning "未找到 terraform,使用手动解析..." | ||||
|         manual_parse_tfvars "$tfvars_file" "$temp_json_file" | ||||
|     fi | ||||
|      | ||||
|     echo "$temp_json_file" | ||||
| } | ||||
| 
 | ||||
| # 手动解析 tfvars 文件 | ||||
| manual_parse_tfvars() { | ||||
|     local tfvars_file="$1" | ||||
|     local output_file="$2" | ||||
|      | ||||
|     log_info "手动解析 tfvars 文件..." | ||||
|      | ||||
|     # 创建基础 JSON 结构 | ||||
|     echo "{" > "$output_file" | ||||
|      | ||||
|     local first_item=true | ||||
|     local in_block=false | ||||
|     local block_name="" | ||||
|     local block_content="" | ||||
|      | ||||
|     while IFS= read -r line; do | ||||
|         # 跳过注释和空行 | ||||
|         if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         # 检测配置块开始 | ||||
|         if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\{[[:space:]]*$ ]]; then | ||||
|             block_name="${BASH_REMATCH[1]}" | ||||
|             in_block=true | ||||
|             block_content="" | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         # 检测配置块结束 | ||||
|         if [[ "$in_block" == true && "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then | ||||
|             if [[ "$first_item" == false ]]; then | ||||
|                 echo "," >> "$output_file" | ||||
|             fi | ||||
|             echo "  \"$block_name\": {" >> "$output_file" | ||||
|             echo "$block_content" >> "$output_file" | ||||
|             echo "  }" >> "$output_file" | ||||
|             first_item=false | ||||
|             in_block=false | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         # 处理块内容 | ||||
|         if [[ "$in_block" == true ]]; then | ||||
|             if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\"([^\"]*)\"|^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]]+) ]]; then | ||||
|                 local key="${BASH_REMATCH[1]:-${BASH_REMATCH[3]}}" | ||||
|                 local value="${BASH_REMATCH[2]:-${BASH_REMATCH[4]}}" | ||||
|                  | ||||
|                 if [[ -n "$block_content" ]]; then | ||||
|                     block_content+="," | ||||
|                 fi | ||||
|                 block_content+="\n    \"$key\": \"$value\"" | ||||
|             fi | ||||
|             continue | ||||
|         fi | ||||
|          | ||||
|         # 处理简单变量 | ||||
|         if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\"([^\"]*)\"|^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]]+) ]]; then | ||||
|             local var_name="${BASH_REMATCH[1]:-${BASH_REMATCH[3]}}" | ||||
|             local var_value="${BASH_REMATCH[2]:-${BASH_REMATCH[4]}}" | ||||
|              | ||||
|             if [[ "$first_item" == false ]]; then | ||||
|                 echo "," >> "$output_file" | ||||
|             fi | ||||
|             echo "  \"$var_name\": \"$var_value\"" >> "$output_file" | ||||
|             first_item=false | ||||
|         fi | ||||
|     done < "$tfvars_file" | ||||
|      | ||||
|     echo "}" >> "$output_file" | ||||
| } | ||||
| 
 | ||||
| # 上传配置到 Consul | ||||
| upload_config_to_consul() { | ||||
|     local config_file="$1" | ||||
|     local uploaded_count=0 | ||||
|      | ||||
|     log_info "开始上传配置到 Consul..." | ||||
|      | ||||
|     # 读取 JSON 配置 | ||||
|     if [[ ! -f "$config_file" ]]; then | ||||
|         log_error "配置文件不存在: $config_file" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     # 上传 Oracle Cloud 配置 | ||||
|     local oci_tenancy=$(jq -r '.oci_tenancy_ocid // empty' "$config_file") | ||||
|     local oci_user=$(jq -r '.oci_user_ocid // empty' "$config_file") | ||||
|     local oci_fingerprint=$(jq -r '.oci_fingerprint // empty' "$config_file") | ||||
|     local oci_private_key_path=$(jq -r '.oci_private_key_path // empty' "$config_file") | ||||
|     local oci_compartment=$(jq -r '.oci_compartment_ocid // empty' "$config_file") | ||||
|     local oci_region=$(jq -r '.oci_region // empty' "$config_file") | ||||
|      | ||||
|     if [[ -n "$oci_tenancy" && "$oci_tenancy" != "null" && "$oci_tenancy" != "" ]]; then | ||||
| ======= | ||||
| # 上传配置到 Consul | ||||
| upload_config_to_consul() { | ||||
|     local config_file="$1" | ||||
|     local uploaded_count=0 | ||||
|      | ||||
|     log_info "开始上传配置到 Consul..." | ||||
|      | ||||
|     # 读取 JSON 配置 | ||||
|     if [[ ! -f "$config_file" ]]; then | ||||
|         log_error "配置文件不存在: $config_file" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     # 上传 Oracle Cloud 配置 | ||||
|     local oci_tenancy=$(jq -r '.oci_tenancy_ocid // empty' "$config_file") | ||||
|     local oci_user=$(jq -r '.oci_user_ocid // empty' "$config_file") | ||||
|     local oci_fingerprint=$(jq -r '.oci_fingerprint // empty' "$config_file") | ||||
|     local oci_private_key_path=$(jq -r '.oci_private_key_path // empty' "$config_file") | ||||
|     local oci_compartment=$(jq -r '.oci_compartment_ocid // empty' "$config_file") | ||||
|     local oci_region=$(jq -r '.oci_region // empty' "$config_file") | ||||
|      | ||||
|     if [[ -n "$oci_tenancy" && "$oci_tenancy" != "null" && "$oci_tenancy" != "" ]]; then | ||||
|         log_info "上传 Oracle Cloud 配置..." | ||||
|         local base_path="config/${ENVIRONMENT}/oracle" | ||||
|          | ||||
|         local tenancy_ocid=$(jq -r '.oci_config.tenancy_ocid // empty' "$config_file") | ||||
|         local user_ocid=$(jq -r '.oci_config.user_ocid // empty' "$config_file") | ||||
|         local fingerprint=$(jq -r '.oci_config.fingerprint // empty' "$config_file") | ||||
|         local private_key_path=$(jq -r '.oci_config.private_key_path // empty' "$config_file") | ||||
|         local compartment_ocid=$(jq -r '.oci_config.compartment_ocid // empty' "$config_file") | ||||
|         local region=$(jq -r '.oci_config.region // "ap-seoul-1"' "$config_file") | ||||
|          | ||||
|         # 上传非空配置 | ||||
|         [[ -n "$tenancy_ocid" && "$tenancy_ocid" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid" -d "$tenancy_ocid" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$user_ocid" && "$user_ocid" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid" -d "$user_ocid" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$fingerprint" && "$fingerprint" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint" -d "$fingerprint" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$compartment_ocid" && "$compartment_ocid" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid" -d "$compartment_ocid" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$region" && "$region" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|          | ||||
|         # 上传私钥文件内容 | ||||
|         if [[ -n "$private_key_path" && "$private_key_path" != "null" && -f "$private_key_path" ]]; then | ||||
|             local private_key_content=$(cat "$private_key_path") | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/private_key" -d "$private_key_content" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         fi | ||||
|          | ||||
|         log_success "Oracle Cloud 配置已上传" | ||||
|     fi | ||||
|      | ||||
|     # 上传华为云配置 | ||||
|     if jq -e '.huawei_config' "$config_file" > /dev/null 2>&1; then | ||||
|         log_info "上传华为云配置..." | ||||
|         local base_path="config/${ENVIRONMENT}/huawei" | ||||
|          | ||||
|         local access_key=$(jq -r '.huawei_config.access_key // empty' "$config_file") | ||||
|         local secret_key=$(jq -r '.huawei_config.secret_key // empty' "$config_file") | ||||
|         local region=$(jq -r '.huawei_config.region // "cn-north-4"' "$config_file") | ||||
|         local project_id=$(jq -r '.huawei_config.project_id // empty' "$config_file") | ||||
|          | ||||
|         [[ -n "$access_key" && "$access_key" != "null" && "$access_key" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/access_key" -d "$access_key" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$secret_key" && "$secret_key" != "null" && "$secret_key" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/secret_key" -d "$secret_key" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$region" && "$region" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$project_id" && "$project_id" != "null" && "$project_id" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/project_id" -d "$project_id" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|          | ||||
|         log_success "华为云配置已上传" | ||||
|     fi | ||||
|      | ||||
|     # 上传 AWS 配置 | ||||
|     if jq -e '.aws_config' "$config_file" > /dev/null 2>&1; then | ||||
|         log_info "上传 AWS 配置..." | ||||
|         local base_path="config/${ENVIRONMENT}/aws" | ||||
|          | ||||
|         local access_key=$(jq -r '.aws_config.access_key // empty' "$config_file") | ||||
|         local secret_key=$(jq -r '.aws_config.secret_key // empty' "$config_file") | ||||
|         local region=$(jq -r '.aws_config.region // "ap-northeast-2"' "$config_file") | ||||
|          | ||||
|         [[ -n "$access_key" && "$access_key" != "null" && "$access_key" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/access_key" -d "$access_key" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$secret_key" && "$secret_key" != "null" && "$secret_key" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/secret_key" -d "$secret_key" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$region" && "$region" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|          | ||||
|         log_success "AWS 配置已上传" | ||||
|     fi | ||||
|      | ||||
|     # 上传 DigitalOcean 配置 | ||||
|     if jq -e '.do_config' "$config_file" > /dev/null 2>&1; then | ||||
|         log_info "上传 DigitalOcean 配置..." | ||||
|         local base_path="config/${ENVIRONMENT}/digitalocean" | ||||
|          | ||||
|         local token=$(jq -r '.do_config.token // empty' "$config_file") | ||||
|         local region=$(jq -r '.do_config.region // "sgp1"' "$config_file") | ||||
|          | ||||
|         [[ -n "$token" && "$token" != "null" && "$token" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/token" -d "$token" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$region" && "$region" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|          | ||||
|         log_success "DigitalOcean 配置已上传" | ||||
|     fi | ||||
|      | ||||
|     # 上传 Google Cloud 配置 | ||||
|     if jq -e '.gcp_config' "$config_file" > /dev/null 2>&1; then | ||||
|         log_info "上传 Google Cloud 配置..." | ||||
|         local base_path="config/${ENVIRONMENT}/gcp" | ||||
|          | ||||
|         local project_id=$(jq -r '.gcp_config.project_id // empty' "$config_file") | ||||
|         local region=$(jq -r '.gcp_config.region // "asia-northeast3"' "$config_file") | ||||
|         local zone=$(jq -r '.gcp_config.zone // "asia-northeast3-a"' "$config_file") | ||||
|         local credentials_file=$(jq -r '.gcp_config.credentials_file // empty' "$config_file") | ||||
|          | ||||
|         [[ -n "$project_id" && "$project_id" != "null" && "$project_id" != "" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/project_id" -d "$project_id" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$region" && "$region" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|         [[ -n "$zone" && "$zone" != "null" ]] && { | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/zone" -d "$zone" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         } | ||||
|          | ||||
|         # 上传凭证文件内容 | ||||
|         if [[ -n "$credentials_file" && "$credentials_file" != "null" && -f "$credentials_file" ]]; then | ||||
|             local credentials_content=$(cat "$credentials_file") | ||||
|             curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/credentials" -d "$credentials_content" > /dev/null | ||||
|             ((uploaded_count++)) | ||||
|         fi | ||||
|          | ||||
|         log_success "Google Cloud 配置已上传" | ||||
|     fi | ||||
|      | ||||
|     log_success "总共上传了 $uploaded_count 个配置项到 Consul" | ||||
| } | ||||
| 
 | ||||
| # 列出 Consul 中的配置 | ||||
| list_consul_configs() { | ||||
|     log_info "列出 Consul 中的配置..." | ||||
|      | ||||
|     local base_path="config/${ENVIRONMENT}" | ||||
|      | ||||
|     echo "=== Consul 中的配置 ===" | ||||
|      | ||||
|     # 获取所有配置键 | ||||
|     local keys=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/?keys" | jq -r '.[]' 2>/dev/null || echo "") | ||||
|      | ||||
|     if [[ -z "$keys" ]]; then | ||||
|         log_warning "Consul 中没有找到配置" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     echo "$keys" | while read -r key; do | ||||
|         local value=$(curl -s "${CONSUL_ADDR}/v1/kv/${key}?raw" 2>/dev/null || echo "无法读取") | ||||
|         # 隐藏敏感信息 | ||||
|         if [[ "$key" =~ (secret|key|token|password) ]]; then | ||||
|             echo "$key: [已隐藏]" | ||||
|         else | ||||
|             echo "$key: $value" | ||||
|         fi | ||||
|     done | ||||
| } | ||||
| 
 | ||||
| # 清理 Consul 配置 | ||||
| cleanup_consul_configs() { | ||||
|     log_warning "清理 Consul 配置..." | ||||
|      | ||||
|     read -p "确定要删除环境 '$ENVIRONMENT' 的所有配置吗?(y/N): " confirm | ||||
|     if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then | ||||
|         log_info "操作已取消" | ||||
|         return | ||||
|     fi | ||||
|      | ||||
|     local base_path="config/${ENVIRONMENT}" | ||||
|     curl -s -X DELETE "${CONSUL_ADDR}/v1/kv/${base_path}?recurse" > /dev/null | ||||
|      | ||||
|     log_success "环境 '$ENVIRONMENT' 的配置已清理" | ||||
| } | ||||
| 
 | ||||
| # 显示帮助信息 | ||||
| show_help() { | ||||
|     cat << EOF | ||||
| OpenTofu 密钥上传脚本 | ||||
| 
 | ||||
| 用法: $0 [选项] | ||||
| 
 | ||||
| 选项: | ||||
|     upload      上传 terraform.tfvars 中的配置到 Consul | ||||
|     list        列出 Consul 中的配置 | ||||
|     cleanup     清理 Consul 中的配置 | ||||
|     help        显示此帮助信息 | ||||
| 
 | ||||
| 环境变量: | ||||
|     CONSUL_ADDR     Consul 地址 (默认: http://localhost:8500) | ||||
|     CONSUL_TOKEN    Consul ACL Token (可选) | ||||
|     ENVIRONMENT     环境名称 (默认: dev) | ||||
|     TOFU_DIR        OpenTofu 目录 (默认: tofu/environments/\${ENVIRONMENT}) | ||||
|     TFVARS_FILE     变量文件路径 (默认: \${TOFU_DIR}/terraform.tfvars) | ||||
| 
 | ||||
| 示例: | ||||
|     # 上传配置到 Consul | ||||
|     $0 upload | ||||
|      | ||||
|     # 列出 Consul 中的配置 | ||||
|     $0 list | ||||
|      | ||||
|     # 清理配置 | ||||
|     $0 cleanup | ||||
|      | ||||
|     # 指定不同环境 | ||||
|     ENVIRONMENT=production $0 upload | ||||
| EOF | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     check_dependencies | ||||
|      | ||||
|     case "${1:-help}" in | ||||
|         "upload") | ||||
|             check_consul | ||||
|             check_tfvars_file | ||||
|              | ||||
|             log_info "解析配置文件: $TFVARS_FILE" | ||||
|             local config_json=$(manual_parse_tfvars "$TFVARS_FILE" "/tmp/parsed_config.json") | ||||
|             upload_config_to_consul "/tmp/parsed_config.json" | ||||
|              | ||||
|             # 清理临时文件 | ||||
|             rm -f /tmp/parsed_config.json /tmp/temp_config.tf | ||||
|             ;; | ||||
|         "list") | ||||
|             check_consul | ||||
|             list_consul_configs | ||||
|             ;; | ||||
|         "cleanup") | ||||
|             check_consul | ||||
|             cleanup_consul_configs | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             show_help | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| main "$@" | ||||
|  | @ -0,0 +1,31 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| echo "=== Nomad Podman Migration Verification ===" | ||||
| echo | ||||
| 
 | ||||
| # Check Nomad service status | ||||
| echo "1. Checking Nomad service status..." | ||||
| ssh ben@100.84.197.26 "sudo systemctl status nomad --no-pager -l" | ||||
| echo | ||||
| 
 | ||||
| # Check Nomad configuration | ||||
| echo "2. Checking Nomad configuration..." | ||||
| ssh ben@100.84.197.26 "sudo cat /etc/nomad.d/nomad.hcl | grep -A 10 -B 2 podman" | ||||
| echo | ||||
| 
 | ||||
| # Check Podman socket | ||||
| echo "3. Checking Podman socket..." | ||||
| ssh ben@100.84.197.26 "ls -la /run/user/*/podman/podman.sock 2>/dev/null || echo 'Podman socket not found'" | ||||
| echo | ||||
| 
 | ||||
| # Check Nomad node status | ||||
| echo "4. Checking Nomad node status..." | ||||
| ssh ben@100.84.197.26 "sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 'Driver Status'" 2>/dev/null || echo "Could not get node status" | ||||
| echo | ||||
| 
 | ||||
| # Test Podman functionality | ||||
| echo "5. Testing Podman as nomad user..." | ||||
| ssh ben@100.84.197.26 "sudo -u nomad podman version --format '{{.Version}}'" 2>/dev/null || echo "Podman test failed" | ||||
| echo | ||||
| 
 | ||||
| echo "=== Verification Complete ===" | ||||
|  | @ -1,138 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   # Traefik 负载均衡器 | ||||
|   traefik: | ||||
|     image: traefik:v3.0 | ||||
|     container_name: traefik | ||||
|     restart: unless-stopped | ||||
|     ports: | ||||
|       - "80:80" | ||||
|       - "443:443" | ||||
|       - "8080:8080"  # Traefik Dashboard | ||||
|     volumes: | ||||
|       - /var/run/docker.sock:/var/run/docker.sock:ro | ||||
|       - ./traefik.yml:/etc/traefik/traefik.yml:ro | ||||
|       - ./certs:/certs:ro | ||||
|     environment: | ||||
|       - CONSUL_ENDPOINTS=consul1:8500,consul2:8500,consul3:8500 | ||||
|     depends_on: | ||||
|       - consul1 | ||||
|       - consul2 | ||||
|       - consul3 | ||||
|     networks: | ||||
|       - traefik-net | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.dashboard.rule=Host(`traefik.local`)" | ||||
|       - "traefik.http.routers.dashboard.service=api@internal" | ||||
| 
 | ||||
|   # Consul 集群节点 1 | ||||
|   consul1: | ||||
|     image: consul:1.16.1 | ||||
|     container_name: consul1 | ||||
|     hostname: consul1 | ||||
|     restart: unless-stopped | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|     volumes: | ||||
|       - consul1_data:/consul/data | ||||
|     command: > | ||||
|       consul agent -server -bootstrap-expect=3 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul1 -bind=0.0.0.0 -client=0.0.0.0 | ||||
|       -retry-join=consul2 -retry-join=consul3 | ||||
|       -ui-config='{"enabled": true}' | ||||
|       -log-level=INFO | ||||
|     networks: | ||||
|       - traefik-net | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.consul.rule=Host(`consul.local`)" | ||||
|       - "traefik.http.services.consul.loadbalancer.server.port=8500" | ||||
| 
 | ||||
|   # Consul 集群节点 2 | ||||
|   consul2: | ||||
|     image: consul:1.16.1 | ||||
|     container_name: consul2 | ||||
|     hostname: consul2 | ||||
|     restart: unless-stopped | ||||
|     volumes: | ||||
|       - consul2_data:/consul/data | ||||
|     command: > | ||||
|       consul agent -server -bootstrap-expect=3 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul2 -bind=0.0.0.0 -client=0.0.0.0 | ||||
|       -retry-join=consul1 -retry-join=consul3 | ||||
|       -log-level=INFO | ||||
|     networks: | ||||
|       - traefik-net | ||||
| 
 | ||||
|   # Consul 集群节点 3 | ||||
|   consul3: | ||||
|     image: consul:1.16.1 | ||||
|     container_name: consul3 | ||||
|     hostname: consul3 | ||||
|     restart: unless-stopped | ||||
|     volumes: | ||||
|       - consul3_data:/consul/data | ||||
|     command: > | ||||
|       consul agent -server -bootstrap-expect=3 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul3 -bind=0.0.0.0 -client=0.0.0.0 | ||||
|       -retry-join=consul1 -retry-join=consul2 | ||||
|       -log-level=INFO | ||||
|     networks: | ||||
|       - traefik-net | ||||
| 
 | ||||
|   # 示例 Web 应用 | ||||
|   web-app: | ||||
|     image: nginx:alpine | ||||
|     container_name: web-app | ||||
|     restart: unless-stopped | ||||
|     volumes: | ||||
|       - ./web-content:/usr/share/nginx/html:ro | ||||
|     environment: | ||||
|       - CONSUL_URL=http://consul1:8500 | ||||
|     networks: | ||||
|       - traefik-net | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.web.rule=Host(`app.local`)" | ||||
|       - "traefik.http.services.web.loadbalancer.server.port=80" | ||||
|       - "traefik.http.routers.web.middlewares=web-auth" | ||||
|       - "traefik.http.middlewares.web-auth.basicauth.users=admin:$$2y$$10$$..." | ||||
| 
 | ||||
|   # 示例 API 应用 | ||||
|   api-app: | ||||
|     image: node:18-alpine | ||||
|     container_name: api-app | ||||
|     restart: unless-stopped | ||||
|     working_dir: /app | ||||
|     volumes: | ||||
|       - ./api:/app | ||||
|     command: ["node", "server.js"] | ||||
|     environment: | ||||
|       - CONSUL_URL=http://consul1:8500 | ||||
|       - NODE_ENV=production | ||||
|     networks: | ||||
|       - traefik-net | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.api.rule=Host(`api.local`) && PathPrefix(`/api`)" | ||||
|       - "traefik.http.services.api.loadbalancer.server.port=3000" | ||||
|       - "traefik.http.routers.api.middlewares=api-cors" | ||||
|       - "traefik.http.middlewares.api-cors.headers.accesscontrolallowmethods=GET,POST,PUT,DELETE" | ||||
|       - "traefik.http.middlewares.api-cors.headers.accesscontrolalloworigin=*" | ||||
| 
 | ||||
| volumes: | ||||
|   consul1_data: | ||||
|   consul2_data: | ||||
|   consul3_data: | ||||
| 
 | ||||
| networks: | ||||
|   traefik-net: | ||||
|     driver: bridge | ||||
|     ipam: | ||||
|       config: | ||||
|         - subnet: 172.20.0.0/16 | ||||
|  | @ -1,60 +0,0 @@ | |||
| # Traefik 配置文件 | ||||
| api: | ||||
|   dashboard: true | ||||
|   insecure: true  # 仅开发环境,生产环境请使用 HTTPS | ||||
| 
 | ||||
| # 入口点配置 | ||||
| entryPoints: | ||||
|   web: | ||||
|     address: ":80" | ||||
|   websecure: | ||||
|     address: ":443" | ||||
| 
 | ||||
| # 提供者配置 | ||||
| providers: | ||||
|   # Docker 标签发现 | ||||
|   docker: | ||||
|     endpoint: "unix:///var/run/docker.sock" | ||||
|     exposedByDefault: false | ||||
|     watch: true | ||||
| 
 | ||||
|   # Consul 服务发现 | ||||
|   consul: | ||||
|     endpoints: | ||||
|       - "consul1:8500" | ||||
|       - "consul2:8500" | ||||
|       - "consul3:8500" | ||||
|     watch: true | ||||
|      | ||||
|   # 文件配置提供者 | ||||
|   file: | ||||
|     filename: /etc/traefik/dynamic.yml | ||||
|     watch: true | ||||
| 
 | ||||
| # 证书解析器(Let's Encrypt) | ||||
| certificatesResolvers: | ||||
|   letsencrypt: | ||||
|     acme: | ||||
|       email: admin@example.com | ||||
|       storage: /certs/acme.json | ||||
|       httpChallenge: | ||||
|         entryPoint: web | ||||
| 
 | ||||
| # 日志配置 | ||||
| log: | ||||
|   level: INFO | ||||
|   filePath: "/var/log/traefik.log" | ||||
| 
 | ||||
| accessLog: | ||||
|   filePath: "/var/log/access.log" | ||||
| 
 | ||||
| # 指标配置 | ||||
| metrics: | ||||
|   prometheus: | ||||
|     addEntryPointsLabels: true | ||||
|     addServicesLabels: true | ||||
| 
 | ||||
| # 全局配置 | ||||
| global: | ||||
|   checkNewVersion: false | ||||
|   sendAnonymousUsage: false | ||||
|  | @ -1,184 +0,0 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| # Docker Swarm 管理脚本 | ||||
| set -euo pipefail | ||||
| 
 | ||||
| # 颜色定义 | ||||
| RED='\033[0;31m' | ||||
| GREEN='\033[0;32m' | ||||
| YELLOW='\033[1;33m' | ||||
| BLUE='\033[0;34m' | ||||
| NC='\033[0m' # No Color | ||||
| 
 | ||||
| # 日志函数 | ||||
| log_info() { | ||||
|     echo -e "${BLUE}[INFO]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_success() { | ||||
|     echo -e "${GREEN}[SUCCESS]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_warning() { | ||||
|     echo -e "${YELLOW}[WARNING]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| log_error() { | ||||
|     echo -e "${RED}[ERROR]${NC} $1" | ||||
| } | ||||
| 
 | ||||
| # 检查是否在 Swarm 模式 | ||||
| check_swarm_mode() { | ||||
|     if docker info --format '{{.Swarm.LocalNodeState}}' | grep -q "active"; then | ||||
|         log_success "Docker Swarm 模式已激活" | ||||
|         return 0 | ||||
|     else | ||||
|         log_error "Docker Swarm 模式未激活" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 初始化 Swarm | ||||
| init_swarm() { | ||||
|     log_info "初始化 Docker Swarm..." | ||||
|      | ||||
|     if docker swarm init; then | ||||
|         log_success "Docker Swarm 初始化成功" | ||||
|         log_info "要添加工作节点,请在其他主机上运行:" | ||||
|         docker swarm join-token worker | ||||
|     else | ||||
|         log_error "Docker Swarm 初始化失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 部署堆栈 | ||||
| deploy_stack() { | ||||
|     local stack_name="$1" | ||||
|     local compose_file="$2" | ||||
|      | ||||
|     log_info "部署堆栈: $stack_name" | ||||
|      | ||||
|     if [[ ! -f "$compose_file" ]]; then | ||||
|         log_error "Compose 文件不存在: $compose_file" | ||||
|         return 1 | ||||
|     fi | ||||
|      | ||||
|     if docker stack deploy -c "$compose_file" "$stack_name"; then | ||||
|         log_success "堆栈 $stack_name 部署成功" | ||||
|     else | ||||
|         log_error "堆栈 $stack_name 部署失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 列出堆栈 | ||||
| list_stacks() { | ||||
|     log_info "当前部署的堆栈:" | ||||
|     docker stack ls | ||||
| } | ||||
| 
 | ||||
| # 查看堆栈服务 | ||||
| show_stack_services() { | ||||
|     local stack_name="$1" | ||||
|      | ||||
|     log_info "堆栈 $stack_name 的服务:" | ||||
|     docker stack services "$stack_name" | ||||
| } | ||||
| 
 | ||||
| # 删除堆栈 | ||||
| remove_stack() { | ||||
|     local stack_name="$1" | ||||
|      | ||||
|     log_info "删除堆栈: $stack_name" | ||||
|      | ||||
|     if docker stack rm "$stack_name"; then | ||||
|         log_success "堆栈 $stack_name 删除成功" | ||||
|     else | ||||
|         log_error "堆栈 $stack_name 删除失败" | ||||
|         return 1 | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| # 显示节点信息 | ||||
| show_nodes() { | ||||
|     log_info "Swarm 节点信息:" | ||||
|     docker node ls | ||||
| } | ||||
| 
 | ||||
| # 显示帮助信息 | ||||
| show_help() { | ||||
|     echo "Docker Swarm 管理脚本" | ||||
|     echo "" | ||||
|     echo "用法: $0 [命令] [参数]" | ||||
|     echo "" | ||||
|     echo "命令:" | ||||
|     echo "  init                          - 初始化 Docker Swarm" | ||||
|     echo "  deploy <stack> <compose-file> - 部署堆栈" | ||||
|     echo "  list                          - 列出所有堆栈" | ||||
|     echo "  services <stack>              - 查看堆栈服务" | ||||
|     echo "  remove <stack>                - 删除堆栈" | ||||
|     echo "  nodes                         - 显示节点信息" | ||||
|     echo "  check                         - 检查 Swarm 状态" | ||||
|     echo "  help                          - 显示此帮助信息" | ||||
|     echo "" | ||||
|     echo "示例:" | ||||
|     echo "  $0 init                                    # 初始化 Swarm" | ||||
|     echo "  $0 deploy traefik stacks/traefik-swarm-stack.yml" | ||||
|     echo "  $0 deploy demo stacks/demo-services-stack.yml" | ||||
|     echo "  $0 list                                    # 列出堆栈" | ||||
|     echo "  $0 services traefik                       # 查看 traefik 堆栈服务" | ||||
| } | ||||
| 
 | ||||
| # 主函数 | ||||
| main() { | ||||
|     case "${1:-help}" in | ||||
|         "init") | ||||
|             init_swarm | ||||
|             ;; | ||||
|         "deploy") | ||||
|             if [[ $# -lt 3 ]]; then | ||||
|                 log_error "部署命令需要堆栈名称和 compose 文件" | ||||
|                 echo "用法: $0 deploy <stack-name> <compose-file>" | ||||
|                 exit 1 | ||||
|             fi | ||||
|             check_swarm_mode || exit 1 | ||||
|             deploy_stack "$2" "$3" | ||||
|             ;; | ||||
|         "list") | ||||
|             check_swarm_mode || exit 1 | ||||
|             list_stacks | ||||
|             ;; | ||||
|         "services") | ||||
|             if [[ $# -lt 2 ]]; then | ||||
|                 log_error "需要指定堆栈名称" | ||||
|                 echo "用法: $0 services <stack-name>" | ||||
|                 exit 1 | ||||
|             fi | ||||
|             check_swarm_mode || exit 1 | ||||
|             show_stack_services "$2" | ||||
|             ;; | ||||
|         "remove") | ||||
|             if [[ $# -lt 2 ]]; then | ||||
|                 log_error "需要指定堆栈名称" | ||||
|                 echo "用法: $0 remove <stack-name>" | ||||
|                 exit 1 | ||||
|             fi | ||||
|             check_swarm_mode || exit 1 | ||||
|             remove_stack "$2" | ||||
|             ;; | ||||
|         "nodes") | ||||
|             check_swarm_mode || exit 1 | ||||
|             show_nodes | ||||
|             ;; | ||||
|         "check") | ||||
|             check_swarm_mode | ||||
|             ;; | ||||
|         "help"|*) | ||||
|             show_help | ||||
|             ;; | ||||
|     esac | ||||
| } | ||||
| 
 | ||||
| # 运行主函数 | ||||
| main "$@" | ||||
|  | @ -1,41 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul: | ||||
|     image: consul:latest | ||||
|     hostname: consul-ash3c | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-ash3c -bind=$$IP -advertise=100.116.80.94 -client=0.0.0.0 | ||||
|       -retry-join=100.117.106.136 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_data:/consul/data | ||||
|     networks: | ||||
|       - consul-net | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == ash3c | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
| volumes: | ||||
|   consul_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,76 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul-master: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_master_data:/consul/data | ||||
|     networks: | ||||
|       consul-net: | ||||
|         aliases: | ||||
|           - consul-master | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
|   consul-ash3c: | ||||
|     image: consul:latest | ||||
|     hostname: consul-ash3c | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -retry-join=consul-master | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8501:8500" | ||||
|       - "8601:8600/udp" | ||||
|     volumes: | ||||
|       - consul_ash3c_data:/consul/data | ||||
|     networks: | ||||
|       consul-net: | ||||
|         aliases: | ||||
|           - consul-ash3c | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == ash3c | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
| volumes: | ||||
|   consul_master_data: | ||||
|   consul_ash3c_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,68 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul-master: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=100.117.106.136 -advertise=100.117.106.136 -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|       - "8301:8301" | ||||
|       - "8302:8302" | ||||
|     volumes: | ||||
|       - consul_master_data:/consul/data | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
|   consul-ash3c: | ||||
|     image: consul:latest | ||||
|     hostname: consul-ash3c | ||||
|     command: > | ||||
|       sh -c " | ||||
|       ASH3C_IP=$$(getent hosts ash3c | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-ash3c -bind=$$ASH3C_IP -advertise=$$ASH3C_IP -client=0.0.0.0 | ||||
|       -retry-join=100.117.106.136 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8501:8500" | ||||
|       - "8601:8600/udp" | ||||
|       - "8311:8301" | ||||
|       - "8312:8302" | ||||
|     volumes: | ||||
|       - consul_ash3c_data:/consul/data | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == ash3c | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
|     depends_on: | ||||
|       - consul-master | ||||
| 
 | ||||
| volumes: | ||||
|   consul_master_data: | ||||
|   consul_ash3c_data: | ||||
|  | @ -1,78 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul-master: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_master_data:/consul/data | ||||
|     networks: | ||||
|       consul-net: | ||||
|         aliases: | ||||
|           - consul-master | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
|   consul-ash3c: | ||||
|     image: consul:latest | ||||
|     hostname: consul-ash3c | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -retry-join=10.0.5.5 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8501:8500" | ||||
|       - "8601:8600/udp" | ||||
|     volumes: | ||||
|       - consul_ash3c_data:/consul/data | ||||
|     networks: | ||||
|       consul-net: | ||||
|         aliases: | ||||
|           - consul-ash3c | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == ash3c | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
|     depends_on: | ||||
|       - consul-master | ||||
| 
 | ||||
| volumes: | ||||
|   consul_master_data: | ||||
|   consul_ash3c_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,78 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul-master: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=192.168.1.100 -advertise=192.168.1.100 -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_master_data:/consul/data | ||||
|     networks: | ||||
|       consul-macvlan: | ||||
|         ipv4_address: 192.168.1.100 | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
|   consul-ash3c: | ||||
|     image: consul:latest | ||||
|     hostname: consul-ash3c | ||||
|     command: > | ||||
|       sh -c " | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-ash3c -bind=192.168.1.101 -advertise=192.168.1.101 -client=0.0.0.0 | ||||
|       -retry-join=192.168.1.100 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8501:8500" | ||||
|       - "8601:8600/udp" | ||||
|     volumes: | ||||
|       - consul_ash3c_data:/consul/data | ||||
|     networks: | ||||
|       consul-macvlan: | ||||
|         ipv4_address: 192.168.1.101 | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == ash3c | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
| volumes: | ||||
|   consul_master_data: | ||||
|   consul_ash3c_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-macvlan: | ||||
|     driver: macvlan | ||||
|     driver_opts: | ||||
|       parent: eth0  # 根据你的网络接口调整 | ||||
|     ipam: | ||||
|       config: | ||||
|         - subnet: 192.168.1.0/24 | ||||
|           gateway: 192.168.1.1 | ||||
|           ip_range: 192.168.1.100/30  # 只分配 .100-.103 的IP | ||||
|  | @ -1,76 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul-master: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_master_data:/consul/data | ||||
|     networks: | ||||
|       consul-net: | ||||
|         aliases: | ||||
|           - consul-master | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
|   consul-ash3c: | ||||
|     image: consul:latest | ||||
|     hostname: consul-ash3c | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -retry-join=consul-cluster_consul-master | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8501:8500" | ||||
|       - "8601:8600/udp" | ||||
|     volumes: | ||||
|       - consul_ash3c_data:/consul/data | ||||
|     networks: | ||||
|       consul-net: | ||||
|         aliases: | ||||
|           - consul-ash3c | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == ash3c | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
| volumes: | ||||
|   consul_master_data: | ||||
|   consul_ash3c_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,40 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=2 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=$$IP -advertise=100.117.106.136 -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_data:/consul/data | ||||
|     networks: | ||||
|       - consul-net | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
| volumes: | ||||
|   consul_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,39 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul: | ||||
|     image: consul:latest | ||||
|     hostname: consul | ||||
|     command: > | ||||
|       consul agent -server -bootstrap-expect=1 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_data:/consul/data | ||||
|     networks: | ||||
|       - consul-net | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
|       labels: | ||||
|         - "traefik.enable=true" | ||||
|         - "traefik.http.routers.consul.rule=Host(`consul.local`)" | ||||
|         - "traefik.http.services.consul.loadbalancer.server.port=8500" | ||||
|         - "traefik.docker.network=consul-net" | ||||
| 
 | ||||
| volumes: | ||||
|   consul_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,40 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   consul: | ||||
|     image: consul:latest | ||||
|     hostname: consul-master | ||||
|     command: > | ||||
|       sh -c " | ||||
|       IP=$$(hostname -i | awk '{print $$1}'); | ||||
|       consul agent -server -bootstrap-expect=1 | ||||
|       -datacenter=dc1 -data-dir=/consul/data | ||||
|       -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 | ||||
|       -ui | ||||
|       -log-level=INFO | ||||
|       " | ||||
|     ports: | ||||
|       - "8500:8500" | ||||
|       - "8600:8600/udp" | ||||
|     volumes: | ||||
|       - consul_data:/consul/data | ||||
|     networks: | ||||
|       - consul-net | ||||
|     deploy: | ||||
|       mode: replicated | ||||
|       replicas: 1 | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.hostname == master | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
|         delay: 5s | ||||
|         max_attempts: 3 | ||||
| 
 | ||||
| volumes: | ||||
|   consul_data: | ||||
| 
 | ||||
| networks: | ||||
|   consul-net: | ||||
|     driver: overlay | ||||
|     attachable: true | ||||
|  | @ -1,166 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   # Web 应用示例 | ||||
|   webapp: | ||||
|     image: nginx:alpine | ||||
|     networks: | ||||
|       - traefik-public | ||||
|     configs: | ||||
|       - source: webapp-html | ||||
|         target: /usr/share/nginx/html/index.html | ||||
|     deploy: | ||||
|       replicas: 2 | ||||
|       labels: | ||||
|         - traefik.enable=true | ||||
|         - traefik.http.routers.webapp.rule=Host(`app.local`) | ||||
|         - traefik.http.routers.webapp.entrypoints=web | ||||
|         - traefik.http.services.webapp.loadbalancer.server.port=80 | ||||
|       update_config: | ||||
|         parallelism: 1 | ||||
|         delay: 10s | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
| 
 | ||||
|   # API 服务示例 | ||||
|   api: | ||||
|     image: httpd:alpine | ||||
|     networks: | ||||
|       - traefik-public | ||||
|     configs: | ||||
|       - source: api-html | ||||
|         target: /usr/local/apache2/htdocs/index.html | ||||
|     deploy: | ||||
|       replicas: 2 | ||||
|       labels: | ||||
|         - traefik.enable=true | ||||
|         - traefik.http.routers.api.rule=Host(`api.local`) | ||||
|         - traefik.http.routers.api.entrypoints=web | ||||
|         - traefik.http.services.api.loadbalancer.server.port=80 | ||||
|         # 添加路径前缀 | ||||
|         - traefik.http.routers.api-path.rule=Host(`app.local`) && PathPrefix(`/api`) | ||||
|         - traefik.http.routers.api-path.entrypoints=web | ||||
|         - traefik.http.routers.api-path.service=api | ||||
|       update_config: | ||||
|         parallelism: 1 | ||||
|         delay: 10s | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
| 
 | ||||
|   # 监控服务示例 | ||||
|   monitor: | ||||
|     image: nginx:alpine | ||||
|     networks: | ||||
|       - traefik-public | ||||
|     configs: | ||||
|       - source: monitor-html | ||||
|         target: /usr/share/nginx/html/index.html | ||||
|     deploy: | ||||
|       replicas: 1 | ||||
|       labels: | ||||
|         - traefik.enable=true | ||||
|         - traefik.http.routers.monitor.rule=Host(`monitor.local`) | ||||
|         - traefik.http.routers.monitor.entrypoints=web | ||||
|         - traefik.http.services.monitor.loadbalancer.server.port=80 | ||||
|         # 添加基本认证 (可选) | ||||
|         - traefik.http.routers.monitor.middlewares=auth | ||||
|         - traefik.http.middlewares.auth.basicauth.users=admin:$$2y$$10$$DLKjKQKQKQKQKQKQKQKQKe | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
| 
 | ||||
| networks: | ||||
|   traefik-public: | ||||
|     external: true | ||||
| 
 | ||||
| configs: | ||||
|   webapp-html: | ||||
|     content: | | ||||
|       <!DOCTYPE html> | ||||
|       <html> | ||||
|       <head> | ||||
|           <title>Web App - Traefik Swarm Demo</title> | ||||
|           <style> | ||||
|               body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; } | ||||
|               .container { background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | ||||
|               h1 { color: #2c3e50; } | ||||
|               .info { background: #e8f4fd; padding: 15px; border-radius: 4px; margin: 20px 0; } | ||||
|           </style> | ||||
|       </head> | ||||
|       <body> | ||||
|           <div class="container"> | ||||
|               <h1>🚀 Web Application</h1> | ||||
|               <div class="info"> | ||||
|                   <p><strong>服务:</strong> webapp</p> | ||||
|                   <p><strong>访问地址:</strong> http://app.local</p> | ||||
|                   <p><strong>负载均衡:</strong> Traefik + Docker Swarm</p> | ||||
|                   <p><strong>时间:</strong> <span id="time"></span></p> | ||||
|               </div> | ||||
|               <p>这是通过 Traefik 路由的 Web 应用示例。</p> | ||||
|           </div> | ||||
|           <script> | ||||
|               document.getElementById('time').textContent = new Date().toLocaleString(); | ||||
|           </script> | ||||
|       </body> | ||||
|       </html> | ||||
| 
 | ||||
|   api-html: | ||||
|     content: | | ||||
|       <!DOCTYPE html> | ||||
|       <html> | ||||
|       <head> | ||||
|           <title>API Service - Traefik Swarm Demo</title> | ||||
|           <style> | ||||
|               body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; } | ||||
|               .container { background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | ||||
|               h1 { color: #27ae60; } | ||||
|               .info { background: #e8f8f5; padding: 15px; border-radius: 4px; margin: 20px 0; } | ||||
|           </style> | ||||
|       </head> | ||||
|       <body> | ||||
|           <div class="container"> | ||||
|               <h1>🔌 API Service</h1> | ||||
|               <div class="info"> | ||||
|                   <p><strong>服务:</strong> api</p> | ||||
|                   <p><strong>访问地址:</strong> http://api.local</p> | ||||
|                   <p><strong>路径路由:</strong> http://app.local/api</p> | ||||
|                   <p><strong>负载均衡:</strong> Traefik + Docker Swarm</p> | ||||
|                   <p><strong>时间:</strong> <span id="time"></span></p> | ||||
|               </div> | ||||
|               <p>这是通过 Traefik 路由的 API 服务示例。</p> | ||||
|           </div> | ||||
|           <script> | ||||
|               document.getElementById('time').textContent = new Date().toLocaleString(); | ||||
|           </script> | ||||
|       </body> | ||||
|       </html> | ||||
| 
 | ||||
|   monitor-html: | ||||
|     content: | | ||||
|       <!DOCTYPE html> | ||||
|       <html> | ||||
|       <head> | ||||
|           <title>Monitor Service - Traefik Swarm Demo</title> | ||||
|           <style> | ||||
|               body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; } | ||||
|               .container { background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | ||||
|               h1 { color: #e74c3c; } | ||||
|               .info { background: #fdf2e9; padding: 15px; border-radius: 4px; margin: 20px 0; } | ||||
|           </style> | ||||
|       </head> | ||||
|       <body> | ||||
|           <div class="container"> | ||||
|               <h1>📊 Monitor Service</h1> | ||||
|               <div class="info"> | ||||
|                   <p><strong>服务:</strong> monitor</p> | ||||
|                   <p><strong>访问地址:</strong> http://monitor.local</p> | ||||
|                   <p><strong>认证:</strong> 基本认证保护</p> | ||||
|                   <p><strong>负载均衡:</strong> Traefik + Docker Swarm</p> | ||||
|                   <p><strong>时间:</strong> <span id="time"></span></p> | ||||
|               </div> | ||||
|               <p>这是通过 Traefik 路由的监控服务示例。</p> | ||||
|           </div> | ||||
|           <script> | ||||
|               document.getElementById('time').textContent = new Date().toLocaleString(); | ||||
|           </script> | ||||
|       </body> | ||||
|       </html> | ||||
|  | @ -1,70 +0,0 @@ | |||
| version: '3.8' | ||||
| 
 | ||||
| services: | ||||
|   traefik: | ||||
|     image: traefik:v3.0 | ||||
|     command: | ||||
|       # API 和 Dashboard | ||||
|       - --api.dashboard=true | ||||
|       - --api.insecure=true | ||||
|        | ||||
|       # 入口点 | ||||
|       - --entrypoints.web.address=:80 | ||||
|       - --entrypoints.websecure.address=:443 | ||||
|        | ||||
|       # Docker Swarm Provider | ||||
|       - --providers.swarm=true | ||||
|       - --providers.swarm.endpoint=unix:///var/run/docker.sock | ||||
|       - --providers.swarm.exposedByDefault=false | ||||
|       - --providers.swarm.network=traefik-public | ||||
|        | ||||
|       # 日志 | ||||
|       - --log.level=INFO | ||||
|       - --accesslog=true | ||||
|        | ||||
|       # 指标 | ||||
|       - --metrics.prometheus=true | ||||
|       - --metrics.prometheus.addEntryPointsLabels=true | ||||
|       - --metrics.prometheus.addServicesLabels=true | ||||
|        | ||||
|       # 证书解析器 (可选) | ||||
|       - --certificatesresolvers.letsencrypt.acme.httpchallenge=true | ||||
|       - --certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web | ||||
|       - --certificatesresolvers.letsencrypt.acme.email=admin@example.com | ||||
|       - --certificatesresolvers.letsencrypt.acme.storage=/certificates/acme.json | ||||
|      | ||||
|     ports: | ||||
|       - "80:80" | ||||
|       - "443:443" | ||||
|       - "8080:8080"  # Dashboard | ||||
|      | ||||
|     volumes: | ||||
|       - /var/run/docker.sock:/var/run/docker.sock:ro | ||||
|       - traefik-certificates:/certificates | ||||
|      | ||||
|     networks: | ||||
|       - traefik-public | ||||
|      | ||||
|     deploy: | ||||
|       mode: global | ||||
|       placement: | ||||
|         constraints: | ||||
|           - node.role == manager | ||||
|       labels: | ||||
|         # Traefik Dashboard 路由 | ||||
|         - traefik.enable=true | ||||
|         - traefik.http.routers.traefik-dashboard.rule=Host(`traefik.local`) | ||||
|         - traefik.http.routers.traefik-dashboard.service=api@internal | ||||
|         - traefik.http.services.traefik-dashboard.loadbalancer.server.port=8080 | ||||
|       update_config: | ||||
|         parallelism: 1 | ||||
|         delay: 10s | ||||
|       restart_policy: | ||||
|         condition: on-failure | ||||
| 
 | ||||
| networks: | ||||
|   traefik-public: | ||||
|     external: true | ||||
| 
 | ||||
| volumes: | ||||
|   traefik-certificates: | ||||
|  | @ -23,17 +23,16 @@ apt-get install -y \ | |||
|     wget \ | ||||
|     unzip \ | ||||
|     jq \ | ||||
|     docker.io \ | ||||
|     docker-compose \ | ||||
|     podman \ | ||||
|     htop \ | ||||
|     net-tools \ | ||||
|     vim | ||||
| 
 | ||||
| # 启动 Docker | ||||
| log "启动 Docker 服务..." | ||||
| systemctl enable docker | ||||
| systemctl start docker | ||||
| usermod -aG docker ubuntu | ||||
| # 启动 Podman | ||||
| log "启动 Podman 服务..." | ||||
| systemctl enable podman | ||||
| systemctl start podman | ||||
| usermod -aG podman ubuntu | ||||
| 
 | ||||
| # 安装 Nomad | ||||
| log "安装 Nomad ${nomad_version}..." | ||||
|  | @ -85,8 +84,8 @@ server { | |||
| client { | ||||
|   enabled = true | ||||
|    | ||||
|   host_volume "docker-sock" { | ||||
|     path      = "/var/run/docker.sock" | ||||
|   host_volume "podman-sock" { | ||||
|     path      = "/run/podman/podman.sock" | ||||
|     read_only = false | ||||
|   } | ||||
| } | ||||
|  | @ -108,9 +107,8 @@ ports { | |||
|   serf = 4648 | ||||
| } | ||||
| 
 | ||||
| plugin "docker" { | ||||
| plugin "podman" { | ||||
|   config { | ||||
|     allow_privileged = true | ||||
|     volumes { | ||||
|       enabled = true | ||||
|     } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue