From d0e7f64c1db1ba4f2fca36ed8c3c29bc93eb74e5 Mon Sep 17 00:00:00 2001 From: Houzhong Xu Date: Wed, 24 Sep 2025 03:46:30 +0000 Subject: [PATCH] =?UTF-8?q?feat(=E7=9B=91=E6=8E=A7):=20=E6=B7=BB=E5=8A=A0T?= =?UTF-8?q?elegraf=E7=9B=91=E6=8E=A7=E9=85=8D=E7=BD=AE=E5=92=8C=E7=A3=81?= =?UTF-8?q?=E7=9B=98=E7=9B=91=E6=8E=A7=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refactor(容器): 从Docker迁移到Podman并更新Nomad配置 fix(配置): 修复代理和别名配置问题 docs(文档): 更新配置文件和脚本注释 chore(清理): 移除不再使用的Consul和Docker相关文件 --- configuration/deploy-monitoring.sh | 46 ++ configuration/deploy-telegraf-remote.sh | 40 ++ configuration/docker-daemon.json | 14 - .../inventories/production/group_vars/all.yml | 20 + .../inventories/production/nomad-cluster.ini | 18 +- .../production/nomad-cluster.ini.backup | 22 + .../nomad-cluster.ini.backup-20250924-025928 | 23 + .../playbooks/bootstrap/cron-setup.yml | 183 ------- configuration/playbooks/bootstrap/main.yml | 175 ------- .../playbooks/bootstrap/system-cleanup.yml | 83 --- .../playbooks/bootstrap/system-update.yml | 43 -- configuration/playbooks/clear-aliases.yml | 81 +++ configuration/playbooks/clear-proxy.yml | 76 +++ .../configure-nomad-podman-cluster.yml | 57 ++ .../playbooks/configure-nomad-tailscale.yml | 217 ++++++++ .../playbooks/debug-nomad-podman.yml | 60 +++ .../playbooks/disk-analysis-ncdu.yml | 168 ++++++ configuration/playbooks/disk-cleanup.yml | 96 ++++ configuration/playbooks/final-podman-fix.yml | 105 ++++ configuration/playbooks/fix-hcp-podman.yml | 83 +++ .../playbooks/fix-hcs-dpkg-issue.yml | 56 ++ configuration/playbooks/fix-nomad-local.yml | 99 ++++ .../playbooks/fix-nomad-podman-config.yml | 72 +++ configuration/playbooks/fix-nomad-systemd.yml | 88 ++++ .../playbooks/fix-podman-installation.yml | 79 +++ .../install-nomad-direct-download.yml | 133 +++++ .../playbooks/install-nomad-podman-driver.yml | 131 +++++ .../playbooks/install-podman-compose.yml | 61 +++ .../playbooks/maintenance/ops-toolkit.yml | 131 ----- .../playbooks/migrate-to-podman-simple.yml | 167 ++++++ .../monitoring/network-connectivity.yml | 143 ----- .../monitoring/service-health-check.yml | 135 ----- .../remove-docker-install-podman.yml | 120 +++++ configuration/playbooks/restart-tailscale.yml | 39 ++ .../security/certificate-management.yml | 152 ------ .../playbooks/security/security-hardening.yml | 119 ----- .../playbooks/setup-disk-monitoring.yml | 187 +++++++ .../playbooks/setup-new-nomad-nodes.yml | 76 +++ .../templates/disk-monitoring.conf.j2 | 68 +++ .../templates/system-monitoring.conf.j2 | 68 +++ configuration/templates/telegraf-env.j2 | 7 + configuration/templates/telegraf.conf.j2 | 53 ++ configuration/templates/telegraf.service.j2 | 29 + docs/disk-management.md | 169 ++++++ mgmt.sh | 162 ------ scripts/deployment/configure-nomad-cluster.sh | 137 ----- scripts/deployment/deploy-consul-cluster.sh | 104 ---- scripts/deployment/deploy-consul-simple.sh | 132 ----- scripts/deployment/deploy-nomad-cluster.sh | 146 ------ scripts/deployment/deploy-nomad-local.sh | 136 ----- scripts/deployment/install-nomad-cluster.sh | 149 ------ scripts/setup/setup-gitea-integration.sh | 467 ----------------- scripts/setup/setup-nomad-laptop.sh | 230 ++++++++ scripts/setup/setup-nomad-windows.ps1 | 212 ++++++++ scripts/setup/setup-opentofu.sh | 174 ------ scripts/utilities/NUCLEAR-NOMAD-RESET.yml | 375 ------------- .../utilities/complete-nomad-cluster-fix.yml | 189 ------- scripts/utilities/complete-nomad-reset.yml | 151 ------ scripts/utilities/consul-cluster-manager.sh | 233 --------- scripts/utilities/consul-secrets-manager.sh | 228 -------- scripts/utilities/correct-nomad-cluster.yml | 115 ---- scripts/utilities/deploy-nomad-configs.yml | 113 ---- scripts/utilities/disk-monitor.sh | 33 ++ scripts/utilities/final-nomad-cluster-fix.yml | 190 ------- scripts/utilities/final-nomad-fix.yml | 111 ---- scripts/utilities/fix-ash3c-ip.sh | 137 ----- scripts/utilities/fix-consul-cluster.sh | 151 ------ scripts/utilities/fix-nomad-cluster.yml | 92 ---- scripts/utilities/gitea-repo-manager.sh | 242 --------- scripts/utilities/nomad-cluster-manager.sh | 227 ++++++++ scripts/utilities/proxy-toggle.sh | 304 ----------- scripts/utilities/quick-start.sh | 114 ---- scripts/utilities/simple-nomad-fix.sh | 104 ---- .../utilities/terraform-consul-provider.sh | 311 ----------- .../utilities/tofu-secrets-uploader-simple.sh | 128 ----- scripts/utilities/tofu-secrets-uploader.sh | 495 ------------------ scripts/utilities/verify-podman-migration.sh | 31 ++ swarm/configs/traefik-consul-setup.yml | 138 ----- swarm/configs/traefik.yml | 60 --- swarm/scripts/swarm-manager.sh | 184 ------- swarm/stacks/consul-ash3c-stack.yml | 41 -- swarm/stacks/consul-cluster-fixed.yml | 76 --- swarm/stacks/consul-cluster-host-network.yml | 68 --- swarm/stacks/consul-cluster-ip-based.yml | 78 --- swarm/stacks/consul-cluster-macvlan.yml | 78 --- swarm/stacks/consul-cluster-stack.yml | 76 --- swarm/stacks/consul-master-stack.yml | 40 -- swarm/stacks/consul-simple-stack.yml | 39 -- swarm/stacks/consul-single-node.yml | 40 -- swarm/stacks/demo-services-stack.yml | 166 ------ swarm/stacks/traefik-swarm-stack.yml | 70 --- .../nomad-cluster/templates/nomad-userdata.sh | 20 +- 92 files changed, 3552 insertions(+), 7737 deletions(-) create mode 100755 configuration/deploy-monitoring.sh create mode 100755 configuration/deploy-telegraf-remote.sh delete mode 100644 configuration/docker-daemon.json create mode 100644 configuration/inventories/production/group_vars/all.yml create mode 100644 configuration/inventories/production/nomad-cluster.ini.backup create mode 100644 configuration/inventories/production/nomad-cluster.ini.backup-20250924-025928 delete mode 100644 configuration/playbooks/bootstrap/cron-setup.yml delete mode 100644 configuration/playbooks/bootstrap/main.yml delete mode 100644 configuration/playbooks/bootstrap/system-cleanup.yml delete mode 100644 configuration/playbooks/bootstrap/system-update.yml create mode 100644 configuration/playbooks/clear-aliases.yml create mode 100644 configuration/playbooks/clear-proxy.yml create mode 100644 configuration/playbooks/configure-nomad-podman-cluster.yml create mode 100644 configuration/playbooks/configure-nomad-tailscale.yml create mode 100644 configuration/playbooks/debug-nomad-podman.yml create mode 100644 configuration/playbooks/disk-analysis-ncdu.yml create mode 100644 configuration/playbooks/disk-cleanup.yml create mode 100644 configuration/playbooks/final-podman-fix.yml create mode 100644 configuration/playbooks/fix-hcp-podman.yml create mode 100644 configuration/playbooks/fix-hcs-dpkg-issue.yml create mode 100644 configuration/playbooks/fix-nomad-local.yml create mode 100644 configuration/playbooks/fix-nomad-podman-config.yml create mode 100644 configuration/playbooks/fix-nomad-systemd.yml create mode 100644 configuration/playbooks/fix-podman-installation.yml create mode 100644 configuration/playbooks/install-nomad-direct-download.yml create mode 100644 configuration/playbooks/install-nomad-podman-driver.yml create mode 100644 configuration/playbooks/install-podman-compose.yml delete mode 100644 configuration/playbooks/maintenance/ops-toolkit.yml create mode 100644 configuration/playbooks/migrate-to-podman-simple.yml delete mode 100644 configuration/playbooks/monitoring/network-connectivity.yml delete mode 100644 configuration/playbooks/monitoring/service-health-check.yml create mode 100644 configuration/playbooks/remove-docker-install-podman.yml create mode 100644 configuration/playbooks/restart-tailscale.yml delete mode 100644 configuration/playbooks/security/certificate-management.yml delete mode 100644 configuration/playbooks/security/security-hardening.yml create mode 100644 configuration/playbooks/setup-disk-monitoring.yml create mode 100644 configuration/playbooks/setup-new-nomad-nodes.yml create mode 100644 configuration/templates/disk-monitoring.conf.j2 create mode 100644 configuration/templates/system-monitoring.conf.j2 create mode 100644 configuration/templates/telegraf-env.j2 create mode 100644 configuration/templates/telegraf.conf.j2 create mode 100644 configuration/templates/telegraf.service.j2 create mode 100644 docs/disk-management.md delete mode 100755 mgmt.sh delete mode 100755 scripts/deployment/configure-nomad-cluster.sh delete mode 100755 scripts/deployment/deploy-consul-cluster.sh delete mode 100755 scripts/deployment/deploy-consul-simple.sh delete mode 100755 scripts/deployment/deploy-nomad-cluster.sh delete mode 100755 scripts/deployment/deploy-nomad-local.sh delete mode 100755 scripts/deployment/install-nomad-cluster.sh delete mode 100755 scripts/setup/setup-gitea-integration.sh create mode 100755 scripts/setup/setup-nomad-laptop.sh create mode 100644 scripts/setup/setup-nomad-windows.ps1 delete mode 100755 scripts/setup/setup-opentofu.sh delete mode 100644 scripts/utilities/NUCLEAR-NOMAD-RESET.yml delete mode 100644 scripts/utilities/complete-nomad-cluster-fix.yml delete mode 100644 scripts/utilities/complete-nomad-reset.yml delete mode 100755 scripts/utilities/consul-cluster-manager.sh delete mode 100755 scripts/utilities/consul-secrets-manager.sh delete mode 100644 scripts/utilities/correct-nomad-cluster.yml delete mode 100644 scripts/utilities/deploy-nomad-configs.yml create mode 100755 scripts/utilities/disk-monitor.sh delete mode 100644 scripts/utilities/final-nomad-cluster-fix.yml delete mode 100644 scripts/utilities/final-nomad-fix.yml delete mode 100755 scripts/utilities/fix-ash3c-ip.sh delete mode 100755 scripts/utilities/fix-consul-cluster.sh delete mode 100644 scripts/utilities/fix-nomad-cluster.yml delete mode 100755 scripts/utilities/gitea-repo-manager.sh create mode 100755 scripts/utilities/nomad-cluster-manager.sh delete mode 100755 scripts/utilities/proxy-toggle.sh delete mode 100755 scripts/utilities/quick-start.sh delete mode 100755 scripts/utilities/simple-nomad-fix.sh delete mode 100755 scripts/utilities/terraform-consul-provider.sh delete mode 100755 scripts/utilities/tofu-secrets-uploader-simple.sh delete mode 100755 scripts/utilities/tofu-secrets-uploader.sh create mode 100755 scripts/utilities/verify-podman-migration.sh delete mode 100644 swarm/configs/traefik-consul-setup.yml delete mode 100644 swarm/configs/traefik.yml delete mode 100755 swarm/scripts/swarm-manager.sh delete mode 100644 swarm/stacks/consul-ash3c-stack.yml delete mode 100644 swarm/stacks/consul-cluster-fixed.yml delete mode 100644 swarm/stacks/consul-cluster-host-network.yml delete mode 100644 swarm/stacks/consul-cluster-ip-based.yml delete mode 100644 swarm/stacks/consul-cluster-macvlan.yml delete mode 100644 swarm/stacks/consul-cluster-stack.yml delete mode 100644 swarm/stacks/consul-master-stack.yml delete mode 100644 swarm/stacks/consul-simple-stack.yml delete mode 100644 swarm/stacks/consul-single-node.yml delete mode 100644 swarm/stacks/demo-services-stack.yml delete mode 100644 swarm/stacks/traefik-swarm-stack.yml diff --git a/configuration/deploy-monitoring.sh b/configuration/deploy-monitoring.sh new file mode 100755 index 0000000..dbc68df --- /dev/null +++ b/configuration/deploy-monitoring.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Nomad 集群硬盘监控部署脚本 +# 使用现有的 InfluxDB + Grafana 监控栈 + +echo "🚀 开始部署 Nomad 集群硬盘监控..." + +# 检查配置文件 +if [[ ! -f "inventories/production/group_vars/all.yml" ]]; then + echo "❌ 配置文件不存在,请先配置 InfluxDB 连接信息" + exit 1 +fi + +# 显示配置信息 +echo "📋 当前监控配置:" +grep -E "influxdb_|disk_usage_|collection_interval" inventories/production/group_vars/all.yml + +echo "" +read -p "🤔 确认配置正确吗?(y/N): " confirm +if [[ $confirm != [yY] ]]; then + echo "❌ 部署取消,请修改配置后重试" + exit 1 +fi + +# 部署到所有节点 +echo "📦 开始安装 Telegraf 到所有节点..." +ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml + +# 检查部署结果 +if [[ $? -eq 0 ]]; then + echo "✅ 硬盘监控部署完成!" + echo "" + echo "📊 监控信息:" + echo "- 数据将发送到你现有的 InfluxDB" + echo "- 可以在 Grafana 中创建仪表板查看数据" + echo "- 已禁用本地日志文件以节省硬盘空间" + echo "- 监控数据每30秒收集一次" + echo "" + echo "🔧 下一步:" + echo "1. 在 Grafana 中创建 Nomad 集群监控仪表板" + echo "2. 设置硬盘使用率告警规则" + echo "3. 可以运行以下命令检查监控状态:" + echo " ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf'" +else + echo "❌ 部署失败,请检查错误信息" + exit 1 +fi \ No newline at end of file diff --git a/configuration/deploy-telegraf-remote.sh b/configuration/deploy-telegraf-remote.sh new file mode 100755 index 0000000..4b20096 --- /dev/null +++ b/configuration/deploy-telegraf-remote.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# 使用远程 InfluxDB 2.x 配置快速部署 Telegraf 监控 + +echo "🚀 使用 InfluxDB 2.x 远程配置部署 Telegraf 监控..." + +# 设置变量 +INFLUX_TOKEN="VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w==" +TELEGRAF_CONFIG_URL="http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000" + +# 检查网络连接 +echo "🔍 检查 InfluxDB 连接..." +if curl -s --max-time 5 "http://influxdb1.tailnet-68f9.ts.net:8086/health" > /dev/null; then + echo "✅ InfluxDB 连接正常" +else + echo "❌ 无法连接到 InfluxDB,请检查网络" + exit 1 +fi + +# 使用远程配置部署 +echo "📦 开始部署到所有节点..." +ansible-playbook -i inventories/production/nomad-cluster.ini playbooks/setup-disk-monitoring.yml \ + -e "use_remote_config=true" \ + -e "telegraf_config_url=$TELEGRAF_CONFIG_URL" \ + -e "influxdb_token=$INFLUX_TOKEN" + +# 检查部署结果 +if [[ $? -eq 0 ]]; then + echo "✅ Telegraf 监控部署完成!" + echo "" + echo "📊 配置信息:" + echo "- 使用远程配置: $TELEGRAF_CONFIG_URL" + echo "- InfluxDB 服务器: influxdb1.tailnet-68f9.ts.net:8086" + echo "- 已禁用本地日志文件" + echo "" + echo "🔧 验证部署:" + echo "ansible all -i inventories/production/nomad-cluster.ini -m shell -a 'systemctl status telegraf --no-pager'" +else + echo "❌ 部署失败,请检查错误信息" + exit 1 +fi \ No newline at end of file diff --git a/configuration/docker-daemon.json b/configuration/docker-daemon.json deleted file mode 100644 index 5564836..0000000 --- a/configuration/docker-daemon.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "proxies": { - "http-proxy": "http://istoreos.tailnet-68f9.ts.net:7891", - "https-proxy": "http://istoreos.tailnet-68f9.ts.net:7891", - "no-proxy": "localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" - }, - "registry-mirrors": [], - "insecure-registries": [], - "debug": false, - "experimental": false, - "features": { - "buildkit": true - } -} diff --git a/configuration/inventories/production/group_vars/all.yml b/configuration/inventories/production/group_vars/all.yml new file mode 100644 index 0000000..b5c6cbe --- /dev/null +++ b/configuration/inventories/production/group_vars/all.yml @@ -0,0 +1,20 @@ +# Nomad 集群全局配置 +# InfluxDB 2.x + Grafana 监控配置 + +# InfluxDB 2.x 连接配置 +influxdb_url: "http://influxdb1.tailnet-68f9.ts.net:8086" +influxdb_token: "VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w==" +influxdb_org: "nomad" # 组织名称 +influxdb_bucket: "nomad_monitoring" # Bucket 名称 + +# 远程 Telegraf 配置 URL +telegraf_config_url: "http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000" + +# 监控配置 +disk_usage_warning: 80 # 硬盘使用率警告阈值 +disk_usage_critical: 90 # 硬盘使用率严重告警阈值 +collection_interval: 30 # 数据收集间隔(秒) + +# Telegraf 优化配置 +telegraf_log_level: "ERROR" # 只记录错误日志 +telegraf_disable_local_logs: true # 禁用本地日志文件 \ No newline at end of file diff --git a/configuration/inventories/production/nomad-cluster.ini b/configuration/inventories/production/nomad-cluster.ini index 184ac27..a1aca42 100644 --- a/configuration/inventories/production/nomad-cluster.ini +++ b/configuration/inventories/production/nomad-cluster.ini @@ -1,10 +1,20 @@ [nomad_servers] -master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 -semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 -ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 +semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=6 +ash2e ansible_host=ash2e ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 +ash1d ansible_host=ash1d ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 +ch2 ansible_host=ch2 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 +ch3 ansible_host=ch3 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=6 +# 新增的 Mac 和 Windows 节点(请替换为实际的 Tailscale IP) +mac-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_mac_user nomad_role=server nomad_bootstrap_expect=6 +win-laptop ansible_host=100.xxx.xxx.xxx ansible_user=your_win_user nomad_role=server nomad_bootstrap_expect=6 [nomad_clients] -# 如果需要客户端节点,可以在这里添加 +master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client +ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client +hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client +hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client +hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client +syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client [nomad_cluster:children] nomad_servers diff --git a/configuration/inventories/production/nomad-cluster.ini.backup b/configuration/inventories/production/nomad-cluster.ini.backup new file mode 100644 index 0000000..07d02ad --- /dev/null +++ b/configuration/inventories/production/nomad-cluster.ini.backup @@ -0,0 +1,22 @@ +[nomad_servers] +master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 +semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 +ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 + +[nomad_clients] +hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client +hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client +hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client + +[nomad_cluster:children] +nomad_servers +nomad_clients + +[nomad_cluster:vars] +ansible_ssh_private_key_file=~/.ssh/id_ed25519 +ansible_user=ben +ansible_become=yes +nomad_version=1.10.5 +nomad_datacenter=dc1 +nomad_region=global +nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ= \ No newline at end of file diff --git a/configuration/inventories/production/nomad-cluster.ini.backup-20250924-025928 b/configuration/inventories/production/nomad-cluster.ini.backup-20250924-025928 new file mode 100644 index 0000000..b51ddd6 --- /dev/null +++ b/configuration/inventories/production/nomad-cluster.ini.backup-20250924-025928 @@ -0,0 +1,23 @@ +[nomad_servers] +master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 +semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 +ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 + +[nomad_clients] +hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client +hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131 nomad_role=client +hcs ansible_host=hcs ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client +syd ansible_host=100.117.137.105 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=client + +[nomad_cluster:children] +nomad_servers +nomad_clients + +[nomad_cluster:vars] +ansible_ssh_private_key_file=~/.ssh/id_ed25519 +ansible_user=ben +ansible_become=yes +nomad_version=1.10.5 +nomad_datacenter=dc1 +nomad_region=global +nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ= \ No newline at end of file diff --git a/configuration/playbooks/bootstrap/cron-setup.yml b/configuration/playbooks/bootstrap/cron-setup.yml deleted file mode 100644 index 4512919..0000000 --- a/configuration/playbooks/bootstrap/cron-setup.yml +++ /dev/null @@ -1,183 +0,0 @@ ---- -- name: Setup Automated Maintenance Cron Jobs - hosts: localhost - gather_facts: no - - vars: - # 定时任务配置 - cron_jobs: - # 每日快速检查 - - name: "Daily system health check" - job: "cd /root/mgmt && ./scripts/ops-manager.sh toolkit all --check > /var/log/daily-health-check.log 2>&1" - minute: "0" - hour: "8" - day: "*" - month: "*" - weekday: "*" - - # 每周系统清理 - - name: "Weekly system cleanup" - job: "cd /root/mgmt && ./scripts/ops-manager.sh cleanup all > /var/log/weekly-cleanup.log 2>&1" - minute: "0" - hour: "2" - day: "*" - month: "*" - weekday: "0" # Sunday - - # 每月安全检查 - - name: "Monthly security hardening check" - job: "cd /root/mgmt && ./scripts/ops-manager.sh security all --check > /var/log/monthly-security-check.log 2>&1" - minute: "0" - hour: "3" - day: "1" - month: "*" - weekday: "*" - - # 每周证书检查 - - name: "Weekly certificate check" - job: "cd /root/mgmt && ./scripts/ops-manager.sh cert all > /var/log/weekly-cert-check.log 2>&1" - minute: "30" - hour: "4" - day: "*" - month: "*" - weekday: "1" # Monday - - # 每日 Docker 清理 (仅 LXC 组) - - name: "Daily Docker cleanup for LXC" - job: "cd /root/mgmt && ansible lxc -i ansible/inventory.ini -m shell -a 'docker system prune -f' --become -e 'ansible_ssh_pass=313131' > /var/log/daily-docker-cleanup.log 2>&1" - minute: "0" - hour: "1" - day: "*" - month: "*" - weekday: "*" - - # 每周网络连通性检查 - - name: "Weekly network connectivity check" - job: "cd /root/mgmt && ./scripts/ops-manager.sh network all > /var/log/weekly-network-check.log 2>&1" - minute: "0" - hour: "6" - day: "*" - month: "*" - weekday: "2" # Tuesday - - tasks: - # 创建日志目录 - - name: Create log directory - file: - path: /var/log/ansible-automation - state: directory - mode: '0755' - become: yes - - # 设置脚本执行权限 - - name: Make ops-manager.sh executable - file: - path: /root/mgmt/scripts/ops-manager.sh - mode: '0755' - - # 创建定时任务 - - name: Setup cron jobs for automated maintenance - cron: - name: "{{ item.name }}" - job: "{{ item.job }}" - minute: "{{ item.minute }}" - hour: "{{ item.hour }}" - day: "{{ item.day }}" - month: "{{ item.month }}" - weekday: "{{ item.weekday }}" - user: root - loop: "{{ cron_jobs }}" - become: yes - - # 创建日志轮转配置 - - name: Setup log rotation for automation logs - copy: - content: | - /var/log/*-health-check.log - /var/log/*-cleanup.log - /var/log/*-security-check.log - /var/log/*-cert-check.log - /var/log/*-docker-cleanup.log - /var/log/*-network-check.log { - daily - missingok - rotate 30 - compress - delaycompress - notifempty - copytruncate - } - dest: /etc/logrotate.d/ansible-automation - mode: '0644' - become: yes - - # 创建监控脚本 - - name: Create monitoring dashboard script - copy: - content: | - #!/bin/bash - # Automation Monitoring Dashboard - - echo "🤖 Ansible Automation Status Dashboard" - echo "======================================" - echo "" - - echo "📅 Last Execution Times:" - echo "------------------------" - for log in /var/log/*-check.log /var/log/*-cleanup.log; do - if [ -f "$log" ]; then - echo "$(basename "$log" .log): $(stat -c %y "$log" | cut -d. -f1)" - fi - done - echo "" - - echo "📊 Recent Log Summary:" - echo "---------------------" - for log in /var/log/daily-health-check.log /var/log/weekly-cleanup.log; do - if [ -f "$log" ]; then - echo "=== $(basename "$log") ===" - tail -5 "$log" | grep -E "(TASK|PLAY RECAP|ERROR|WARNING)" || echo "No recent activity" - echo "" - fi - done - - echo "⏰ Next Scheduled Jobs:" - echo "----------------------" - crontab -l | grep -E "(health|cleanup|security|cert|docker|network)" | while read line; do - echo "$line" - done - echo "" - - echo "💾 Log File Sizes:" - echo "-----------------" - ls -lh /var/log/*-*.log 2>/dev/null | awk '{print $5, $9}' || echo "No log files found" - dest: /usr/local/bin/automation-status - mode: '0755' - become: yes - - # 显示设置完成信息 - - name: Display setup completion info - debug: - msg: | - 🎉 自动化定时任务设置完成! - - 📋 已配置的定时任务: - • 每日 08:00 - 系统健康检查 - • 每日 01:00 - Docker 清理 (LXC 组) - • 每周日 02:00 - 系统清理 - • 每周一 04:30 - 证书检查 - • 每周二 06:00 - 网络连通性检查 - • 每月1日 03:00 - 安全检查 - - 📊 监控命令: - • 查看状态: automation-status - • 查看定时任务: crontab -l - • 查看日志: tail -f /var/log/daily-health-check.log - - 📁 日志位置: /var/log/ - 🔄 日志轮转: 30天自动清理 - - 💡 手动执行示例: - • ./scripts/ops-manager.sh toolkit all - • ./scripts/ops-manager.sh cleanup lxc - • ./scripts/ops-manager.sh health proxmox \ No newline at end of file diff --git a/configuration/playbooks/bootstrap/main.yml b/configuration/playbooks/bootstrap/main.yml deleted file mode 100644 index 250e45e..0000000 --- a/configuration/playbooks/bootstrap/main.yml +++ /dev/null @@ -1,175 +0,0 @@ ---- -- name: Bootstrap Infrastructure - hosts: all - become: yes - gather_facts: yes - - vars: - # 基础软件包 - base_packages: - - curl - - wget - - git - - vim - - htop - - tree - - unzip - - jq - - python3 - - python3-pip - - apt-transport-https - - ca-certificates - - gnupg - - lsb-release - - # Docker 配置 - docker_users: - - "{{ ansible_user }}" - - # 系统配置 - timezone: "Asia/Shanghai" - - tasks: - - name: Update package cache - apt: - update_cache: yes - cache_valid_time: 3600 - when: ansible_os_family == "Debian" - - - name: Install base packages - package: - name: "{{ base_packages }}" - state: present - - - name: Set timezone - timezone: - name: "{{ timezone }}" - - - name: Create system users - user: - name: "{{ ansible_user }}" - groups: sudo - shell: /bin/bash - create_home: yes - when: ansible_user != "root" - - - name: Configure SSH - lineinfile: - path: /etc/ssh/sshd_config - regexp: "{{ item.regexp }}" - line: "{{ item.line }}" - backup: yes - loop: - - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' } - - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' } - - { regexp: '^#?PubkeyAuthentication', line: 'PubkeyAuthentication yes' } - notify: restart ssh - when: ansible_user != "root" - - - name: Install Docker - block: - - name: Add Docker GPG key - apt_key: - url: https://download.docker.com/linux/ubuntu/gpg - state: present - - - name: Add Docker repository - apt_repository: - repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" - state: present - - - name: Install Docker - package: - name: - - docker-ce - - docker-ce-cli - - containerd.io - - docker-compose-plugin - state: present - - - name: Add users to docker group - user: - name: "{{ item }}" - groups: docker - append: yes - loop: "{{ docker_users }}" - - - name: Start and enable Docker - systemd: - name: docker - state: started - enabled: yes - - - name: Install Docker Compose (standalone) - get_url: - url: "https://github.com/docker/compose/releases/latest/download/docker-compose-linux-x86_64" - dest: /usr/local/bin/docker-compose - mode: '0755' - - - name: Configure firewall - ufw: - rule: "{{ item.rule }}" - port: "{{ item.port }}" - proto: "{{ item.proto | default('tcp') }}" - loop: - - { rule: 'allow', port: '22' } - - { rule: 'allow', port: '80' } - - { rule: 'allow', port: '443' } - notify: enable ufw - - - name: Create application directories - file: - path: "{{ item }}" - state: directory - owner: "{{ ansible_user }}" - group: "{{ ansible_user }}" - mode: '0755' - loop: - - /opt/apps - - /opt/data - - /opt/logs - - /opt/backups - - /opt/scripts - - - name: Install monitoring tools - package: - name: - - htop - - iotop - - nethogs - - ncdu - - tmux - state: present - - - name: Configure system limits - pam_limits: - domain: '*' - limit_type: "{{ item.type }}" - limit_item: "{{ item.item }}" - value: "{{ item.value }}" - loop: - - { type: 'soft', item: 'nofile', value: '65536' } - - { type: 'hard', item: 'nofile', value: '65536' } - - { type: 'soft', item: 'nproc', value: '32768' } - - { type: 'hard', item: 'nproc', value: '32768' } - - - name: Configure sysctl - sysctl: - name: "{{ item.name }}" - value: "{{ item.value }}" - state: present - reload: yes - loop: - - { name: 'vm.max_map_count', value: '262144' } - - { name: 'fs.file-max', value: '2097152' } - - { name: 'net.core.somaxconn', value: '32768' } - - handlers: - - name: restart ssh - systemd: - name: ssh - state: restarted - - - name: enable ufw - ufw: - state: enabled \ No newline at end of file diff --git a/configuration/playbooks/bootstrap/system-cleanup.yml b/configuration/playbooks/bootstrap/system-cleanup.yml deleted file mode 100644 index b7c741c..0000000 --- a/configuration/playbooks/bootstrap/system-cleanup.yml +++ /dev/null @@ -1,83 +0,0 @@ ---- -- name: System Cleanup and Maintenance - hosts: all - become: yes - gather_facts: yes - - tasks: - # 清理包缓存和孤立包 - - name: Clean package cache (Debian/Ubuntu) - apt: - autoclean: yes - autoremove: yes - when: ansible_os_family == "Debian" - - - name: Remove orphaned packages (Debian/Ubuntu) - shell: apt-get autoremove --purge -y - when: ansible_os_family == "Debian" - - # 清理日志文件 - - name: Clean old journal logs (keep 7 days) - shell: journalctl --vacuum-time=7d - - - name: Clean old log files - find: - paths: /var/log - patterns: "*.log.*,*.gz" - age: "7d" - recurse: yes - register: old_logs - - - name: Remove old log files - file: - path: "{{ item.path }}" - state: absent - loop: "{{ old_logs.files }}" - when: old_logs.files is defined - - # 清理临时文件 - - name: Clean /tmp directory (files older than 7 days) - find: - paths: /tmp - age: "7d" - recurse: yes - register: tmp_files - - - name: Remove old temp files - file: - path: "{{ item.path }}" - state: absent - loop: "{{ tmp_files.files }}" - when: tmp_files.files is defined - - # Docker 清理 (如果存在) - - name: Check if Docker is installed - command: which docker - register: docker_check - failed_when: false - changed_when: false - - - name: Clean Docker system - shell: | - docker system prune -f - docker image prune -f - docker volume prune -f - when: docker_check.rc == 0 - - # 磁盘空间检查 - - name: Check disk usage - shell: df -h - register: disk_usage - - - name: Display disk usage - debug: - msg: "{{ disk_usage.stdout_lines }}" - - # 内存使用检查 - - name: Check memory usage - shell: free -h - register: memory_usage - - - name: Display memory usage - debug: - msg: "{{ memory_usage.stdout_lines }}" \ No newline at end of file diff --git a/configuration/playbooks/bootstrap/system-update.yml b/configuration/playbooks/bootstrap/system-update.yml deleted file mode 100644 index ebadf3a..0000000 --- a/configuration/playbooks/bootstrap/system-update.yml +++ /dev/null @@ -1,43 +0,0 @@ ---- -- name: System Update Playbook - hosts: all - become: yes - gather_facts: yes - - tasks: - - name: Wait for automatic system updates to complete - shell: while fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do sleep 5; done - when: ansible_os_family == "Debian" - - - name: Update apt cache - apt: - update_cache: yes - cache_valid_time: 3600 - when: ansible_os_family == "Debian" - retries: 3 - delay: 10 - - - name: Upgrade all packages - apt: - upgrade: yes - autoremove: yes - autoclean: yes - when: ansible_os_family == "Debian" - register: upgrade_result - retries: 3 - delay: 10 - - - name: Display upgrade results - debug: - msg: "System upgrade completed. {{ upgrade_result.changed }} packages were updated." - - - name: Check if reboot is required - stat: - path: /var/run/reboot-required - register: reboot_required - when: ansible_os_family == "Debian" - - - name: Notify if reboot is required - debug: - msg: "System reboot is required to complete the update." - when: reboot_required.stat.exists is defined and reboot_required.stat.exists \ No newline at end of file diff --git a/configuration/playbooks/clear-aliases.yml b/configuration/playbooks/clear-aliases.yml new file mode 100644 index 0000000..d299355 --- /dev/null +++ b/configuration/playbooks/clear-aliases.yml @@ -0,0 +1,81 @@ +--- +- name: Clear all aliases on hcp1 and hcp2 + hosts: hcp1,hcp2 + become: yes + + tasks: + - name: Check current aliases + shell: alias || echo "No aliases found" + register: current_aliases + + - name: Display current aliases + debug: + msg: "Current aliases: {{ current_aliases.stdout_lines }}" + + - name: Clear aliases from /root/.bashrc + shell: | + sed -i '/^alias /d' /root/.bashrc + sed -i '/^alias\t/d' /root/.bashrc + ignore_errors: yes + + - name: Clear aliases from /root/.profile + shell: | + sed -i '/^alias /d' /root/.profile + sed -i '/^alias\t/d' /root/.profile + ignore_errors: yes + + - name: Clear aliases from /root/.zshrc + shell: | + sed -i '/^alias /d' /root/.zshrc + sed -i '/^alias\t/d' /root/.zshrc + ignore_errors: yes + + - name: Clear aliases from /etc/bash.bashrc + shell: | + sed -i '/^alias /d' /etc/bash.bashrc + sed -i '/^alias\t/d' /etc/bash.bashrc + ignore_errors: yes + + - name: Clear aliases from /etc/profile + shell: | + sed -i '/^alias /d' /etc/profile + sed -i '/^alias\t/d' /etc/profile + ignore_errors: yes + + - name: Find and clear custom alias files + find: + paths: ["/root", "/etc", "/home"] + patterns: ["*.aliases", ".aliases", "aliases"] + recurse: yes + register: alias_files + + - name: Remove found alias files + file: + path: "{{ item.path }}" + state: absent + loop: "{{ alias_files.files }}" + when: alias_files.files is defined + + - name: Clear shell history to remove alias commands + shell: | + > /root/.bash_history + > /root/.zsh_history + history -c + ignore_errors: yes + + - name: Unalias all current aliases + shell: unalias -a + ignore_errors: yes + + - name: Restart shell services + shell: | + pkill -f bash || true + pkill -f zsh || true + + - name: Test network connectivity after clearing aliases + shell: ping -c 2 8.8.8.8 || echo "Ping failed" + register: ping_test + + - name: Display ping test result + debug: + msg: "Ping test: {{ ping_test.stdout_lines }}" \ No newline at end of file diff --git a/configuration/playbooks/clear-proxy.yml b/configuration/playbooks/clear-proxy.yml new file mode 100644 index 0000000..be77bcb --- /dev/null +++ b/configuration/playbooks/clear-proxy.yml @@ -0,0 +1,76 @@ +--- +- name: Clear proxy settings on hcp1 and hcp2 + hosts: hcp1,hcp2 + become: yes + + tasks: + - name: Check current proxy environment variables + shell: env | grep -i proxy || echo "No proxy vars found" + register: proxy_env_before + + - name: Display current proxy settings + debug: + msg: "Current proxy env: {{ proxy_env_before.stdout_lines }}" + + - name: Clear proxy from /etc/environment + lineinfile: + path: /etc/environment + regexp: "{{ item }}" + state: absent + loop: + - "^http_proxy=" + - "^https_proxy=" + - "^HTTP_PROXY=" + - "^HTTPS_PROXY=" + - "^ftp_proxy=" + - "^FTP_PROXY=" + - "^no_proxy=" + - "^NO_PROXY=" + + - name: Clear proxy from /etc/apt/apt.conf.d/ + file: + path: "{{ item }}" + state: absent + loop: + - /etc/apt/apt.conf.d/95proxies + - /etc/apt/apt.conf.d/proxy.conf + - /etc/apt/apt.conf.d/00proxy + + - name: Clear proxy from user profiles + lineinfile: + path: "{{ item }}" + regexp: ".*proxy.*" + state: absent + loop: + - /root/.bashrc + - /root/.profile + - /home/root/.bashrc + - /home/root/.profile + ignore_errors: yes + + - name: Unset proxy variables in current session + shell: | + unset http_proxy + unset https_proxy + unset HTTP_PROXY + unset HTTPS_PROXY + unset ftp_proxy + unset FTP_PROXY + unset no_proxy + unset NO_PROXY + + - name: Check APT proxy configuration + shell: apt-config dump | grep -i proxy || echo "No APT proxy found" + register: apt_proxy_check + + - name: Display APT proxy status + debug: + msg: "APT proxy config: {{ apt_proxy_check.stdout_lines }}" + + - name: Test direct connection to HashiCorp + shell: curl -I --connect-timeout 10 https://releases.hashicorp.com/ || echo "Connection failed" + register: connection_test + + - name: Display connection test result + debug: + msg: "Connection test: {{ connection_test.stdout_lines }}" \ No newline at end of file diff --git a/configuration/playbooks/configure-nomad-podman-cluster.yml b/configuration/playbooks/configure-nomad-podman-cluster.yml new file mode 100644 index 0000000..01430dd --- /dev/null +++ b/configuration/playbooks/configure-nomad-podman-cluster.yml @@ -0,0 +1,57 @@ +--- +- name: Configure Podman driver for all Nomad client nodes + hosts: nomad_clients,nomad_servers + become: yes + + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Install Podman if not present + package: + name: podman + state: present + ignore_errors: yes + + - name: Enable Podman socket + systemd: + name: podman.socket + enabled: yes + state: started + ignore_errors: yes + + - name: Update Nomad configuration to use Podman + lineinfile: + path: /etc/nomad.d/nomad.hcl + regexp: '^plugin "docker"' + line: 'plugin "podman" {' + state: present + + - name: Add Podman plugin configuration + blockinfile: + path: /etc/nomad.d/nomad.hcl + marker: "# {mark} PODMAN PLUGIN CONFIG" + block: | + plugin "podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + insertafter: 'client {' + + - name: Start Nomad service + systemd: + name: nomad + state: started + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + host: localhost + delay: 5 + timeout: 30 \ No newline at end of file diff --git a/configuration/playbooks/configure-nomad-tailscale.yml b/configuration/playbooks/configure-nomad-tailscale.yml new file mode 100644 index 0000000..45f3a49 --- /dev/null +++ b/configuration/playbooks/configure-nomad-tailscale.yml @@ -0,0 +1,217 @@ +--- +- name: 配置 Nomad 集群使用 Tailscale 网络通讯 + hosts: nomad_cluster + become: yes + gather_facts: no + vars: + nomad_config_dir: "/etc/nomad.d" + nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" + + tasks: + - name: 获取当前节点的 Tailscale IP + shell: tailscale ip | head -1 + register: current_tailscale_ip + failed_when: current_tailscale_ip.rc != 0 + + - name: 确保 Nomad 配置目录存在 + file: + path: "{{ nomad_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: 生成 Nomad 服务器配置(使用 Tailscale) + copy: + dest: "{{ nomad_config_file }}" + owner: root + group: root + mode: '0644' + content: | + datacenter = "{{ nomad_datacenter | default('dc1') }}" + data_dir = "/opt/nomad/data" + log_level = "INFO" + + bind_addr = "{{ current_tailscale_ip.stdout }}" + + addresses { + http = "0.0.0.0" + rpc = "{{ current_tailscale_ip.stdout }}" + serf = "{{ current_tailscale_ip.stdout }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + bootstrap_expect = {{ nomad_bootstrap_expect | default(4) }} + + retry_join = [ + "100.116.158.95", # semaphore + "100.103.147.94", # ash2e + "100.81.26.3", # ash1d + "100.90.159.68" # ch2 + ] + + encrypt = "{{ nomad_encrypt_key }}" + } + + client { + enabled = false + } + + plugin "podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "{{ current_tailscale_ip.stdout }}:8500" + } + when: nomad_role == "server" + + - name: 生成 Nomad 客户端配置(使用 Tailscale) + copy: + dest: "{{ nomad_config_file }}" + owner: root + group: root + mode: '0644' + content: | + datacenter = "{{ nomad_datacenter | default('dc1') }}" + data_dir = "/opt/nomad/data" + log_level = "INFO" + + bind_addr = "{{ current_tailscale_ip.stdout }}" + + addresses { + http = "0.0.0.0" + rpc = "{{ current_tailscale_ip.stdout }}" + serf = "{{ current_tailscale_ip.stdout }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = false + } + + client { + enabled = true + + servers = [ + "100.116.158.95:4647", # semaphore + "100.103.147.94:4647", # ash2e + "100.81.26.3:4647", # ash1d + "100.90.159.68:4647" # ch2 + ] + } + + plugin "podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "{{ current_tailscale_ip.stdout }}:8500" + } + when: nomad_role == "client" + + - name: 检查 Nomad 二进制文件位置 + shell: which nomad || find /usr -name nomad 2>/dev/null | head -1 + register: nomad_binary_path + failed_when: nomad_binary_path.stdout == "" + + - name: 创建/更新 Nomad systemd 服务文件 + copy: + dest: "/etc/systemd/system/nomad.service" + owner: root + group: root + mode: '0644' + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + + [Service] + Type=notify + User=root + Group=root + ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + notify: restart nomad + + - name: 确保 Nomad 数据目录存在 + file: + path: "/opt/nomad/data" + state: directory + owner: root + group: root + mode: '0755' + + - name: 重新加载 systemd daemon + systemd: + daemon_reload: yes + + - name: 启用并启动 Nomad 服务 + systemd: + name: nomad + enabled: yes + state: started + + - name: 等待 Nomad 服务启动 + wait_for: + port: 4646 + host: "{{ current_tailscale_ip.stdout }}" + delay: 5 + timeout: 30 + ignore_errors: yes + + - name: 检查 Nomad 服务状态 + shell: systemctl status nomad --no-pager -l + register: nomad_status + ignore_errors: yes + + - name: 显示配置结果 + debug: + msg: | + ✅ 节点 {{ inventory_hostname }} 配置完成 + 🌐 Tailscale IP: {{ current_tailscale_ip.stdout }} + 🎯 角色: {{ nomad_role }} + 🔧 Nomad 二进制: {{ nomad_binary_path.stdout }} + 📊 服务状态: {{ 'active' if nomad_status.rc == 0 else 'failed' }} + {% if nomad_status.rc != 0 %} + ❌ 错误信息: + {{ nomad_status.stdout }} + {{ nomad_status.stderr }} + {% endif %} + + handlers: + - name: restart nomad + systemd: + name: nomad + state: restarted + daemon_reload: yes \ No newline at end of file diff --git a/configuration/playbooks/debug-nomad-podman.yml b/configuration/playbooks/debug-nomad-podman.yml new file mode 100644 index 0000000..368f9fb --- /dev/null +++ b/configuration/playbooks/debug-nomad-podman.yml @@ -0,0 +1,60 @@ +--- +- name: Debug Nomad Podman Driver Issues + hosts: all + become: yes + vars: + nomad_user: nomad + + tasks: + - name: Check Nomad configuration + shell: cat /etc/nomad.d/nomad.hcl + register: nomad_config + + - name: Display Nomad configuration + debug: + var: nomad_config.stdout_lines + + - name: Check plugin directory contents + shell: ls -la /opt/nomad/data/plugins/ + register: plugin_dir + + - name: Display plugin directory + debug: + var: plugin_dir.stdout_lines + + - name: Check Nomad logs for plugin loading + shell: journalctl -u nomad -n 50 --no-pager | grep -E "(plugin|driver|podman)" + register: nomad_logs + failed_when: false + + - name: Display relevant Nomad logs + debug: + var: nomad_logs.stdout_lines + + - name: Check if plugin is executable + stat: + path: /opt/nomad/data/plugins/nomad-driver-podman + register: plugin_stat + + - name: Display plugin file info + debug: + var: plugin_stat + + - name: Test plugin directly + shell: /opt/nomad/data/plugins/nomad-driver-podman --version + register: plugin_version + failed_when: false + become_user: "{{ nomad_user }}" + + - name: Display plugin version + debug: + msg: "Plugin version test: {{ 'SUCCESS' if plugin_version.rc == 0 else 'FAILED' }} - {{ plugin_version.stdout if plugin_version.rc == 0 else plugin_version.stderr }}" + + - name: Check Podman socket accessibility + shell: sudo -u {{ nomad_user }} curl --unix-socket /run/user/1001/podman/podman.sock http://localhost/v1.0.0/libpod/info 2>/dev/null | head -3 + register: podman_socket_test + failed_when: false + + - name: Display Podman socket test + debug: + msg: "Podman socket test: {{ 'SUCCESS' if podman_socket_test.rc == 0 else 'FAILED' }}" \ No newline at end of file diff --git a/configuration/playbooks/disk-analysis-ncdu.yml b/configuration/playbooks/disk-analysis-ncdu.yml new file mode 100644 index 0000000..437dfc8 --- /dev/null +++ b/configuration/playbooks/disk-analysis-ncdu.yml @@ -0,0 +1,168 @@ +--- +- name: 磁盘空间分析 - 使用 ncdu 工具 + hosts: all + become: yes + vars: + ncdu_scan_paths: + - "/" + - "/var" + - "/opt" + - "/home" + output_dir: "/tmp/disk-analysis" + + tasks: + - name: 安装 ncdu 工具 + package: + name: ncdu + state: present + register: ncdu_install + + - name: 创建输出目录 + file: + path: "{{ output_dir }}" + state: directory + mode: '0755' + + - name: 检查磁盘空间使用情况 + shell: df -h + register: disk_usage + + - name: 显示当前磁盘使用情况 + debug: + msg: | + === {{ inventory_hostname }} 磁盘使用情况 === + {{ disk_usage.stdout }} + + - name: 使用 ncdu 扫描根目录并生成报告 + shell: | + ncdu -x -o {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json / + async: 300 + poll: 0 + register: ncdu_root_scan + + - name: 使用 ncdu 扫描 /var 目录 + shell: | + ncdu -x -o {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json /var + async: 180 + poll: 0 + register: ncdu_var_scan + when: ansible_mounts | selectattr('mount', 'equalto', '/var') | list | length > 0 or '/var' in ansible_mounts | map(attribute='mount') | list + + - name: 使用 ncdu 扫描 /opt 目录 + shell: | + ncdu -x -o {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json /opt + async: 120 + poll: 0 + register: ncdu_opt_scan + when: ansible_mounts | selectattr('mount', 'equalto', '/opt') | list | length > 0 or '/opt' in ansible_mounts | map(attribute='mount') | list + + - name: 等待根目录扫描完成 + async_status: + jid: "{{ ncdu_root_scan.ansible_job_id }}" + register: ncdu_root_result + until: ncdu_root_result.finished + retries: 60 + delay: 5 + + - name: 等待 /var 目录扫描完成 + async_status: + jid: "{{ ncdu_var_scan.ansible_job_id }}" + register: ncdu_var_result + until: ncdu_var_result.finished + retries: 36 + delay: 5 + when: ncdu_var_scan is defined and ncdu_var_scan.ansible_job_id is defined + + - name: 等待 /opt 目录扫描完成 + async_status: + jid: "{{ ncdu_opt_scan.ansible_job_id }}" + register: ncdu_opt_result + until: ncdu_opt_result.finished + retries: 24 + delay: 5 + when: ncdu_opt_scan is defined and ncdu_opt_scan.ansible_job_id is defined + + - name: 生成磁盘使用分析报告 + shell: | + echo "=== {{ inventory_hostname }} 磁盘分析报告 ===" > {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "生成时间: $(date)" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "=== 磁盘使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + df -h >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "=== 最大的目录 (前10个) ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + du -h --max-depth=2 / 2>/dev/null | sort -hr | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "=== /var 目录最大文件 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + find /var -type f -size +100M -exec ls -lh {} \; 2>/dev/null | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "=== /tmp 目录使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + du -sh /tmp/* 2>/dev/null | sort -hr | head -5 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + echo "=== 日志文件大小 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + find /var/log -name "*.log" -type f -size +50M -exec ls -lh {} \; 2>/dev/null >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + + - name: 显示分析报告 + shell: cat {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + register: disk_report + + - name: 输出磁盘分析结果 + debug: + msg: "{{ disk_report.stdout }}" + + - name: 检查是否有磁盘使用率超过 80% + shell: df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 80) print $0}' + register: high_usage_disks + + - name: 警告高磁盘使用率 + debug: + msg: | + ⚠️ 警告: {{ inventory_hostname }} 发现高磁盘使用率! + {{ high_usage_disks.stdout }} + when: high_usage_disks.stdout != "" + + - name: 创建清理建议 + shell: | + echo "=== {{ inventory_hostname }} 清理建议 ===" > {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "1. 检查日志文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + find /var/log -name "*.log" -type f -size +100M -exec echo " 大日志文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "2. 检查临时文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + find /tmp -type f -size +50M -exec echo " 大临时文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "3. 检查包缓存:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + if [ -d /var/cache/apt ]; then + echo " APT 缓存大小: $(du -sh /var/cache/apt 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + fi + if [ -d /var/cache/yum ]; then + echo " YUM 缓存大小: $(du -sh /var/cache/yum 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + fi + echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo "4. 检查容器相关:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + if command -v podman >/dev/null 2>&1; then + echo " Podman 镜像: $(podman images --format 'table {{.Repository}} {{.Tag}} {{.Size}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + echo " Podman 容器: $(podman ps -a --format 'table {{.Names}} {{.Status}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + fi + + - name: 显示清理建议 + shell: cat {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt + register: cleanup_suggestions + + - name: 输出清理建议 + debug: + msg: "{{ cleanup_suggestions.stdout }}" + + - name: 保存 ncdu 文件位置信息 + debug: + msg: | + 📁 ncdu 扫描文件已保存到: + - 根目录: {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json + - /var 目录: {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json (如果存在) + - /opt 目录: {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json (如果存在) + + 💡 使用方法: + ncdu -f {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json + + 📊 完整报告: {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt + 🧹 清理建议: {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt \ No newline at end of file diff --git a/configuration/playbooks/disk-cleanup.yml b/configuration/playbooks/disk-cleanup.yml new file mode 100644 index 0000000..1d0d881 --- /dev/null +++ b/configuration/playbooks/disk-cleanup.yml @@ -0,0 +1,96 @@ +--- +- name: 磁盘清理工具 + hosts: all + become: yes + vars: + cleanup_logs: true + cleanup_cache: true + cleanup_temp: true + cleanup_containers: false # 谨慎操作 + + tasks: + - name: 检查磁盘使用情况 (清理前) + shell: df -h + register: disk_before + + - name: 显示清理前磁盘使用情况 + debug: + msg: | + === {{ inventory_hostname }} 清理前磁盘使用情况 === + {{ disk_before.stdout }} + + - name: 清理系统日志 (保留最近7天) + shell: | + journalctl --vacuum-time=7d + find /var/log -name "*.log" -type f -mtime +7 -exec truncate -s 0 {} \; + find /var/log -name "*.log.*" -type f -mtime +7 -delete + when: cleanup_logs | bool + register: log_cleanup + + - name: 清理包管理器缓存 + block: + - name: 清理 APT 缓存 (Debian/Ubuntu) + shell: | + apt-get clean + apt-get autoclean + apt-get autoremove -y + when: ansible_os_family == "Debian" + + - name: 清理 YUM/DNF 缓存 (RedHat/CentOS) + shell: | + if command -v dnf >/dev/null 2>&1; then + dnf clean all + elif command -v yum >/dev/null 2>&1; then + yum clean all + fi + when: ansible_os_family == "RedHat" + when: cleanup_cache | bool + + - name: 清理临时文件 + shell: | + find /tmp -type f -atime +7 -delete 2>/dev/null || true + find /var/tmp -type f -atime +7 -delete 2>/dev/null || true + rm -rf /tmp/.* 2>/dev/null || true + when: cleanup_temp | bool + + - name: 清理 Podman 资源 (谨慎操作) + block: + - name: 停止所有容器 + shell: podman stop --all + ignore_errors: yes + + - name: 删除未使用的容器 + shell: podman container prune -f + ignore_errors: yes + + - name: 删除未使用的镜像 + shell: podman image prune -f + ignore_errors: yes + + - name: 删除未使用的卷 + shell: podman volume prune -f + ignore_errors: yes + when: cleanup_containers | bool + + - name: 清理核心转储文件 + shell: | + find /var/crash -name "core.*" -type f -delete 2>/dev/null || true + find / -name "core" -type f -size +10M -delete 2>/dev/null || true + ignore_errors: yes + + - name: 检查磁盘使用情况 (清理后) + shell: df -h + register: disk_after + + - name: 显示清理结果 + debug: + msg: | + === {{ inventory_hostname }} 清理完成 === + + 清理前: + {{ disk_before.stdout }} + + 清理后: + {{ disk_after.stdout }} + + 🧹 清理操作完成! \ No newline at end of file diff --git a/configuration/playbooks/final-podman-fix.yml b/configuration/playbooks/final-podman-fix.yml new file mode 100644 index 0000000..c0832ef --- /dev/null +++ b/configuration/playbooks/final-podman-fix.yml @@ -0,0 +1,105 @@ +--- +- name: Final Podman Permission Fix for Nomad + hosts: all + become: yes + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Install podman for nomad user (system-wide) + package: + name: podman + state: present + + - name: Enable podman socket for nomad user + systemd: + name: podman.socket + enabled: yes + state: started + scope: system + daemon_reload: yes + + - name: Create nomad user podman configuration directory + file: + path: /home/nomad/.config/containers + state: directory + owner: nomad + group: nomad + mode: '0755' + recurse: yes + + - name: Configure podman for nomad user to use system socket + copy: + content: | + [containers] + + [engine] + remote = true + + [service_destinations] + [service_destinations.system] + uri = "unix:///run/podman/podman.sock" + dest: /home/nomad/.config/containers/containers.conf + owner: nomad + group: nomad + mode: '0644' + + - name: Update Nomad configuration to use system podman socket + replace: + path: /etc/nomad.d/nomad.hcl + regexp: 'socket_path = "unix:///run/user/1001/podman/podman.sock"' + replace: 'socket_path = "unix:///run/podman/podman.sock"' + + - name: Add nomad user to necessary groups + user: + name: nomad + groups: + - podman + append: yes + + - name: Create podman group if it doesn't exist + group: + name: podman + state: present + + - name: Set proper permissions on system podman socket directory + file: + path: /run/podman + state: directory + mode: '0755' + group: podman + + - name: Start Nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + timeout: 60 + + - name: Wait for plugins to load + pause: + seconds: 20 + + - name: Final verification - Check driver status + shell: sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 "Driver Status" + register: final_driver_status + failed_when: false + + - name: Display final driver status + debug: + var: final_driver_status.stdout_lines + + - name: Test podman access for nomad user + shell: sudo -u nomad podman version + register: podman_test + failed_when: false + + - name: Display podman test result + debug: + var: podman_test.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/fix-hcp-podman.yml b/configuration/playbooks/fix-hcp-podman.yml new file mode 100644 index 0000000..d76a533 --- /dev/null +++ b/configuration/playbooks/fix-hcp-podman.yml @@ -0,0 +1,83 @@ +--- +- name: Fix HCP1 and HCP2 Podman Configuration + hosts: hcp1,hcp2 + become: yes + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Ensure nomad user exists + user: + name: nomad + system: yes + shell: /bin/false + home: /home/nomad + create_home: yes + + - name: Ensure Podman socket is running + systemd: + name: podman.socket + state: started + enabled: yes + + - name: Set proper permissions on Podman socket + file: + path: /run/podman/podman.sock + mode: '0666' + ignore_errors: yes + + - name: Create nomad data directory + file: + path: /opt/nomad/data + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Create nomad log directory + file: + path: /var/log/nomad + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Test Podman access for nomad user + shell: sudo -u nomad podman version + register: podman_test + failed_when: false + + - name: Display Podman test result + debug: + var: podman_test.stdout_lines + + - name: Validate Nomad configuration + shell: /usr/local/bin/nomad config validate /etc/nomad.d/nomad.hcl + register: config_validation + failed_when: false + + - name: Display configuration validation + debug: + var: config_validation + + - name: Start Nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + timeout: 60 + + - name: Check Nomad node status + shell: /usr/local/bin/nomad node status -self + register: node_status + failed_when: false + + - name: Display node status + debug: + var: node_status.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/fix-hcs-dpkg-issue.yml b/configuration/playbooks/fix-hcs-dpkg-issue.yml new file mode 100644 index 0000000..7db31b8 --- /dev/null +++ b/configuration/playbooks/fix-hcs-dpkg-issue.yml @@ -0,0 +1,56 @@ +--- +- name: Fix dpkg and initramfs issues on hcs + hosts: hcs + become: yes + tasks: + - name: Check current dpkg status + shell: dpkg --audit + register: dpkg_status + ignore_errors: yes + + - name: Display dpkg status + debug: + var: dpkg_status.stdout_lines + + - name: Fix broken btrfs hook + shell: | + # Remove problematic btrfs hook temporarily + mv /usr/share/initramfs-tools/hooks/btrfs /usr/share/initramfs-tools/hooks/btrfs.bak || true + + # Try to reconfigure the failed package + dpkg --configure -a + + # If that works, restore the hook + if [ $? -eq 0 ]; then + mv /usr/share/initramfs-tools/hooks/btrfs.bak /usr/share/initramfs-tools/hooks/btrfs || true + fi + register: fix_result + ignore_errors: yes + + - name: Display fix result + debug: + var: fix_result + + - name: Alternative fix - reinstall initramfs-tools + apt: + name: initramfs-tools + state: latest + force: yes + when: fix_result.rc != 0 + ignore_errors: yes + + - name: Clean up and update + shell: | + apt autoremove -y + apt update + apt upgrade -y + ignore_errors: yes + + - name: Check final dpkg status + shell: dpkg --audit + register: final_status + ignore_errors: yes + + - name: Display final status + debug: + var: final_status.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/fix-nomad-local.yml b/configuration/playbooks/fix-nomad-local.yml new file mode 100644 index 0000000..b75fdff --- /dev/null +++ b/configuration/playbooks/fix-nomad-local.yml @@ -0,0 +1,99 @@ +--- +- name: Update Nomad configuration for Podman and fix issues + hosts: localhost + become: yes + connection: local + + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Update Nomad configuration to use Podman and disable Consul + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "100.116.158.95" + + server { + enabled = true + bootstrap_expect = 1 + encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + } + + client { + enabled = true + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "100.116.158.95" + serf = "100.116.158.95" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + # Disable Consul integration for now + consul { + address = "" + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + backup: yes + + - name: Enable Podman socket for systemd + systemd: + name: podman.socket + enabled: yes + state: started + ignore_errors: yes + + - name: Start Nomad service + systemd: + name: nomad + state: started + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + host: localhost + delay: 5 + timeout: 30 + + - name: Check Nomad status + uri: + url: http://localhost:4646/v1/status/leader + method: GET + register: nomad_status + retries: 3 + delay: 5 + + - name: Display Nomad status + debug: + msg: "Nomad leader: {{ nomad_status.json if nomad_status.json is defined else 'No leader elected' }}" \ No newline at end of file diff --git a/configuration/playbooks/fix-nomad-podman-config.yml b/configuration/playbooks/fix-nomad-podman-config.yml new file mode 100644 index 0000000..d8e498c --- /dev/null +++ b/configuration/playbooks/fix-nomad-podman-config.yml @@ -0,0 +1,72 @@ +--- +- name: Fix Nomad Podman Driver Configuration + hosts: all + become: yes + vars: + nomad_user: nomad + + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Update Nomad configuration to properly reference Podman plugin + replace: + path: /etc/nomad.d/nomad.hcl + regexp: 'plugin "podman" \{\n config \{\n socket_path = "unix:///run/user/1001/podman/podman.sock"\n volumes \{\n enabled = true\n \}\n \}\n\}' + replace: | + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/user/1001/podman/podman.sock" + volumes { + enabled = true + } + } + } + + - name: Start Nomad service + systemd: + name: nomad + state: started + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + host: localhost + delay: 10 + timeout: 60 + + - name: Wait for plugins to load + pause: + seconds: 15 + + - name: Check if Podman driver is now loaded + shell: | + sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status" + register: driver_status + + - name: Display driver status + debug: + var: driver_status.stdout_lines + + - name: Check Nomad logs for successful plugin loading + shell: journalctl -u nomad -n 20 --no-pager | grep -E "(podman|plugin)" + register: recent_logs + failed_when: false + + - name: Display recent plugin logs + debug: + var: recent_logs.stdout_lines + + - name: Final verification - Test Podman functionality + shell: | + sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' | grep -i podman + register: podman_driver_check + failed_when: false + + - name: Display final result + debug: + msg: | + Podman driver status: {{ 'SUCCESS - Driver loaded!' if 'podman' in (podman_driver_check.stdout | default('')) else 'Still checking...' }} + Available drivers: {{ podman_driver_check.stdout_lines | default(['none']) | join(', ') }} \ No newline at end of file diff --git a/configuration/playbooks/fix-nomad-systemd.yml b/configuration/playbooks/fix-nomad-systemd.yml new file mode 100644 index 0000000..959ab7b --- /dev/null +++ b/configuration/playbooks/fix-nomad-systemd.yml @@ -0,0 +1,88 @@ +--- +- name: Fix Nomad systemd service binary path + hosts: nomad_cluster + become: yes + + tasks: + - name: Check Nomad binary location + shell: which nomad + register: nomad_binary_path + + - name: Display binary path + debug: + msg: "Nomad binary 位于: {{ nomad_binary_path.stdout }}" + + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + ignore_errors: yes + + - name: Update Nomad systemd service with correct binary path + copy: + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl + + [Service] + Type=notify + User=nomad + Group=nomad + ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/nomad.service + mode: '0644' + notify: reload systemd + + - name: Reload systemd and start Nomad servers first + systemd: + name: nomad + state: started + enabled: yes + daemon_reload: yes + when: inventory_hostname in groups['nomad_servers'] + + - name: Wait for servers to be ready + pause: + seconds: 15 + when: inventory_hostname in groups['nomad_servers'] + + - name: Start Nomad clients + systemd: + name: nomad + state: started + enabled: yes + daemon_reload: yes + when: inventory_hostname in groups['nomad_clients'] + + - name: Wait for clients to connect + pause: + seconds: 10 + when: inventory_hostname in groups['nomad_clients'] + + - name: Check final service status + shell: systemctl status nomad --no-pager -l + register: service_status + ignore_errors: yes + + - name: Display service status + debug: + msg: | + ✅ 节点 {{ inventory_hostname }} 服务状态: + 📊 状态: {{ 'SUCCESS' if service_status.rc == 0 else 'FAILED' }} + 💾 二进制路径: {{ nomad_binary_path.stdout }} + + handlers: + - name: reload systemd + systemd: + daemon_reload: yes \ No newline at end of file diff --git a/configuration/playbooks/fix-podman-installation.yml b/configuration/playbooks/fix-podman-installation.yml new file mode 100644 index 0000000..27aa892 --- /dev/null +++ b/configuration/playbooks/fix-podman-installation.yml @@ -0,0 +1,79 @@ +--- +- name: Fix Podman installation on remaining nodes + hosts: semaphore,master,ash3c,hcs + become: yes + serial: 1 # 逐个处理,避免同时影响多个节点 + + tasks: + - name: Current node status + debug: + msg: "🔧 修复节点: {{ inventory_hostname }}" + + - name: Check if Podman is already installed + shell: podman --version 2>/dev/null || echo "NOT_INSTALLED" + register: podman_check + + - name: Install Podman if not present (semaphore special handling) + apt: + name: + - podman + - buildah + - skopeo + state: present + update_cache: yes + force_apt_get: yes + when: inventory_hostname == 'semaphore' and 'NOT_INSTALLED' in podman_check.stdout + ignore_errors: yes + + - name: Install Podman on other nodes + apt: + name: + - podman + - buildah + - skopeo + state: present + when: inventory_hostname != 'semaphore' + ignore_errors: yes + + - name: Install Python dependencies for podman-compose + apt: + name: + - python3-pip + - python3-setuptools + - python3-yaml + - python3-dotenv + state: present + ignore_errors: yes + + - name: Install podman-compose via pip + pip: + name: + - podman-compose + state: present + executable: pip3 + ignore_errors: yes + + - name: Alternative podman-compose installation via apt + apt: + name: podman-compose + state: present + ignore_errors: yes + + - name: Verify installations + shell: | + echo "Podman: $(podman --version 2>/dev/null || echo 'FAILED')" + echo "Podman Compose: $(podman-compose --version 2>/dev/null || echo 'FAILED')" + register: verify_result + + - name: Display verification results + debug: + msg: | + ✅ 节点 {{ inventory_hostname }} 验证结果: + {{ verify_result.stdout }} + + - name: Enable Podman socket + systemd: + name: podman.socket + enabled: yes + state: started + ignore_errors: yes \ No newline at end of file diff --git a/configuration/playbooks/install-nomad-direct-download.yml b/configuration/playbooks/install-nomad-direct-download.yml new file mode 100644 index 0000000..50b2783 --- /dev/null +++ b/configuration/playbooks/install-nomad-direct-download.yml @@ -0,0 +1,133 @@ +--- +- name: Install Nomad by direct download from HashiCorp + hosts: hcs + become: yes + vars: + nomad_version: "1.10.5" + nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip" + nomad_user: "nomad" + nomad_group: "nomad" + nomad_home: "/opt/nomad" + nomad_data_dir: "/opt/nomad/data" + nomad_config_dir: "/etc/nomad.d" + nomad_datacenter: "dc1" + nomad_region: "global" + nomad_server_addresses: + - "100.116.158.95:4647" # semaphore server address + + tasks: + - name: Create nomad user + user: + name: "{{ nomad_user }}" + group: "{{ nomad_group }}" + system: yes + shell: /bin/false + home: "{{ nomad_home }}" + create_home: yes + + - name: Create nomad directories + file: + path: "{{ item }}" + state: directory + owner: "{{ nomad_user }}" + group: "{{ nomad_group }}" + mode: '0755' + loop: + - "{{ nomad_home }}" + - "{{ nomad_data_dir }}" + - "{{ nomad_config_dir }}" + - /var/log/nomad + + - name: Install unzip package + apt: + name: unzip + state: present + update_cache: yes + + - name: Download Nomad binary + get_url: + url: "{{ nomad_url }}" + dest: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" + mode: '0644' + timeout: 300 + + - name: Extract Nomad binary + unarchive: + src: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" + dest: /tmp + remote_src: yes + + - name: Copy Nomad binary to /usr/local/bin + copy: + src: /tmp/nomad + dest: /usr/local/bin/nomad + mode: '0755' + owner: root + group: root + remote_src: yes + + - name: Create Nomad client configuration + template: + src: templates/nomad-client.hcl.j2 + dest: "{{ nomad_config_dir }}/nomad.hcl" + owner: "{{ nomad_user }}" + group: "{{ nomad_group }}" + mode: '0640' + + - name: Create Nomad systemd service + copy: + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + ConditionFileNotEmpty={{ nomad_config_dir }}/nomad.hcl + + [Service] + Type=notify + User={{ nomad_user }} + Group={{ nomad_group }} + ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }} + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/nomad.service + mode: '0644' + + - name: Reload systemd daemon + systemd: + daemon_reload: yes + + - name: Enable and start Nomad service + systemd: + name: nomad + enabled: yes + state: started + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + host: localhost + delay: 5 + timeout: 60 + + - name: Verify Nomad installation + command: /usr/local/bin/nomad version + register: nomad_version_output + + - name: Display Nomad version + debug: + msg: "{{ nomad_version_output.stdout }}" + + - name: Clean up downloaded files + file: + path: "{{ item }}" + state: absent + loop: + - "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" + - /tmp/nomad \ No newline at end of file diff --git a/configuration/playbooks/install-nomad-podman-driver.yml b/configuration/playbooks/install-nomad-podman-driver.yml new file mode 100644 index 0000000..c308872 --- /dev/null +++ b/configuration/playbooks/install-nomad-podman-driver.yml @@ -0,0 +1,131 @@ +--- +- name: Install Nomad Podman Driver Plugin + hosts: all + become: yes + vars: + nomad_user: nomad + nomad_data_dir: /opt/nomad/data + nomad_plugins_dir: "{{ nomad_data_dir }}/plugins" + podman_driver_version: "0.6.1" + podman_driver_url: "https://releases.hashicorp.com/nomad-driver-podman/{{ podman_driver_version }}/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" + + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Create plugins directory + file: + path: "{{ nomad_plugins_dir }}" + state: directory + owner: "{{ nomad_user }}" + group: "{{ nomad_user }}" + mode: '0755' + + - name: Download Nomad Podman driver + get_url: + url: "{{ podman_driver_url }}" + dest: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" + mode: '0644' + + - name: Extract Nomad Podman driver + unarchive: + src: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" + dest: "/tmp" + remote_src: yes + + - name: Install Nomad Podman driver + copy: + src: "/tmp/nomad-driver-podman" + dest: "{{ nomad_plugins_dir }}/nomad-driver-podman" + owner: "{{ nomad_user }}" + group: "{{ nomad_user }}" + mode: '0755' + remote_src: yes + + - name: Update Nomad configuration for plugin directory + blockinfile: + path: /etc/nomad.d/nomad.hcl + marker: "# {mark} PLUGIN DIRECTORY CONFIGURATION" + block: | + plugin_dir = "{{ nomad_plugins_dir }}" + insertafter: 'data_dir = "/opt/nomad/data"' + + - name: Fix Podman socket permissions + file: + path: /run/user/1001/podman/podman.sock + mode: '0666' + ignore_errors: yes + + - name: Ensure nomad user can access Podman socket + user: + name: "{{ nomad_user }}" + groups: ben + append: yes + + - name: Start Nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + host: localhost + delay: 10 + timeout: 60 + + - name: Verify Nomad is running + systemd: + name: nomad + register: nomad_service_status + + - name: Display Nomad service status + debug: + msg: "Nomad service is {{ nomad_service_status.status.ActiveState }}" + + - name: Wait for plugins to load + pause: + seconds: 15 + + - name: Check available drivers + shell: | + sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status" + register: driver_status + failed_when: false + + - name: Display driver status + debug: + var: driver_status.stdout_lines + + - name: Test Podman driver functionality + shell: | + sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]' + register: available_drivers + failed_when: false + + - name: Display available drivers + debug: + msg: "Available drivers: {{ available_drivers.stdout_lines | join(', ') }}" + + - name: Clean up downloaded files + file: + path: "{{ item }}" + state: absent + loop: + - "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip" + - "/tmp/nomad-driver-podman" + + - name: Final verification - Check if Podman driver is loaded + shell: | + sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers.podman.Detected' + register: podman_driver_detected + failed_when: false + + - name: Display final result + debug: + msg: | + Podman driver installation: {{ 'SUCCESS' if podman_driver_detected.stdout == 'true' else 'NEEDS VERIFICATION' }} + Driver detected: {{ podman_driver_detected.stdout | default('unknown') }} \ No newline at end of file diff --git a/configuration/playbooks/install-podman-compose.yml b/configuration/playbooks/install-podman-compose.yml new file mode 100644 index 0000000..7a1cb8b --- /dev/null +++ b/configuration/playbooks/install-podman-compose.yml @@ -0,0 +1,61 @@ +--- +- name: Install Podman Compose on all Nomad cluster nodes + hosts: nomad_cluster + become: yes + + tasks: + - name: Display target node + debug: + msg: "正在安装 Podman Compose 到节点: {{ inventory_hostname }}" + + - name: Update package cache + apt: + update_cache: yes + ignore_errors: yes + + - name: Install Podman and related tools + apt: + name: + - podman + - podman-compose + - buildah + - skopeo + state: present + ignore_errors: yes + + - name: Install additional dependencies + apt: + name: + - python3-pip + - python3-setuptools + state: present + ignore_errors: yes + + - name: Install podman-compose via pip if package manager failed + pip: + name: podman-compose + state: present + ignore_errors: yes + + - name: Verify Podman installation + shell: podman --version + register: podman_version + + - name: Verify Podman Compose installation + shell: podman-compose --version + register: podman_compose_version + ignore_errors: yes + + - name: Display installation results + debug: + msg: | + ✅ 节点 {{ inventory_hostname }} 安装结果: + 📦 Podman: {{ podman_version.stdout }} + 🐳 Podman Compose: {{ podman_compose_version.stdout if podman_compose_version.rc == 0 else '安装失败或不可用' }} + + - name: Ensure Podman socket is enabled + systemd: + name: podman.socket + enabled: yes + state: started + ignore_errors: yes \ No newline at end of file diff --git a/configuration/playbooks/maintenance/ops-toolkit.yml b/configuration/playbooks/maintenance/ops-toolkit.yml deleted file mode 100644 index 46428c8..0000000 --- a/configuration/playbooks/maintenance/ops-toolkit.yml +++ /dev/null @@ -1,131 +0,0 @@ ---- -- name: Operations Toolkit - Unified Management Dashboard - hosts: all - gather_facts: yes - - vars: - # 可用的运维脚本 - available_scripts: - - { name: "system-update", desc: "System package updates", file: "system-update.yml" } - - { name: "system-cleanup", desc: "System cleanup and maintenance", file: "system-cleanup.yml" } - - { name: "service-health", desc: "Service health monitoring", file: "service-health-check.yml" } - - { name: "security-hardening", desc: "Security hardening and backup", file: "security-hardening.yml" } - - { name: "docker-management", desc: "Docker container management", file: "docker-management.yml" } - - { name: "network-connectivity", desc: "Network connectivity check", file: "network-connectivity.yml" } - - { name: "certificate-management", desc: "SSL certificate monitoring", file: "certificate-management.yml" } - - tasks: - # 显示系统概览 - - name: Display system overview - debug: - msg: | - 🖥️ System Overview for {{ inventory_hostname }}: - 📊 OS: {{ ansible_distribution }} {{ ansible_distribution_version }} - 💾 Memory: {{ (ansible_memtotal_mb/1024)|round(1) }}GB total, {{ (ansible_memfree_mb/1024)|round(1) }}GB free - 💿 CPU: {{ ansible_processor_vcpus }} cores - 🏠 Architecture: {{ ansible_architecture }} - 🌐 IP: {{ ansible_default_ipv4.address }} - ⏰ Uptime: {{ ansible_uptime_seconds//86400 }}d {{ (ansible_uptime_seconds%86400)//3600 }}h {{ ((ansible_uptime_seconds%3600)//60) }}m - - # 快速系统状态检查 - - name: Quick system status check - shell: | - echo "=== DISK USAGE ===" - df -h | grep -E "(Filesystem|/dev/)" - echo "" - echo "=== MEMORY USAGE ===" - free -h - echo "" - echo "=== LOAD AVERAGE ===" - uptime - echo "" - echo "=== TOP PROCESSES ===" - ps aux --sort=-%cpu | head -6 - register: quick_status - - - name: Display quick status - debug: - msg: "{{ quick_status.stdout_lines }}" - - # 检查关键服务状态 - - name: Check critical services - systemd: - name: "{{ item }}" - register: service_status - loop: - - ssh - - systemd-resolved - - cron - failed_when: false - - - name: Display service status - debug: - msg: "🔧 {{ item.item }}: {{ item.status.ActiveState if item.status is defined else 'NOT FOUND' }}" - loop: "{{ service_status.results }}" - - # 检查最近的系统日志错误 - - name: Check recent system errors - shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10 - register: recent_errors - failed_when: false - - - name: Display recent errors - debug: - msg: "🚨 Recent Errors: {{ recent_errors.stdout_lines if recent_errors.stdout_lines else ['No recent errors found'] }}" - - # 检查网络连接 - - name: Quick network check - shell: | - echo "=== NETWORK INTERFACES ===" - ip -br addr show - echo "" - echo "=== DEFAULT ROUTE ===" - ip route | grep default - echo "" - echo "=== DNS TEST ===" - nslookup google.com | grep -A1 "Name:" || echo "DNS resolution failed" - register: network_check - failed_when: false - - - name: Display network status - debug: - msg: "🌐 Network Status: {{ network_check.stdout_lines }}" - - # 显示可用的运维脚本 - - name: Display available operations scripts - debug: - msg: | - 🛠️ Available Operations Scripts: - {% for script in available_scripts %} - {{ loop.index }}. {{ script.name }}: {{ script.desc }} - {% endfor %} - - 💡 Usage Examples: - ansible-playbook -i inventory.ini system-cleanup.yml --limit {{ inventory_hostname }} - ansible-playbook -i inventory.ini docker-management.yml --limit lxc - ansible-playbook -i inventory.ini network-connectivity.yml --limit proxmox - - # 生成运维建议 - - name: Generate maintenance recommendations - debug: - msg: | - 💡 Maintenance Recommendations for {{ inventory_hostname }}: - - 🔄 Regular Tasks (Weekly): - - Run system-cleanup.yml to free up disk space - - Check service-health-check.yml for service status - - Review certificate-management.yml for expiring certificates - - 🔒 Security Tasks (Monthly): - - Execute security-hardening.yml for security updates - - Review network-connectivity.yml for network security - - 🐳 Container Tasks (As needed): - - Use docker-management.yml for Docker maintenance - - 📊 Monitoring Tasks (Daily): - - Quick check with ops-toolkit.yml (this script) - - ⚡ Emergency Tasks: - - Use system-update.yml for critical security patches - - Run network-connectivity.yml for connectivity issues \ No newline at end of file diff --git a/configuration/playbooks/migrate-to-podman-simple.yml b/configuration/playbooks/migrate-to-podman-simple.yml new file mode 100644 index 0000000..7688caa --- /dev/null +++ b/configuration/playbooks/migrate-to-podman-simple.yml @@ -0,0 +1,167 @@ +--- +- name: Migrate Nomad from Docker to Podman (Simple Version) + hosts: all + become: yes + vars: + nomad_user: nomad + nomad_config_dir: /etc/nomad.d + nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" + + tasks: + - name: Stop Nomad service + systemd: + name: nomad + state: stopped + + - name: Backup current Nomad configuration + copy: + src: "{{ nomad_config_file }}" + dest: "{{ nomad_config_file }}.backup-{{ ansible_date_time.epoch }}" + remote_src: yes + + - name: Get nomad user info + getent: + database: passwd + key: "{{ nomad_user }}" + register: nomad_user_info + + - name: Set nomad user UID variable + set_fact: + nomad_uid: "{{ nomad_user_info.ansible_facts.getent_passwd[nomad_user][1] }}" + + - name: Enable lingering for nomad user + command: loginctl enable-linger {{ nomad_user }} + failed_when: false + + - name: Create runtime directory for nomad user + file: + path: "/run/user/{{ nomad_uid }}" + state: directory + owner: "{{ nomad_user }}" + group: "{{ nomad_user }}" + mode: '0700' + + - name: Start Podman socket as nomad user + shell: | + sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} systemctl --user enable --now podman.socket + args: + creates: "/run/user/{{ nomad_uid }}/podman/podman.sock" + + - name: Create new Nomad configuration with Podman + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "0.0.0.0" + + client { + enabled = true + servers = [ + "100.116.158.95:4647", + ] + } + + # Docker plugin (disabled) + # plugin "docker" { + # config { + # allow_privileged = true + # volumes { + # enabled = true + # } + # } + # } + + plugin "podman" { + config { + socket_path = "unix:///run/user/{{ nomad_uid }}/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "127.0.0.1:8500" + } + dest: "{{ nomad_config_file }}" + owner: root + group: root + mode: '0644' + + - name: Update Nomad systemd service to run as nomad user + copy: + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + Wants=network-online.target + + [Service] + Type=notify + User={{ nomad_user }} + Group={{ nomad_user }} + ExecReload=/bin/kill -HUP $MAINPID + ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }} + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + Environment=XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/nomad.service + owner: root + group: root + mode: '0644' + + - name: Reload systemd daemon + systemd: + daemon_reload: yes + + - name: Start Nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for Nomad to be ready (local check) + wait_for: + port: 4646 + host: localhost + delay: 5 + timeout: 60 + + - name: Verify Nomad is running + shell: systemctl is-active nomad + register: nomad_status + + - name: Display Nomad status + debug: + msg: "Nomad service status: {{ nomad_status.stdout }}" + + - name: Check Podman socket + stat: + path: "/run/user/{{ nomad_uid }}/podman/podman.sock" + register: podman_socket + + - name: Display Podman socket status + debug: + msg: "Podman socket exists: {{ podman_socket.stat.exists }}" + + - name: Test Podman as nomad user + shell: | + sudo -u {{ nomad_user }} XDG_RUNTIME_DIR=/run/user/{{ nomad_uid }} podman version --format json + register: podman_test + failed_when: false + + - name: Display Podman test result + debug: + msg: | + Podman test: {{ 'SUCCESS' if podman_test.rc == 0 else 'FAILED' }} + {% if podman_test.rc != 0 %} + Error: {{ podman_test.stderr }} + {% endif %} \ No newline at end of file diff --git a/configuration/playbooks/monitoring/network-connectivity.yml b/configuration/playbooks/monitoring/network-connectivity.yml deleted file mode 100644 index 989749b..0000000 --- a/configuration/playbooks/monitoring/network-connectivity.yml +++ /dev/null @@ -1,143 +0,0 @@ ---- -- name: Network Connectivity and Performance Check - hosts: all - gather_facts: yes - - vars: - test_domains: - - google.com - - github.com - - docker.io - - tailscale.com - test_ports: - - { host: "8.8.8.8", port: 53, name: "Google DNS" } - - { host: "1.1.1.1", port: 53, name: "Cloudflare DNS" } - - { host: "github.com", port: 443, name: "GitHub HTTPS" } - - { host: "docker.io", port: 443, name: "Docker Hub" } - - tasks: - # 基本网络信息 - - name: Get network interfaces - shell: ip addr show | grep -E "^[0-9]+:|inet " - register: network_interfaces - - - name: Display network interfaces - debug: - msg: "🌐 Network Interfaces: {{ network_interfaces.stdout_lines }}" - - # 检查默认路由 - - name: Check default route - shell: ip route | grep default - register: default_route - - - name: Display default route - debug: - msg: "🛣️ Default Route: {{ default_route.stdout }}" - - # DNS 解析测试 - - name: Test DNS resolution - shell: nslookup {{ item }} | grep -A2 "Name:" - register: dns_test - loop: "{{ test_domains }}" - failed_when: false - - - name: Display DNS test results - debug: - msg: "🔍 DNS Test for {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}" - loop: "{{ dns_test.results }}" - - # 网络连通性测试 - - name: Test network connectivity (ping) - shell: ping -c 3 {{ item }} - register: ping_test - loop: "{{ test_domains }}" - failed_when: false - - - name: Display ping test results - debug: - msg: "🏓 Ping to {{ item.item }}: {{ 'SUCCESS' if item.rc == 0 else 'FAILED' }}" - loop: "{{ ping_test.results }}" - - # 端口连通性测试 - - name: Test port connectivity - wait_for: - host: "{{ item.host }}" - port: "{{ item.port }}" - timeout: 5 - register: port_test - loop: "{{ test_ports }}" - failed_when: false - - - name: Display port test results - debug: - msg: "🔌 {{ item.item.name }} ({{ item.item.host }}:{{ item.item.port }}): {{ 'SUCCESS' if not item.failed else 'FAILED' }}" - loop: "{{ port_test.results }}" - - # 检查 Tailscale 状态 - - name: Check Tailscale status - shell: tailscale status - register: tailscale_status - failed_when: false - - - name: Display Tailscale status - debug: - msg: "🔗 Tailscale Status: {{ 'CONNECTED' if tailscale_status.rc == 0 else 'NOT CONNECTED' }}" - - - name: Show Tailscale details - debug: - msg: "{{ tailscale_status.stdout_lines }}" - when: tailscale_status.rc == 0 - - # 检查防火墙状态 - - name: Check UFW status (Ubuntu/Debian) - shell: ufw status - register: ufw_status - failed_when: false - when: ansible_os_family == "Debian" - - - name: Display UFW status - debug: - msg: "🛡️ UFW Firewall: {{ ufw_status.stdout_lines }}" - when: ansible_os_family == "Debian" and ufw_status.rc == 0 - - # 检查 iptables 规则 - - name: Check iptables rules - shell: iptables -L -n | head -20 - register: iptables_rules - failed_when: false - become: yes - - - name: Display iptables summary - debug: - msg: "🔥 Iptables Rules: {{ iptables_rules.stdout_lines[:10] }}" - when: iptables_rules.rc == 0 - - # 网络性能测试 - - name: Test download speed (small file) - shell: curl -o /dev/null -s -w "%{time_total}" http://speedtest.wdc01.softlayer.com/downloads/test10.zip - register: download_speed - failed_when: false - - - name: Display download speed test - debug: - msg: "⚡ Download Speed Test: {{ download_speed.stdout }}s for 10MB file" - when: download_speed.rc == 0 - - # 检查网络统计 - - name: Get network statistics - shell: cat /proc/net/dev | grep -v "lo:" | grep ":" - register: network_stats - - - name: Display network statistics - debug: - msg: "📊 Network Stats: {{ network_stats.stdout_lines }}" - - # 生成网络健康报告 - - name: Generate network health summary - debug: - msg: | - 🌐 Network Health Summary for {{ inventory_hostname }}: - ✅ DNS Resolution: {{ (dns_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} domains - ✅ Ping Connectivity: {{ (ping_test.results | selectattr('rc', 'equalto', 0) | list | length) }}/{{ test_domains | length }} hosts - ✅ Port Connectivity: {{ (port_test.results | rejectattr('failed', 'defined') | list | length) }}/{{ test_ports | length }} ports - ✅ Tailscale: {{ 'Connected' if tailscale_status.rc == 0 else 'Disconnected' }} \ No newline at end of file diff --git a/configuration/playbooks/monitoring/service-health-check.yml b/configuration/playbooks/monitoring/service-health-check.yml deleted file mode 100644 index 51e36d9..0000000 --- a/configuration/playbooks/monitoring/service-health-check.yml +++ /dev/null @@ -1,135 +0,0 @@ ---- -- name: Service Health Check and Monitoring - hosts: all - become: yes - gather_facts: yes - - vars: - critical_services: - - ssh - - systemd-resolved - - cron - web_services: - - nginx - - apache2 - database_services: - - mysql - - mariadb - - postgresql - container_services: - - docker - - containerd - network_services: - - tailscale - - cloudflared - - tasks: - # 检查关键系统服务 - - name: Check critical system services - systemd: - name: "{{ item }}" - register: critical_service_status - loop: "{{ critical_services }}" - failed_when: false - - - name: Report critical service issues - debug: - msg: "⚠️ Critical service {{ item.item }} is {{ item.status.ActiveState | default('not found') }}" - loop: "{{ critical_service_status.results }}" - when: item.status is defined and item.status.ActiveState != "active" - - # 检查 Web 服务 - - name: Check web services - systemd: - name: "{{ item }}" - register: web_service_status - loop: "{{ web_services }}" - failed_when: false - - - name: Report web service status - debug: - msg: "🌐 Web service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" - loop: "{{ web_service_status.results }}" - when: item.status is defined - - # 检查数据库服务 - - name: Check database services - systemd: - name: "{{ item }}" - register: db_service_status - loop: "{{ database_services }}" - failed_when: false - - - name: Report database service status - debug: - msg: "🗄️ Database service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" - loop: "{{ db_service_status.results }}" - when: item.status is defined - - # 检查容器服务 - - name: Check container services - systemd: - name: "{{ item }}" - register: container_service_status - loop: "{{ container_services }}" - failed_when: false - - - name: Report container service status - debug: - msg: "📦 Container service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" - loop: "{{ container_service_status.results }}" - when: item.status is defined - - # 检查网络服务 - - name: Check network services - systemd: - name: "{{ item }}" - register: network_service_status - loop: "{{ network_services }}" - failed_when: false - - - name: Report network service status - debug: - msg: "🌐 Network service {{ item.item }}: {{ item.status.ActiveState | default('not installed') }}" - loop: "{{ network_service_status.results }}" - when: item.status is defined - - # 检查系统负载 - - name: Check system load - shell: uptime - register: system_load - - - name: Display system load - debug: - msg: "📊 System Load: {{ system_load.stdout }}" - - # 检查磁盘空间警告 - - name: Check disk space usage - shell: df -h | awk '$5 > 80 {print $0}' - register: disk_warning - changed_when: false - - - name: Warn about high disk usage - debug: - msg: "⚠️ High disk usage detected: {{ disk_warning.stdout_lines }}" - when: disk_warning.stdout_lines | length > 0 - - # 检查内存使用率 - - name: Check memory usage percentage - shell: free | awk 'NR==2{printf "%.2f%%", $3*100/$2}' - register: memory_percent - - - name: Display memory usage - debug: - msg: "🧠 Memory Usage: {{ memory_percent.stdout }}" - - # 检查最近的系统错误 - - name: Check recent system errors - shell: journalctl --since "1 hour ago" --priority=err --no-pager | tail -10 - register: recent_errors - changed_when: false - - - name: Display recent errors - debug: - msg: "🚨 Recent system errors: {{ recent_errors.stdout_lines }}" - when: recent_errors.stdout_lines | length > 0 \ No newline at end of file diff --git a/configuration/playbooks/remove-docker-install-podman.yml b/configuration/playbooks/remove-docker-install-podman.yml new file mode 100644 index 0000000..09ff808 --- /dev/null +++ b/configuration/playbooks/remove-docker-install-podman.yml @@ -0,0 +1,120 @@ +--- +- name: 移除 Docker 并安装 Podman - 新 Server 节点 + hosts: ash2e,ash1d,ch2 + become: yes + gather_facts: no + serial: 1 # 逐个节点处理,避免并发冲突 + + tasks: + - name: 显示当前处理的节点 + debug: + msg: "🔧 正在处理节点: {{ inventory_hostname }}" + + - name: 检查 Docker 服务状态 + shell: systemctl is-active docker 2>/dev/null || echo "inactive" + register: docker_status + changed_when: false + + - name: 停止 Docker 服务 + systemd: + name: docker + state: stopped + enabled: no + ignore_errors: yes + when: docker_status.stdout == "active" + + - name: 停止 Docker socket + systemd: + name: docker.socket + state: stopped + enabled: no + ignore_errors: yes + + - name: 移除 Docker 相关包 + apt: + name: + - docker-ce + - docker-ce-cli + - containerd.io + - docker-buildx-plugin + - docker-compose-plugin + - docker.io + - docker-doc + - docker-compose + - docker-registry + - containerd + - runc + state: absent + purge: yes + ignore_errors: yes + + - name: 清理 Docker 数据目录 + file: + path: "{{ item }}" + state: absent + loop: + - /var/lib/docker + - /var/lib/containerd + - /etc/docker + - /etc/containerd + ignore_errors: yes + + - name: 清理 Docker 用户组 + group: + name: docker + state: absent + ignore_errors: yes + + - name: 更新包缓存 + apt: + update_cache: yes + cache_valid_time: 3600 + + - name: 安装 Podman 及相关工具 + apt: + name: + - podman + - buildah + - skopeo + - podman-compose + state: present + retries: 3 + delay: 10 + + - name: 启用 Podman socket 服务 + systemd: + name: podman.socket + enabled: yes + state: started + ignore_errors: yes + + - name: 创建 Podman 用户服务目录 + file: + path: /etc/systemd/user + state: directory + mode: '0755' + + - name: 验证 Podman 安装 + shell: podman --version + register: podman_version + + - name: 验证 Podman Compose 安装 + shell: podman-compose --version 2>/dev/null || echo "未安装" + register: podman_compose_version + + - name: 检查 Docker 清理状态 + shell: systemctl is-active docker 2>/dev/null || echo "已移除" + register: final_docker_status + + - name: 显示节点处理结果 + debug: + msg: | + ✅ 节点 {{ inventory_hostname }} 处理完成 + 🐳 Docker 状态: {{ final_docker_status.stdout }} + 📦 Podman 版本: {{ podman_version.stdout }} + 🔧 Compose 状态: {{ podman_compose_version.stdout }} + + - name: 清理 apt 缓存 + apt: + autoclean: yes + autoremove: yes \ No newline at end of file diff --git a/configuration/playbooks/restart-tailscale.yml b/configuration/playbooks/restart-tailscale.yml new file mode 100644 index 0000000..46688b7 --- /dev/null +++ b/configuration/playbooks/restart-tailscale.yml @@ -0,0 +1,39 @@ +--- +- name: Restart Tailscale to fix DNS issues + hosts: hcp1,hcp2 + become: yes + + tasks: + - name: Check current DNS configuration + shell: cat /etc/resolv.conf + register: dns_before + + - name: Display current DNS config + debug: + msg: "Current DNS config: {{ dns_before.stdout_lines }}" + + - name: Restart tailscaled service + systemd: + name: tailscaled + state: restarted + + - name: Wait for tailscale to stabilize + wait_for: + timeout: 10 + + - name: Check DNS configuration after restart + shell: cat /etc/resolv.conf + register: dns_after + + - name: Display new DNS config + debug: + msg: "New DNS config: {{ dns_after.stdout_lines }}" + + - name: Test DNS resolution + shell: nslookup apt.releases.hashicorp.com + register: dns_test + ignore_errors: yes + + - name: Display DNS test result + debug: + msg: "DNS test result: {{ dns_test.stdout_lines }}" \ No newline at end of file diff --git a/configuration/playbooks/security/certificate-management.yml b/configuration/playbooks/security/certificate-management.yml deleted file mode 100644 index 600bbe9..0000000 --- a/configuration/playbooks/security/certificate-management.yml +++ /dev/null @@ -1,152 +0,0 @@ ---- -- name: SSL Certificate Management and Monitoring - hosts: all - gather_facts: yes - - vars: - # 常见证书路径 - cert_paths: - - /etc/ssl/certs - - /etc/letsencrypt/live - - /etc/nginx/ssl - - /etc/apache2/ssl - - /usr/local/share/ca-certificates - - # 需要检查的服务端口 - ssl_services: - - { name: "HTTPS", port: 443 } - - { name: "SMTPS", port: 465 } - - { name: "IMAPS", port: 993 } - - { name: "LDAPS", port: 636 } - - tasks: - # 检查证书目录 - - name: Check certificate directories - stat: - path: "{{ item }}" - register: cert_dirs - loop: "{{ cert_paths }}" - - - name: List existing certificate directories - debug: - msg: "📁 Certificate directory {{ item.item }}: {{ 'EXISTS' if item.stat.exists else 'NOT FOUND' }}" - loop: "{{ cert_dirs.results }}" - - # 查找证书文件 - - name: Find certificate files - find: - paths: "{{ cert_paths }}" - patterns: "*.crt,*.pem,*.cert" - recurse: yes - register: cert_files - - - name: Display found certificates - debug: - msg: "🔐 Found {{ cert_files.files | length }} certificate files" - - # 检查证书过期时间 - - name: Check certificate expiration - shell: | - if [ -f "{{ item.path }}" ]; then - openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2 - fi - register: cert_expiry - loop: "{{ cert_files.files[:10] }}" # 限制检查前10个证书 - failed_when: false - - - name: Display certificate expiration dates - debug: - msg: "📅 {{ item.item.path | basename }}: expires {{ item.stdout if item.stdout else 'INVALID/UNREADABLE' }}" - loop: "{{ cert_expiry.results }}" - when: item.stdout != "" - - # 检查即将过期的证书 (30天内) - - name: Check certificates expiring soon - shell: | - if [ -f "{{ item.path }}" ]; then - exp_date=$(openssl x509 -in "{{ item.path }}" -noout -enddate 2>/dev/null | cut -d= -f2) - if [ ! -z "$exp_date" ]; then - exp_epoch=$(date -d "$exp_date" +%s 2>/dev/null) - now_epoch=$(date +%s) - days_left=$(( (exp_epoch - now_epoch) / 86400 )) - if [ $days_left -lt 30 ]; then - echo "WARNING: $days_left days left" - else - echo "OK: $days_left days left" - fi - fi - fi - register: cert_warnings - loop: "{{ cert_files.files[:10] }}" - failed_when: false - - - name: Display certificate warnings - debug: - msg: "⚠️ {{ item.item.path | basename }}: {{ item.stdout }}" - loop: "{{ cert_warnings.results }}" - when: item.stdout != "" and "WARNING" in item.stdout - - # 检查 Let's Encrypt 证书 - - name: Check Let's Encrypt certificates - shell: certbot certificates 2>/dev/null || echo "Certbot not installed" - register: letsencrypt_certs - failed_when: false - - - name: Display Let's Encrypt status - debug: - msg: "🔒 Let's Encrypt: {{ letsencrypt_certs.stdout_lines }}" - when: "'not installed' not in letsencrypt_certs.stdout" - - # 检查 SSL 服务端口 - - name: Check SSL service ports - wait_for: - port: "{{ item.port }}" - timeout: 3 - register: ssl_ports - loop: "{{ ssl_services }}" - failed_when: false - - - name: Display SSL service status - debug: - msg: "🔌 {{ item.item.name }} (port {{ item.item.port }}): {{ 'LISTENING' if not item.failed else 'NOT AVAILABLE' }}" - loop: "{{ ssl_ports.results }}" - - # 测试 HTTPS 连接 - - name: Test HTTPS connection to localhost - uri: - url: "https://{{ ansible_default_ipv4.address }}" - method: GET - validate_certs: no - timeout: 5 - register: https_test - failed_when: false - when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed - - - name: Display HTTPS test result - debug: - msg: "🌐 HTTPS Test: {{ 'SUCCESS' if https_test.status is defined else 'FAILED' }}" - when: https_test is defined - - # 检查证书链 - - name: Check certificate chain for HTTPS - shell: | - echo | openssl s_client -connect {{ ansible_default_ipv4.address }}:443 -servername {{ ansible_hostname }} 2>/dev/null | openssl x509 -noout -subject -issuer - register: cert_chain - failed_when: false - when: ssl_ports.results[0] is defined and not ssl_ports.results[0].failed - - - name: Display certificate chain info - debug: - msg: "🔗 Certificate Chain: {{ cert_chain.stdout_lines }}" - when: cert_chain is defined and cert_chain.rc == 0 - - # 生成证书健康报告 - - name: Generate certificate health summary - debug: - msg: | - 🔐 Certificate Health Summary for {{ inventory_hostname }}: - 📁 Certificate directories found: {{ (cert_dirs.results | selectattr('stat.exists') | list | length) }} - 📄 Certificate files found: {{ cert_files.files | length }} - ⚠️ Certificates expiring soon: {{ (cert_warnings.results | selectattr('stdout', 'search', 'WARNING') | list | length) }} - 🔒 Let's Encrypt: {{ 'Configured' if 'not installed' not in letsencrypt_certs.stdout else 'Not installed' }} - 🌐 SSL Services: {{ (ssl_ports.results | rejectattr('failed') | list | length) }}/{{ ssl_services | length }} available \ No newline at end of file diff --git a/configuration/playbooks/security/security-hardening.yml b/configuration/playbooks/security/security-hardening.yml deleted file mode 100644 index f5cdd20..0000000 --- a/configuration/playbooks/security/security-hardening.yml +++ /dev/null @@ -1,119 +0,0 @@ ---- -- name: Security Hardening and Backup - hosts: all - become: yes - gather_facts: yes - - tasks: - # SSH 安全配置检查 - - name: Check SSH configuration security - lineinfile: - path: /etc/ssh/sshd_config - regexp: "{{ item.regexp }}" - line: "{{ item.line }}" - backup: yes - loop: - - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no' } - - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' } - - { regexp: '^#?X11Forwarding', line: 'X11Forwarding no' } - - { regexp: '^#?MaxAuthTries', line: 'MaxAuthTries 3' } - notify: restart ssh - when: ansible_os_family == "Debian" - - # 防火墙状态检查 - - name: Check UFW firewall status - shell: ufw status - register: ufw_status - changed_when: false - failed_when: false - when: ansible_os_family == "Debian" - - - name: Display firewall status - debug: - msg: "🔥 Firewall Status: {{ ufw_status.stdout_lines }}" - when: ansible_os_family == "Debian" and ufw_status.stdout_lines is defined - - # 检查可疑登录 - - name: Check for failed login attempts - shell: grep "Failed password" /var/log/auth.log | tail -10 - register: failed_logins - changed_when: false - failed_when: false - - - name: Report suspicious login attempts - debug: - msg: "🚨 Recent failed logins: {{ failed_logins.stdout_lines }}" - when: failed_logins.stdout_lines | length > 0 - - # 检查 root 用户活动 - - name: Check recent root activity - shell: grep "sudo.*root" /var/log/auth.log | tail -5 - register: root_activity - changed_when: false - failed_when: false - - - name: Display root activity - debug: - msg: "👑 Recent root activity: {{ root_activity.stdout_lines }}" - when: root_activity.stdout_lines | length > 0 - - # 备份重要配置文件 - - name: Create backup directory - file: - path: /backup/configs - state: directory - mode: '0700' - - - name: Backup important configuration files - copy: - src: "{{ item }}" - dest: "/backup/configs/{{ item | basename }}.{{ ansible_date_time.epoch }}" - remote_src: yes - backup: yes - loop: - - /etc/ssh/sshd_config - - /etc/hosts - - /etc/fstab - - /etc/crontab - failed_when: false - - # 检查系统完整性 - - name: Check for world-writable files - shell: find /etc /usr /bin /sbin -type f -perm -002 2>/dev/null | head -10 - register: world_writable - changed_when: false - - - name: Report world-writable files - debug: - msg: "⚠️ World-writable files found: {{ world_writable.stdout_lines }}" - when: world_writable.stdout_lines | length > 0 - - # 检查 SUID 文件 - - name: Check for SUID files - shell: find /usr /bin /sbin -type f -perm -4000 2>/dev/null - register: suid_files - changed_when: false - - - name: Display SUID files count - debug: - msg: "🔐 Found {{ suid_files.stdout_lines | length }} SUID files" - - # 更新系统时间 - - name: Sync system time - shell: timedatectl set-ntp true - failed_when: false - - - name: Check time synchronization - shell: timedatectl status - register: time_status - - - name: Display time sync status - debug: - msg: "🕐 Time sync: {{ time_status.stdout_lines | select('match', '.*synchronized.*') | list }}" - - handlers: - - name: restart ssh - systemd: - name: ssh - state: restarted - when: ansible_os_family == "Debian" \ No newline at end of file diff --git a/configuration/playbooks/setup-disk-monitoring.yml b/configuration/playbooks/setup-disk-monitoring.yml new file mode 100644 index 0000000..f513dba --- /dev/null +++ b/configuration/playbooks/setup-disk-monitoring.yml @@ -0,0 +1,187 @@ +--- +- name: 部署 Telegraf 硬盘监控到 Nomad 集群 + hosts: all + become: yes + vars: + # 连接现有的 InfluxDB 2.x + Grafana 监控栈 + influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}" + influxdb_token: "{{ influxdb_token }}" + influxdb_org: "{{ influxdb_org | default('nomad') }}" + influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}" + + # 远程 Telegraf 配置模式(优先) + use_remote_config: "{{ use_remote_config | default(true) }}" + telegraf_config_url: "{{ telegraf_config_url | default('') }}" + + # 硬盘监控阈值 + disk_usage_warning: 80 # 80% 使用率警告 + disk_usage_critical: 90 # 90% 使用率严重告警 + + # 监控间隔(秒) + collection_interval: 30 + + tasks: + - name: 显示正在处理的节点 + debug: + msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控" + + - name: 添加 InfluxData 仓库密钥 + apt_key: + url: https://repos.influxdata.com/influxdata-archive_compat.key + state: present + retries: 3 + delay: 5 + + - name: 添加 InfluxData 仓库 + apt_repository: + repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable" + state: present + update_cache: yes + retries: 3 + delay: 5 + + - name: 安装 Telegraf + apt: + name: telegraf + state: present + update_cache: yes + retries: 3 + delay: 10 + + - name: 创建 Telegraf 配置目录 + file: + path: /etc/telegraf/telegraf.d + state: directory + owner: telegraf + group: telegraf + mode: '0755' + + - name: 清理旧的 Telegraf 日志文件(节省硬盘空间) + file: + path: "{{ item }}" + state: absent + loop: + - /var/log/telegraf + - /var/log/telegraf.log + ignore_errors: yes + + - name: 禁用 Telegraf 日志目录创建 + file: + path: /var/log/telegraf + state: absent + ignore_errors: yes + + - name: 创建 Telegraf 环境变量文件 + template: + src: telegraf-env.j2 + dest: /etc/default/telegraf + owner: root + group: root + mode: '0600' + backup: yes + notify: restart telegraf + + - name: 创建 Telegraf systemd 服务文件(支持远程配置) + template: + src: telegraf.service.j2 + dest: /etc/systemd/system/telegraf.service + owner: root + group: root + mode: '0644' + backup: yes + notify: + - reload systemd + - restart telegraf + when: telegraf_config_url is defined and telegraf_config_url != '' + + - name: 生成 Telegraf 主配置文件(本地配置模式) + template: + src: telegraf.conf.j2 + dest: /etc/telegraf/telegraf.conf + owner: telegraf + group: telegraf + mode: '0644' + backup: yes + notify: restart telegraf + when: telegraf_config_url is not defined or telegraf_config_url == '' + + - name: 生成硬盘监控配置 + template: + src: disk-monitoring.conf.j2 + dest: /etc/telegraf/telegraf.d/disk-monitoring.conf + owner: telegraf + group: telegraf + mode: '0644' + backup: yes + notify: restart telegraf + + - name: 生成系统监控配置 + template: + src: system-monitoring.conf.j2 + dest: /etc/telegraf/telegraf.d/system-monitoring.conf + owner: telegraf + group: telegraf + mode: '0644' + backup: yes + notify: restart telegraf + + - name: 启用并启动 Telegraf 服务 + systemd: + name: telegraf + state: started + enabled: yes + daemon_reload: yes + + - name: 验证 Telegraf 状态 + systemd: + name: telegraf + register: telegraf_status + + - name: 检查 InfluxDB 连接 + uri: + url: "{{ influxdb_url }}/ping" + method: GET + timeout: 5 + register: influxdb_ping + ignore_errors: yes + delegate_to: localhost + run_once: true + + - name: 显示 InfluxDB 连接状态 + debug: + msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}" + run_once: true + + - name: 显示 Telegraf 状态 + debug: + msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}" + + - name: 检查硬盘使用情况 + shell: | + df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output; + do + usage=$(echo $output | awk '{print $1}' | sed 's/%//g') + partition=$(echo $output | awk '{print $2}') + mount=$(echo $output | awk '{print $3}') + if [ $usage -ge {{ disk_usage_warning }} ]; then + echo "⚠️ 警告: $mount ($partition) 使用率 $usage%" + else + echo "✅ $mount ($partition) 使用率 $usage%" + fi + done + register: disk_check + changed_when: false + + - name: 显示硬盘检查结果 + debug: + msg: "{{ disk_check.stdout_lines }}" + + handlers: + - name: reload systemd + systemd: + daemon_reload: yes + + - name: restart telegraf + systemd: + name: telegraf + state: restarted \ No newline at end of file diff --git a/configuration/playbooks/setup-new-nomad-nodes.yml b/configuration/playbooks/setup-new-nomad-nodes.yml new file mode 100644 index 0000000..802587d --- /dev/null +++ b/configuration/playbooks/setup-new-nomad-nodes.yml @@ -0,0 +1,76 @@ +--- +- name: 安装并配置新的 Nomad Server 节点 + hosts: ash2e,ash1d,ch2 + become: yes + gather_facts: no + + tasks: + - name: 更新包缓存 + apt: + update_cache: yes + cache_valid_time: 3600 + retries: 3 + delay: 10 + + - name: 安装依赖包 + apt: + name: + - wget + - curl + - unzip + - podman + - buildah + - skopeo + state: present + retries: 3 + delay: 10 + + - name: 检查 Nomad 是否已安装 + shell: which nomad || echo "not_found" + register: nomad_check + changed_when: false + + - name: 下载并安装 Nomad + block: + - name: 下载 Nomad 1.10.5 + get_url: + url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip" + dest: "/tmp/nomad.zip" + mode: '0644' + + - name: 解压 Nomad + unarchive: + src: "/tmp/nomad.zip" + dest: "/usr/bin/" + remote_src: yes + owner: root + group: root + mode: '0755' + + - name: 清理临时文件 + file: + path: "/tmp/nomad.zip" + state: absent + when: nomad_check.stdout == "not_found" + + - name: 验证 Nomad 安装 + shell: nomad version + register: nomad_version_output + + - name: 显示安装结果 + debug: + msg: | + ✅ 节点 {{ inventory_hostname }} 软件安装完成 + 📦 Podman: {{ ansible_facts.packages.podman[0].version if ansible_facts.packages.podman is defined else 'checking...' }} + 🎯 Nomad: {{ nomad_version_output.stdout.split('\n')[0] }} + + - name: 启用 Podman socket + systemd: + name: podman.socket + enabled: yes + state: started + ignore_errors: yes + + - name: 继续完整配置 + debug: + msg: "软件安装完成,现在将运行完整的 Nomad 配置..." \ No newline at end of file diff --git a/configuration/templates/disk-monitoring.conf.j2 b/configuration/templates/disk-monitoring.conf.j2 new file mode 100644 index 0000000..3a2ef44 --- /dev/null +++ b/configuration/templates/disk-monitoring.conf.j2 @@ -0,0 +1,68 @@ +# 硬盘监控配置 +# 监控所有挂载点的硬盘使用情况 + +# 硬盘使用率监控 +[[inputs.disk]] + ## 忽略的文件系统类型 + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + + ## 监控所有挂载点 + mount_points = ["/", "/var", "/tmp", "/opt", "/home"] + + ## 标签配置 + [inputs.disk.tags] + service = "disk-monitoring" + +# 硬盘 I/O 监控 +[[inputs.diskio]] + ## 监控所有设备 + devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"] + + ## 跳过序列号收集以提高性能 + skip_serial_number = true + + [inputs.diskio.tags] + service = "disk-io-monitoring" + +# 文件系统 inode 监控 +[[inputs.disk]] + ## 监控 inode 使用情况 + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + + ## 收集 inode 信息 + [inputs.disk.tags] + service = "inode-monitoring" + +# 进程监控(可选,用于监控可能占用大量硬盘的进程) +[[inputs.procstat]] + ## 监控 Docker 进程(如果存在) + pattern = "docker" + + [inputs.procstat.tags] + service = "docker-process" + +[[inputs.procstat]] + ## 监控 Podman 进程 + pattern = "podman" + + [inputs.procstat.tags] + service = "podman-process" + +[[inputs.procstat]] + ## 监控 Nomad 进程 + pattern = "nomad" + + [inputs.procstat.tags] + service = "nomad-process" + +# 日志文件大小监控 +[[inputs.filestat]] + files = [ + "/var/log/nomad/*.log", + "/var/log/syslog", + "/var/log/kern.log", + "/var/log/auth.log" + ] + + [inputs.filestat.tags] + service = "log-monitoring" \ No newline at end of file diff --git a/configuration/templates/system-monitoring.conf.j2 b/configuration/templates/system-monitoring.conf.j2 new file mode 100644 index 0000000..245315f --- /dev/null +++ b/configuration/templates/system-monitoring.conf.j2 @@ -0,0 +1,68 @@ +# 系统监控配置 +# CPU、内存、网络等系统资源监控 + +# CPU 监控 +[[inputs.cpu]] + ## 是否收集每个 CPU 核心的信息 + percpu = true + ## 是否收集总 CPU 信息 + totalcpu = true + ## 收集字段 + collect_cpu_time = false + ## 报告活跃的 CPU + report_active = false + + [inputs.cpu.tags] + service = "cpu-monitoring" + +# 内存监控 +[[inputs.mem]] + [inputs.mem.tags] + service = "memory-monitoring" + +# 网络接口监控 +[[inputs.net]] + ## 接口配置 + interfaces = ["eth*", "en*", "tailscale*"] + + [inputs.net.tags] + service = "network-monitoring" + +# 系统负载监控 +[[inputs.system]] + [inputs.system.tags] + service = "system-load" + +# 内核统计 +[[inputs.kernel]] + [inputs.kernel.tags] + service = "kernel-stats" + +# 网络统计 +[[inputs.netstat]] + [inputs.netstat.tags] + service = "network-stats" + +# 交换分区监控 +[[inputs.swap]] + [inputs.swap.tags] + service = "swap-monitoring" + +# 服务状态监控 +[[inputs.systemd_units]] + ## 监控的服务 + units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"] + + [inputs.systemd_units.tags] + service = "service-monitoring" + +# 硬盘健康状态监控(如果支持 SMART) +[[inputs.smart]] + ## SMART 监控路径 + path_smartctl = "/usr/sbin/smartctl" + + ## 超时设置 + timeout = "30s" + + [inputs.smart.tags] + service = "smart-monitoring" \ No newline at end of file diff --git a/configuration/templates/telegraf-env.j2 b/configuration/templates/telegraf-env.j2 new file mode 100644 index 0000000..e7a9be7 --- /dev/null +++ b/configuration/templates/telegraf-env.j2 @@ -0,0 +1,7 @@ +# Telegraf 环境变量配置 +# InfluxDB 2.x 认证信息 + +INFLUX_TOKEN={{ influxdb_token }} +INFLUX_ORG={{ influxdb_org }} +INFLUX_BUCKET={{ influxdb_bucket }} +INFLUX_URL={{ influxdb_url }} \ No newline at end of file diff --git a/configuration/templates/telegraf.conf.j2 b/configuration/templates/telegraf.conf.j2 new file mode 100644 index 0000000..62342b2 --- /dev/null +++ b/configuration/templates/telegraf.conf.j2 @@ -0,0 +1,53 @@ +# Telegraf 主配置文件 +# Nomad 集群硬盘监控配置 + +# 全局设置 +[global_tags] + nomad_cluster = "production" + node_role = "{{ nomad_role | default('unknown') }}" + hostname = "{{ inventory_hostname }}" + +# Agent 配置 +[agent] + interval = "{{ collection_interval | default(30) }}s" + round_interval = true + metric_batch_size = 1000 + metric_buffer_limit = 10000 + collection_jitter = "2s" + flush_interval = "10s" + flush_jitter = "0s" + precision = "" + hostname = "{{ inventory_hostname }}" + omit_hostname = false + +# 输出配置 - InfluxDB 2.x +[[outputs.influxdb_v2]] + urls = ["{{ influxdb_url }}"] + token = "{{ influxdb_token }}" + organization = "{{ influxdb_org | default('nomad') }}" + bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}" + + ## 连接配置 + timeout = "10s" + max_retries = 3 + retry_timeout = "5s" + + ## 数据精度 + precision = "s" + + ## TLS 配置(如果需要) + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + # insecure_skip_verify = false + +# 日志配置 - 禁用本地日志以节省硬盘空间 +[log] + ## 只输出错误日志到 syslog,不生成本地文件 + level = "ERROR" + ## 禁用本地日志文件 + # file = "/var/log/telegraf/telegraf.log" + ## 使用 syslog 替代本地文件 + logtarget = "syslog" + ## 禁用日志轮转 + logrotate = false \ No newline at end of file diff --git a/configuration/templates/telegraf.service.j2 b/configuration/templates/telegraf.service.j2 new file mode 100644 index 0000000..da400d5 --- /dev/null +++ b/configuration/templates/telegraf.service.j2 @@ -0,0 +1,29 @@ +[Unit] +Description=Telegraf - 节点监控服务 +Documentation=https://github.com/influxdata/telegraf +After=network.target + +[Service] +Type=notify +User=telegraf +Group=telegraf +ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }} +ExecReload=/bin/kill -HUP $MAINPID +KillMode=control-group +Restart=on-failure +RestartSec=5 +TimeoutStopSec=20 +EnvironmentFile=/etc/default/telegraf + +# 安全配置 +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths=/var/lib/telegraf +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/docs/disk-management.md b/docs/disk-management.md new file mode 100644 index 0000000..54a419b --- /dev/null +++ b/docs/disk-management.md @@ -0,0 +1,169 @@ +# 磁盘管理工具使用指南 + +## 🔧 工具概览 + +我们提供了三个主要的磁盘管理工具来解决磁盘空间不足的问题: + +### 1. 磁盘分析工具 (`disk-analysis-ncdu.yml`) +使用 `ncdu` 工具深度分析磁盘使用情况,生成详细报告。 + +### 2. 磁盘清理工具 (`disk-cleanup.yml`) +自动清理系统垃圾文件、日志、缓存等。 + +### 3. 磁盘监控脚本 (`disk-monitor.sh`) +一键监控所有节点的磁盘使用情况。 + +## 🚀 快速使用 + +### 监控所有节点磁盘使用情况 +```bash +# 使用默认阈值 85% +./scripts/utilities/disk-monitor.sh + +# 使用自定义阈值 90% +./scripts/utilities/disk-monitor.sh 90 +``` + +### 分析特定节点磁盘使用 +```bash +# 分析所有节点 +ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ + configuration/playbooks/disk-analysis-ncdu.yml + +# 分析特定节点 +ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ + configuration/playbooks/disk-analysis-ncdu.yml --limit semaphore +``` + +### 清理磁盘空间 +```bash +# 清理所有节点 (安全模式) +ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ + configuration/playbooks/disk-cleanup.yml + +# 清理特定节点 +ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ + configuration/playbooks/disk-cleanup.yml --limit ash3c + +# 包含容器清理 (谨慎使用) +ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ + configuration/playbooks/disk-cleanup.yml -e cleanup_containers=true +``` + +## 📊 分析报告说明 + +### ncdu 文件位置 +分析完成后,ncdu 扫描文件保存在各节点的 `/tmp/disk-analysis/` 目录: + +- `ncdu-root-.json` - 根目录扫描结果 +- `ncdu-var-.json` - /var 目录扫描结果 +- `ncdu-opt-.json` - /opt 目录扫描结果 + +### 查看 ncdu 报告 +```bash +# 在目标节点上查看交互式报告 +ncdu -f /tmp/disk-analysis/ncdu-root-semaphore.json + +# 查看文本报告 +cat /tmp/disk-analysis/disk-report-semaphore.txt + +# 查看清理建议 +cat /tmp/disk-analysis/cleanup-suggestions-semaphore.txt +``` + +## 🧹 清理选项说明 + +### 默认清理项目 +- ✅ **系统日志**: 清理7天前的日志文件 +- ✅ **包缓存**: 清理 APT/YUM 缓存 +- ✅ **临时文件**: 清理7天前的临时文件 +- ✅ **核心转储**: 删除 core dump 文件 + +### 可选清理项目 +- ⚠️ **容器清理**: 需要手动启用 (`cleanup_containers=true`) + - 停止所有容器 + - 删除未使用的容器、镜像、卷 + +### 自定义清理参数 +```bash +ansible-playbook configuration/playbooks/disk-cleanup.yml \ + -e cleanup_logs=false \ + -e cleanup_cache=true \ + -e cleanup_temp=true \ + -e cleanup_containers=false +``` + +## 🚨 紧急情况处理 + +### 磁盘使用率 > 95% +```bash +# 1. 立即检查最大文件 +ansible all -i configuration/inventories/production/nomad-cluster.ini \ + -m shell -a "find / -type f -size +1G -exec ls -lh {} \; 2>/dev/null | head -5" + +# 2. 紧急清理 +ansible-playbook configuration/playbooks/disk-cleanup.yml \ + -e cleanup_containers=true + +# 3. 手动清理大文件 +ansible all -m shell -a "truncate -s 0 /var/log/large.log" +``` + +### 常见大文件位置 +- `/var/log/` - 系统日志 +- `/tmp/` - 临时文件 +- `/var/cache/` - 包管理器缓存 +- `/opt/nomad/data/` - Nomad 数据 +- `~/.local/share/containers/` - Podman 数据 + +## 📈 定期维护建议 + +### 每日监控 +```bash +# 添加到 crontab +0 9 * * * /root/mgmt/scripts/utilities/disk-monitor.sh 85 +``` + +### 每周清理 +```bash +# 每周日凌晨2点自动清理 +0 2 * * 0 cd /root/mgmt && ansible-playbook configuration/playbooks/disk-cleanup.yml +``` + +### 每月深度分析 +```bash +# 每月1号生成详细报告 +0 3 1 * * cd /root/mgmt && ansible-playbook configuration/playbooks/disk-analysis-ncdu.yml +``` + +## 🔍 故障排除 + +### ncdu 安装失败 +```bash +# 手动安装 +ansible all -m package -a "name=ncdu state=present" --become +``` + +### 扫描超时 +```bash +# 增加超时时间 +ansible-playbook disk-analysis-ncdu.yml -e ansible_timeout=600 +``` + +### 权限问题 +```bash +# 确保使用 sudo +ansible-playbook disk-analysis-ncdu.yml --become +``` + +## 💡 最佳实践 + +1. **定期监控**: 每天检查磁盘使用情况 +2. **预防性清理**: 使用率超过80%时主动清理 +3. **日志轮转**: 配置合适的日志轮转策略 +4. **容器管理**: 定期清理未使用的容器镜像 +5. **监控告警**: 设置磁盘使用率告警阈值 + +--- + +💡 **提示**: 使用 `./scripts/utilities/disk-monitor.sh` 可以快速检查所有节点状态! \ No newline at end of file diff --git a/mgmt.sh b/mgmt.sh deleted file mode 100755 index a5e3701..0000000 --- a/mgmt.sh +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/bash - -# 项目管理主脚本 -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -# 项目根目录 -PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 显示项目状态 -show_status() { - log_info "=== 项目状态总览 ===" - echo "" - - # Docker Swarm 状态 - log_info "Docker Swarm 状态:" - if docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null | grep -q "active"; then - log_success "✓ Docker Swarm 已激活" - docker node ls 2>/dev/null | head -n 5 - else - log_warning "✗ Docker Swarm 未激活" - fi - echo "" - - # OpenTofu 状态 - log_info "OpenTofu 状态:" - if command -v tofu &> /dev/null; then - local version=$(tofu version | head -n1) - log_success "✓ OpenTofu 已安装: $version" - else - log_warning "✗ OpenTofu 未安装" - fi - echo "" - - # 部署的堆栈 - log_info "已部署的 Docker Stack:" - if docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null | grep -q "active"; then - docker stack ls 2>/dev/null || log_warning "无堆栈部署" - else - log_warning "Swarm 未激活,无法查看堆栈" - fi - echo "" -} - -# 快速部署 -quick_deploy() { - log_info "=== 快速部署 ===" - - # 检查 Swarm - if ! docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null | grep -q "active"; then - log_info "初始化 Docker Swarm..." - "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" init - fi - - # 部署 Traefik - log_info "部署 Traefik 反向代理..." - "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" deploy traefik "${PROJECT_ROOT}/swarm/stacks/traefik-swarm-stack.yml" - - # 等待 Traefik 启动 - log_info "等待 Traefik 启动..." - sleep 10 - - # 部署示例服务 - log_info "部署示例服务..." - "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" deploy demo "${PROJECT_ROOT}/swarm/stacks/demo-services-stack.yml" - - log_success "快速部署完成!" - echo "" - log_info "访问地址:" - echo " - Traefik Dashboard: http://localhost:8080" - echo " - 示例应用: 请查看 demo 堆栈的服务配置" -} - -# 清理环境 -cleanup() { - log_info "=== 清理环境 ===" - - # 停止所有堆栈 - log_info "停止所有 Docker Stack..." - docker stack ls --format "{{.Name}}" 2>/dev/null | while read -r stack; do - if [[ -n "$stack" ]]; then - log_info "删除堆栈: $stack" - docker stack rm "$stack" - fi - done - - # 等待服务清理 - log_info "等待服务清理..." - sleep 5 - - log_success "环境清理完成" -} - -# 显示帮助 -show_help() { - echo "项目管理脚本" - echo "" - echo "用法: $0 [命令]" - echo "" - echo "命令:" - echo " status - 显示项目状态总览" - echo " deploy - 快速部署所有服务" - echo " cleanup - 清理所有部署的服务" - echo " swarm - 打开 Swarm 管理工具" - echo " tofu - 打开 OpenTofu 管理工具" - echo " help - 显示此帮助信息" - echo "" - echo "子工具:" - echo " ./swarm/scripts/swarm-manager.sh - Docker Swarm 管理" - echo " ./scripts/setup/setup-opentofu.sh - OpenTofu 设置" - echo "" -} - -# 主函数 -main() { - cd "$PROJECT_ROOT" - - case "${1:-help}" in - "status") - show_status - ;; - "deploy") - quick_deploy - ;; - "cleanup") - cleanup - ;; - "swarm") - exec "${PROJECT_ROOT}/swarm/scripts/swarm-manager.sh" "${@:2}" - ;; - "tofu") - exec "${PROJECT_ROOT}/scripts/setup/setup-opentofu.sh" "${@:2}" - ;; - "help"|*) - show_help - ;; - esac -} - -main "$@" \ No newline at end of file diff --git a/scripts/deployment/configure-nomad-cluster.sh b/scripts/deployment/configure-nomad-cluster.sh deleted file mode 100755 index 76fbbd1..0000000 --- a/scripts/deployment/configure-nomad-cluster.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash - -set -e - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查必要的文件 -check_prerequisites() { - log_info "检查前置条件..." - - if [ ! -f "configuration/inventories/production/nomad-cluster.ini" ]; then - log_error "找不到 Nomad 集群配置文件" - exit 1 - fi - - if [ ! -f "configuration/playbooks/applications/configure-nomad-cluster.yml" ]; then - log_error "找不到 Nomad 配置 playbook" - exit 1 - fi - - log_success "前置条件检查完成" -} - -# 生成加密密钥 -generate_encrypt_key() { - log_info "生成 Nomad 加密密钥..." - - if command -v nomad >/dev/null 2>&1; then - ENCRYPT_KEY=$(nomad operator gossip keyring generate) - log_success "生成加密密钥: $ENCRYPT_KEY" - - # 更新配置文件中的加密密钥 - sed -i "s|YOUR_NOMAD_ENCRYPT_KEY_HERE|$ENCRYPT_KEY|g" configuration/inventories/production/nomad-cluster.ini - log_success "已更新配置文件中的加密密钥" - else - log_warning "本地未安装 Nomad,将在远程节点生成密钥" - fi -} - -# 测试连接 -test_connectivity() { - log_info "测试目标主机连接性..." - - ansible -i configuration/inventories/production/nomad-cluster.ini nomad_cluster -m ping - - if [ $? -eq 0 ]; then - log_success "所有主机连接正常" - else - log_error "部分主机连接失败,请检查网络和SSH配置" - exit 1 - fi -} - -# 配置 Nomad 集群 -configure_cluster() { - log_info "开始配置 Nomad 集群..." - - ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ - configuration/playbooks/applications/configure-nomad-cluster.yml \ - -v - - if [ $? -eq 0 ]; then - log_success "Nomad 集群配置完成" - else - log_error "Nomad 集群配置失败" - exit 1 - fi -} - -# 验证集群状态 -verify_cluster() { - log_info "验证集群状态..." - - # 等待服务启动 - sleep 10 - - log_info "检查 Nomad 服务状态..." - ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ - -m shell -a "systemctl status nomad --no-pager" - - log_info "检查集群成员..." - ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ - -m shell -a "nomad server members" --limit 1 - - log_info "检查节点状态..." - ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ - -m shell -a "nomad node status" --limit 1 -} - -# 主函数 -main() { - echo "🚀 开始配置 Nomad 集群..." - echo "==================================" - - check_prerequisites - generate_encrypt_key - test_connectivity - configure_cluster - verify_cluster - - echo "==================================" - log_success "Nomad 集群配置完成!" - echo "" - echo "访问 Nomad UI:" - echo "- Master: http://100.117.106.136:4646" - echo "- Semaphore: http://100.116.158.95:4646" - echo "" - echo "常用命令:" - echo "- 查看集群状态: nomad server members" - echo "- 查看节点状态: nomad node status" - echo "- 运行作业: nomad job run " -} - -# 运行主函数 -main "$@" \ No newline at end of file diff --git a/scripts/deployment/deploy-consul-cluster.sh b/scripts/deployment/deploy-consul-cluster.sh deleted file mode 100755 index e6c7d25..0000000 --- a/scripts/deployment/deploy-consul-cluster.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash - -# Consul 集群部署脚本 -# 使用 Ansible 在物理机上部署 Consul 集群 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" -PLAYBOOK_FILE="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster.yml" - -echo "=== Consul 集群部署脚本 ===" -echo "项目根目录: $PROJECT_ROOT" -echo "清单文件: $INVENTORY_FILE" -echo "Playbook: $PLAYBOOK_FILE" -echo - -# 检查必要文件 -if [[ ! -f "$INVENTORY_FILE" ]]; then - echo "错误: 清单文件不存在: $INVENTORY_FILE" - exit 1 -fi - -if [[ ! -f "$PLAYBOOK_FILE" ]]; then - echo "错误: Playbook 文件不存在: $PLAYBOOK_FILE" - exit 1 -fi - -# 生成 Consul 加密密钥(如果需要) -echo "1. 检查 Consul 加密密钥..." -if grep -q "YOUR_BASE64_ENCRYPT_KEY_HERE" "$INVENTORY_FILE"; then - echo "需要生成 Consul 加密密钥..." - - # 尝试使用已安装的 consul 生成密钥 - if command -v consul &> /dev/null; then - ENCRYPT_KEY=$(consul keygen) - echo "生成的加密密钥: $ENCRYPT_KEY" - - # 替换清单文件中的占位符 - sed -i "s/YOUR_BASE64_ENCRYPT_KEY_HERE/$ENCRYPT_KEY/" "$INVENTORY_FILE" - echo "已更新清单文件中的加密密钥" - else - echo "警告: 未找到 consul 命令,请手动生成加密密钥并更新清单文件" - echo "可以使用以下命令生成: consul keygen" - echo "或者使用在线工具生成 32 字节的 base64 编码密钥" - fi -fi - -# 测试连接 -echo -echo "2. 测试目标主机连接..." -ansible -i "$INVENTORY_FILE" consul_cluster -m ping - -if [[ $? -ne 0 ]]; then - echo "错误: 无法连接到目标主机,请检查清单文件中的连接信息" - exit 1 -fi - -# 显示部署信息 -echo -echo "3. 部署信息:" -echo "目标主机:" -ansible -i "$INVENTORY_FILE" consul_cluster --list-hosts - -echo -echo "Consul 版本: $(grep consul_version "$INVENTORY_FILE" | cut -d'=' -f2)" -echo "数据中心: $(grep consul_datacenter "$INVENTORY_FILE" | cut -d'=' -f2)" - -# 确认部署 -echo -read -p "确认部署 Consul 集群到上述主机? (y/N): " confirm -if [[ $confirm != "y" && $confirm != "Y" ]]; then - echo "部署已取消" - exit 0 -fi - -# 执行部署 -echo -echo "4. 开始部署 Consul 集群..." -ansible-playbook -i "$INVENTORY_FILE" "$PLAYBOOK_FILE" -v - -if [[ $? -eq 0 ]]; then - echo - echo "=== 部署完成 ===" - echo - echo "验证集群状态:" - echo "1. 检查服务状态:" - echo " ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'systemctl status consul'" - echo - echo "2. 检查集群成员:" - echo " ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'consul members'" - echo - echo "3. 访问 Web UI:" - echo " - Master: http://master:8500" - echo " - Ash3c: http://ash3c:8500" - echo - echo "4. 检查集群领导者:" - echo " curl http://master:8500/v1/status/leader" - echo -else - echo "部署失败,请检查错误信息" - exit 1 -fi \ No newline at end of file diff --git a/scripts/deployment/deploy-consul-simple.sh b/scripts/deployment/deploy-consul-simple.sh deleted file mode 100755 index b140cd3..0000000 --- a/scripts/deployment/deploy-consul-simple.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash - -# Consul Cluster Simple Deployment Script -# 简化版 Consul 集群部署脚本 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查依赖 -check_dependencies() { - log_info "检查依赖项..." - - if ! command -v ansible-playbook &> /dev/null; then - log_error "ansible-playbook 未找到,请安装 Ansible" - exit 1 - fi - - if ! command -v python3 &> /dev/null; then - log_error "python3 未找到" - exit 1 - fi - - log_success "依赖检查完成" -} - -# 检查网络连接 -check_connectivity() { - log_info "检查目标主机连接性..." - - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" - - if [[ ! -f "$inventory_file" ]]; then - log_error "清单文件不存在: $inventory_file" - exit 1 - fi - - # 测试连接 - if ansible consul_cluster -i "$inventory_file" -m ping --one-line; then - log_success "所有主机连接正常" - else - log_warning "部分主机连接失败,但继续部署..." - fi -} - -# 部署 Consul 集群 -deploy_consul() { - log_info "开始部署 Consul 集群..." - - local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster-simple.yml" - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" - - if [[ ! -f "$playbook_file" ]]; then - log_error "Playbook 文件不存在: $playbook_file" - exit 1 - fi - - # 运行 Ansible playbook - if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then - log_success "Consul 集群部署完成" - else - log_error "Consul 集群部署失败" - exit 1 - fi -} - -# 验证集群状态 -verify_cluster() { - log_info "验证 Consul 集群状态..." - - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" - - # 检查服务状态 - log_info "检查 Consul 服务状态..." - ansible consul_cluster -i "$inventory_file" -m shell -a "systemctl status consul --no-pager" || true - - # 检查集群成员 - log_info "检查集群成员..." - ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true - - # 检查领导者 - log_info "检查集群领导者..." - ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul operator raft list-peers" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true -} - -# 主函数 -main() { - log_info "开始 Consul 集群简化部署..." - - check_dependencies - check_connectivity - deploy_consul - verify_cluster - - log_success "Consul 集群部署流程完成!" - - echo "" - log_info "后续步骤:" - echo "1. 检查集群状态: consul members" - echo "2. 访问 Web UI: http://:8500" - echo "3. 检查日志: journalctl -u consul -f" -} - -# 脚本入口 -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - main "$@" -fi \ No newline at end of file diff --git a/scripts/deployment/deploy-nomad-cluster.sh b/scripts/deployment/deploy-nomad-cluster.sh deleted file mode 100755 index 2d4e852..0000000 --- a/scripts/deployment/deploy-nomad-cluster.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash - -# Nomad Cluster Deployment Script -# Nomad 集群部署脚本 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查依赖 -check_dependencies() { - log_info "检查依赖项..." - - if ! command -v ansible-playbook &> /dev/null; then - log_error "ansible-playbook 未找到,请安装 Ansible" - exit 1 - fi - - log_success "依赖检查完成" -} - -# 检查网络连接 -check_connectivity() { - log_info "检查目标主机连接性..." - - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - if [[ ! -f "$inventory_file" ]]; then - log_error "清单文件不存在: $inventory_file" - exit 1 - fi - - # 测试连接 - if ansible nomad_cluster -i "$inventory_file" -m ping --one-line; then - log_success "所有主机连接正常" - else - log_warning "部分主机连接失败,但继续部署..." - fi -} - -# 部署 Nomad 集群 -deploy_nomad() { - log_info "开始部署 Nomad 集群..." - - local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-cluster.yml" - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - if [[ ! -f "$playbook_file" ]]; then - log_error "Playbook 文件不存在: $playbook_file" - exit 1 - fi - - # 运行 Ansible playbook - if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then - log_success "Nomad 集群部署完成" - else - log_error "Nomad 集群部署失败" - exit 1 - fi -} - -# 验证集群状态 -verify_cluster() { - log_info "验证 Nomad 集群状态..." - - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - # 检查服务状态 - log_info "检查 Nomad 服务状态..." - ansible nomad_cluster -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true - - # 检查集群成员 - log_info "检查集群服务器..." - ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true - - # 检查节点状态 - log_info "检查节点状态..." - ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true - - # 显示集群信息 - log_info "集群信息..." - ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true -} - -# 显示访问信息 -show_access_info() { - log_info "Nomad 集群访问信息:" - echo "" - echo "Web UI 访问地址:" - echo " - http://10.0.0.232:4646" - echo " - http://10.0.0.179:4646" - echo "" - echo "API 访问地址:" - echo " - http://10.0.0.232:4646/v1/" - echo " - http://10.0.0.179:4646/v1/" - echo "" - echo "常用命令:" - echo " - 查看集群状态: nomad status" - echo " - 查看节点: nomad node status" - echo " - 查看服务器: nomad server members" - echo " - 提交作业: nomad job run " - echo "" -} - -# 主函数 -main() { - log_info "开始 Nomad 集群部署..." - - check_dependencies - check_connectivity - deploy_nomad - verify_cluster - show_access_info - - log_success "Nomad 集群部署流程完成!" -} - -# 脚本入口 -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - main "$@" -fi \ No newline at end of file diff --git a/scripts/deployment/deploy-nomad-local.sh b/scripts/deployment/deploy-nomad-local.sh deleted file mode 100755 index fcdbf2c..0000000 --- a/scripts/deployment/deploy-nomad-local.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/bin/bash - -# Nomad Local Deployment Script -# Nomad 本地部署脚本 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查依赖 -check_dependencies() { - log_info "检查依赖项..." - - if ! command -v ansible-playbook &> /dev/null; then - log_error "ansible-playbook 未找到,请安装 Ansible" - exit 1 - fi - - if ! command -v docker &> /dev/null; then - log_error "docker 未找到,请安装 Docker" - exit 1 - fi - - log_success "依赖检查完成" -} - -# 部署 Nomad -deploy_nomad() { - log_info "开始部署 Nomad (本地单节点)..." - - local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-local.yml" - - if [[ ! -f "$playbook_file" ]]; then - log_error "Playbook 文件不存在: $playbook_file" - exit 1 - fi - - # 运行 Ansible playbook - if ansible-playbook "$playbook_file" -v; then - log_success "Nomad 本地部署完成" - else - log_error "Nomad 本地部署失败" - exit 1 - fi -} - -# 验证部署 -verify_deployment() { - log_info "验证 Nomad 部署..." - - # 等待服务启动 - sleep 5 - - # 检查服务状态 - log_info "检查 Nomad 服务状态..." - systemctl status nomad --no-pager || true - - # 检查 Nomad 版本 - log_info "检查 Nomad 版本..." - /usr/local/bin/nomad version || true - - # 检查节点状态 - log_info "检查节点状态..." - /usr/local/bin/nomad node status || true - - # 检查服务器状态 - log_info "检查服务器状态..." - /usr/local/bin/nomad server members || true -} - -# 显示访问信息 -show_access_info() { - local current_ip=$(hostname -I | awk '{print $1}') - - log_info "Nomad 访问信息:" - echo "" - echo "Web UI 访问地址:" - echo " - http://localhost:4646" - echo " - http://${current_ip}:4646" - echo "" - echo "API 访问地址:" - echo " - http://localhost:4646/v1/" - echo " - http://${current_ip}:4646/v1/" - echo "" - echo "常用命令:" - echo " - 查看集群状态: nomad status" - echo " - 查看节点: nomad node status" - echo " - 查看服务器: nomad server members" - echo " - 提交作业: nomad job run " - echo "" - echo "示例作业文件位置:" - echo " - $PROJECT_ROOT/examples/nomad-jobs/" - echo "" -} - -# 主函数 -main() { - log_info "开始 Nomad 本地部署..." - - check_dependencies - deploy_nomad - verify_deployment - show_access_info - - log_success "Nomad 本地部署流程完成!" -} - -# 脚本入口 -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - main "$@" -fi \ No newline at end of file diff --git a/scripts/deployment/install-nomad-cluster.sh b/scripts/deployment/install-nomad-cluster.sh deleted file mode 100755 index aa5aadc..0000000 --- a/scripts/deployment/install-nomad-cluster.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/bin/bash - -# Install Nomad Cluster via APT -# 通过 APT 安装 Nomad 集群 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查依赖 -check_dependencies() { - log_info "检查依赖项..." - - if ! command -v ansible-playbook &> /dev/null; then - log_error "ansible-playbook 未找到,请安装 Ansible" - exit 1 - fi - - log_success "依赖检查完成" -} - -# 检查网络连接 -check_connectivity() { - log_info "检查目标主机连接性..." - - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - if [[ ! -f "$inventory_file" ]]; then - log_error "清单文件不存在: $inventory_file" - exit 1 - fi - - # 测试连接 - if ansible nomad_servers -i "$inventory_file" -m ping --one-line; then - log_success "所有主机连接正常" - else - log_warning "部分主机连接失败,但继续安装..." - fi -} - -# 安装 Nomad -install_nomad() { - log_info "开始在远程主机安装 Nomad..." - - local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/install-nomad-apt.yml" - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - if [[ ! -f "$playbook_file" ]]; then - log_error "Playbook 文件不存在: $playbook_file" - exit 1 - fi - - # 运行 Ansible playbook - if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then - log_success "Nomad 集群安装完成" - else - log_error "Nomad 集群安装失败" - exit 1 - fi -} - -# 验证安装 -verify_installation() { - log_info "验证 Nomad 安装..." - - local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" - - # 检查服务状态 - log_info "检查 Nomad 服务状态..." - ansible nomad_servers -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true - - # 检查 Nomad 版本 - log_info "检查 Nomad 版本..." - ansible nomad_servers -i "$inventory_file" -m shell -a "nomad version" || true - - # 检查集群成员 - log_info "检查集群服务器..." - ansible nomad_servers -i "$inventory_file" -m shell -a "nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true - - # 检查节点状态 - log_info "检查节点状态..." - ansible nomad_servers -i "$inventory_file" -m shell -a "nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true -} - -# 显示访问信息 -show_access_info() { - log_info "Nomad 集群访问信息:" - echo "" - echo "Web UI 访问地址:" - echo " - http://100.117.106.136:4646 (master)" - echo " - http://100.116.158.95:4646 (semaphore)" - echo "" - echo "API 访问地址:" - echo " - http://100.117.106.136:4646/v1/ (master)" - echo " - http://100.116.158.95:4646/v1/ (semaphore)" - echo "" - echo "常用命令:" - echo " - 查看集群状态: nomad status" - echo " - 查看节点: nomad node status" - echo " - 查看服务器: nomad server members" - echo " - 提交作业: nomad job run " - echo "" - echo "示例作业文件位置:" - echo " - $PROJECT_ROOT/examples/nomad-jobs/" - echo "" -} - -# 主函数 -main() { - log_info "开始 Nomad 集群安装..." - - check_dependencies - check_connectivity - install_nomad - verify_installation - show_access_info - - log_success "Nomad 集群安装流程完成!" -} - -# 脚本入口 -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - main "$@" -fi \ No newline at end of file diff --git a/scripts/setup/setup-gitea-integration.sh b/scripts/setup/setup-gitea-integration.sh deleted file mode 100755 index 7526a6e..0000000 --- a/scripts/setup/setup-gitea-integration.sh +++ /dev/null @@ -1,467 +0,0 @@ -#!/bin/bash -# Gitea 集成设置脚本 - -set -e - -echo "🔗 设置 Gitea 集成..." - -# 配置变量 -GITEA_HOST="gitea" -GITEA_USER="ben" -GITEA_SSH_URL="git@${GITEA_HOST}" -REPO_NAME="mgmt" -GITEA_HTTP_URL="http://${GITEA_HOST}:3000" - -# 检查 SSH 连接 -echo "🔍 检查 Gitea SSH 连接..." -if ssh -o ConnectTimeout=5 -o BatchMode=yes "${GITEA_SSH_URL}" 2>&1 | grep -q "successfully authenticated"; then - echo "✅ SSH 连接正常" -else - echo "❌ SSH 连接失败,请检查:" - echo " 1. Gitea 服务是否运行" - echo " 2. SSH 密钥是否已添加到 Gitea" - echo " 3. 网络连接是否正常" - exit 1 -fi - -# 检查是否已经是 Git 仓库 -if [ ! -d ".git" ]; then - echo "📦 初始化 Git 仓库..." - git init - git config user.name "${GITEA_USER}" - git config user.email "${GITEA_USER}@example.com" -else - echo "✅ Git 仓库已存在" -fi - -# 检查远程仓库配置 -if git remote get-url origin >/dev/null 2>&1; then - CURRENT_ORIGIN=$(git remote get-url origin) - echo "ℹ️ 当前远程仓库: $CURRENT_ORIGIN" - - if [[ "$CURRENT_ORIGIN" != *"${GITEA_HOST}"* ]]; then - echo "🔄 更新远程仓库地址..." - git remote set-url origin "${GITEA_SSH_URL}:${GITEA_USER}/${REPO_NAME}.git" - fi -else - echo "➕ 添加远程仓库..." - git remote add origin "${GITEA_SSH_URL}:${GITEA_USER}/${REPO_NAME}.git" -fi - -# 创建 .gitignore -echo "📝 创建 .gitignore..." -cat > .gitignore << 'EOF' -# OpenTofu/Terraform -*.tfstate -*.tfstate.* -*.tfvars -!*.tfvars.example -.terraform/ -.terraform.lock.hcl -crash.log -crash.*.log - -# Ansible -*.retry -.vault_pass -host_vars/*/vault.yml -group_vars/*/vault.yml - -# Docker -.env -docker-compose.override.yml - -# IDE -.vscode/ -.idea/ -*.swp -*.swo -*~ - -# OS -.DS_Store -Thumbs.db - -# Logs -*.log -logs/ - -# Temporary files -tmp/ -temp/ -.tmp/ - -# Backup files -backup-*/ -*.bak - -# Secrets -secrets/ -*.pem -*.key -*.crt -!*.example.* - -# Node modules (if any) -node_modules/ - -# Python -__pycache__/ -*.pyc -*.pyo -*.pyd -.Python -env/ -venv/ -.venv/ -pip-log.txt -pip-delete-this-directory.txt -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.log -.git -.mypy_cache -.pytest_cache -.hypothesis - -# Local development -.local/ -local-* -EOF - -# 创建 Gitea Actions 工作流 -echo "🔄 创建 Gitea Actions 工作流..." - -# 基础设施 CI/CD -cat > .gitea/workflows/infrastructure.yml << 'EOF' -name: Infrastructure CI/CD - -on: - push: - branches: [ main, develop ] - paths: - - 'infrastructure/**' - - '.gitea/workflows/infrastructure.yml' - pull_request: - branches: [ main ] - paths: - - 'infrastructure/**' - -jobs: - validate: - runs-on: ubuntu-latest - name: Validate Infrastructure - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup OpenTofu - uses: opentofu/setup-opentofu@v1 - with: - tofu_version: 1.10.6 - - - name: Validate OpenTofu configurations - run: | - for dir in infrastructure/providers/*/; do - if [ -d "$dir" ]; then - echo "Validating $dir" - cd "$dir" - tofu init -backend=false - tofu validate - cd - > /dev/null - fi - done - - - name: Check formatting - run: | - tofu fmt -check -recursive infrastructure/ - - - name: Security scan - run: | - # 这里可以添加 tfsec 或 checkov 扫描 - echo "Security scan placeholder" - - plan: - runs-on: ubuntu-latest - name: Plan Infrastructure - needs: validate - if: github.event_name == 'pull_request' - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup OpenTofu - uses: opentofu/setup-opentofu@v1 - with: - tofu_version: 1.10.6 - - - name: Plan infrastructure changes - run: | - cd infrastructure/environments/dev - tofu init - tofu plan -var-file="terraform.tfvars" -out=tfplan - env: - # 这里需要配置云服务商的环境变量 - TF_VAR_environment: dev - - apply: - runs-on: ubuntu-latest - name: Apply Infrastructure - needs: validate - if: github.ref == 'refs/heads/main' && github.event_name == 'push' - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup OpenTofu - uses: opentofu/setup-opentofu@v1 - with: - tofu_version: 1.10.6 - - - name: Apply infrastructure changes - run: | - cd infrastructure/environments/dev - tofu init - tofu apply -var-file="terraform.tfvars" -auto-approve - env: - TF_VAR_environment: dev -EOF - -# 应用部署工作流 -cat > .gitea/workflows/deploy.yml << 'EOF' -name: Application Deployment - -on: - push: - branches: [ main ] - paths: - - 'configuration/**' - - 'containers/**' - - '.gitea/workflows/deploy.yml' - workflow_dispatch: - inputs: - environment: - description: 'Target environment' - required: true - default: 'dev' - type: choice - options: - - dev - - staging - - production - -jobs: - ansible-check: - runs-on: ubuntu-latest - name: Ansible Syntax Check - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install Ansible - run: | - pip install ansible ansible-core - ansible-galaxy collection install community.general - ansible-galaxy collection install ansible.posix - ansible-galaxy collection install community.docker - - - name: Ansible syntax check - run: | - cd configuration - for playbook in playbooks/*/*.yml; do - if [ -f "$playbook" ]; then - echo "Checking $playbook" - ansible-playbook --syntax-check "$playbook" - fi - done - - deploy: - runs-on: ubuntu-latest - name: Deploy Applications - needs: ansible-check - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install Ansible - run: | - pip install ansible ansible-core - ansible-galaxy collection install community.general - ansible-galaxy collection install ansible.posix - ansible-galaxy collection install community.docker - - - name: Deploy applications - run: | - cd configuration - ENV="${{ github.event.inputs.environment || 'dev' }}" - ansible-playbook -i "inventories/${ENV}/inventory.ini" playbooks/bootstrap/main.yml - env: - ANSIBLE_HOST_KEY_CHECKING: False -EOF - -# Docker 构建工作流 -cat > .gitea/workflows/docker.yml << 'EOF' -name: Docker Build and Deploy - -on: - push: - branches: [ main ] - paths: - - 'containers/**' - - 'Dockerfile*' - - '.gitea/workflows/docker.yml' - -jobs: - build: - runs-on: ubuntu-latest - name: Build Docker Images - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ secrets.REGISTRY_URL }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: Build and push images - run: | - # 构建应用镜像 - for dockerfile in containers/applications/*/Dockerfile; do - if [ -f "$dockerfile" ]; then - app_name=$(basename $(dirname "$dockerfile")) - echo "Building $app_name" - docker build -t "${{ secrets.REGISTRY_URL }}/$app_name:${{ github.sha }}" -f "$dockerfile" . - docker push "${{ secrets.REGISTRY_URL }}/$app_name:${{ github.sha }}" - fi - done - - deploy-swarm: - runs-on: ubuntu-latest - name: Deploy to Docker Swarm - needs: build - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Deploy to Swarm - run: | - # 这里可以通过 SSH 连接到 Swarm 管理节点进行部署 - echo "Deploy to Swarm placeholder" -EOF - -# 创建项目配置文件 -echo "⚙️ 创建项目配置文件..." - -# Gitea 仓库配置 -cat > .gitea/settings.yml << 'EOF' -# Gitea 仓库设置 -repository: - name: mgmt - description: "基础设施管理项目 - OpenTofu + Ansible + Docker Swarm" - website: "" - default_branch: main - - # 功能开关 - has_issues: true - has_wiki: true - has_projects: true - has_actions: true - - # 权限设置 - private: false - allow_merge_commits: true - allow_squash_merge: true - allow_rebase_merge: true - delete_branch_on_merge: true - -# Actions 设置 -actions: - enabled: true - allow_fork_pull_request_run: true - default_actions_url: "https://gitea.com" - -# 分支保护 -branch_protection: - main: - enable_push: false - enable_push_whitelist: true - push_whitelist_usernames: ["ben"] - require_signed_commits: false - enable_merge_whitelist: true - merge_whitelist_usernames: ["ben"] - enable_status_check: true - status_check_contexts: ["validate", "plan"] - enable_approvals_whitelist: false - approvals_whitelist_usernames: [] - block_on_rejected_reviews: true - dismiss_stale_approvals: true - require_signed_commits: false -EOF - -# 添加所有文件到 Git -echo "📦 添加文件到 Git..." -git add . - -# 检查是否有变更需要提交 -if git diff --staged --quiet; then - echo "ℹ️ 没有新的变更需要提交" -else - echo "💾 提交变更..." - git commit -m "feat: 集成 OpenTofu + Ansible + Gitea CI/CD - -- 重构项目目录结构 -- 添加 OpenTofu 多云支持 -- 配置 Ansible 自动化部署 -- 集成 Gitea Actions CI/CD 流水线 -- 添加 Docker Swarm 管理 -- 完善监控和安全配置" -fi - -# 推送到远程仓库 -echo "🚀 推送到 Gitea..." -if git push -u origin main; then - echo "✅ 成功推送到 Gitea" -else - echo "⚠️ 推送失败,可能需要先在 Gitea 创建仓库" - echo " 请访问: ${GITEA_HTTP_URL}/repo/create" - echo " 创建名为 '${REPO_NAME}' 的仓库" -fi - -echo "" -echo "🎉 Gitea 集成设置完成!" -echo "" -echo "📋 下一步操作:" -echo "1. 访问 Gitea: ${GITEA_HTTP_URL}/${GITEA_USER}/${REPO_NAME}" -echo "2. 配置 Actions Secrets (如果需要):" -echo " - REGISTRY_URL: 容器镜像仓库地址" -echo " - REGISTRY_USERNAME: 仓库用户名" -echo " - REGISTRY_PASSWORD: 仓库密码" -echo "3. 配置云服务商凭据 (通过 Secrets 或环境变量)" -echo "4. 测试 CI/CD 流水线" -echo "" -echo "🔗 有用的命令:" -echo " git status - 查看仓库状态" -echo " git log --oneline - 查看提交历史" -echo " git push - 推送变更" -echo " make help - 查看项目命令" \ No newline at end of file diff --git a/scripts/setup/setup-nomad-laptop.sh b/scripts/setup/setup-nomad-laptop.sh new file mode 100755 index 0000000..7337e51 --- /dev/null +++ b/scripts/setup/setup-nomad-laptop.sh @@ -0,0 +1,230 @@ +#!/bin/bash + +# Nomad 笔记本设置脚本 - Mac/Linux 版本 +# 用于将 Mac 或 Linux 笔记本加入 Nomad 集群作为 server + +set -e + +# 配置变量 +NOMAD_VERSION="1.10.5" +NOMAD_DATACENTER="dc1" +NOMAD_ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + +# 检测操作系统 +OS=$(uname -s | tr '[:upper:]' '[:lower:]') +ARCH=$(uname -m) + +case $ARCH in + x86_64) ARCH="amd64" ;; + arm64|aarch64) ARCH="arm64" ;; + *) echo "不支持的架构: $ARCH"; exit 1 ;; +esac + +echo "🚀 开始设置 Nomad ($OS-$ARCH)..." + +# 1. 检查 Tailscale +echo "📡 检查 Tailscale 连接..." +if ! command -v tailscale &> /dev/null; then + echo "❌ 请先安装 Tailscale" + exit 1 +fi + +TAILSCALE_IP=$(tailscale ip | head -1) +if [ -z "$TAILSCALE_IP" ]; then + echo "❌ Tailscale 未连接,请先运行: tailscale up" + exit 1 +fi + +echo "✅ Tailscale IP: $TAILSCALE_IP" + +# 2. 安装 Nomad(如果需要) +if ! command -v nomad &> /dev/null; then + echo "📦 安装 Nomad $NOMAD_VERSION..." + + if [[ "$OS" == "darwin" ]]; then + # macOS + if command -v brew &> /dev/null; then + brew install nomad + else + echo "❌ 请先安装 Homebrew 或手动安装 Nomad" + exit 1 + fi + else + # Linux + NOMAD_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_${OS}_${ARCH}.zip" + curl -L "$NOMAD_URL" -o nomad.zip + unzip nomad.zip + sudo mv nomad /usr/local/bin/ + rm nomad.zip + fi +fi + +echo "✅ Nomad 版本: $(nomad version)" + +# 3. 创建配置目录 +echo "📁 创建配置目录..." +sudo mkdir -p /etc/nomad.d /opt/nomad/data +sudo chown -R $(whoami):$(id -gn) /etc/nomad.d /opt/nomad/data + +# 4. 生成 Nomad 配置 +echo "⚙️ 生成 Nomad 配置..." +cat > /etc/nomad.d/nomad.hcl << EOF +datacenter = "$NOMAD_DATACENTER" +data_dir = "/opt/nomad/data" +log_level = "INFO" + +bind_addr = "$TAILSCALE_IP" + +addresses { + http = "0.0.0.0" + rpc = "$TAILSCALE_IP" + serf = "$TAILSCALE_IP" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 6 + + retry_join = [ + "100.116.158.95", # semaphore + "100.117.106.136", # master (现在是 client) + "100.116.80.94" # ash3c (现在是 client) + ] + + encrypt = "$NOMAD_ENCRYPT_KEY" +} + +client { + enabled = false +} + +# 如果是 macOS,可能需要 Docker 插件 +plugin "podman" { + config { + volumes { + enabled = true + } + } +} + +consul { + address = "$TAILSCALE_IP:8500" +} +EOF + +echo "✅ 配置文件已生成: /etc/nomad.d/nomad.hcl" + +# 5. 创建启动脚本(macOS 不使用 systemd) +if [[ "$OS" == "darwin" ]]; then + # macOS - 创建 LaunchDaemon + echo "🍎 创建 macOS LaunchDaemon..." + sudo tee /Library/LaunchDaemons/io.nomadproject.nomad.plist > /dev/null << EOF + + + + + Label + io.nomadproject.nomad + ProgramArguments + + /usr/local/bin/nomad + agent + -config=/etc/nomad.d/nomad.hcl + + RunAtLoad + + KeepAlive + + StandardOutPath + /var/log/nomad.log + StandardErrorPath + /var/log/nomad.log + + +EOF + + # 加载并启动服务 + sudo launchctl load /Library/LaunchDaemons/io.nomadproject.nomad.plist + sudo launchctl start io.nomadproject.nomad + +else + # Linux - 创建 systemd 服务 + echo "🐧 创建 systemd 服务..." + sudo tee /etc/systemd/system/nomad.service > /dev/null << EOF +[Unit] +Description=Nomad +Documentation=https://www.nomadproject.io/ +Requires=network-online.target +After=network-online.target + +[Service] +Type=notify +User=$(whoami) +Group=$(id -gn) +ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl +ExecReload=/bin/kill -HUP \$MAINPID +KillMode=process +Restart=on-failure +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target +EOF + + # 启动服务 + sudo systemctl daemon-reload + sudo systemctl enable nomad + sudo systemctl start nomad +fi + +# 6. 验证安装 +echo "🔍 验证 Nomad 服务..." +sleep 5 + +if [[ "$OS" == "darwin" ]]; then + if sudo launchctl list | grep -q nomad; then + echo "✅ Nomad 服务已启动" + else + echo "❌ Nomad 服务启动失败" + exit 1 + fi +else + if systemctl is-active --quiet nomad; then + echo "✅ Nomad 服务已启动" + else + echo "❌ Nomad 服务启动失败" + sudo systemctl status nomad + exit 1 + fi +fi + +# 7. 检查集群状态 +echo "🌐 检查集群连接..." +sleep 10 + +if nomad server members 2>/dev/null | grep -q alive; then + echo "✅ 成功加入 Nomad 集群!" + nomad server members +else + echo "⚠️ 正在连接集群,请稍等..." + echo "可以运行以下命令检查状态:" + echo " nomad server members" + echo " nomad node status" +fi + +echo "" +echo "🎉 设置完成!" +echo "📊 Web UI: http://$TAILSCALE_IP:4646" +echo "🔧 配置文件: /etc/nomad.d/nomad.hcl" +echo "📝 日志查看:" +if [[ "$OS" == "darwin" ]]; then + echo " tail -f /var/log/nomad.log" +else + echo " sudo journalctl -u nomad -f" +fi \ No newline at end of file diff --git a/scripts/setup/setup-nomad-windows.ps1 b/scripts/setup/setup-nomad-windows.ps1 new file mode 100644 index 0000000..241e9cd --- /dev/null +++ b/scripts/setup/setup-nomad-windows.ps1 @@ -0,0 +1,212 @@ +# Nomad Windows 设置脚本 +# 用于将 Windows 笔记本加入 Nomad 集群作为 server + +param( + [string]$NomadVersion = "1.10.5", + [string]$DataCenter = "dc1", + [string]$EncryptKey = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" +) + +# 需要管理员权限 +if (-NOT ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) { + Write-Host "❌ 此脚本需要管理员权限运行" -ForegroundColor Red + Write-Host "请以管理员身份运行 PowerShell" -ForegroundColor Yellow + exit 1 +} + +Write-Host "🚀 开始设置 Windows Nomad Server..." -ForegroundColor Green + +# 1. 检查 Tailscale +Write-Host "📡 检查 Tailscale 连接..." -ForegroundColor Cyan +try { + $tailscaleIP = (tailscale ip) | Select-Object -First 1 + if ([string]::IsNullOrEmpty($tailscaleIP)) { + throw "Tailscale IP 为空" + } + Write-Host "✅ Tailscale IP: $tailscaleIP" -ForegroundColor Green +} catch { + Write-Host "❌ Tailscale 未安装或未连接" -ForegroundColor Red + Write-Host "请先安装 Tailscale 并运行: tailscale up" -ForegroundColor Yellow + exit 1 +} + +# 2. 创建目录 +Write-Host "📁 创建 Nomad 目录..." -ForegroundColor Cyan +$nomadDir = "C:\nomad" +$configDir = "$nomadDir\config" +$dataDir = "$nomadDir\data" +$binDir = "$nomadDir\bin" + +New-Item -ItemType Directory -Force -Path $configDir | Out-Null +New-Item -ItemType Directory -Force -Path $dataDir | Out-Null +New-Item -ItemType Directory -Force -Path $binDir | Out-Null + +# 3. 下载 Nomad(如果需要) +$nomadExe = "$binDir\nomad.exe" +if (-not (Test-Path $nomadExe)) { + Write-Host "📦 下载 Nomad $NomadVersion..." -ForegroundColor Cyan + $nomadUrl = "https://releases.hashicorp.com/nomad/$NomadVersion/nomad_${NomadVersion}_windows_amd64.zip" + $zipPath = "$env:TEMP\nomad.zip" + + try { + Invoke-WebRequest -Uri $nomadUrl -OutFile $zipPath + Expand-Archive -Path $zipPath -DestinationPath $binDir -Force + Remove-Item $zipPath + Write-Host "✅ Nomad 下载完成" -ForegroundColor Green + } catch { + Write-Host "❌ 下载 Nomad 失败: $_" -ForegroundColor Red + exit 1 + } +} + +# 4. 添加到 PATH(如果需要) +$currentPath = [Environment]::GetEnvironmentVariable("PATH", "Machine") +if ($currentPath -notlike "*$binDir*") { + Write-Host "🔧 添加 Nomad 到系统 PATH..." -ForegroundColor Cyan + [Environment]::SetEnvironmentVariable("PATH", "$currentPath;$binDir", "Machine") + $env:PATH += ";$binDir" +} + +# 5. 生成配置文件 +Write-Host "⚙️ 生成 Nomad 配置..." -ForegroundColor Cyan +$configContent = @" +datacenter = "$DataCenter" +data_dir = "$($dataDir -replace '\\', '/')" +log_level = "INFO" + +bind_addr = "$tailscaleIP" + +addresses { + http = "0.0.0.0" + rpc = "$tailscaleIP" + serf = "$tailscaleIP" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 6 + + retry_join = [ + "100.116.158.95", # semaphore + "100.117.106.136", # master + "100.116.80.94" # ash3c + ] + + encrypt = "$EncryptKey" +} + +client { + enabled = false +} + +plugin "podman" { + config { + volumes { + enabled = true + } + } +} + +consul { + address = "$tailscaleIP:8500" +} +"@ + +$configFile = "$configDir\nomad.hcl" +$configContent | Out-File -FilePath $configFile -Encoding UTF8 +Write-Host "✅ 配置文件已生成: $configFile" -ForegroundColor Green + +# 6. 创建 Windows 服务 +Write-Host "🔧 创建 Windows 服务..." -ForegroundColor Cyan + +# 先停止并删除现有服务(如果存在) +try { + Stop-Service -Name "Nomad" -ErrorAction SilentlyContinue + & sc.exe delete "Nomad" 2>$null +} catch {} + +# 创建新服务 +$serviceName = "Nomad" +$serviceDisplayName = "HashiCorp Nomad" +$serviceDescription = "HashiCorp Nomad Agent" +$serviceCommand = "`"$nomadExe`" agent -config=`"$configFile`"" + +try { + & sc.exe create $serviceName binPath= $serviceCommand DisplayName= $serviceDisplayName start= auto + & sc.exe description $serviceName $serviceDescription + + # 配置服务恢复选项 + & sc.exe failure $serviceName reset= 30 actions= restart/5000/restart/5000/restart/5000 + + Write-Host "✅ Windows 服务已创建" -ForegroundColor Green +} catch { + Write-Host "❌ 创建服务失败: $_" -ForegroundColor Red + exit 1 +} + +# 7. 启动服务 +Write-Host "🚀 启动 Nomad 服务..." -ForegroundColor Cyan +try { + Start-Service -Name $serviceName + Write-Host "✅ Nomad 服务已启动" -ForegroundColor Green +} catch { + Write-Host "❌ 启动服务失败: $_" -ForegroundColor Red + Write-Host "检查服务状态: Get-Service Nomad" -ForegroundColor Yellow + exit 1 +} + +# 8. 验证安装 +Write-Host "🔍 验证 Nomad 服务..." -ForegroundColor Cyan +Start-Sleep -Seconds 10 + +try { + $serviceStatus = Get-Service -Name $serviceName + if ($serviceStatus.Status -eq "Running") { + Write-Host "✅ Nomad 服务运行正常" -ForegroundColor Green + } else { + Write-Host "❌ Nomad 服务状态异常: $($serviceStatus.Status)" -ForegroundColor Red + } +} catch { + Write-Host "❌ 检查服务状态失败: $_" -ForegroundColor Red +} + +# 9. 检查集群连接 +Write-Host "🌐 检查集群连接..." -ForegroundColor Cyan +Start-Sleep -Seconds 15 + +try { + & $nomadExe server members + Write-Host "✅ 成功加入 Nomad 集群!" -ForegroundColor Green +} catch { + Write-Host "⚠️ 正在连接集群,请稍等..." -ForegroundColor Yellow + Write-Host "可以运行以下命令检查状态:" -ForegroundColor Cyan + Write-Host " nomad server members" -ForegroundColor White + Write-Host " nomad node status" -ForegroundColor White +} + +# 10. 防火墙规则 +Write-Host "🔥 配置防火墙规则..." -ForegroundColor Cyan +try { + New-NetFirewallRule -DisplayName "Nomad HTTP" -Direction Inbound -Protocol TCP -LocalPort 4646 -Action Allow -ErrorAction SilentlyContinue + New-NetFirewallRule -DisplayName "Nomad RPC" -Direction Inbound -Protocol TCP -LocalPort 4647 -Action Allow -ErrorAction SilentlyContinue + New-NetFirewallRule -DisplayName "Nomad Serf" -Direction Inbound -Protocol TCP -LocalPort 4648 -Action Allow -ErrorAction SilentlyContinue + Write-Host "✅ 防火墙规则已配置" -ForegroundColor Green +} catch { + Write-Host "⚠️ 防火墙规则配置可能失败,请手动检查" -ForegroundColor Yellow +} + +Write-Host "" +Write-Host "🎉 Windows Nomad Server 设置完成!" -ForegroundColor Green +Write-Host "📊 Web UI: http://$tailscaleIP:4646" -ForegroundColor Cyan +Write-Host "🔧 配置文件: $configFile" -ForegroundColor Cyan +Write-Host "📝 服务管理:" -ForegroundColor Cyan +Write-Host " 启动: Start-Service Nomad" -ForegroundColor White +Write-Host " 停止: Stop-Service Nomad" -ForegroundColor White +Write-Host " 状态: Get-Service Nomad" -ForegroundColor White +Write-Host " 日志: Get-EventLog -LogName Application -Source Nomad" -ForegroundColor White \ No newline at end of file diff --git a/scripts/setup/setup-opentofu.sh b/scripts/setup/setup-opentofu.sh deleted file mode 100755 index b115ad3..0000000 --- a/scripts/setup/setup-opentofu.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash - -# OpenTofu 设置脚本 -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查 OpenTofu 是否已安装 -check_opentofu() { - log_info "检查 OpenTofu 安装状态..." - - if command -v tofu &> /dev/null; then - local version=$(tofu version | head -n1) - log_success "OpenTofu 已安装: $version" - return 0 - else - log_error "OpenTofu 未安装" - return 1 - fi -} - -# 检查配置文件 -check_config() { - log_info "检查配置文件..." - - local config_file="tofu/environments/dev/terraform.tfvars" - - if [[ ! -f "$config_file" ]]; then - log_error "配置文件不存在: $config_file" - log_info "请复制 terraform.tfvars.example 并填入实际配置" - return 1 - fi - - # 检查是否包含示例值 - if grep -q "your_tenancy_id_here\|your_user_id_here\|your:key:fingerprint:here" "$config_file"; then - log_warning "配置文件包含示例值,请填入实际的 Oracle Cloud 配置" - log_info "需要配置以下项目:" - echo " - tenancy_ocid: Oracle Cloud 租户 OCID" - echo " - user_ocid: 用户 OCID" - echo " - fingerprint: API 密钥指纹" - echo " - private_key_path: 私钥文件路径" - echo " - compartment_ocid: 区间 OCID" - return 1 - fi - - log_success "配置文件检查通过" - return 0 -} - -# 初始化 OpenTofu -init_opentofu() { - log_info "初始化 OpenTofu..." - - cd tofu/environments/dev - - # 清理旧的状态文件 - if [[ -d ".terraform" ]]; then - log_info "清理旧的 .terraform 目录..." - rm -rf .terraform - fi - - # 初始化 - if tofu init; then - log_success "OpenTofu 初始化成功" - else - log_error "OpenTofu 初始化失败" - return 1 - fi - - cd - > /dev/null -} - -# 验证配置 -validate_config() { - log_info "验证 OpenTofu 配置..." - - cd tofu/environments/dev - - if tofu validate; then - log_success "配置验证通过" - else - log_error "配置验证失败" - return 1 - fi - - cd - > /dev/null -} - -# 生成计划 -plan_infrastructure() { - log_info "生成基础设施计划..." - - cd tofu/environments/dev - - if tofu plan -var-file="terraform.tfvars" -out=tfplan; then - log_success "计划生成成功" - log_info "计划文件已保存为 tfplan" - else - log_error "计划生成失败" - return 1 - fi - - cd - > /dev/null -} - -# 显示帮助信息 -show_help() { - echo "OpenTofu 设置脚本" - echo "" - echo "用法: $0 [选项]" - echo "" - echo "选项:" - echo " init - 初始化 OpenTofu" - echo " validate - 验证配置" - echo " plan - 生成执行计划" - echo " check - 检查环境和配置" - echo " help - 显示此帮助信息" - echo "" - echo "示例:" - echo " $0 check # 检查环境" - echo " $0 init # 初始化项目" - echo " $0 plan # 生成计划" -} - -# 主函数 -main() { - case "${1:-help}" in - "check") - check_opentofu - check_config - ;; - "init") - check_opentofu || exit 1 - check_config || exit 1 - init_opentofu - ;; - "validate") - validate_config - ;; - "plan") - check_opentofu || exit 1 - check_config || exit 1 - plan_infrastructure - ;; - "help"|*) - show_help - ;; - esac -} - -# 运行主函数 -main "$@" \ No newline at end of file diff --git a/scripts/utilities/NUCLEAR-NOMAD-RESET.yml b/scripts/utilities/NUCLEAR-NOMAD-RESET.yml deleted file mode 100644 index f080662..0000000 --- a/scripts/utilities/NUCLEAR-NOMAD-RESET.yml +++ /dev/null @@ -1,375 +0,0 @@ ---- -# ☢️ NUCLEAR NOMAD RESET ☢️ -# 这是比终极还要强的修复脚本 -# 警告:这将完全摧毁并重建 Nomad 集群 -- name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️" - hosts: nomad_cluster - become: yes - gather_facts: yes - serial: 1 # 一次处理一个节点,避免同时炸掉所有节点 - vars: - nomad_version: "1.10.5" - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - tailscale_ips: - semaphore: "100.116.158.95" - master: "100.117.106.136" - ash3c: "100.116.80.94" - - tasks: - - name: "🚨 警告:即将进行核弹级重置" - debug: - msg: | - ☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️ - 这将完全摧毁所有 Nomad 相关的数据、配置和进程! - 如果你不确定,请立即按 Ctrl+C 取消! - - - name: "⏰ 等待 10 秒,给你最后的机会取消..." - pause: - seconds: 10 - - # ========== 第一阶段:核弹级清理 ========== - - name: "💀 第一阶段:核弹级进程清理" - debug: - msg: "开始核弹级进程清理..." - - - name: "🔥 停止 Nomad 服务(如果存在)" - systemd: - name: nomad - state: stopped - enabled: no - daemon_reload: yes - ignore_errors: yes - - - name: "💣 强制杀死所有 Nomad 相关进程" - shell: | - # 杀死所有 nomad 进程 - pkill -9 -f nomad || true - # 杀死所有可能的子进程 - pkill -9 -f "nomad agent" || true - pkill -9 -f "nomad server" || true - pkill -9 -f "nomad client" || true - # 等待进程完全死亡 - sleep 5 - # 再次确认杀死 - ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true - ignore_errors: yes - - - name: "🧹 清理所有 Nomad 相关文件和目录" - file: - path: "{{ item }}" - state: absent - loop: - - /opt/nomad - - /etc/nomad.d - - /var/log/nomad - - /etc/systemd/system/nomad.service - - /usr/local/bin/nomad - - /usr/bin/nomad - - /tmp/nomad* - - /var/lib/nomad - - /run/nomad - - /var/run/nomad.pid - ignore_errors: yes - - - name: "🔧 清理 systemd 缓存" - systemd: - daemon_reload: yes - - # ========== 第二阶段:重新安装 Nomad ========== - - name: "🚀 第二阶段:重新安装 Nomad" - debug: - msg: "开始重新安装 Nomad..." - - - name: "🔑 添加 HashiCorp GPG 密钥" - apt_key: - url: https://apt.releases.hashicorp.com/gpg - state: present - - - name: "📦 添加 HashiCorp APT 仓库" - apt_repository: - repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main" - state: present - update_cache: yes - - - name: "🔧 安装 Nomad(自动检测架构)" - apt: - name: "nomad={{ nomad_version }}-1" - state: present - update_cache: yes - - - name: "👤 创建 nomad 用户和组" - group: - name: nomad - state: present - - - name: "👤 创建 nomad 用户" - user: - name: nomad - group: nomad - system: yes - shell: /bin/false - home: /opt/nomad - create_home: no - - - name: "📁 创建全新的目录结构" - file: - path: "{{ item.path }}" - state: directory - owner: "{{ item.owner | default('nomad') }}" - group: "{{ item.group | default('nomad') }}" - mode: "{{ item.mode | default('0755') }}" - loop: - - { path: "/etc/nomad.d", mode: "0755" } - - { path: "/opt/nomad", mode: "0755" } - - { path: "/opt/nomad/data", mode: "0755" } - - { path: "/opt/nomad/alloc_mounts", mode: "0755" } - - { path: "/var/log/nomad", mode: "0755" } - - # ========== 第三阶段:网络和防火墙检查 ========== - - name: "🌐 第三阶段:网络配置验证" - debug: - msg: "验证网络配置..." - - - name: "🔍 检查 Tailscale IP 是否正确绑定" - shell: | - ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND" - register: ip_check - - - name: "⚠️ IP 地址检查结果" - debug: - msg: | - 节点: {{ inventory_hostname }} - 期望 IP: {{ tailscale_ips[inventory_hostname] }} - 检查结果: {{ ip_check.stdout }} - {% if 'IP_NOT_FOUND' in ip_check.stdout %} - ❌ 警告:IP 地址未正确绑定! - {% else %} - ✅ IP 地址检查通过 - {% endif %} - - - name: "🔥 确保防火墙端口开放" - shell: | - # 检查并开放 Nomad 端口 - if command -v ufw >/dev/null 2>&1; then - ufw allow 4646/tcp # HTTP API - ufw allow 4647/tcp # RPC - ufw allow 4648/tcp # Serf - elif command -v firewall-cmd >/dev/null 2>&1; then - firewall-cmd --permanent --add-port=4646/tcp - firewall-cmd --permanent --add-port=4647/tcp - firewall-cmd --permanent --add-port=4648/tcp - firewall-cmd --reload - fi - ignore_errors: yes - - # ========== 第四阶段:创建超强配置 ========== - - name: "⚙️ 第四阶段:创建超强配置文件" - debug: - msg: "创建超强配置文件..." - - - name: "📝 创建核弹级 Nomad 配置" - copy: - content: | - # ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }} - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - # 使用正确的 Tailscale IP - bind_addr = "{{ tailscale_ips[inventory_hostname] }}" - - # 日志配置 - log_level = "INFO" - log_file = "/var/log/nomad/nomad.log" - log_rotate_duration = "24h" - log_rotate_max_files = 5 - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - # 更激进的重试配置 - server_join { - retry_join = [ - "{{ tailscale_ips.semaphore }}:4647", - "{{ tailscale_ips.master }}:4647", - "{{ tailscale_ips.ash3c }}:4647" - ] - retry_max = 10 - retry_interval = "15s" - } - - # 更宽松的心跳配置 - heartbeat_grace = "30s" - min_heartbeat_ttl = "10s" - max_heartbeats_per_second = 50.0 - - # Raft 配置优化 - raft_protocol = 3 - raft_multiplier = 1 - } - - client { - enabled = true - - # 网络接口配置 - network_interface = "tailscale0" - - # 更宽松的心跳配置 - max_kill_timeout = "30s" - - # 主机卷配置 - host_volume "docker-sock" { - path = "/var/run/docker.sock" - read_only = false - } - } - - - - # 地址和端口配置 - addresses { - http = "0.0.0.0" - rpc = "{{ tailscale_ips[inventory_hostname] }}" - serf = "{{ tailscale_ips[inventory_hostname] }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - # Docker 插件配置 - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - - # 更宽松的资源限制 - gc { - image = true - image_delay = "10m" - container = true - dangling_containers { - enabled = true - dry_run = false - period = "5m" - creation_grace = "5m" - } - } - } - } - - # 遥测配置 - telemetry { - collection_interval = "10s" - disable_hostname = false - prometheus_metrics = true - publish_allocation_metrics = true - publish_node_metrics = true - } - dest: "/etc/nomad.d/nomad.hcl" - owner: nomad - group: nomad - mode: '0640' - - # ========== 第五阶段:创建超强 systemd 服务 ========== - - name: "🔧 创建超强 systemd 服务文件" - copy: - content: | - [Unit] - Description=Nomad - Nuclear Edition - Documentation=https://www.nomadproject.io/ - Wants=network-online.target - After=network-online.target - ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl - - [Service] - Type=notify - User=nomad - Group=nomad - ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl - ExecReload=/bin/kill -HUP $MAINPID - KillMode=process - Restart=always - RestartSec=10 - LimitNOFILE=65536 - - # 更强的重启策略 - StartLimitInterval=0 - StartLimitBurst=10 - - # 环境变量 - Environment=NOMAD_DISABLE_UPDATE_CHECK=1 - - [Install] - WantedBy=multi-user.target - dest: "/etc/systemd/system/nomad.service" - owner: root - group: root - mode: '0644' - - - name: "🔄 重新加载 systemd" - systemd: - daemon_reload: yes - - # ========== 第六阶段:启动和验证 ========== - - name: "🚀 第六阶段:启动服务" - debug: - msg: "启动 Nomad 服务..." - - - name: "🔥 启用并启动 Nomad 服务" - systemd: - name: nomad - enabled: yes - state: started - daemon_reload: yes - - - name: "⏰ 等待服务启动" - pause: - seconds: 15 - - - name: "🔍 验证服务状态" - systemd: - name: nomad - register: nomad_service_status - - - name: "📊 显示服务状态" - debug: - msg: | - ☢️ 核弹级重置完成! - 节点: {{ inventory_hostname }} - 服务状态: {{ nomad_service_status.status.ActiveState }} - IP 地址: {{ tailscale_ips[inventory_hostname] }} - - {% if nomad_service_status.status.ActiveState == 'active' %} - ✅ 服务启动成功! - {% else %} - ❌ 服务启动失败,请检查日志! - {% endif %} - - - name: "🧹 清理临时文件" - file: - path: "{{ item }}" - state: absent - loop: - - "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" - - "/tmp/nomad" - ignore_errors: yes - - - name: "🎉 核弹级重置完成通知" - debug: - msg: | - ☢️☢️☢️ 核弹级重置完成!☢️☢️☢️ - - 节点 {{ inventory_hostname }} 已经被完全摧毁并重建! - - 下一步: - 1. 等待所有节点完成重置 - 2. 检查集群状态:nomad server members - 3. 检查节点状态:nomad node status - 4. 如果还有问题,那就真的没救了... 😅 \ No newline at end of file diff --git a/scripts/utilities/complete-nomad-cluster-fix.yml b/scripts/utilities/complete-nomad-cluster-fix.yml deleted file mode 100644 index 08274ab..0000000 --- a/scripts/utilities/complete-nomad-cluster-fix.yml +++ /dev/null @@ -1,189 +0,0 @@ ---- -- name: Complete Nomad Cluster Fix with Ansible - hosts: nomad_cluster - become: yes - gather_facts: yes - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - tailscale_ips: - semaphore: "100.116.158.95" - master: "100.117.106.136" - ash3c: "100.116.80.94" - - tasks: - - name: Stop nomad service completely - systemd: - name: nomad - state: stopped - enabled: yes - ignore_errors: yes - - - name: Kill any remaining nomad processes - shell: pkill -f nomad || true - ignore_errors: yes - - - name: Reset systemd failure state - shell: systemctl reset-failed nomad - ignore_errors: yes - - - name: Create nomad user if not exists - user: - name: nomad - system: yes - shell: /bin/false - home: /opt/nomad - create_home: no - - - name: Create all required directories with correct permissions - file: - path: "{{ item }}" - state: directory - owner: nomad - group: nomad - mode: '0755' - loop: - - /opt/nomad - - /opt/nomad/data - - /opt/nomad/alloc_mounts - - /var/log/nomad - - /etc/nomad.d - - - name: Completely clean nomad data directory - shell: rm -rf /opt/nomad/data/* /opt/nomad/data/.* - ignore_errors: yes - - - name: Create correct nomad configuration - copy: - content: | - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - bind_addr = "{{ tailscale_ips[inventory_hostname] }}" - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - server_join { - retry_join = [ - "{{ tailscale_ips.semaphore }}:4647", - "{{ tailscale_ips.master }}:4647", - "{{ tailscale_ips.ash3c }}:4647" - ] - retry_interval = "15s" - retry_max = 3 - } - } - - client { - enabled = true - alloc_dir = "/opt/nomad/alloc_mounts" - } - - ui { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "{{ tailscale_ips[inventory_hostname] }}" - serf = "{{ tailscale_ips[inventory_hostname] }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - - log_level = "INFO" - log_file = "/var/log/nomad/nomad.log" - log_rotate_duration = "24h" - log_rotate_max_files = 5 - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - - - name: Set correct ownership for all nomad files - file: - path: "{{ item }}" - owner: nomad - group: nomad - recurse: yes - loop: - - /opt/nomad - - /var/log/nomad - - /etc/nomad.d - - - name: Validate nomad configuration - shell: nomad config validate /etc/nomad.d/nomad.hcl - register: config_validation - ignore_errors: yes - - - name: Show config validation result - debug: - var: config_validation - - - name: Start nomad service on first node (semaphore) - systemd: - name: nomad - state: started - daemon_reload: yes - when: inventory_hostname == 'semaphore' - - - name: Wait for first node to start - pause: - seconds: 30 - when: inventory_hostname == 'semaphore' - - - name: Start nomad service on remaining nodes - systemd: - name: nomad - state: started - daemon_reload: yes - when: inventory_hostname != 'semaphore' - - - name: Wait for all services to start - pause: - seconds: 20 - - - name: Check nomad service status - shell: systemctl status nomad --no-pager -l - register: service_status - ignore_errors: yes - - - name: Show service status - debug: - var: service_status.stdout_lines - - - name: Check nomad logs for errors - shell: journalctl -u nomad -n 10 --no-pager - register: nomad_logs - ignore_errors: yes - - - name: Show recent nomad logs - debug: - var: nomad_logs.stdout_lines - - - name: Test nomad connectivity - shell: nomad server members - register: nomad_members - ignore_errors: yes - when: inventory_hostname == 'semaphore' - - - name: Show cluster members - debug: - var: nomad_members.stdout_lines - when: inventory_hostname == 'semaphore' \ No newline at end of file diff --git a/scripts/utilities/complete-nomad-reset.yml b/scripts/utilities/complete-nomad-reset.yml deleted file mode 100644 index 7b3633f..0000000 --- a/scripts/utilities/complete-nomad-reset.yml +++ /dev/null @@ -1,151 +0,0 @@ ---- -- name: Complete Nomad Cluster Reset and Rebuild - hosts: nomad_cluster - become: yes - serial: 1 # 一次处理一个节点 - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - tailscale_ips: - semaphore: "100.116.158.95" - master: "100.117.106.136" - ash3c: "100.116.80.94" - - tasks: - - name: Stop nomad service completely - systemd: - name: nomad - state: stopped - ignore_errors: yes - - - name: Kill any remaining nomad processes - shell: pkill -f nomad || true - ignore_errors: yes - - - name: Remove all nomad data and state - shell: | - rm -rf /opt/nomad/data/* - rm -rf /opt/nomad/data/.* - rm -rf /var/log/nomad/* - ignore_errors: yes - - - name: Create fresh nomad configuration with correct Tailscale IPs - copy: - content: | - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - # 使用 Tailscale IP 地址 - bind_addr = "{{ tailscale_ips[inventory_hostname] }}" - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - server_join { - retry_join = [ - "{{ tailscale_ips.semaphore }}", - "{{ tailscale_ips.master }}", - "{{ tailscale_ips.ash3c }}" - ] - } - } - - client { - enabled = true - network_interface = "tailscale0" - } - - ui_config { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "{{ tailscale_ips[inventory_hostname] }}" - serf = "{{ tailscale_ips[inventory_hostname] }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - - log_level = "INFO" - log_file = "/var/log/nomad/nomad.log" - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - - - name: Ensure log directory exists - file: - path: /var/log/nomad - state: directory - owner: nomad - group: nomad - mode: '0755' - - - name: Start nomad service - systemd: - name: nomad - state: started - enabled: yes - - - name: Wait for nomad to start - wait_for: - port: 4646 - host: "{{ tailscale_ips[inventory_hostname] }}" - delay: 5 - timeout: 30 - - - name: Check nomad service status - shell: systemctl status nomad --no-pager -l - register: nomad_status - ignore_errors: yes - - - name: Display nomad status - debug: - var: nomad_status.stdout_lines - -- name: Wait for cluster to form - hosts: localhost - gather_facts: no - tasks: - - name: Wait for cluster formation - pause: - seconds: 30 - prompt: "等待集群形成..." - -- name: Verify cluster status - hosts: semaphore - become: yes - tasks: - - name: Check cluster members - shell: nomad server members - register: cluster_members - ignore_errors: yes - - - name: Display cluster members - debug: - var: cluster_members.stdout_lines - - - name: Check node status - shell: nomad node status - register: node_status - ignore_errors: yes - - - name: Display node status - debug: - var: node_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/consul-cluster-manager.sh b/scripts/utilities/consul-cluster-manager.sh deleted file mode 100755 index 5503ced..0000000 --- a/scripts/utilities/consul-cluster-manager.sh +++ /dev/null @@ -1,233 +0,0 @@ -#!/bin/bash - -# Consul 集群管理脚本 -# 提供集群状态检查、重启、停止等功能 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 打印带颜色的消息 -print_status() { - echo -e "${GREEN}[INFO]${NC} $1" -} - -print_warning() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -print_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -print_header() { - echo -e "${BLUE}=== $1 ===${NC}" -} - -# 检查必要文件 -check_prerequisites() { - if [[ ! -f "$INVENTORY_FILE" ]]; then - print_error "清单文件不存在: $INVENTORY_FILE" - exit 1 - fi - - if ! command -v ansible &> /dev/null; then - print_error "未找到 ansible 命令" - exit 1 - fi -} - -# 显示帮助信息 -show_help() { - echo "Consul 集群管理脚本" - echo - echo "用法: $0 [命令]" - echo - echo "命令:" - echo " status - 检查集群状态" - echo " members - 显示集群成员" - echo " leader - 显示集群领导者" - echo " restart - 重启 Consul 服务" - echo " stop - 停止 Consul 服务" - echo " start - 启动 Consul 服务" - echo " logs - 查看服务日志" - echo " health - 健康检查" - echo " cleanup - 清理 Consul 数据(危险操作)" - echo " help - 显示此帮助信息" - echo -} - -# 检查集群状态 -check_status() { - print_header "Consul 服务状态" - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o - - echo - print_header "Consul 进程状态" - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ps aux | grep consul | grep -v grep" -o -} - -# 显示集群成员 -show_members() { - print_header "Consul 集群成员" - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members" -o -} - -# 显示集群领导者 -show_leader() { - print_header "Consul 集群领导者" - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul operator raft list-peers" -o - - echo - print_header "通过 API 检查领导者" - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s http://localhost:8500/v1/status/leader" -o -} - -# 重启服务 -restart_service() { - print_header "重启 Consul 服务" - print_warning "即将重启所有 Consul 节点..." - read -p "确认继续? (y/N): " confirm - if [[ $confirm != "y" && $confirm != "Y" ]]; then - print_status "操作已取消" - return - fi - - ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=restarted" -b - - print_status "等待服务启动..." - sleep 10 - check_status -} - -# 停止服务 -stop_service() { - print_header "停止 Consul 服务" - print_warning "即将停止所有 Consul 节点..." - read -p "确认继续? (y/N): " confirm - if [[ $confirm != "y" && $confirm != "Y" ]]; then - print_status "操作已取消" - return - fi - - ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b -} - -# 启动服务 -start_service() { - print_header "启动 Consul 服务" - ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b - - print_status "等待服务启动..." - sleep 10 - check_status -} - -# 查看日志 -show_logs() { - print_header "Consul 服务日志" - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "journalctl -u consul --no-pager -n 20" -o -} - -# 健康检查 -health_check() { - print_header "Consul 健康检查" - - # 检查服务状态 - print_status "检查服务状态..." - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o - - echo - # 检查端口监听 - print_status "检查端口监听..." - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ss -tlnp | grep :8500" -o - - echo - # 检查集群成员 - print_status "检查集群成员..." - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members | wc -l" -o - - echo - # 检查 API 响应 - print_status "检查 API 响应..." - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s -o /dev/null -w '%{http_code}' http://localhost:8500/v1/status/leader" -o -} - -# 清理数据(危险操作) -cleanup_data() { - print_header "清理 Consul 数据" - print_error "警告: 此操作将删除所有 Consul 数据,包括服务注册、KV 存储等!" - print_error "此操作不可逆!" - echo - read -p "确认要清理所有数据? 请输入 'YES' 确认: " confirm - if [[ $confirm != "YES" ]]; then - print_status "操作已取消" - return - fi - - print_status "停止 Consul 服务..." - ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b - - print_status "清理数据目录..." - ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "rm -rf /opt/consul/data/*" -b - - print_status "启动 Consul 服务..." - ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b - - print_status "数据清理完成" -} - -# 主函数 -main() { - check_prerequisites - - case "${1:-help}" in - status) - check_status - ;; - members) - show_members - ;; - leader) - show_leader - ;; - restart) - restart_service - ;; - stop) - stop_service - ;; - start) - start_service - ;; - logs) - show_logs - ;; - health) - health_check - ;; - cleanup) - cleanup_data - ;; - help|--help|-h) - show_help - ;; - *) - print_error "未知命令: $1" - echo - show_help - exit 1 - ;; - esac -} - -main "$@" \ No newline at end of file diff --git a/scripts/utilities/consul-secrets-manager.sh b/scripts/utilities/consul-secrets-manager.sh deleted file mode 100755 index f317eec..0000000 --- a/scripts/utilities/consul-secrets-manager.sh +++ /dev/null @@ -1,228 +0,0 @@ -#!/bin/bash - -# Consul 密钥管理脚本 -# 用于安全地管理 Oracle Cloud 和其他云服务商的敏感配置 - -set -euo pipefail - -# 配置 -CONSUL_ADDR="${CONSUL_ADDR:-http://localhost:8500}" -CONSUL_TOKEN="${CONSUL_TOKEN:-}" -ENVIRONMENT="${ENVIRONMENT:-dev}" - -# 颜色输出 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查 Consul 连接 -check_consul() { - log_info "检查 Consul 连接..." - if ! curl -s "${CONSUL_ADDR}/v1/status/leader" > /dev/null; then - log_error "无法连接到 Consul: ${CONSUL_ADDR}" - exit 1 - fi - log_success "Consul 连接正常" -} - -# 设置 Oracle Cloud 配置 -set_oracle_config() { - log_info "设置 Oracle Cloud 配置..." - - echo "请输入 Oracle Cloud 配置信息:" - - read -p "租户 OCID: " tenancy_ocid - read -p "用户 OCID: " user_ocid - read -p "API 密钥指纹: " fingerprint - read -p "私钥文件路径: " private_key_path - read -p "区间 OCID: " compartment_ocid - - # 验证私钥文件是否存在 - if [[ ! -f "$private_key_path" ]]; then - log_error "私钥文件不存在: $private_key_path" - exit 1 - fi - - # 读取私钥内容 - private_key_content=$(cat "$private_key_path") - - # 存储到 Consul - local base_path="config/${ENVIRONMENT}/oracle" - - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid" -d "$tenancy_ocid" > /dev/null - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid" -d "$user_ocid" > /dev/null - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint" -d "$fingerprint" > /dev/null - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/private_key" -d "$private_key_content" > /dev/null - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid" -d "$compartment_ocid" > /dev/null - - log_success "Oracle Cloud 配置已存储到 Consul" -} - -# 获取 Oracle Cloud 配置 -get_oracle_config() { - log_info "从 Consul 获取 Oracle Cloud 配置..." - - local base_path="config/${ENVIRONMENT}/oracle" - - echo "Oracle Cloud 配置:" - echo "租户 OCID: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid?raw" 2>/dev/null || echo "未设置")" - echo "用户 OCID: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid?raw" 2>/dev/null || echo "未设置")" - echo "指纹: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint?raw" 2>/dev/null || echo "未设置")" - echo "区间 OCID: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid?raw" 2>/dev/null || echo "未设置")" - echo "私钥: $(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/private_key?raw" 2>/dev/null | head -1 || echo "未设置")" -} - -# 删除 Oracle Cloud 配置 -delete_oracle_config() { - log_warning "删除 Oracle Cloud 配置..." - - read -p "确定要删除所有 Oracle Cloud 配置吗?(y/N): " confirm - if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - log_info "操作已取消" - return - fi - - local base_path="config/${ENVIRONMENT}/oracle" - - curl -s -X DELETE "${CONSUL_ADDR}/v1/kv/${base_path}?recurse" > /dev/null - - log_success "Oracle Cloud 配置已删除" -} - -# 生成 Terraform 变量文件 -generate_terraform_vars() { - log_info "生成 Terraform 变量文件..." - - local base_path="config/${ENVIRONMENT}/oracle" - local output_file="infrastructure/environments/${ENVIRONMENT}/terraform.tfvars.consul" - - # 从 Consul 获取配置 - local tenancy_ocid=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid?raw" 2>/dev/null || echo "") - local user_ocid=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid?raw" 2>/dev/null || echo "") - local fingerprint=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint?raw" 2>/dev/null || echo "") - local compartment_ocid=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid?raw" 2>/dev/null || echo "") - - if [[ -z "$tenancy_ocid" ]]; then - log_error "Consul 中没有找到 Oracle Cloud 配置" - exit 1 - fi - - # 创建临时私钥文件 - local temp_key_file="/tmp/oci_private_key_${ENVIRONMENT}.pem" - curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/private_key?raw" > "$temp_key_file" - chmod 600 "$temp_key_file" - - # 生成 Terraform 变量文件 - cat > "$output_file" << EOF -# 从 Consul 生成的 Oracle Cloud 配置 -# 生成时间: $(date) -# 环境: ${ENVIRONMENT} - -oci_config = { - tenancy_ocid = "$tenancy_ocid" - user_ocid = "$user_ocid" - fingerprint = "$fingerprint" - private_key_path = "$temp_key_file" - region = "ap-seoul-1" - compartment_ocid = "$compartment_ocid" -} -EOF - - log_success "Terraform 变量文件已生成: $output_file" - log_warning "私钥文件位置: $temp_key_file" - log_warning "请在使用完毕后删除临时私钥文件" -} - -# 清理临时文件 -cleanup_temp_files() { - log_info "清理临时文件..." - - rm -f /tmp/oci_private_key_*.pem - rm -f infrastructure/environments/*/terraform.tfvars.consul - - log_success "临时文件已清理" -} - -# 显示帮助信息 -show_help() { - cat << EOF -Consul 密钥管理脚本 - -用法: $0 [选项] - -选项: - set-oracle 设置 Oracle Cloud 配置到 Consul - get-oracle 从 Consul 获取 Oracle Cloud 配置 - delete-oracle 从 Consul 删除 Oracle Cloud 配置 - generate-vars 从 Consul 生成 Terraform 变量文件 - cleanup 清理临时文件 - help 显示此帮助信息 - -环境变量: - CONSUL_ADDR Consul 地址 (默认: http://localhost:8500) - CONSUL_TOKEN Consul ACL Token (可选) - ENVIRONMENT 环境名称 (默认: dev) - -示例: - # 设置 Oracle Cloud 配置 - $0 set-oracle - - # 生成 Terraform 变量文件 - $0 generate-vars - - # 查看配置 - $0 get-oracle - - # 清理临时文件 - $0 cleanup -EOF -} - -# 主函数 -main() { - case "${1:-help}" in - "set-oracle") - check_consul - set_oracle_config - ;; - "get-oracle") - check_consul - get_oracle_config - ;; - "delete-oracle") - check_consul - delete_oracle_config - ;; - "generate-vars") - check_consul - generate_terraform_vars - ;; - "cleanup") - cleanup_temp_files - ;; - "help"|*) - show_help - ;; - esac -} - -main "$@" \ No newline at end of file diff --git a/scripts/utilities/correct-nomad-cluster.yml b/scripts/utilities/correct-nomad-cluster.yml deleted file mode 100644 index cbe1717..0000000 --- a/scripts/utilities/correct-nomad-cluster.yml +++ /dev/null @@ -1,115 +0,0 @@ ---- -- name: Correct Nomad Cluster Configuration - hosts: nomad_cluster - become: yes - gather_facts: yes - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - tailscale_ips: - semaphore: "100.116.158.95" - master: "100.117.106.136" - ash3c: "100.116.80.94" - - tasks: - - name: Stop nomad service - systemd: - name: nomad - state: stopped - ignore_errors: yes - - - name: Clean nomad data - file: - path: /opt/nomad/data - state: absent - - - name: Recreate nomad data directory - file: - path: /opt/nomad/data - state: directory - owner: nomad - group: nomad - mode: '0755' - - - name: Create correct nomad configuration - copy: - content: | - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - bind_addr = "{{ tailscale_ips[inventory_hostname] }}" - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - server_join { - retry_join = [ - "{{ tailscale_ips.semaphore }}:4647", - "{{ tailscale_ips.master }}:4647", - "{{ tailscale_ips.ash3c }}:4647" - ] - retry_interval = "15s" - retry_max = 3 - } - } - - client { - enabled = true - alloc_dir = "/opt/nomad/alloc_mounts" - } - - ui { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "{{ tailscale_ips[inventory_hostname] }}" - serf = "{{ tailscale_ips[inventory_hostname] }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - - log_level = "INFO" - log_file = "/var/log/nomad/nomad.log" - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - -- name: Start nomad services in sequence - hosts: nomad_cluster - become: yes - serial: 1 - tasks: - - name: Start nomad service - systemd: - name: nomad - state: started - daemon_reload: yes - - - name: Wait for nomad to start - wait_for: - port: 4646 - host: "{{ tailscale_ips[inventory_hostname] }}" - delay: 10 - timeout: 60 - - - name: Wait between nodes - pause: - seconds: 30 \ No newline at end of file diff --git a/scripts/utilities/deploy-nomad-configs.yml b/scripts/utilities/deploy-nomad-configs.yml deleted file mode 100644 index 6336b9b..0000000 --- a/scripts/utilities/deploy-nomad-configs.yml +++ /dev/null @@ -1,113 +0,0 @@ ---- -- name: Deploy Nomad Configurations - hosts: nomad_cluster - become: yes - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - node_ips: - semaphore: "100.116.158.95" - master: "100.117.106.136" - ash3c: "100.116.80.94" - - tasks: - - name: Create nomad configuration for each node - copy: - content: | - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - bind_addr = "{{ node_ips[inventory_hostname] }}" - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - server_join { - retry_join = [ - "{{ node_ips.semaphore }}:4647", - "{{ node_ips.master }}:4647", - "{{ node_ips.ash3c }}:4647" - ] - retry_interval = "15s" - retry_max = 3 - } - } - - client { - enabled = true - alloc_dir = "/opt/nomad/alloc_mounts" - } - - ui { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "{{ node_ips[inventory_hostname] }}" - serf = "{{ node_ips[inventory_hostname] }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - - log_level = "INFO" - log_file = "/var/log/nomad/nomad.log" - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - - - name: Validate nomad configuration - shell: nomad config validate /etc/nomad.d/nomad.hcl - register: config_validation - - - name: Show validation result - debug: - var: config_validation.stdout_lines - - - name: Start nomad service on bootstrap node first - systemd: - name: nomad - state: started - daemon_reload: yes - when: inventory_hostname == 'semaphore' - - - name: Wait for bootstrap node - pause: - seconds: 15 - when: inventory_hostname == 'semaphore' - - - name: Start nomad service on other nodes - systemd: - name: nomad - state: started - daemon_reload: yes - when: inventory_hostname != 'semaphore' - - - name: Wait for services to start - pause: - seconds: 10 - - - name: Check service status - shell: systemctl status nomad --no-pager - register: service_status - ignore_errors: yes - - - name: Show service status - debug: - var: service_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/disk-monitor.sh b/scripts/utilities/disk-monitor.sh new file mode 100755 index 0000000..799838c --- /dev/null +++ b/scripts/utilities/disk-monitor.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# 磁盘监控脚本 +# 使用方法: ./disk-monitor.sh [threshold] + +THRESHOLD=${1:-85} # 默认阈值 85% +INVENTORY_FILE="configuration/inventories/production/nomad-cluster.ini" + +echo "🔍 开始磁盘空间监控 (阈值: ${THRESHOLD}%)" +echo "==================================" + +# 运行磁盘分析 +echo "📊 运行磁盘分析..." +ansible-playbook -i "$INVENTORY_FILE" configuration/playbooks/disk-analysis-ncdu.yml + +echo "" +echo "⚠️ 检查高磁盘使用率节点..." + +# 检查所有节点的磁盘使用情况 +ansible all -i "$INVENTORY_FILE" -m shell -a "df -h | awk 'NR>1 {gsub(/%/, \"\", \$5); if(\$5 > $THRESHOLD) print \$0}'" | while read line; do + if [[ $line == *"=>"* ]]; then + echo "🚨 节点: $line" + elif [[ $line =~ ^/dev ]]; then + echo " 高使用率磁盘: $line" + fi +done + +echo "" +echo "💡 如需清理,运行:" +echo " ansible-playbook -i $INVENTORY_FILE configuration/playbooks/disk-cleanup.yml" +echo "" +echo "📁 详细报告位置: /tmp/disk-analysis/" +echo " 使用 ncdu -f /tmp/disk-analysis/ncdu-root-.json 查看详细信息" \ No newline at end of file diff --git a/scripts/utilities/final-nomad-cluster-fix.yml b/scripts/utilities/final-nomad-cluster-fix.yml deleted file mode 100644 index 46080a9..0000000 --- a/scripts/utilities/final-nomad-cluster-fix.yml +++ /dev/null @@ -1,190 +0,0 @@ ---- -- name: Final Complete Nomad Cluster Fix - hosts: nomad_cluster - become: yes - gather_facts: yes - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - nomad_servers: - - "100.116.158.95:4647" # semaphore - - "100.117.106.136:4647" # master - - "100.116.80.94:4647" # ash3c - - tasks: - - name: Stop nomad service - systemd: - name: nomad - state: stopped - ignore_errors: yes - - - name: Reset failed nomad service - systemd: - name: nomad - daemon_reload: yes - ignore_errors: yes - - - name: Create nomad user if not exists - user: - name: nomad - system: yes - shell: /bin/false - home: /opt/nomad - create_home: no - - - name: Create nomad directories with correct permissions - file: - path: "{{ item }}" - state: directory - owner: nomad - group: nomad - mode: '0755' - loop: - - /etc/nomad.d - - /opt/nomad - - /opt/nomad/data - - /opt/nomad/alloc_mounts - - /var/log/nomad - - - name: Clean old nomad data - file: - path: /opt/nomad/data - state: absent - - - name: Recreate nomad data directory - file: - path: /opt/nomad/data - state: directory - owner: nomad - group: nomad - mode: '0755' - - - name: Get Tailscale IP address - shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1 - register: tailscale_ip - failed_when: false - - - name: Set bind address (fallback to default interface if tailscale not available) - set_fact: - bind_address: "{{ tailscale_ip.stdout if tailscale_ip.stdout != '' else ansible_default_ipv4.address }}" - - - name: Generate nomad configuration - template: - src: nomad-server.hcl.j2 - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - vars: - nomad_datacenter: "dc1" - nomad_region: "global" - nomad_data_dir: "/opt/nomad/data" - nomad_bind_addr: "{{ bind_address }}" - nomad_bootstrap_expect: 3 - nomad_encrypt: "{{ nomad_encrypt_key }}" - nomad_retry_join: "{{ nomad_servers }}" - nomad_alloc_dir: "/opt/nomad/alloc_mounts" - nomad_log_file: "/var/log/nomad/nomad.log" - - - name: Create nomad systemd service - copy: - content: | - [Unit] - Description=Nomad - Documentation=https://www.nomadproject.io/ - Requires=network-online.target - After=network-online.target - ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl - - [Service] - Type=notify - User=nomad - Group=nomad - ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl - ExecReload=/bin/kill -HUP $MAINPID - KillMode=process - Restart=on-failure - LimitNOFILE=65536 - - [Install] - WantedBy=multi-user.target - dest: /etc/systemd/system/nomad.service - mode: '0644' - - - name: Reload systemd daemon - systemd: - daemon_reload: yes - - - name: Start nomad service - systemd: - name: nomad - state: started - enabled: yes - - - name: Wait for nomad to start - wait_for: - port: 4646 - host: "{{ bind_address }}" - delay: 5 - timeout: 30 - ignore_errors: yes - -- name: Create nomad configuration template - hosts: localhost - gather_facts: no - tasks: - - name: Create nomad server template - copy: - content: | - datacenter = "{{ nomad_datacenter }}" - region = "{{ nomad_region }}" - data_dir = "{{ nomad_data_dir }}" - - bind_addr = "{{ nomad_bind_addr }}" - - server { - enabled = true - bootstrap_expect = {{ nomad_bootstrap_expect }} - encrypt = "{{ nomad_encrypt }}" - - server_join { - retry_join = {{ nomad_retry_join | to_json }} - retry_interval = "15s" - retry_max = 3 - } - } - - client { - enabled = true - alloc_dir = "{{ nomad_alloc_dir }}" - } - - ui { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "{{ nomad_bind_addr }}" - serf = "{{ nomad_bind_addr }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - - log_level = "INFO" - log_file = "{{ nomad_log_file }}" - dest: /tmp/nomad-server.hcl.j2 - delegate_to: localhost - run_once: true \ No newline at end of file diff --git a/scripts/utilities/final-nomad-fix.yml b/scripts/utilities/final-nomad-fix.yml deleted file mode 100644 index ed51095..0000000 --- a/scripts/utilities/final-nomad-fix.yml +++ /dev/null @@ -1,111 +0,0 @@ ---- -- name: Final Nomad Cluster Fix - hosts: nomad_cluster - become: yes - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - tailscale_ips: - semaphore: "100.116.158.95" - master: "100.117.106.136" - ash3c: "100.116.80.94" - - tasks: - - name: Stop nomad service - systemd: - name: nomad - state: stopped - ignore_errors: yes - - - name: Create required directories - file: - path: "{{ item }}" - state: directory - owner: nomad - group: nomad - mode: '0755' - loop: - - /opt/nomad/data - - /opt/nomad/alloc_mounts - - /var/log/nomad - - - name: Clean nomad data - shell: rm -rf /opt/nomad/data/* - ignore_errors: yes - - - name: Create working nomad configuration - copy: - content: | - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - bind_addr = "{{ tailscale_ips[inventory_hostname] }}" - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - server_join { - retry_join = [ - "{{ tailscale_ips.semaphore }}", - "{{ tailscale_ips.master }}", - "{{ tailscale_ips.ash3c }}" - ] - } - } - - client { - enabled = true - } - - ui { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "{{ tailscale_ips[inventory_hostname] }}" - serf = "{{ tailscale_ips[inventory_hostname] }}" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - - log_level = "INFO" - log_file = "/var/log/nomad/nomad.log" - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - - - name: Start nomad service - systemd: - name: nomad - state: started - enabled: yes - - - name: Wait for service to start - pause: - seconds: 10 - - - name: Check service status - shell: systemctl status nomad --no-pager -l - register: service_status - ignore_errors: yes - - - name: Show service status - debug: - var: service_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/fix-ash3c-ip.sh b/scripts/utilities/fix-ash3c-ip.sh deleted file mode 100755 index 8428d89..0000000 --- a/scripts/utilities/fix-ash3c-ip.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash - -# 🔧 ash3c IP 地址修复脚本 - -set -e - -echo "🔧 ash3c IP 地址问题修复脚本" -echo "" - -# 定义正确的 IP 地址 -CORRECT_IP="100.116.80.94" -ASH3C_HOST="100.116.80.94" - -echo "📡 检查 ash3c 节点的网络配置..." - -# 检查 ash3c 的实际 IP 配置 -echo "🔍 检查 ash3c 节点的 IP 地址绑定..." -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S ip addr show" | grep -E "inet.*100\." || echo "❌ 未找到 Tailscale IP" - -echo "" -echo "🔍 检查 Tailscale 状态..." -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S tailscale status" || echo "❌ Tailscale 状态检查失败" - -echo "" -echo "🔧 修复 ash3c 的 Nomad 配置..." - -# 创建正确的配置文件 -cat > /tmp/ash3c-nomad.hcl << EOF -# 🔧 ash3c 修复后的 Nomad 配置 -datacenter = "dc1" -region = "global" -data_dir = "/opt/nomad/data" - -# 强制使用正确的 Tailscale IP -bind_addr = "${CORRECT_IP}" - -# 日志配置 -log_level = "INFO" -log_file = "/var/log/nomad/nomad.log" - -server { - enabled = true - bootstrap_expect = 3 - encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - - server_join { - retry_join = [ - "100.116.158.95:4647", - "100.117.106.136:4647", - "100.116.80.94:4647" - ] - retry_max = 10 - retry_interval = "15s" - } - - # 更宽松的心跳配置 - heartbeat_grace = "30s" - min_heartbeat_ttl = "10s" -} - -client { - enabled = true - network_interface = "tailscale0" -} - -ui_config { - enabled = true -} - -addresses { - http = "0.0.0.0" - rpc = "${CORRECT_IP}" - serf = "${CORRECT_IP}" -} - -ports { - http = 4646 - rpc = 4647 - serf = 4648 -} - -plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } -} -EOF - -echo "📤 上传修复后的配置到 ash3c..." -scp -P 22 -i ~/.ssh/id_ed25519 /tmp/ash3c-nomad.hcl ben@${ASH3C_HOST}:/tmp/ - -echo "🔧 在 ash3c 上应用修复..." -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} << 'REMOTE_SCRIPT' -echo '3131' | sudo -S systemctl stop nomad || true -echo '3131' | sudo -S pkill -f nomad || true -sleep 5 - -# 备份旧配置 -echo '3131' | sudo -S cp /etc/nomad.d/nomad.hcl /etc/nomad.d/nomad.hcl.backup.$(date +%Y%m%d_%H%M%S) || true - -# 应用新配置 -echo '3131' | sudo -S cp /tmp/ash3c-nomad.hcl /etc/nomad.d/nomad.hcl -echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl -echo '3131' | sudo -S chmod 640 /etc/nomad.d/nomad.hcl - -# 清理数据目录 -echo '3131' | sudo -S rm -rf /opt/nomad/data/* - -# 重启服务 -echo '3131' | sudo -S systemctl daemon-reload -echo '3131' | sudo -S systemctl enable nomad -echo '3131' | sudo -S systemctl start nomad - -echo "✅ ash3c 配置修复完成" -REMOTE_SCRIPT - -echo "" -echo "⏰ 等待 ash3c 服务启动..." -sleep 15 - -echo "" -echo "🔍 检查 ash3c 服务状态..." -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S systemctl status nomad --no-pager" || echo "❌ 服务状态检查失败" - -echo "" -echo "🧹 清理临时文件..." -rm -f /tmp/ash3c-nomad.hcl - -echo "" -echo "✅ ash3c IP 修复完成!" -echo "" -echo "下一步:" -echo "1. 检查集群状态: nomad server members" -echo "2. 如果还有问题,运行核弹级重置: ./scripts/utilities/nuclear-reset.sh" \ No newline at end of file diff --git a/scripts/utilities/fix-consul-cluster.sh b/scripts/utilities/fix-consul-cluster.sh deleted file mode 100755 index 35c07a4..0000000 --- a/scripts/utilities/fix-consul-cluster.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash - -# Consul 集群修复脚本 -# 解决 "No cluster leader" 问题 - -set -e - -echo "=== Consul 集群修复脚本 ===" -echo "当前时间: $(date)" -echo - -# 检查当前 Consul 服务状态 -echo "1. 检查当前 Consul 服务状态..." -docker service ls | grep consul || echo "未找到 consul 服务" -echo - -# 显示当前问题 -echo "2. 检查 Consul 日志中的错误..." -echo "Master 节点日志:" -docker service logs consul-cluster_consul-master --tail 5 2>/dev/null || echo "无法获取 master 日志" -echo -echo "Ash3c 节点日志:" -docker service logs consul-cluster_consul-ash3c --tail 5 2>/dev/null || echo "无法获取 ash3c 日志" -echo - -# 提供修复选项 -echo "3. 修复选项:" -echo " a) 使用修复后的 overlay 网络配置 (推荐)" -echo " b) 使用 macvlan 网络配置" -echo " c) 仅重启现有服务" -echo - -read -p "请选择修复方案 (a/b/c): " choice - -case $choice in - a) - echo "使用修复后的 overlay 网络配置..." - - # 停止现有服务 - echo "停止现有 Consul 集群..." - docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在" - - # 等待服务完全停止 - echo "等待服务完全停止..." - sleep 10 - - # 清理数据卷 (可选) - read -p "是否清理现有数据卷? (y/n): " clean_volumes - if [[ $clean_volumes == "y" ]]; then - docker volume rm consul-cluster_consul_master_data 2>/dev/null || true - docker volume rm consul-cluster_consul_ash3c_data 2>/dev/null || true - echo "数据卷已清理" - fi - - # 部署修复后的配置 - echo "部署修复后的 Consul 集群..." - docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-fixed.yml consul-cluster - - echo "等待服务启动..." - sleep 15 - - # 检查服务状态 - echo "检查新服务状态..." - docker service ls | grep consul - ;; - - b) - echo "使用 macvlan 网络配置..." - echo "注意: 需要根据你的网络环境调整 IP 地址和网络接口" - - # 检查网络接口 - echo "当前网络接口:" - ip link show | grep -E "^[0-9]+:" | awk '{print $2}' | sed 's/://' - echo - - read -p "请输入要使用的网络接口 (如 eth0): " interface - read -p "请输入子网 (如 192.168.1.0/24): " subnet - read -p "请输入网关 (如 192.168.1.1): " gateway - - # 更新 macvlan 配置文件 - sed -i "s/parent: eth0/parent: $interface/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml - sed -i "s/192.168.1.0\/24/$subnet/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml - sed -i "s/192.168.1.1/$gateway/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml - - # 停止现有服务 - echo "停止现有 Consul 集群..." - docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在" - - # 等待服务完全停止 - echo "等待服务完全停止..." - sleep 10 - - # 部署 macvlan 配置 - echo "部署 macvlan Consul 集群..." - docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml consul-cluster - - echo "等待服务启动..." - sleep 15 - - # 检查服务状态 - echo "检查新服务状态..." - docker service ls | grep consul - ;; - - c) - echo "重启现有服务..." - - # 重启服务 - docker service update --force consul-cluster_consul-master - docker service update --force consul-cluster_consul-ash3c - - echo "等待服务重启..." - sleep 10 - - # 检查服务状态 - echo "检查服务状态..." - docker service ls | grep consul - ;; - - *) - echo "无效选择,退出" - exit 1 - ;; -esac - -echo -echo "4. 验证修复结果..." -sleep 5 - -# 检查服务状态 -echo "服务状态:" -docker service ls | grep consul - -echo -echo "等待 30 秒后检查集群状态..." -sleep 30 - -# 尝试检查集群成员 -echo "尝试检查集群成员状态..." -timeout 10 docker service logs consul-cluster_consul-master --tail 10 2>/dev/null || echo "无法获取日志" - -echo -echo "=== 修复完成 ===" -echo "请等待几分钟让集群完全启动,然后访问:" -echo "- Master UI: http://your-master-ip:8500" -echo "- Ash3c UI: http://your-ash3c-ip:8501" -echo -echo "如果问题仍然存在,请检查:" -echo "1. 节点间网络连通性" -echo "2. 防火墙设置" -echo "3. Docker Swarm 网络配置" \ No newline at end of file diff --git a/scripts/utilities/fix-nomad-cluster.yml b/scripts/utilities/fix-nomad-cluster.yml deleted file mode 100644 index 335295c..0000000 --- a/scripts/utilities/fix-nomad-cluster.yml +++ /dev/null @@ -1,92 +0,0 @@ ---- -- name: Fix Nomad Cluster Issues - hosts: nomad_cluster - become: yes - vars: - nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - - tasks: - - name: Stop nomad service - systemd: - name: nomad - state: stopped - ignore_errors: yes - - - name: Clean nomad data directory - shell: rm -rf /opt/nomad/data/* - ignore_errors: yes - - - name: Create correct nomad configuration - copy: - content: | - datacenter = "dc1" - region = "global" - data_dir = "/opt/nomad/data" - - bind_addr = "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}" - - server { - enabled = true - bootstrap_expect = 3 - encrypt = "{{ nomad_encrypt_key }}" - - server_join { - retry_join = ["100.116.158.95", "100.117.106.136", "100.116.80.94"] - } - } - - client { - enabled = true - network_interface = "{{ ansible_default_ipv4.interface | default('eth0') }}" - } - - ui { - enabled = true - } - - addresses { - http = "0.0.0.0" - rpc = "0.0.0.0" - serf = "0.0.0.0" - } - - ports { - http = 4646 - rpc = 4647 - serf = 4648 - } - - plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } - } - dest: /etc/nomad.d/nomad.hcl - owner: nomad - group: nomad - mode: '0640' - - - name: Start nomad service - systemd: - name: nomad - state: started - enabled: yes - - - name: Wait for nomad to start - wait_for: - port: 4646 - host: "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}" - delay: 10 - timeout: 60 - - - name: Check nomad status - shell: systemctl status nomad --no-pager -l - register: nomad_status - ignore_errors: yes - - - name: Display nomad status - debug: - var: nomad_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/gitea-repo-manager.sh b/scripts/utilities/gitea-repo-manager.sh deleted file mode 100755 index 29e49b6..0000000 --- a/scripts/utilities/gitea-repo-manager.sh +++ /dev/null @@ -1,242 +0,0 @@ -#!/bin/bash -# Gitea 仓库管理脚本 - -set -e - -# 配置 -GITEA_HOST="gitea" -GITEA_USER="ben" -GITEA_HTTP_URL="http://${GITEA_HOST}:3000" -GITEA_SSH_URL="git@${GITEA_HOST}" -REPO_NAME="mgmt" - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 打印带颜色的消息 -print_message() { - local color=$1 - local message=$2 - echo -e "${color}${message}${NC}" -} - -# 检查 SSH 连接 -check_ssh_connection() { - print_message $BLUE "🔍 检查 Gitea SSH 连接..." - - if ssh -o ConnectTimeout=5 -o BatchMode=yes "${GITEA_SSH_URL}" 2>&1 | grep -q "successfully authenticated"; then - print_message $GREEN "✅ SSH 连接正常" - return 0 - else - print_message $RED "❌ SSH 连接失败" - return 1 - fi -} - -# 检查仓库状态 -check_repo_status() { - print_message $BLUE "📊 检查仓库状态..." - - if [ -d ".git" ]; then - print_message $GREEN "✅ Git 仓库已初始化" - - if git remote get-url origin >/dev/null 2>&1; then - local origin_url=$(git remote get-url origin) - print_message $GREEN "✅ 远程仓库: $origin_url" - else - print_message $YELLOW "⚠️ 未配置远程仓库" - fi - - local branch=$(git branch --show-current) - print_message $BLUE "📍 当前分支: $branch" - - local status=$(git status --porcelain) - if [ -z "$status" ]; then - print_message $GREEN "✅ 工作目录干净" - else - print_message $YELLOW "⚠️ 有未提交的变更" - fi - else - print_message $RED "❌ 不是 Git 仓库" - fi -} - -# 初始化仓库 -init_repo() { - print_message $BLUE "📦 初始化 Git 仓库..." - - if [ ! -d ".git" ]; then - git init - git config user.name "${GITEA_USER}" - git config user.email "${GITEA_USER}@example.com" - print_message $GREEN "✅ Git 仓库初始化完成" - fi - - # 配置远程仓库 - if ! git remote get-url origin >/dev/null 2>&1; then - git remote add origin "${GITEA_SSH_URL}:${GITEA_USER}/${REPO_NAME}.git" - print_message $GREEN "✅ 远程仓库配置完成" - fi -} - -# 同步代码 -sync_code() { - print_message $BLUE "🔄 同步代码..." - - # 检查是否有未提交的变更 - if ! git diff --quiet || ! git diff --staged --quiet; then - print_message $YELLOW "⚠️ 发现未提交的变更" - git status --short - - read -p "是否提交这些变更? (y/N): " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - git add . - read -p "请输入提交消息: " commit_message - git commit -m "$commit_message" - print_message $GREEN "✅ 变更已提交" - else - print_message $YELLOW "⚠️ 跳过提交" - return 1 - fi - fi - - # 推送到远程仓库 - if git push origin main; then - print_message $GREEN "✅ 代码推送成功" - else - print_message $RED "❌ 代码推送失败" - return 1 - fi -} - -# 拉取最新代码 -pull_code() { - print_message $BLUE "⬇️ 拉取最新代码..." - - if git pull origin main; then - print_message $GREEN "✅ 代码拉取成功" - else - print_message $RED "❌ 代码拉取失败" - return 1 - fi -} - -# 查看提交历史 -show_history() { - print_message $BLUE "📜 提交历史:" - git log --oneline --graph --decorate -10 -} - -# 查看分支状态 -show_branches() { - print_message $BLUE "🌿 分支状态:" - git branch -a -} - -# 创建新分支 -create_branch() { - local branch_name=$1 - if [ -z "$branch_name" ]; then - read -p "请输入分支名称: " branch_name - fi - - if [ -n "$branch_name" ]; then - git checkout -b "$branch_name" - print_message $GREEN "✅ 分支 '$branch_name' 创建成功" - else - print_message $RED "❌ 分支名称不能为空" - fi -} - -# 切换分支 -switch_branch() { - local branch_name=$1 - if [ -z "$branch_name" ]; then - print_message $BLUE "可用分支:" - git branch -a - read -p "请输入要切换的分支名称: " branch_name - fi - - if [ -n "$branch_name" ]; then - git checkout "$branch_name" - print_message $GREEN "✅ 已切换到分支 '$branch_name'" - else - print_message $RED "❌ 分支名称不能为空" - fi -} - -# 显示帮助 -show_help() { - echo "Gitea 仓库管理脚本" - echo "" - echo "用法: $0 [命令]" - echo "" - echo "命令:" - echo " check 检查连接和仓库状态" - echo " init 初始化仓库" - echo " sync 同步代码到远程仓库" - echo " pull 拉取最新代码" - echo " history 查看提交历史" - echo " branches 查看分支状态" - echo " create-branch [name] 创建新分支" - echo " switch-branch [name] 切换分支" - echo " status 查看仓库状态" - echo " help 显示帮助信息" - echo "" - echo "示例:" - echo " $0 check # 检查状态" - echo " $0 sync # 同步代码" - echo " $0 create-branch feature-x # 创建功能分支" -} - -# 主函数 -main() { - local command=${1:-help} - - case $command in - check) - check_ssh_connection - check_repo_status - ;; - init) - init_repo - ;; - sync) - sync_code - ;; - pull) - pull_code - ;; - history) - show_history - ;; - branches) - show_branches - ;; - create-branch) - create_branch "$2" - ;; - switch-branch) - switch_branch "$2" - ;; - status) - check_repo_status - ;; - help|--help|-h) - show_help - ;; - *) - print_message $RED "❌ 未知命令: $command" - show_help - exit 1 - ;; - esac -} - -# 执行主函数 -main "$@" \ No newline at end of file diff --git a/scripts/utilities/nomad-cluster-manager.sh b/scripts/utilities/nomad-cluster-manager.sh new file mode 100755 index 0000000..9a71e99 --- /dev/null +++ b/scripts/utilities/nomad-cluster-manager.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +# 🚀 Nomad 集群管理脚本 +# Nomad Cluster Management Script + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_header() { + echo -e "${PURPLE}=== $1 ===${NC}" +} + +# 显示集群状态 +show_cluster_status() { + log_header "Nomad 集群状态概览" + + # 检查 Leader + echo -e "${CYAN}Leader 状态:${NC}" + LEADER=$(curl -s http://localhost:4646/v1/status/leader 2>/dev/null || echo "无法连接") + if [[ "$LEADER" =~ ^\".*\"$ ]]; then + echo " ✅ Leader: $(echo $LEADER | tr -d '\"')" + else + echo " ❌ 无 Leader 或连接失败" + return 1 + fi + + echo "" + + # 节点状态 + echo -e "${CYAN}节点状态:${NC}" + curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r '.[] | " \(.Status == "ready" and "✅" or "❌") \(.Name) (\(.Address)) - \(.Status)"' 2>/dev/null || { + log_warning "无法获取节点状态详情" + nomad node status 2>/dev/null || echo " ❌ 命令执行失败" + } + + echo "" + + # 驱动状态 + echo -e "${CYAN}驱动程序状态:${NC}" + curl -s http://localhost:4646/v1/nodes 2>/dev/null | jq -r ' + .[] | + " 节点: \(.Name)" as $node | + .Drivers | + to_entries[] | + " \(.value.Healthy and "✅" or "❌") \(.key): \(.value.HealthDescription // "未知")" + ' 2>/dev/null || { + log_warning "无法获取驱动状态详情" + } +} + +# 显示作业状态 +show_jobs_status() { + log_header "作业状态" + + JOBS=$(curl -s http://localhost:4646/v1/jobs 2>/dev/null) + if [[ "$?" -eq 0 ]] && [[ "$JOBS" != "[]" ]] && [[ "$JOBS" != "null" ]]; then + echo "$JOBS" | jq -r '.[] | " \(.Status == "running" and "✅" or "❌") \(.Name) - \(.Status)"' 2>/dev/null + else + echo " 📝 当前没有运行的作业" + fi +} + +# 显示访问信息 +show_access_info() { + log_header "访问信息" + + echo -e "${CYAN}Web UI:${NC}" + echo " 🌐 http://100.116.158.95:4646" + echo "" + + echo -e "${CYAN}API 端点:${NC}" + echo " 🔗 http://100.116.158.95:4646/v1/" + echo "" + + echo -e "${CYAN}常用命令:${NC}" + echo " 📊 nomad status # 查看集群概览" + echo " 🖥️ nomad node status # 查看节点状态" + echo " 🔧 nomad server members # 查看服务器成员" + echo " 📋 nomad job status # 查看作业状态" + echo " 🚀 nomad job run # 运行作业" + echo " 📜 journalctl -u nomad -f # 查看日志" +} + +# 运行诊断 +run_diagnosis() { + log_header "运行完整诊断" + + if [[ -f "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" ]]; then + bash "$PROJECT_ROOT/scripts/utilities/nomad-diagnosis.sh" + else + log_error "诊断脚本未找到" + return 1 + fi +} + +# 配置 Podman 驱动 +configure_podman() { + log_header "配置所有节点使用 Podman 驱动" + + local playbook="$PROJECT_ROOT/configuration/playbooks/configure-nomad-podman-cluster.yml" + local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + if [[ ! -f "$playbook" ]]; then + log_error "Playbook 文件不存在: $playbook" + return 1 + fi + + if [[ ! -f "$inventory" ]]; then + log_error "Inventory 文件不存在: $inventory" + return 1 + fi + + cd "$PROJECT_ROOT/configuration" + python3 -m ansible playbook -i "$inventory" "$playbook" -v +} + +# 重启集群 +restart_cluster() { + log_header "重启 Nomad 集群" + + log_warning "这将重启整个 Nomad 集群" + read -p "确认继续? (y/N): " -n 1 -r + echo "" + + if [[ $REPLY =~ ^[Yy]$ ]]; then + local inventory="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + cd "$PROJECT_ROOT/configuration" + python3 -m ansible adhoc -i "$inventory" nomad_cluster -m systemd -a "name=nomad state=restarted" --become + + log_info "等待集群启动..." + sleep 15 + show_cluster_status + else + log_info "操作已取消" + fi +} + +# 主菜单 +show_menu() { + echo "" + log_header "Nomad 集群管理菜单" + echo "" + echo "1) 📊 显示集群状态" + echo "2) 📋 显示作业状态" + echo "3) 🔍 运行完整诊断" + echo "4) 🐳 配置 Podman 驱动" + echo "5) 🔄 重启集群" + echo "6) ℹ️ 显示访问信息" + echo "0) ❌ 退出" + echo "" +} + +# 主函数 +main() { + echo "" + echo "🚀 Nomad 集群管理工具" + echo "===================" + + while true; do + show_menu + read -p "请选择操作 (0-6): " choice + + case $choice in + 1) + show_cluster_status + ;; + 2) + show_jobs_status + ;; + 3) + run_diagnosis + ;; + 4) + configure_podman + ;; + 5) + restart_cluster + ;; + 6) + show_access_info + ;; + 0) + log_info "再见!" + exit 0 + ;; + *) + log_error "无效选择,请重试" + ;; + esac + + echo "" + read -p "按回车键继续..." -r + done +} + +# 如果直接运行脚本 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/scripts/utilities/proxy-toggle.sh b/scripts/utilities/proxy-toggle.sh deleted file mode 100755 index 777b01c..0000000 --- a/scripts/utilities/proxy-toggle.sh +++ /dev/null @@ -1,304 +0,0 @@ -#!/bin/bash - -# 代理开关脚本 -# 用于一键开启/关闭 istoreos.tailnet-68f9.ts.net:1082 代理 - -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -# 代理配置 -PROXY_HOST="istoreos.tailnet-68f9.ts.net" -PROXY_PORT="1082" -PROXY_URL="http://${PROXY_HOST}:${PROXY_PORT}" - -# 配置文件路径 -PROXY_ENV_FILE="/root/mgmt/configuration/proxy.env" -SHELL_RC_FILE="$HOME/.zshrc" -BASH_RC_FILE="$HOME/.bashrc" - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查代理状态 -check_proxy_status() { - if [[ -n "${http_proxy:-}" ]] || [[ -n "${HTTP_PROXY:-}" ]]; then - echo "on" - else - echo "off" - fi -} - -# 测试代理连接 -test_proxy() { - log_info "测试代理连接..." - if curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://httpbin.org/ip >/dev/null 2>&1; then - log_success "代理连接正常" - return 0 - else - log_error "代理连接失败" - return 1 - fi -} - -# 开启代理 -enable_proxy() { - log_info "开启代理..." - - # 设置环境变量 - export http_proxy="$PROXY_URL" - export https_proxy="$PROXY_URL" - export HTTP_PROXY="$PROXY_URL" - export HTTPS_PROXY="$PROXY_URL" - export no_proxy="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" - export NO_PROXY="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" - export ALL_PROXY="$PROXY_URL" - export all_proxy="$PROXY_URL" - - # 测试连接 - if test_proxy; then - log_success "代理已开启: $PROXY_URL" - - # 显示当前IP - local current_ip=$(curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") - log_info "当前IP: $current_ip" - - return 0 - else - log_error "代理开启失败" - return 1 - fi -} - -# 关闭代理 -disable_proxy() { - log_info "关闭代理..." - - # 清除环境变量 - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - unset no_proxy NO_PROXY ALL_PROXY all_proxy - - log_success "代理已关闭" - - # 显示当前IP - local current_ip=$(curl -s --connect-timeout 5 https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") - log_info "当前IP: $current_ip" -} - -# 切换代理状态 -toggle_proxy() { - local current_status=$(check_proxy_status) - - if [[ "$current_status" == "on" ]]; then - disable_proxy - else - enable_proxy - fi -} - -# 永久开启代理(写入配置文件) -enable_proxy_permanent() { - log_info "永久开启代理..." - - # 创建代理环境文件 - cat > "$PROXY_ENV_FILE" << EOF -# Proxy Configuration for ${PROXY_HOST}:${PROXY_PORT} -# This file contains proxy environment variables for the management system - -# HTTP/HTTPS Proxy Settings -export http_proxy=${PROXY_URL} -export https_proxy=${PROXY_URL} -export HTTP_PROXY=${PROXY_URL} -export HTTPS_PROXY=${PROXY_URL} - -# No Proxy Settings (local networks and services) -export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net -export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net - -# Additional proxy settings for various tools -export ALL_PROXY=${PROXY_URL} -export all_proxy=${PROXY_URL} - -# Docker proxy settings -export DOCKER_BUILDKIT=1 -export BUILDKIT_PROGRESS=plain - -# Git proxy settings -export GIT_HTTP_PROXY=${PROXY_URL} -export GIT_HTTPS_PROXY=${PROXY_URL} - -# Curl proxy settings -export CURL_PROXY=${PROXY_URL} - -# Wget proxy settings -export WGET_PROXY=${PROXY_URL} -EOF - - # 在 shell 配置文件中加载代理配置 - local shell_files=("$SHELL_RC_FILE" "$BASH_RC_FILE") - - for shell_file in "${shell_files[@]}"; do - if [[ -f "$shell_file" ]]; then - # 检查是否已经加载了代理配置 - if ! grep -q "source.*proxy.env" "$shell_file"; then - log_info "在 $shell_file 中添加代理配置加载..." - echo "" >> "$shell_file" - echo "# Load proxy configuration if exists" >> "$shell_file" - echo "if [[ -f $PROXY_ENV_FILE ]]; then" >> "$shell_file" - echo " source $PROXY_ENV_FILE" >> "$shell_file" - echo "fi" >> "$shell_file" - fi - fi - done - - # 立即加载配置 - if [[ -f "$PROXY_ENV_FILE" ]]; then - source "$PROXY_ENV_FILE" - fi - - if test_proxy; then - log_success "代理已永久开启" - log_info "配置已保存到: $PROXY_ENV_FILE" - log_info "请重新登录或运行: source ~/.zshrc" - else - log_error "代理永久开启失败" - return 1 - fi -} - -# 永久关闭代理(从配置文件移除) -disable_proxy_permanent() { - log_info "永久关闭代理..." - - # 备份现有配置 - if [[ -f "$PROXY_ENV_FILE" ]]; then - cp "$PROXY_ENV_FILE" "${PROXY_ENV_FILE}.backup.$(date +%Y%m%d_%H%M%S)" - rm -f "$PROXY_ENV_FILE" - fi - - # 从 shell 配置文件中移除代理配置加载 - local shell_files=("$SHELL_RC_FILE" "$BASH_RC_FILE") - - for shell_file in "${shell_files[@]}"; do - if [[ -f "$shell_file" ]]; then - # 移除代理配置加载行 - if grep -q "source.*proxy.env" "$shell_file"; then - log_info "从 $shell_file 中移除代理配置加载..." - sed -i '/# Load proxy configuration if exists/,/^fi$/d' "$shell_file" - fi - fi - done - - # 立即清除环境变量 - disable_proxy - - log_success "代理已永久关闭" - log_info "请重新登录或运行: source ~/.zshrc" -} - -# 显示代理状态 -show_status() { - local current_status=$(check_proxy_status) - - echo "" - log_info "=== 代理状态 ===" - - if [[ "$current_status" == "on" ]]; then - log_success "代理状态: 开启" - log_info "代理地址: $PROXY_URL" - - # 显示当前IP - local current_ip=$(curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") - log_info "当前IP: $current_ip" - else - log_warning "代理状态: 关闭" - - # 显示当前IP - local current_ip=$(curl -s --connect-timeout 5 https://httpbin.org/ip | jq -r .origin 2>/dev/null || echo "未知") - log_info "当前IP: $current_ip" - fi - - # 检查配置文件状态 - if [[ -f "$PROXY_ENV_FILE" ]]; then - log_info "配置文件: 存在 ($PROXY_ENV_FILE)" - else - log_info "配置文件: 不存在" - fi - - echo "" -} - -# 显示帮助信息 -show_help() { - echo "代理开关脚本 - 管理 istoreos.tailnet-68f9.ts.net:1082 代理" - echo "" - echo "用法: $0 [命令]" - echo "" - echo "命令:" - echo " on - 临时开启代理(仅当前会话)" - echo " off - 临时关闭代理(仅当前会话)" - echo " toggle - 切换代理状态" - echo " enable - 永久开启代理(写入配置文件)" - echo " disable - 永久关闭代理(从配置文件移除)" - echo " status - 显示代理状态" - echo " test - 测试代理连接" - echo " help - 显示此帮助信息" - echo "" - echo "示例:" - echo " $0 on # 临时开启代理" - echo " $0 enable # 永久开启代理" - echo " $0 status # 查看代理状态" - echo " $0 toggle # 切换代理状态" - echo "" -} - -# 主函数 -main() { - case "${1:-help}" in - "on") - enable_proxy - ;; - "off") - disable_proxy - ;; - "toggle") - toggle_proxy - ;; - "enable") - enable_proxy_permanent - ;; - "disable") - disable_proxy_permanent - ;; - "status") - show_status - ;; - "test") - test_proxy - ;; - "help"|*) - show_help - ;; - esac -} - -main "$@" diff --git a/scripts/utilities/quick-start.sh b/scripts/utilities/quick-start.sh deleted file mode 100755 index c6366d7..0000000 --- a/scripts/utilities/quick-start.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# 快速启动脚本 - -set -e - -echo "🚀 欢迎使用基础设施管理平台!" -echo "" - -# 检查必要工具 -check_tool() { - if ! command -v "$1" &> /dev/null; then - echo "❌ $1 未安装,请先运行 'make setup'" - return 1 - fi -} - -echo "🔍 检查必要工具..." -check_tool "tofu" || exit 1 -check_tool "ansible" || exit 1 -check_tool "docker" || exit 1 - -echo "✅ 工具检查通过" -echo "" - -# 检查配置文件 -CONFIG_FILE="infrastructure/environments/dev/terraform.tfvars" -if [ ! -f "$CONFIG_FILE" ]; then - echo "⚠️ 配置文件不存在,正在创建..." - cp "${CONFIG_FILE}.example" "$CONFIG_FILE" - echo "📝 请编辑配置文件: $CONFIG_FILE" - echo " 填入你的云服务商凭据后再次运行此脚本" - exit 1 -fi - -echo "✅ 配置文件存在" -echo "" - -# 选择操作 -echo "请选择要执行的操作:" -echo "1) 初始化基础设施" -echo "2) 查看执行计划" -echo "3) 应用基础设施变更" -echo "4) 部署应用" -echo "5) 启动开发环境" -echo "6) 查看监控" -echo "7) 完整部署流程" -echo "" - -read -p "请输入选项 (1-7): " choice - -case $choice in - 1) - echo "🏗️ 初始化基础设施..." - make init - ;; - 2) - echo "📋 查看执行计划..." - make plan - ;; - 3) - echo "🚀 应用基础设施变更..." - make apply - ;; - 4) - echo "📦 部署应用..." - make ansible-deploy - ;; - 5) - echo "🐳 启动开发环境..." - make docker-up - ;; - 6) - echo "📊 启动监控..." - make monitor - ;; - 7) - echo "🎯 执行完整部署流程..." - echo "" - echo "步骤 1/4: 初始化基础设施..." - make init - echo "" - echo "步骤 2/4: 查看执行计划..." - make plan - echo "" - read -p "是否继续应用基础设施变更? (y/N): " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - echo "步骤 3/4: 应用基础设施变更..." - make apply - echo "" - echo "步骤 4/4: 部署应用..." - make ansible-deploy - echo "" - echo "🎉 完整部署流程完成!" - else - echo "ℹ️ 部署流程已取消" - fi - ;; - *) - echo "❌ 无效选项" - exit 1 - ;; -esac - -echo "" -echo "🎉 操作完成!" -echo "" -echo "📋 有用的命令:" -echo " make help - 查看所有可用命令" -echo " make plan - 查看基础设施变更计划" -echo " make apply - 应用基础设施变更" -echo " make ansible-deploy - 部署应用" -echo " make monitor - 启动监控" -echo " make clean - 清理临时文件" \ No newline at end of file diff --git a/scripts/utilities/simple-nomad-fix.sh b/scripts/utilities/simple-nomad-fix.sh deleted file mode 100755 index f0feaa3..0000000 --- a/scripts/utilities/simple-nomad-fix.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash - -echo "=== 简单的 Nomad 集群修复脚本 ===" - -# 定义 Tailscale IP 地址 -SEMAPHORE_IP="100.116.158.95" -MASTER_IP="100.117.106.136" -ASH3C_IP="100.116.80.94" -ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" - -# 创建配置文件函数 -create_config() { - local node_name=$1 - local bind_ip=$2 - - cat > /tmp/nomad-${node_name}.hcl << EOF -datacenter = "dc1" -region = "global" -data_dir = "/opt/nomad/data" - -bind_addr = "${bind_ip}" - -server { - enabled = true - bootstrap_expect = 3 - encrypt = "${ENCRYPT_KEY}" - - server_join { - retry_join = ["${SEMAPHORE_IP}", "${MASTER_IP}", "${ASH3C_IP}"] - } -} - -client { - enabled = true -} - -ui_config { - enabled = true -} - -addresses { - http = "0.0.0.0" - rpc = "${bind_ip}" - serf = "${bind_ip}" -} - -ports { - http = 4646 - rpc = 4647 - serf = 4648 -} - -plugin "docker" { - config { - allow_privileged = true - volumes { - enabled = true - } - } -} - -log_level = "INFO" -log_file = "/var/log/nomad/nomad.log" -EOF -} - -echo "1. 停止所有 Nomad 服务..." -systemctl stop nomad -ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl stop nomad" -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl stop nomad" - -echo "2. 清理数据目录..." -rm -rf /opt/nomad/data/* -ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*" -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*" - -echo "3. 创建新配置文件..." -create_config "semaphore" "${SEMAPHORE_IP}" -create_config "master" "${MASTER_IP}" -create_config "ash3c" "${ASH3C_IP}" - -echo "4. 部署配置文件..." -cp /tmp/nomad-semaphore.hcl /etc/nomad.d/nomad.hcl -chown nomad:nomad /etc/nomad.d/nomad.hcl - -scp -P 60022 -i ~/.ssh/id_ed25519 /tmp/nomad-master.hcl ben@${MASTER_IP}:/tmp/ -ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S cp /tmp/nomad-master.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl" - -scp -P 22 -i ~/.ssh/id_ed25519 /tmp/nomad-ash3c.hcl ben@${ASH3C_IP}:/tmp/ -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S cp /tmp/nomad-ash3c.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl" - -echo "5. 启动服务..." -systemctl start nomad -ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl start nomad" -ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl start nomad" - -echo "6. 等待集群形成..." -sleep 30 - -echo "7. 检查集群状态..." -nomad server members -nomad node status - -echo "=== 修复完成 ===" \ No newline at end of file diff --git a/scripts/utilities/terraform-consul-provider.sh b/scripts/utilities/terraform-consul-provider.sh deleted file mode 100755 index 511696a..0000000 --- a/scripts/utilities/terraform-consul-provider.sh +++ /dev/null @@ -1,311 +0,0 @@ -#!/bin/bash - -# Terraform Consul Provider 配置脚本 -# 用于配置 Terraform 从 Consul 读取敏感配置 - -set -euo pipefail - -ENVIRONMENT="${ENVIRONMENT:-dev}" -CONSUL_ADDR="${CONSUL_ADDR:-http://localhost:8500}" - -# 颜色输出 -GREEN='\033[0;32m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -# 创建 Terraform Consul Provider 配置 -create_consul_provider() { - local tf_dir="infrastructure/environments/${ENVIRONMENT}" - - log_info "创建 Terraform Consul Provider 配置..." - - cat > "${tf_dir}/consul-provider.tf" << 'EOF' -# Consul Provider 配置 -terraform { - required_providers { - consul = { - source = "hashicorp/consul" - version = "~> 2.18" - } - } -} - -provider "consul" { - address = var.consul_config.address - token = lookup(var.consul_config, "token", null) -} - -# 从 Consul 读取 Oracle Cloud 配置 -data "consul_keys" "oracle_config" { - key { - name = "tenancy_ocid" - path = "config/${var.environment}/oracle/tenancy_ocid" - } - - key { - name = "user_ocid" - path = "config/${var.environment}/oracle/user_ocid" - } - - key { - name = "fingerprint" - path = "config/${var.environment}/oracle/fingerprint" - } - - key { - name = "private_key" - path = "config/${var.environment}/oracle/private_key" - } - - key { - name = "compartment_ocid" - path = "config/${var.environment}/oracle/compartment_ocid" - } -} - -# 创建临时私钥文件 -resource "local_file" "oci_private_key" { - content = data.consul_keys.oracle_config.var.private_key - filename = "/tmp/oci_private_key_${var.environment}.pem" - file_permission = "0600" - - lifecycle { - ignore_changes = [content] - } -} - -# 本地变量,用于构建完整的 OCI 配置 -locals { - oci_config_from_consul = { - tenancy_ocid = data.consul_keys.oracle_config.var.tenancy_ocid - user_ocid = data.consul_keys.oracle_config.var.user_ocid - fingerprint = data.consul_keys.oracle_config.var.fingerprint - private_key_path = local_file.oci_private_key.filename - region = var.oci_config.region - compartment_ocid = data.consul_keys.oracle_config.var.compartment_ocid - } -} -EOF - - log_success "Consul Provider 配置已创建: ${tf_dir}/consul-provider.tf" -} - -# 创建变量定义文件 -create_variables() { - local tf_dir="infrastructure/environments/${ENVIRONMENT}" - - log_info "更新 Terraform 变量定义..." - - cat > "${tf_dir}/variables.tf" << 'EOF' -# 基本变量 -variable "environment" { - description = "环境名称" - type = string -} - -variable "project_name" { - description = "项目名称" - type = string -} - -variable "owner" { - description = "项目所有者" - type = string -} - -variable "cloud_providers" { - description = "要启用的云服务商" - type = list(string) - default = [] -} - -variable "vpc_cidr" { - description = "VPC CIDR 块" - type = string -} - -variable "availability_zones" { - description = "可用区列表" - type = list(string) -} - -variable "common_tags" { - description = "通用标签" - type = map(string) - default = {} -} - -# Consul 配置 -variable "consul_config" { - description = "Consul 配置" - type = object({ - address = string - token = optional(string) - }) -} - -# Oracle Cloud 配置(基本信息) -variable "oci_config" { - description = "Oracle Cloud 基本配置" - type = object({ - region = string - tenancy_ocid = optional(string, "FROM_CONSUL") - user_ocid = optional(string, "FROM_CONSUL") - fingerprint = optional(string, "FROM_CONSUL") - private_key_path = optional(string, "FROM_CONSUL") - compartment_ocid = optional(string, "FROM_CONSUL") - }) -} - -# 其他云服务商配置 -variable "huawei_config" { - description = "华为云配置" - type = object({ - access_key = string - secret_key = string - region = string - project_id = string - }) - default = { - access_key = "" - secret_key = "" - region = "cn-north-4" - project_id = "" - } -} - -variable "gcp_config" { - description = "Google Cloud 配置" - type = object({ - project_id = string - region = string - zone = string - credentials_file = string - }) - default = { - project_id = "" - region = "asia-northeast3" - zone = "asia-northeast3-a" - credentials_file = "" - } -} - -variable "aws_config" { - description = "AWS 配置" - type = object({ - region = string - access_key = string - secret_key = string - }) - default = { - region = "ap-northeast-2" - access_key = "" - secret_key = "" - } -} - -variable "do_config" { - description = "DigitalOcean 配置" - type = object({ - token = string - region = string - }) - default = { - token = "" - region = "sgp1" - } -} -EOF - - log_success "变量定义已更新: ${tf_dir}/variables.tf" -} - -# 创建示例 main.tf -create_main_tf() { - local tf_dir="infrastructure/environments/${ENVIRONMENT}" - - log_info "创建示例 main.tf..." - - cat > "${tf_dir}/main.tf" << 'EOF' -# 主要 Terraform 配置文件 - -terraform { - required_version = ">= 1.0" - - required_providers { - oci = { - source = "oracle/oci" - version = "~> 5.0" - } - } -} - -# Oracle Cloud Provider -provider "oci" { - tenancy_ocid = local.oci_config_from_consul.tenancy_ocid - user_ocid = local.oci_config_from_consul.user_ocid - fingerprint = local.oci_config_from_consul.fingerprint - private_key_path = local.oci_config_from_consul.private_key_path - region = local.oci_config_from_consul.region -} - -# 示例:创建 VCN -resource "oci_core_vcn" "main" { - count = contains(var.cloud_providers, "oracle") ? 1 : 0 - compartment_id = local.oci_config_from_consul.compartment_ocid - cidr_block = var.vpc_cidr - display_name = "${var.project_name}-${var.environment}-vcn" - - freeform_tags = var.common_tags -} - -# 输出 -output "vcn_id" { - description = "VCN ID" - value = try(oci_core_vcn.main[0].id, null) -} - -output "oci_config_source" { - description = "OCI 配置来源" - value = "consul" -} -EOF - - log_success "示例 main.tf 已创建: ${tf_dir}/main.tf" -} - -# 主函数 -main() { - case "${1:-help}" in - "setup") - create_consul_provider - create_variables - create_main_tf - ;; - "help"|*) - cat << EOF -Terraform Consul Provider 配置脚本 - -用法: $0 [选项] - -选项: - setup 创建 Terraform Consul Provider 配置 - help 显示此帮助信息 - -环境变量: - ENVIRONMENT 环境名称 (默认: dev) - CONSUL_ADDR Consul 地址 (默认: http://localhost:8500) -EOF - ;; - esac -} - -main "$@" \ No newline at end of file diff --git a/scripts/utilities/tofu-secrets-uploader-simple.sh b/scripts/utilities/tofu-secrets-uploader-simple.sh deleted file mode 100755 index 2c8f722..0000000 --- a/scripts/utilities/tofu-secrets-uploader-simple.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash - -# 简化版 OpenTofu 密钥上传脚本 -set -euo pipefail - -# 配置 -CONSUL_ADDR="${CONSUL_ADDR:-http://master:8500}" -ENVIRONMENT="${ENVIRONMENT:-dev}" -TFVARS_FILE="tofu/environments/${ENVIRONMENT}/terraform.tfvars" - -# 颜色输出 -RED='\033[0;31m' -GREEN='\033[0;32m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } -log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -# 检查 Consul 连接 -check_consul() { - log_info "检查 Consul 连接..." - if ! curl -s "${CONSUL_ADDR}/v1/status/leader" > /dev/null; then - log_error "无法连接到 Consul: ${CONSUL_ADDR}" - exit 1 - fi - log_success "Consul 连接正常" -} - -# 上传配置 -upload_configs() { - local uploaded_count=0 - - log_info "开始解析并上传配置..." - - # 直接解析 tfvars 文件 - while IFS= read -r line; do - # 跳过注释和空行 - if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then - continue - fi - - # 匹配变量赋值 - if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\"([^\"]*)\"|^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]#]+) ]]; then - local var_name="${BASH_REMATCH[1]:-${BASH_REMATCH[3]}}" - local var_value="${BASH_REMATCH[2]:-${BASH_REMATCH[4]}}" - - # 跳过空值 - if [[ -z "$var_value" || "$var_value" == "null" ]]; then - continue - fi - - # 确定配置分类和路径 - local consul_path="" - if [[ "$var_name" =~ ^oci_ ]]; then - consul_path="config/${ENVIRONMENT}/oracle/${var_name#oci_}" - elif [[ "$var_name" =~ ^huawei_ ]]; then - consul_path="config/${ENVIRONMENT}/huawei/${var_name#huawei_}" - elif [[ "$var_name" =~ ^aws_ ]]; then - consul_path="config/${ENVIRONMENT}/aws/${var_name#aws_}" - elif [[ "$var_name" =~ ^do_ ]]; then - consul_path="config/${ENVIRONMENT}/digitalocean/${var_name#do_}" - elif [[ "$var_name" =~ ^gcp_ ]]; then - consul_path="config/${ENVIRONMENT}/gcp/${var_name#gcp_}" - else - consul_path="config/${ENVIRONMENT}/general/${var_name}" - fi - - # 上传到 Consul - if curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${consul_path}" -d "$var_value" > /dev/null; then - log_info "上传: ${consul_path}" - ((uploaded_count++)) - else - log_error "上传失败: ${consul_path}" - fi - fi - done < "$TFVARS_FILE" - - log_success "总共上传了 $uploaded_count 个配置项到 Consul" -} - -# 列出配置 -list_configs() { - log_info "列出 Consul 中的配置..." - - local keys=$(curl -s "${CONSUL_ADDR}/v1/kv/config/${ENVIRONMENT}/?keys" | jq -r '.[]' 2>/dev/null || echo "") - - if [[ -z "$keys" ]]; then - log_error "没有找到配置" - return - fi - - echo "=== 环境 ${ENVIRONMENT} 的配置 ===" - echo "$keys" | while read -r key; do - local value=$(curl -s "${CONSUL_ADDR}/v1/kv/${key}?raw" 2>/dev/null || echo "无法读取") - # 隐藏敏感信息 - if [[ "$key" =~ (secret|key|token|password|ocid) ]]; then - echo "$key: [已隐藏]" - else - echo "$key: $value" - fi - done -} - -# 主函数 -main() { - if [[ ! -f "$TFVARS_FILE" ]]; then - log_error "找不到配置文件: $TFVARS_FILE" - exit 1 - fi - - check_consul - - case "${1:-upload}" in - "upload") - upload_configs - ;; - "list") - list_configs - ;; - *) - echo "用法: $0 [upload|list]" - ;; - esac -} - -main "$@" \ No newline at end of file diff --git a/scripts/utilities/tofu-secrets-uploader.sh b/scripts/utilities/tofu-secrets-uploader.sh deleted file mode 100755 index 4d2a94d..0000000 --- a/scripts/utilities/tofu-secrets-uploader.sh +++ /dev/null @@ -1,495 +0,0 @@ -#!/bin/bash - -# OpenTofu 密钥上传脚本 -# 用于将 terraform.tfvars 中的敏感配置批量上传到 Consul - -set -euo pipefail - -# 配置 -CONSUL_ADDR="${CONSUL_ADDR:-http://master:8500}" -CONSUL_TOKEN="${CONSUL_TOKEN:-}" -ENVIRONMENT="${ENVIRONMENT:-dev}" -TOFU_DIR="${TOFU_DIR:-tofu/environments/${ENVIRONMENT}}" -TFVARS_FILE="${TFVARS_FILE:-${TOFU_DIR}/terraform.tfvars}" - -# 颜色输出 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查依赖 -check_dependencies() { - local deps=("curl" "jq") - for dep in "${deps[@]}"; do - if ! command -v "$dep" &> /dev/null; then - log_error "缺少依赖: $dep" - exit 1 - fi - done -} - -# 检查 Consul 连接 -check_consul() { - log_info "检查 Consul 连接..." - if ! curl -s "${CONSUL_ADDR}/v1/status/leader" > /dev/null; then - log_error "无法连接到 Consul: ${CONSUL_ADDR}" - exit 1 - fi - log_success "Consul 连接正常" -} - -# 检查 tfvars 文件 -check_tfvars_file() { - if [[ ! -f "$TFVARS_FILE" ]]; then - log_error "找不到 terraform.tfvars 文件: $TFVARS_FILE" - exit 1 - fi - log_info "找到配置文件: $TFVARS_FILE" -} - -# 解析 HCL 配置并转换为 JSON -parse_hcl_to_json() { - local tfvars_file="$1" - local temp_tf_file="/tmp/temp_config.tf" - local temp_json_file="/tmp/temp_config.json" - - # 创建临时 .tf 文件,将变量赋值转换为输出 - log_info "解析 HCL 配置..." - - # 读取 tfvars 文件并转换为 output 格式 - cat > "$temp_tf_file" << 'EOF' -# 临时配置文件,用于解析 tfvars -EOF - - # 解析每个配置块 - while IFS= read -r line; do - # 跳过注释和空行 - if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then - continue - fi - - # 提取变量名和值 - if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*(.+)$ ]]; then - local var_name="${BASH_REMATCH[1]}" - local var_value="${BASH_REMATCH[2]}" - - echo "output \"$var_name\" {" >> "$temp_tf_file" - echo " value = $var_value" >> "$temp_tf_file" - echo "}" >> "$temp_tf_file" - fi - done < "$tfvars_file" - - # 使用 terraform 解析配置 - if command -v terraform &> /dev/null; then - cd "$(dirname "$temp_tf_file")" - terraform init -backend=false > /dev/null 2>&1 || true - terraform output -json > "$temp_json_file" 2>/dev/null || { - log_warning "无法使用 terraform 解析,尝试手动解析..." - manual_parse_tfvars "$tfvars_file" "$temp_json_file" - } - else - log_warning "未找到 terraform,使用手动解析..." - manual_parse_tfvars "$tfvars_file" "$temp_json_file" - fi - - echo "$temp_json_file" -} - -# 手动解析 tfvars 文件 -manual_parse_tfvars() { - local tfvars_file="$1" - local output_file="$2" - - log_info "手动解析 tfvars 文件..." - - # 创建基础 JSON 结构 - echo "{" > "$output_file" - - local first_item=true - local in_block=false - local block_name="" - local block_content="" - - while IFS= read -r line; do - # 跳过注释和空行 - if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then - continue - fi - - # 检测配置块开始 - if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\{[[:space:]]*$ ]]; then - block_name="${BASH_REMATCH[1]}" - in_block=true - block_content="" - continue - fi - - # 检测配置块结束 - if [[ "$in_block" == true && "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then - if [[ "$first_item" == false ]]; then - echo "," >> "$output_file" - fi - echo " \"$block_name\": {" >> "$output_file" - echo "$block_content" >> "$output_file" - echo " }" >> "$output_file" - first_item=false - in_block=false - continue - fi - - # 处理块内容 - if [[ "$in_block" == true ]]; then - if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\"([^\"]*)\"|^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]]+) ]]; then - local key="${BASH_REMATCH[1]:-${BASH_REMATCH[3]}}" - local value="${BASH_REMATCH[2]:-${BASH_REMATCH[4]}}" - - if [[ -n "$block_content" ]]; then - block_content+="," - fi - block_content+="\n \"$key\": \"$value\"" - fi - continue - fi - - # 处理简单变量 - if [[ "$line" =~ ^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*\"([^\"]*)\"|^[[:space:]]*([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]]+) ]]; then - local var_name="${BASH_REMATCH[1]:-${BASH_REMATCH[3]}}" - local var_value="${BASH_REMATCH[2]:-${BASH_REMATCH[4]}}" - - if [[ "$first_item" == false ]]; then - echo "," >> "$output_file" - fi - echo " \"$var_name\": \"$var_value\"" >> "$output_file" - first_item=false - fi - done < "$tfvars_file" - - echo "}" >> "$output_file" -} - -# 上传配置到 Consul -upload_config_to_consul() { - local config_file="$1" - local uploaded_count=0 - - log_info "开始上传配置到 Consul..." - - # 读取 JSON 配置 - if [[ ! -f "$config_file" ]]; then - log_error "配置文件不存在: $config_file" - return 1 - fi - - # 上传 Oracle Cloud 配置 - local oci_tenancy=$(jq -r '.oci_tenancy_ocid // empty' "$config_file") - local oci_user=$(jq -r '.oci_user_ocid // empty' "$config_file") - local oci_fingerprint=$(jq -r '.oci_fingerprint // empty' "$config_file") - local oci_private_key_path=$(jq -r '.oci_private_key_path // empty' "$config_file") - local oci_compartment=$(jq -r '.oci_compartment_ocid // empty' "$config_file") - local oci_region=$(jq -r '.oci_region // empty' "$config_file") - - if [[ -n "$oci_tenancy" && "$oci_tenancy" != "null" && "$oci_tenancy" != "" ]]; then -======= -# 上传配置到 Consul -upload_config_to_consul() { - local config_file="$1" - local uploaded_count=0 - - log_info "开始上传配置到 Consul..." - - # 读取 JSON 配置 - if [[ ! -f "$config_file" ]]; then - log_error "配置文件不存在: $config_file" - return 1 - fi - - # 上传 Oracle Cloud 配置 - local oci_tenancy=$(jq -r '.oci_tenancy_ocid // empty' "$config_file") - local oci_user=$(jq -r '.oci_user_ocid // empty' "$config_file") - local oci_fingerprint=$(jq -r '.oci_fingerprint // empty' "$config_file") - local oci_private_key_path=$(jq -r '.oci_private_key_path // empty' "$config_file") - local oci_compartment=$(jq -r '.oci_compartment_ocid // empty' "$config_file") - local oci_region=$(jq -r '.oci_region // empty' "$config_file") - - if [[ -n "$oci_tenancy" && "$oci_tenancy" != "null" && "$oci_tenancy" != "" ]]; then - log_info "上传 Oracle Cloud 配置..." - local base_path="config/${ENVIRONMENT}/oracle" - - local tenancy_ocid=$(jq -r '.oci_config.tenancy_ocid // empty' "$config_file") - local user_ocid=$(jq -r '.oci_config.user_ocid // empty' "$config_file") - local fingerprint=$(jq -r '.oci_config.fingerprint // empty' "$config_file") - local private_key_path=$(jq -r '.oci_config.private_key_path // empty' "$config_file") - local compartment_ocid=$(jq -r '.oci_config.compartment_ocid // empty' "$config_file") - local region=$(jq -r '.oci_config.region // "ap-seoul-1"' "$config_file") - - # 上传非空配置 - [[ -n "$tenancy_ocid" && "$tenancy_ocid" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/tenancy_ocid" -d "$tenancy_ocid" > /dev/null - ((uploaded_count++)) - } - [[ -n "$user_ocid" && "$user_ocid" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/user_ocid" -d "$user_ocid" > /dev/null - ((uploaded_count++)) - } - [[ -n "$fingerprint" && "$fingerprint" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/fingerprint" -d "$fingerprint" > /dev/null - ((uploaded_count++)) - } - [[ -n "$compartment_ocid" && "$compartment_ocid" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/compartment_ocid" -d "$compartment_ocid" > /dev/null - ((uploaded_count++)) - } - [[ -n "$region" && "$region" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null - ((uploaded_count++)) - } - - # 上传私钥文件内容 - if [[ -n "$private_key_path" && "$private_key_path" != "null" && -f "$private_key_path" ]]; then - local private_key_content=$(cat "$private_key_path") - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/private_key" -d "$private_key_content" > /dev/null - ((uploaded_count++)) - fi - - log_success "Oracle Cloud 配置已上传" - fi - - # 上传华为云配置 - if jq -e '.huawei_config' "$config_file" > /dev/null 2>&1; then - log_info "上传华为云配置..." - local base_path="config/${ENVIRONMENT}/huawei" - - local access_key=$(jq -r '.huawei_config.access_key // empty' "$config_file") - local secret_key=$(jq -r '.huawei_config.secret_key // empty' "$config_file") - local region=$(jq -r '.huawei_config.region // "cn-north-4"' "$config_file") - local project_id=$(jq -r '.huawei_config.project_id // empty' "$config_file") - - [[ -n "$access_key" && "$access_key" != "null" && "$access_key" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/access_key" -d "$access_key" > /dev/null - ((uploaded_count++)) - } - [[ -n "$secret_key" && "$secret_key" != "null" && "$secret_key" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/secret_key" -d "$secret_key" > /dev/null - ((uploaded_count++)) - } - [[ -n "$region" && "$region" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null - ((uploaded_count++)) - } - [[ -n "$project_id" && "$project_id" != "null" && "$project_id" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/project_id" -d "$project_id" > /dev/null - ((uploaded_count++)) - } - - log_success "华为云配置已上传" - fi - - # 上传 AWS 配置 - if jq -e '.aws_config' "$config_file" > /dev/null 2>&1; then - log_info "上传 AWS 配置..." - local base_path="config/${ENVIRONMENT}/aws" - - local access_key=$(jq -r '.aws_config.access_key // empty' "$config_file") - local secret_key=$(jq -r '.aws_config.secret_key // empty' "$config_file") - local region=$(jq -r '.aws_config.region // "ap-northeast-2"' "$config_file") - - [[ -n "$access_key" && "$access_key" != "null" && "$access_key" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/access_key" -d "$access_key" > /dev/null - ((uploaded_count++)) - } - [[ -n "$secret_key" && "$secret_key" != "null" && "$secret_key" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/secret_key" -d "$secret_key" > /dev/null - ((uploaded_count++)) - } - [[ -n "$region" && "$region" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null - ((uploaded_count++)) - } - - log_success "AWS 配置已上传" - fi - - # 上传 DigitalOcean 配置 - if jq -e '.do_config' "$config_file" > /dev/null 2>&1; then - log_info "上传 DigitalOcean 配置..." - local base_path="config/${ENVIRONMENT}/digitalocean" - - local token=$(jq -r '.do_config.token // empty' "$config_file") - local region=$(jq -r '.do_config.region // "sgp1"' "$config_file") - - [[ -n "$token" && "$token" != "null" && "$token" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/token" -d "$token" > /dev/null - ((uploaded_count++)) - } - [[ -n "$region" && "$region" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null - ((uploaded_count++)) - } - - log_success "DigitalOcean 配置已上传" - fi - - # 上传 Google Cloud 配置 - if jq -e '.gcp_config' "$config_file" > /dev/null 2>&1; then - log_info "上传 Google Cloud 配置..." - local base_path="config/${ENVIRONMENT}/gcp" - - local project_id=$(jq -r '.gcp_config.project_id // empty' "$config_file") - local region=$(jq -r '.gcp_config.region // "asia-northeast3"' "$config_file") - local zone=$(jq -r '.gcp_config.zone // "asia-northeast3-a"' "$config_file") - local credentials_file=$(jq -r '.gcp_config.credentials_file // empty' "$config_file") - - [[ -n "$project_id" && "$project_id" != "null" && "$project_id" != "" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/project_id" -d "$project_id" > /dev/null - ((uploaded_count++)) - } - [[ -n "$region" && "$region" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/region" -d "$region" > /dev/null - ((uploaded_count++)) - } - [[ -n "$zone" && "$zone" != "null" ]] && { - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/zone" -d "$zone" > /dev/null - ((uploaded_count++)) - } - - # 上传凭证文件内容 - if [[ -n "$credentials_file" && "$credentials_file" != "null" && -f "$credentials_file" ]]; then - local credentials_content=$(cat "$credentials_file") - curl -s -X PUT "${CONSUL_ADDR}/v1/kv/${base_path}/credentials" -d "$credentials_content" > /dev/null - ((uploaded_count++)) - fi - - log_success "Google Cloud 配置已上传" - fi - - log_success "总共上传了 $uploaded_count 个配置项到 Consul" -} - -# 列出 Consul 中的配置 -list_consul_configs() { - log_info "列出 Consul 中的配置..." - - local base_path="config/${ENVIRONMENT}" - - echo "=== Consul 中的配置 ===" - - # 获取所有配置键 - local keys=$(curl -s "${CONSUL_ADDR}/v1/kv/${base_path}/?keys" | jq -r '.[]' 2>/dev/null || echo "") - - if [[ -z "$keys" ]]; then - log_warning "Consul 中没有找到配置" - return - fi - - echo "$keys" | while read -r key; do - local value=$(curl -s "${CONSUL_ADDR}/v1/kv/${key}?raw" 2>/dev/null || echo "无法读取") - # 隐藏敏感信息 - if [[ "$key" =~ (secret|key|token|password) ]]; then - echo "$key: [已隐藏]" - else - echo "$key: $value" - fi - done -} - -# 清理 Consul 配置 -cleanup_consul_configs() { - log_warning "清理 Consul 配置..." - - read -p "确定要删除环境 '$ENVIRONMENT' 的所有配置吗?(y/N): " confirm - if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - log_info "操作已取消" - return - fi - - local base_path="config/${ENVIRONMENT}" - curl -s -X DELETE "${CONSUL_ADDR}/v1/kv/${base_path}?recurse" > /dev/null - - log_success "环境 '$ENVIRONMENT' 的配置已清理" -} - -# 显示帮助信息 -show_help() { - cat << EOF -OpenTofu 密钥上传脚本 - -用法: $0 [选项] - -选项: - upload 上传 terraform.tfvars 中的配置到 Consul - list 列出 Consul 中的配置 - cleanup 清理 Consul 中的配置 - help 显示此帮助信息 - -环境变量: - CONSUL_ADDR Consul 地址 (默认: http://localhost:8500) - CONSUL_TOKEN Consul ACL Token (可选) - ENVIRONMENT 环境名称 (默认: dev) - TOFU_DIR OpenTofu 目录 (默认: tofu/environments/\${ENVIRONMENT}) - TFVARS_FILE 变量文件路径 (默认: \${TOFU_DIR}/terraform.tfvars) - -示例: - # 上传配置到 Consul - $0 upload - - # 列出 Consul 中的配置 - $0 list - - # 清理配置 - $0 cleanup - - # 指定不同环境 - ENVIRONMENT=production $0 upload -EOF -} - -# 主函数 -main() { - check_dependencies - - case "${1:-help}" in - "upload") - check_consul - check_tfvars_file - - log_info "解析配置文件: $TFVARS_FILE" - local config_json=$(manual_parse_tfvars "$TFVARS_FILE" "/tmp/parsed_config.json") - upload_config_to_consul "/tmp/parsed_config.json" - - # 清理临时文件 - rm -f /tmp/parsed_config.json /tmp/temp_config.tf - ;; - "list") - check_consul - list_consul_configs - ;; - "cleanup") - check_consul - cleanup_consul_configs - ;; - "help"|*) - show_help - ;; - esac -} - -main "$@" \ No newline at end of file diff --git a/scripts/utilities/verify-podman-migration.sh b/scripts/utilities/verify-podman-migration.sh new file mode 100755 index 0000000..391be9a --- /dev/null +++ b/scripts/utilities/verify-podman-migration.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +echo "=== Nomad Podman Migration Verification ===" +echo + +# Check Nomad service status +echo "1. Checking Nomad service status..." +ssh ben@100.84.197.26 "sudo systemctl status nomad --no-pager -l" +echo + +# Check Nomad configuration +echo "2. Checking Nomad configuration..." +ssh ben@100.84.197.26 "sudo cat /etc/nomad.d/nomad.hcl | grep -A 10 -B 2 podman" +echo + +# Check Podman socket +echo "3. Checking Podman socket..." +ssh ben@100.84.197.26 "ls -la /run/user/*/podman/podman.sock 2>/dev/null || echo 'Podman socket not found'" +echo + +# Check Nomad node status +echo "4. Checking Nomad node status..." +ssh ben@100.84.197.26 "sudo -u nomad /usr/local/bin/nomad node status -self | grep -A 10 'Driver Status'" 2>/dev/null || echo "Could not get node status" +echo + +# Test Podman functionality +echo "5. Testing Podman as nomad user..." +ssh ben@100.84.197.26 "sudo -u nomad podman version --format '{{.Version}}'" 2>/dev/null || echo "Podman test failed" +echo + +echo "=== Verification Complete ===" \ No newline at end of file diff --git a/swarm/configs/traefik-consul-setup.yml b/swarm/configs/traefik-consul-setup.yml deleted file mode 100644 index 866bf9e..0000000 --- a/swarm/configs/traefik-consul-setup.yml +++ /dev/null @@ -1,138 +0,0 @@ -version: '3.8' - -services: - # Traefik 负载均衡器 - traefik: - image: traefik:v3.0 - container_name: traefik - restart: unless-stopped - ports: - - "80:80" - - "443:443" - - "8080:8080" # Traefik Dashboard - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - ./traefik.yml:/etc/traefik/traefik.yml:ro - - ./certs:/certs:ro - environment: - - CONSUL_ENDPOINTS=consul1:8500,consul2:8500,consul3:8500 - depends_on: - - consul1 - - consul2 - - consul3 - networks: - - traefik-net - labels: - - "traefik.enable=true" - - "traefik.http.routers.dashboard.rule=Host(`traefik.local`)" - - "traefik.http.routers.dashboard.service=api@internal" - - # Consul 集群节点 1 - consul1: - image: consul:1.16.1 - container_name: consul1 - hostname: consul1 - restart: unless-stopped - ports: - - "8500:8500" - volumes: - - consul1_data:/consul/data - command: > - consul agent -server -bootstrap-expect=3 - -datacenter=dc1 -data-dir=/consul/data - -node=consul1 -bind=0.0.0.0 -client=0.0.0.0 - -retry-join=consul2 -retry-join=consul3 - -ui-config='{"enabled": true}' - -log-level=INFO - networks: - - traefik-net - labels: - - "traefik.enable=true" - - "traefik.http.routers.consul.rule=Host(`consul.local`)" - - "traefik.http.services.consul.loadbalancer.server.port=8500" - - # Consul 集群节点 2 - consul2: - image: consul:1.16.1 - container_name: consul2 - hostname: consul2 - restart: unless-stopped - volumes: - - consul2_data:/consul/data - command: > - consul agent -server -bootstrap-expect=3 - -datacenter=dc1 -data-dir=/consul/data - -node=consul2 -bind=0.0.0.0 -client=0.0.0.0 - -retry-join=consul1 -retry-join=consul3 - -log-level=INFO - networks: - - traefik-net - - # Consul 集群节点 3 - consul3: - image: consul:1.16.1 - container_name: consul3 - hostname: consul3 - restart: unless-stopped - volumes: - - consul3_data:/consul/data - command: > - consul agent -server -bootstrap-expect=3 - -datacenter=dc1 -data-dir=/consul/data - -node=consul3 -bind=0.0.0.0 -client=0.0.0.0 - -retry-join=consul1 -retry-join=consul2 - -log-level=INFO - networks: - - traefik-net - - # 示例 Web 应用 - web-app: - image: nginx:alpine - container_name: web-app - restart: unless-stopped - volumes: - - ./web-content:/usr/share/nginx/html:ro - environment: - - CONSUL_URL=http://consul1:8500 - networks: - - traefik-net - labels: - - "traefik.enable=true" - - "traefik.http.routers.web.rule=Host(`app.local`)" - - "traefik.http.services.web.loadbalancer.server.port=80" - - "traefik.http.routers.web.middlewares=web-auth" - - "traefik.http.middlewares.web-auth.basicauth.users=admin:$$2y$$10$$..." - - # 示例 API 应用 - api-app: - image: node:18-alpine - container_name: api-app - restart: unless-stopped - working_dir: /app - volumes: - - ./api:/app - command: ["node", "server.js"] - environment: - - CONSUL_URL=http://consul1:8500 - - NODE_ENV=production - networks: - - traefik-net - labels: - - "traefik.enable=true" - - "traefik.http.routers.api.rule=Host(`api.local`) && PathPrefix(`/api`)" - - "traefik.http.services.api.loadbalancer.server.port=3000" - - "traefik.http.routers.api.middlewares=api-cors" - - "traefik.http.middlewares.api-cors.headers.accesscontrolallowmethods=GET,POST,PUT,DELETE" - - "traefik.http.middlewares.api-cors.headers.accesscontrolalloworigin=*" - -volumes: - consul1_data: - consul2_data: - consul3_data: - -networks: - traefik-net: - driver: bridge - ipam: - config: - - subnet: 172.20.0.0/16 \ No newline at end of file diff --git a/swarm/configs/traefik.yml b/swarm/configs/traefik.yml deleted file mode 100644 index f3a9f92..0000000 --- a/swarm/configs/traefik.yml +++ /dev/null @@ -1,60 +0,0 @@ -# Traefik 配置文件 -api: - dashboard: true - insecure: true # 仅开发环境,生产环境请使用 HTTPS - -# 入口点配置 -entryPoints: - web: - address: ":80" - websecure: - address: ":443" - -# 提供者配置 -providers: - # Docker 标签发现 - docker: - endpoint: "unix:///var/run/docker.sock" - exposedByDefault: false - watch: true - - # Consul 服务发现 - consul: - endpoints: - - "consul1:8500" - - "consul2:8500" - - "consul3:8500" - watch: true - - # 文件配置提供者 - file: - filename: /etc/traefik/dynamic.yml - watch: true - -# 证书解析器(Let's Encrypt) -certificatesResolvers: - letsencrypt: - acme: - email: admin@example.com - storage: /certs/acme.json - httpChallenge: - entryPoint: web - -# 日志配置 -log: - level: INFO - filePath: "/var/log/traefik.log" - -accessLog: - filePath: "/var/log/access.log" - -# 指标配置 -metrics: - prometheus: - addEntryPointsLabels: true - addServicesLabels: true - -# 全局配置 -global: - checkNewVersion: false - sendAnonymousUsage: false \ No newline at end of file diff --git a/swarm/scripts/swarm-manager.sh b/swarm/scripts/swarm-manager.sh deleted file mode 100755 index 573ba98..0000000 --- a/swarm/scripts/swarm-manager.sh +++ /dev/null @@ -1,184 +0,0 @@ -#!/bin/bash - -# Docker Swarm 管理脚本 -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查是否在 Swarm 模式 -check_swarm_mode() { - if docker info --format '{{.Swarm.LocalNodeState}}' | grep -q "active"; then - log_success "Docker Swarm 模式已激活" - return 0 - else - log_error "Docker Swarm 模式未激活" - return 1 - fi -} - -# 初始化 Swarm -init_swarm() { - log_info "初始化 Docker Swarm..." - - if docker swarm init; then - log_success "Docker Swarm 初始化成功" - log_info "要添加工作节点,请在其他主机上运行:" - docker swarm join-token worker - else - log_error "Docker Swarm 初始化失败" - return 1 - fi -} - -# 部署堆栈 -deploy_stack() { - local stack_name="$1" - local compose_file="$2" - - log_info "部署堆栈: $stack_name" - - if [[ ! -f "$compose_file" ]]; then - log_error "Compose 文件不存在: $compose_file" - return 1 - fi - - if docker stack deploy -c "$compose_file" "$stack_name"; then - log_success "堆栈 $stack_name 部署成功" - else - log_error "堆栈 $stack_name 部署失败" - return 1 - fi -} - -# 列出堆栈 -list_stacks() { - log_info "当前部署的堆栈:" - docker stack ls -} - -# 查看堆栈服务 -show_stack_services() { - local stack_name="$1" - - log_info "堆栈 $stack_name 的服务:" - docker stack services "$stack_name" -} - -# 删除堆栈 -remove_stack() { - local stack_name="$1" - - log_info "删除堆栈: $stack_name" - - if docker stack rm "$stack_name"; then - log_success "堆栈 $stack_name 删除成功" - else - log_error "堆栈 $stack_name 删除失败" - return 1 - fi -} - -# 显示节点信息 -show_nodes() { - log_info "Swarm 节点信息:" - docker node ls -} - -# 显示帮助信息 -show_help() { - echo "Docker Swarm 管理脚本" - echo "" - echo "用法: $0 [命令] [参数]" - echo "" - echo "命令:" - echo " init - 初始化 Docker Swarm" - echo " deploy - 部署堆栈" - echo " list - 列出所有堆栈" - echo " services - 查看堆栈服务" - echo " remove - 删除堆栈" - echo " nodes - 显示节点信息" - echo " check - 检查 Swarm 状态" - echo " help - 显示此帮助信息" - echo "" - echo "示例:" - echo " $0 init # 初始化 Swarm" - echo " $0 deploy traefik stacks/traefik-swarm-stack.yml" - echo " $0 deploy demo stacks/demo-services-stack.yml" - echo " $0 list # 列出堆栈" - echo " $0 services traefik # 查看 traefik 堆栈服务" -} - -# 主函数 -main() { - case "${1:-help}" in - "init") - init_swarm - ;; - "deploy") - if [[ $# -lt 3 ]]; then - log_error "部署命令需要堆栈名称和 compose 文件" - echo "用法: $0 deploy " - exit 1 - fi - check_swarm_mode || exit 1 - deploy_stack "$2" "$3" - ;; - "list") - check_swarm_mode || exit 1 - list_stacks - ;; - "services") - if [[ $# -lt 2 ]]; then - log_error "需要指定堆栈名称" - echo "用法: $0 services " - exit 1 - fi - check_swarm_mode || exit 1 - show_stack_services "$2" - ;; - "remove") - if [[ $# -lt 2 ]]; then - log_error "需要指定堆栈名称" - echo "用法: $0 remove " - exit 1 - fi - check_swarm_mode || exit 1 - remove_stack "$2" - ;; - "nodes") - check_swarm_mode || exit 1 - show_nodes - ;; - "check") - check_swarm_mode - ;; - "help"|*) - show_help - ;; - esac -} - -# 运行主函数 -main "$@" \ No newline at end of file diff --git a/swarm/stacks/consul-ash3c-stack.yml b/swarm/stacks/consul-ash3c-stack.yml deleted file mode 100644 index 2ac88e3..0000000 --- a/swarm/stacks/consul-ash3c-stack.yml +++ /dev/null @@ -1,41 +0,0 @@ -version: '3.8' - -services: - consul: - image: consul:latest - hostname: consul-ash3c - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-ash3c -bind=$$IP -advertise=100.116.80.94 -client=0.0.0.0 - -retry-join=100.117.106.136 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_data:/consul/data - networks: - - consul-net - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == ash3c - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - -volumes: - consul_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-fixed.yml b/swarm/stacks/consul-cluster-fixed.yml deleted file mode 100644 index 0fea038..0000000 --- a/swarm/stacks/consul-cluster-fixed.yml +++ /dev/null @@ -1,76 +0,0 @@ -version: '3.8' - -services: - consul-master: - image: consul:latest - hostname: consul-master - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_master_data:/consul/data - networks: - consul-net: - aliases: - - consul-master - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - - consul-ash3c: - image: consul:latest - hostname: consul-ash3c - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -retry-join=consul-master - -ui - -log-level=INFO - " - ports: - - "8501:8500" - - "8601:8600/udp" - volumes: - - consul_ash3c_data:/consul/data - networks: - consul-net: - aliases: - - consul-ash3c - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == ash3c - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - -volumes: - consul_master_data: - consul_ash3c_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-host-network.yml b/swarm/stacks/consul-cluster-host-network.yml deleted file mode 100644 index 300bddd..0000000 --- a/swarm/stacks/consul-cluster-host-network.yml +++ /dev/null @@ -1,68 +0,0 @@ -version: '3.8' - -services: - consul-master: - image: consul:latest - hostname: consul-master - command: > - sh -c " - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=100.117.106.136 -advertise=100.117.106.136 -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - - "8301:8301" - - "8302:8302" - volumes: - - consul_master_data:/consul/data - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - - consul-ash3c: - image: consul:latest - hostname: consul-ash3c - command: > - sh -c " - ASH3C_IP=$$(getent hosts ash3c | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-ash3c -bind=$$ASH3C_IP -advertise=$$ASH3C_IP -client=0.0.0.0 - -retry-join=100.117.106.136 - -ui - -log-level=INFO - " - ports: - - "8501:8500" - - "8601:8600/udp" - - "8311:8301" - - "8312:8302" - volumes: - - consul_ash3c_data:/consul/data - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == ash3c - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - depends_on: - - consul-master - -volumes: - consul_master_data: - consul_ash3c_data: \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-ip-based.yml b/swarm/stacks/consul-cluster-ip-based.yml deleted file mode 100644 index 56a86f0..0000000 --- a/swarm/stacks/consul-cluster-ip-based.yml +++ /dev/null @@ -1,78 +0,0 @@ -version: '3.8' - -services: - consul-master: - image: consul:latest - hostname: consul-master - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_master_data:/consul/data - networks: - consul-net: - aliases: - - consul-master - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - - consul-ash3c: - image: consul:latest - hostname: consul-ash3c - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -retry-join=10.0.5.5 - -ui - -log-level=INFO - " - ports: - - "8501:8500" - - "8601:8600/udp" - volumes: - - consul_ash3c_data:/consul/data - networks: - consul-net: - aliases: - - consul-ash3c - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == ash3c - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - depends_on: - - consul-master - -volumes: - consul_master_data: - consul_ash3c_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-macvlan.yml b/swarm/stacks/consul-cluster-macvlan.yml deleted file mode 100644 index aa8e2a4..0000000 --- a/swarm/stacks/consul-cluster-macvlan.yml +++ /dev/null @@ -1,78 +0,0 @@ -version: '3.8' - -services: - consul-master: - image: consul:latest - hostname: consul-master - command: > - sh -c " - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=192.168.1.100 -advertise=192.168.1.100 -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_master_data:/consul/data - networks: - consul-macvlan: - ipv4_address: 192.168.1.100 - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - - consul-ash3c: - image: consul:latest - hostname: consul-ash3c - command: > - sh -c " - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-ash3c -bind=192.168.1.101 -advertise=192.168.1.101 -client=0.0.0.0 - -retry-join=192.168.1.100 - -ui - -log-level=INFO - " - ports: - - "8501:8500" - - "8601:8600/udp" - volumes: - - consul_ash3c_data:/consul/data - networks: - consul-macvlan: - ipv4_address: 192.168.1.101 - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == ash3c - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - -volumes: - consul_master_data: - consul_ash3c_data: - -networks: - consul-macvlan: - driver: macvlan - driver_opts: - parent: eth0 # 根据你的网络接口调整 - ipam: - config: - - subnet: 192.168.1.0/24 - gateway: 192.168.1.1 - ip_range: 192.168.1.100/30 # 只分配 .100-.103 的IP \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-stack.yml b/swarm/stacks/consul-cluster-stack.yml deleted file mode 100644 index d34548f..0000000 --- a/swarm/stacks/consul-cluster-stack.yml +++ /dev/null @@ -1,76 +0,0 @@ -version: '3.8' - -services: - consul-master: - image: consul:latest - hostname: consul-master - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_master_data:/consul/data - networks: - consul-net: - aliases: - - consul-master - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - - consul-ash3c: - image: consul:latest - hostname: consul-ash3c - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -retry-join=consul-cluster_consul-master - -ui - -log-level=INFO - " - ports: - - "8501:8500" - - "8601:8600/udp" - volumes: - - consul_ash3c_data:/consul/data - networks: - consul-net: - aliases: - - consul-ash3c - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == ash3c - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - -volumes: - consul_master_data: - consul_ash3c_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-master-stack.yml b/swarm/stacks/consul-master-stack.yml deleted file mode 100644 index eadfa92..0000000 --- a/swarm/stacks/consul-master-stack.yml +++ /dev/null @@ -1,40 +0,0 @@ -version: '3.8' - -services: - consul: - image: consul:latest - hostname: consul-master - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=2 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=$$IP -advertise=100.117.106.136 -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_data:/consul/data - networks: - - consul-net - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - -volumes: - consul_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-simple-stack.yml b/swarm/stacks/consul-simple-stack.yml deleted file mode 100644 index cb09f5a..0000000 --- a/swarm/stacks/consul-simple-stack.yml +++ /dev/null @@ -1,39 +0,0 @@ -version: '3.8' - -services: - consul: - image: consul:latest - hostname: consul - command: > - consul agent -server -bootstrap-expect=1 - -datacenter=dc1 -data-dir=/consul/data - -node=consul -client=0.0.0.0 - -ui - -log-level=INFO - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_data:/consul/data - networks: - - consul-net - deploy: - mode: replicated - replicas: 1 - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - labels: - - "traefik.enable=true" - - "traefik.http.routers.consul.rule=Host(`consul.local`)" - - "traefik.http.services.consul.loadbalancer.server.port=8500" - - "traefik.docker.network=consul-net" - -volumes: - consul_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-single-node.yml b/swarm/stacks/consul-single-node.yml deleted file mode 100644 index 379d78d..0000000 --- a/swarm/stacks/consul-single-node.yml +++ /dev/null @@ -1,40 +0,0 @@ -version: '3.8' - -services: - consul: - image: consul:latest - hostname: consul-master - command: > - sh -c " - IP=$$(hostname -i | awk '{print $$1}'); - consul agent -server -bootstrap-expect=1 - -datacenter=dc1 -data-dir=/consul/data - -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 - -ui - -log-level=INFO - " - ports: - - "8500:8500" - - "8600:8600/udp" - volumes: - - consul_data:/consul/data - networks: - - consul-net - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.hostname == master - restart_policy: - condition: on-failure - delay: 5s - max_attempts: 3 - -volumes: - consul_data: - -networks: - consul-net: - driver: overlay - attachable: true \ No newline at end of file diff --git a/swarm/stacks/demo-services-stack.yml b/swarm/stacks/demo-services-stack.yml deleted file mode 100644 index d4af571..0000000 --- a/swarm/stacks/demo-services-stack.yml +++ /dev/null @@ -1,166 +0,0 @@ -version: '3.8' - -services: - # Web 应用示例 - webapp: - image: nginx:alpine - networks: - - traefik-public - configs: - - source: webapp-html - target: /usr/share/nginx/html/index.html - deploy: - replicas: 2 - labels: - - traefik.enable=true - - traefik.http.routers.webapp.rule=Host(`app.local`) - - traefik.http.routers.webapp.entrypoints=web - - traefik.http.services.webapp.loadbalancer.server.port=80 - update_config: - parallelism: 1 - delay: 10s - restart_policy: - condition: on-failure - - # API 服务示例 - api: - image: httpd:alpine - networks: - - traefik-public - configs: - - source: api-html - target: /usr/local/apache2/htdocs/index.html - deploy: - replicas: 2 - labels: - - traefik.enable=true - - traefik.http.routers.api.rule=Host(`api.local`) - - traefik.http.routers.api.entrypoints=web - - traefik.http.services.api.loadbalancer.server.port=80 - # 添加路径前缀 - - traefik.http.routers.api-path.rule=Host(`app.local`) && PathPrefix(`/api`) - - traefik.http.routers.api-path.entrypoints=web - - traefik.http.routers.api-path.service=api - update_config: - parallelism: 1 - delay: 10s - restart_policy: - condition: on-failure - - # 监控服务示例 - monitor: - image: nginx:alpine - networks: - - traefik-public - configs: - - source: monitor-html - target: /usr/share/nginx/html/index.html - deploy: - replicas: 1 - labels: - - traefik.enable=true - - traefik.http.routers.monitor.rule=Host(`monitor.local`) - - traefik.http.routers.monitor.entrypoints=web - - traefik.http.services.monitor.loadbalancer.server.port=80 - # 添加基本认证 (可选) - - traefik.http.routers.monitor.middlewares=auth - - traefik.http.middlewares.auth.basicauth.users=admin:$$2y$$10$$DLKjKQKQKQKQKQKQKQKQKe - restart_policy: - condition: on-failure - -networks: - traefik-public: - external: true - -configs: - webapp-html: - content: | - - - - Web App - Traefik Swarm Demo - - - -
-

🚀 Web Application

-
-

服务: webapp

-

访问地址: http://app.local

-

负载均衡: Traefik + Docker Swarm

-

时间:

-
-

这是通过 Traefik 路由的 Web 应用示例。

-
- - - - - api-html: - content: | - - - - API Service - Traefik Swarm Demo - - - -
-

🔌 API Service

-
-

服务: api

-

访问地址: http://api.local

-

路径路由: http://app.local/api

-

负载均衡: Traefik + Docker Swarm

-

时间:

-
-

这是通过 Traefik 路由的 API 服务示例。

-
- - - - - monitor-html: - content: | - - - - Monitor Service - Traefik Swarm Demo - - - -
-

📊 Monitor Service

-
-

服务: monitor

-

访问地址: http://monitor.local

-

认证: 基本认证保护

-

负载均衡: Traefik + Docker Swarm

-

时间:

-
-

这是通过 Traefik 路由的监控服务示例。

-
- - - \ No newline at end of file diff --git a/swarm/stacks/traefik-swarm-stack.yml b/swarm/stacks/traefik-swarm-stack.yml deleted file mode 100644 index e432508..0000000 --- a/swarm/stacks/traefik-swarm-stack.yml +++ /dev/null @@ -1,70 +0,0 @@ -version: '3.8' - -services: - traefik: - image: traefik:v3.0 - command: - # API 和 Dashboard - - --api.dashboard=true - - --api.insecure=true - - # 入口点 - - --entrypoints.web.address=:80 - - --entrypoints.websecure.address=:443 - - # Docker Swarm Provider - - --providers.swarm=true - - --providers.swarm.endpoint=unix:///var/run/docker.sock - - --providers.swarm.exposedByDefault=false - - --providers.swarm.network=traefik-public - - # 日志 - - --log.level=INFO - - --accesslog=true - - # 指标 - - --metrics.prometheus=true - - --metrics.prometheus.addEntryPointsLabels=true - - --metrics.prometheus.addServicesLabels=true - - # 证书解析器 (可选) - - --certificatesresolvers.letsencrypt.acme.httpchallenge=true - - --certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web - - --certificatesresolvers.letsencrypt.acme.email=admin@example.com - - --certificatesresolvers.letsencrypt.acme.storage=/certificates/acme.json - - ports: - - "80:80" - - "443:443" - - "8080:8080" # Dashboard - - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - traefik-certificates:/certificates - - networks: - - traefik-public - - deploy: - mode: global - placement: - constraints: - - node.role == manager - labels: - # Traefik Dashboard 路由 - - traefik.enable=true - - traefik.http.routers.traefik-dashboard.rule=Host(`traefik.local`) - - traefik.http.routers.traefik-dashboard.service=api@internal - - traefik.http.services.traefik-dashboard.loadbalancer.server.port=8080 - update_config: - parallelism: 1 - delay: 10s - restart_policy: - condition: on-failure - -networks: - traefik-public: - external: true - -volumes: - traefik-certificates: \ No newline at end of file diff --git a/tofu/modules/nomad-cluster/templates/nomad-userdata.sh b/tofu/modules/nomad-cluster/templates/nomad-userdata.sh index f0519b3..0f6477f 100644 --- a/tofu/modules/nomad-cluster/templates/nomad-userdata.sh +++ b/tofu/modules/nomad-cluster/templates/nomad-userdata.sh @@ -23,17 +23,16 @@ apt-get install -y \ wget \ unzip \ jq \ - docker.io \ - docker-compose \ + podman \ htop \ net-tools \ vim -# 启动 Docker -log "启动 Docker 服务..." -systemctl enable docker -systemctl start docker -usermod -aG docker ubuntu +# 启动 Podman +log "启动 Podman 服务..." +systemctl enable podman +systemctl start podman +usermod -aG podman ubuntu # 安装 Nomad log "安装 Nomad ${nomad_version}..." @@ -85,8 +84,8 @@ server { client { enabled = true - host_volume "docker-sock" { - path = "/var/run/docker.sock" + host_volume "podman-sock" { + path = "/run/podman/podman.sock" read_only = false } } @@ -108,9 +107,8 @@ ports { serf = 4648 } -plugin "docker" { +plugin "podman" { config { - allow_privileged = true volumes { enabled = true }