diff --git a/configuration/docker-daemon.json b/configuration/docker-daemon.json new file mode 100644 index 0000000..5564836 --- /dev/null +++ b/configuration/docker-daemon.json @@ -0,0 +1,14 @@ +{ + "proxies": { + "http-proxy": "http://istoreos.tailnet-68f9.ts.net:7891", + "https-proxy": "http://istoreos.tailnet-68f9.ts.net:7891", + "no-proxy": "localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" + }, + "registry-mirrors": [], + "insecure-registries": [], + "debug": false, + "experimental": false, + "features": { + "buildkit": true + } +} diff --git a/configuration/inventories/production/consul-cluster.ini b/configuration/inventories/production/consul-cluster.ini new file mode 100644 index 0000000..5e82382 --- /dev/null +++ b/configuration/inventories/production/consul-cluster.ini @@ -0,0 +1,10 @@ +[consul_cluster] +master ansible_host=master ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 +ash3c ansible_host=ash3c ansible_user=ben ansible_become=yes ansible_become_pass=3131 + +[consul_cluster:vars] +ansible_ssh_common_args='-o StrictHostKeyChecking=no' +consul_version=1.21.4 +consul_datacenter=dc1 +# 生成加密密钥: consul keygen +vault_consul_encrypt_key=1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848= \ No newline at end of file diff --git a/configuration/inventories/production/nomad-cluster.ini b/configuration/inventories/production/nomad-cluster.ini new file mode 100644 index 0000000..184ac27 --- /dev/null +++ b/configuration/inventories/production/nomad-cluster.ini @@ -0,0 +1,20 @@ +[nomad_servers] +master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 +semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3 +ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3 + +[nomad_clients] +# 如果需要客户端节点,可以在这里添加 + +[nomad_cluster:children] +nomad_servers +nomad_clients + +[nomad_cluster:vars] +ansible_ssh_private_key_file=~/.ssh/id_ed25519 +ansible_user=ben +ansible_become=yes +nomad_version=1.10.5 +nomad_datacenter=dc1 +nomad_region=global +nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ= \ No newline at end of file diff --git a/configuration/inventories/production/nomad-local.ini b/configuration/inventories/production/nomad-local.ini new file mode 100644 index 0000000..50ed83d --- /dev/null +++ b/configuration/inventories/production/nomad-local.ini @@ -0,0 +1,15 @@ +[nomad_servers] +localhost ansible_connection=local nomad_role=server nomad_bootstrap_expect=1 + +[nomad_clients] +# 如果需要客户端节点,可以在这里添加 + +[nomad_cluster:children] +nomad_servers +nomad_clients + +[nomad_cluster:vars] +ansible_user=root +nomad_version=1.6.2 +nomad_datacenter=dc1 +nomad_region=global \ No newline at end of file diff --git a/configuration/playbooks/applications/cloud-providers-update.yml b/configuration/playbooks/applications/cloud-providers-update.yml deleted file mode 100644 index 1892247..0000000 --- a/configuration/playbooks/applications/cloud-providers-update.yml +++ /dev/null @@ -1,72 +0,0 @@ ---- -- name: Cloud Providers System Update Playbook - hosts: huawei,google,ditigalocean,aws - become: yes - gather_facts: yes - - tasks: - # Ubuntu/Debian 系统更新 (apt) - - name: Update apt cache (Ubuntu/Debian) - apt: - update_cache: yes - cache_valid_time: 3600 - when: ansible_os_family == "Debian" - - - name: Upgrade all packages (Ubuntu/Debian) - apt: - upgrade: yes - autoremove: yes - autoclean: yes - when: ansible_os_family == "Debian" - register: apt_upgrade_result - - # AWS Linux 系统更新 (dnf) - - name: Update dnf cache (AWS Linux/RHEL) - dnf: - update_cache: yes - when: ansible_os_family == "RedHat" - - - name: Upgrade all packages (AWS Linux/RHEL) - dnf: - name: "*" - state: latest - skip_broken: yes - when: ansible_os_family == "RedHat" - register: dnf_upgrade_result - - # 显示升级结果 - - name: Display apt upgrade results - debug: - msg: "APT system upgrade completed. {{ apt_upgrade_result.changed }} packages were updated." - when: ansible_os_family == "Debian" and apt_upgrade_result is defined - - - name: Display dnf upgrade results - debug: - msg: "DNF system upgrade completed. {{ dnf_upgrade_result.changed }} packages were updated." - when: ansible_os_family == "RedHat" and dnf_upgrade_result is defined - - # 检查是否需要重启 (Ubuntu/Debian) - - name: Check if reboot is required (Ubuntu/Debian) - stat: - path: /var/run/reboot-required - register: debian_reboot_required - when: ansible_os_family == "Debian" - - # 检查是否需要重启 (AWS Linux/RHEL) - - name: Check if reboot is required (AWS Linux/RHEL) - command: needs-restarting -r - register: rhel_reboot_required - failed_when: false - changed_when: false - when: ansible_os_family == "RedHat" - - # 通知重启信息 - - name: Notify if reboot is required (Ubuntu/Debian) - debug: - msg: "System reboot is required to complete the update." - when: ansible_os_family == "Debian" and debian_reboot_required.stat.exists is defined and debian_reboot_required.stat.exists - - - name: Notify if reboot is required (AWS Linux/RHEL) - debug: - msg: "System reboot is required to complete the update." - when: ansible_os_family == "RedHat" and rhel_reboot_required.rc == 1 \ No newline at end of file diff --git a/configuration/playbooks/applications/docker-management.yml b/configuration/playbooks/applications/docker-management.yml deleted file mode 100644 index 5f359e9..0000000 --- a/configuration/playbooks/applications/docker-management.yml +++ /dev/null @@ -1,128 +0,0 @@ ---- -- name: Docker Container Management - hosts: all - become: yes - gather_facts: yes - - tasks: - # 检查 Docker 是否安装 - - name: Check if Docker is installed - command: which docker - register: docker_installed - failed_when: false - changed_when: false - - - name: Skip Docker tasks if not installed - debug: - msg: "Docker not installed on {{ inventory_hostname }}, skipping Docker tasks" - when: docker_installed.rc != 0 - - # Docker 系统信息 - - name: Get Docker system info - shell: docker system df - register: docker_system_info - when: docker_installed.rc == 0 - - - name: Display Docker system usage - debug: - msg: "🐳 Docker System Usage: {{ docker_system_info.stdout_lines }}" - when: docker_installed.rc == 0 - - # 检查运行中的容器 - - name: List running containers - shell: docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" - register: running_containers - when: docker_installed.rc == 0 - - - name: Display running containers - debug: - msg: "📦 Running Containers: {{ running_containers.stdout_lines }}" - when: docker_installed.rc == 0 - - # 检查停止的容器 - - name: List stopped containers - shell: docker ps -a --filter "status=exited" --format "table {{.Names}}\t{{.Status}}" - register: stopped_containers - when: docker_installed.rc == 0 - - - name: Display stopped containers - debug: - msg: "⏹️ Stopped Containers: {{ stopped_containers.stdout_lines }}" - when: docker_installed.rc == 0 and stopped_containers.stdout_lines | length > 1 - - # 检查 Docker 镜像 - - name: List Docker images - shell: docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}" - register: docker_images - when: docker_installed.rc == 0 - - - name: Display Docker images - debug: - msg: "🖼️ Docker Images: {{ docker_images.stdout_lines }}" - when: docker_installed.rc == 0 - - # 检查悬空镜像 - - name: Check for dangling images - shell: docker images -f "dangling=true" -q - register: dangling_images - when: docker_installed.rc == 0 - - - name: Report dangling images - debug: - msg: "🗑️ Found {{ dangling_images.stdout_lines | length }} dangling images" - when: docker_installed.rc == 0 - - # 检查 Docker 卷 - - name: List Docker volumes - shell: docker volume ls - register: docker_volumes - when: docker_installed.rc == 0 - - - name: Display Docker volumes - debug: - msg: "💾 Docker Volumes: {{ docker_volumes.stdout_lines }}" - when: docker_installed.rc == 0 - - # 检查 Docker 网络 - - name: List Docker networks - shell: docker network ls - register: docker_networks - when: docker_installed.rc == 0 - - - name: Display Docker networks - debug: - msg: "🌐 Docker Networks: {{ docker_networks.stdout_lines }}" - when: docker_installed.rc == 0 - - # 检查容器资源使用 - - name: Check container resource usage - shell: docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}" - register: container_stats - when: docker_installed.rc == 0 - - - name: Display container resource usage - debug: - msg: "📊 Container Stats: {{ container_stats.stdout_lines }}" - when: docker_installed.rc == 0 - - # 检查 Docker 服务状态 - - name: Check Docker service status - systemd: - name: docker - register: docker_service_status - when: docker_installed.rc == 0 - - - name: Display Docker service status - debug: - msg: "🔧 Docker Service: {{ docker_service_status.status.ActiveState }}" - when: docker_installed.rc == 0 - - # 清理建议 - - name: Suggest cleanup if needed - debug: - msg: | - 💡 Cleanup suggestions: - - Run 'docker system prune -f' to remove unused data - - Run 'docker image prune -f' to remove dangling images - - Run 'docker volume prune -f' to remove unused volumes - when: docker_installed.rc == 0 and (dangling_images.stdout_lines | length > 0 or stopped_containers.stdout_lines | length > 1) \ No newline at end of file diff --git a/configuration/playbooks/applications/docker-status-check.yml b/configuration/playbooks/applications/docker-status-check.yml deleted file mode 100644 index d794f8e..0000000 --- a/configuration/playbooks/applications/docker-status-check.yml +++ /dev/null @@ -1,97 +0,0 @@ ---- -- name: Docker Status Check for HCP Nodes - hosts: hcp - gather_facts: yes - become: yes - - tasks: - - name: Check if Docker is installed - command: docker --version - register: docker_version - ignore_errors: yes - - - name: Display Docker version - debug: - msg: "Docker version: {{ docker_version.stdout }}" - when: docker_version.rc == 0 - - - name: Check Docker service status - systemd: - name: docker - register: docker_service_status - - - name: Display Docker service status - debug: - msg: "Docker service is {{ docker_service_status.status.ActiveState }}" - - - name: Check Docker daemon info - command: docker info --format "{{ '{{' }}.ServerVersion{{ '}}' }}" - register: docker_info - ignore_errors: yes - - - name: Display Docker daemon info - debug: - msg: "Docker daemon version: {{ docker_info.stdout }}" - when: docker_info.rc == 0 - - - name: Check Docker Swarm status - command: docker info --format "{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}" - register: swarm_status - ignore_errors: yes - - - name: Display Swarm status - debug: - msg: "Swarm status: {{ swarm_status.stdout }}" - when: swarm_status.rc == 0 - - - name: Get Docker Swarm node info (if in swarm) - command: docker node ls - register: swarm_nodes - ignore_errors: yes - when: swarm_status.stdout == "active" - - - name: Display Swarm nodes - debug: - msg: "{{ swarm_nodes.stdout_lines }}" - when: swarm_nodes is defined and swarm_nodes.rc == 0 - - - name: List running containers - command: docker ps --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}\t{{ '{{' }}.Ports{{ '}}' }}" - register: running_containers - ignore_errors: yes - - - name: Display running containers - debug: - msg: "{{ running_containers.stdout_lines }}" - when: running_containers.rc == 0 - - - name: Check Docker network list - command: docker network ls - register: docker_networks - ignore_errors: yes - - - name: Display Docker networks - debug: - msg: "{{ docker_networks.stdout_lines }}" - when: docker_networks.rc == 0 - - - name: Get Docker system info - command: docker system df - register: docker_system_info - ignore_errors: yes - - - name: Display Docker system usage - debug: - msg: "{{ docker_system_info.stdout_lines }}" - when: docker_system_info.rc == 0 - - - name: Check if node is Swarm manager - command: docker node inspect self --format "{{ '{{' }}.ManagerStatus.Leader{{ '}}' }}" - register: is_manager - ignore_errors: yes - when: swarm_status.stdout == "active" - - - name: Display manager status - debug: - msg: "Is Swarm manager: {{ is_manager.stdout }}" - when: is_manager is defined and is_manager.rc == 0 \ No newline at end of file diff --git a/configuration/playbooks/applications/docker-swarm-analysis-simple.yml b/configuration/playbooks/applications/docker-swarm-analysis-simple.yml deleted file mode 100644 index 98eeb5c..0000000 --- a/configuration/playbooks/applications/docker-swarm-analysis-simple.yml +++ /dev/null @@ -1,210 +0,0 @@ ---- -- name: Simple Docker Swarm Analysis for ash3c - hosts: ash3c - become: yes - gather_facts: yes - - tasks: - # 基础检查 - - name: Check if Docker is installed - command: which docker - register: docker_installed - failed_when: false - changed_when: false - - - name: Fail if Docker not installed - fail: - msg: "Docker is not installed on {{ inventory_hostname }}" - when: docker_installed.rc != 0 - - # 检查当前 Swarm 状态 - - name: Check Docker Swarm status - shell: docker info | grep "Swarm:" -A 1 - register: swarm_status - - - name: Display current Swarm status - debug: - msg: "🔍 Current Swarm Status: {{ swarm_status.stdout_lines }}" - - # 获取运行中的容器 - - name: Get running containers - shell: docker ps --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Image{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}\t{{ '{{' }}.Ports{{ '}}' }}" - register: running_containers - - - name: Display running containers - debug: - msg: "🏃 Running Containers: {{ running_containers.stdout_lines }}" - - # 获取所有容器(包括停止的) - - name: Get all containers - shell: docker ps -a --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Image{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}" - register: all_containers - - - name: Display all containers - debug: - msg: "📦 All Containers: {{ all_containers.stdout_lines }}" - - # 检查每个容器的详细信息 - - name: Get container names only - shell: docker ps -a --format "{{ '{{' }}.Names{{ '}}' }}" - register: container_names - - - name: Inspect each container - shell: | - echo "=== Container: {{ item }} ===" - echo "Image: $(docker inspect {{ item }} --format '{{ '{{' }}.Config.Image{{ '}}' }}')" - echo "Status: $(docker inspect {{ item }} --format '{{ '{{' }}.State.Status{{ '}}' }}')" - echo "Restart Policy: $(docker inspect {{ item }} --format '{{ '{{' }}.HostConfig.RestartPolicy.Name{{ '}}' }}')" - echo "Network Mode: $(docker inspect {{ item }} --format '{{ '{{' }}.HostConfig.NetworkMode{{ '}}' }}')" - echo "Published Ports: $(docker port {{ item }} 2>/dev/null || echo 'None')" - echo "Volumes/Mounts:" - docker inspect {{ item }} --format '{{ '{{' }}range .Mounts{{ '}}' }} {{ '{{' }}.Source{{ '}}' }}:{{ '{{' }}.Destination{{ '}}' }} ({{ '{{' }}.Mode{{ '}}' }}){{ '{{' }}"\n"{{ '}}' }}{{ '{{' }}end{{ '}}' }}' || echo " None" - echo "Environment Variables:" - docker inspect {{ item }} --format '{{ '{{' }}range .Config.Env{{ '}}' }} {{ '{{' }}.{{ '}}' }}{{ '{{' }}"\n"{{ '}}' }}{{ '{{' }}end{{ '}}' }}' | head -10 - echo "Labels:" - docker inspect {{ item }} --format '{{ '{{' }}range $key, $value := .Config.Labels{{ '}}' }} {{ '{{' }}$key{{ '}}' }}={{ '{{' }}$value{{ '}}' }}{{ '{{' }}"\n"{{ '}}' }}{{ '{{' }}end{{ '}}' }}' | head -5 - echo "---" - register: container_inspect - loop: "{{ container_names.stdout_lines }}" - when: container_names.stdout_lines | length > 0 - - - name: Display container inspection results - debug: - msg: "{{ item.stdout }}" - loop: "{{ container_inspect.results }}" - when: container_inspect is defined - - # 检查 Docker Compose 文件 - - name: Find docker-compose files - find: - paths: - - /root - - /home - - /opt - patterns: - - "docker-compose.yml" - - "docker-compose.yaml" - - "compose.yml" - - "compose.yaml" - recurse: yes - depth: 3 - register: compose_files - - - name: Display found compose files - debug: - msg: "📄 Found compose files: {{ item.path }}" - loop: "{{ compose_files.files }}" - when: compose_files.files | length > 0 - - # 分析网络配置 - - name: Get Docker networks - shell: docker network ls - register: docker_networks - - - name: Display Docker networks - debug: - msg: "🌐 Docker Networks: {{ docker_networks.stdout_lines }}" - - # 检查卷使用情况 - - name: Get Docker volumes - shell: docker volume ls - register: docker_volumes - - - name: Display Docker volumes - debug: - msg: "💾 Docker Volumes: {{ docker_volumes.stdout_lines }}" - - # 检查容器资源使用 - - name: Get container resource usage - shell: docker stats --no-stream - register: container_stats - when: container_names.stdout_lines | length > 0 - - - name: Display container stats - debug: - msg: "📊 Container Resource Usage: {{ container_stats.stdout_lines }}" - when: container_stats is defined - - # 生成 Swarm 适用性分析 - - name: Generate Swarm suitability analysis - debug: - msg: | - 🔍 DOCKER SWARM MIGRATION ANALYSIS FOR {{ inventory_hostname }} - ================================================================ - - 📋 SUMMARY: - - Current Swarm Status: {{ 'Active' if 'active' in swarm_status.stdout else 'Inactive' }} - - Total Containers: {{ container_names.stdout_lines | length }} - - Running Containers: {{ (running_containers.stdout_lines | length) - 1 }} - - Compose Files Found: {{ compose_files.files | length }} - - 💡 GENERAL RECOMMENDATIONS: - - ✅ SUITABLE FOR SWARM (typically): - - Web applications (nginx, apache, etc.) - - API services - - Databases (with proper volume management) - - Monitoring tools (prometheus, grafana, etc.) - - Load balancers - - ❌ NOT SUITABLE FOR SWARM: - - Containers using Docker socket (/var/run/docker.sock) - - Containers with --privileged flag - - Containers requiring specific host access - - Development/testing containers - - ⚠️ NEEDS MODIFICATION: - - Containers using bind mounts (convert to volumes) - - Containers without restart policies - - Containers using host networking - - 🚀 NEXT STEPS: - 1. Review each container's configuration above - 2. Identify services that can benefit from scaling - 3. Convert suitable containers to Docker services - 4. Set up overlay networks - 5. Configure secrets and configs management - - 📝 MIGRATION CHECKLIST: - □ Initialize Swarm (already done: {{ 'Yes' if 'active' in swarm_status.stdout else 'No' }}) - □ Create overlay networks - □ Convert containers to services - □ Set up service discovery - □ Configure load balancing - □ Test service scaling - □ Set up monitoring - when: container_names is defined - - # 保存分析结果 - - name: Save analysis summary - copy: - content: | - Docker Swarm Analysis for {{ inventory_hostname }} - Generated: {{ ansible_date_time.iso8601 }} - - Current Swarm Status: {{ swarm_status.stdout }} - Total Containers: {{ container_names.stdout_lines | length }} - - Container List: - {{ container_names.stdout_lines | join('\n') }} - - Networks: - {{ docker_networks.stdout }} - - Volumes: - {{ docker_volumes.stdout }} - - Compose Files Found: - {% for file in compose_files.files %} - - {{ file.path }} - {% endfor %} - dest: "/tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt" - - - name: Analysis complete - debug: - msg: | - 🎉 Analysis complete! - Results saved to: /tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt - - Review the container details above to determine which services - are suitable for Swarm migration. \ No newline at end of file diff --git a/configuration/playbooks/applications/docker-swarm-analysis.yml b/configuration/playbooks/applications/docker-swarm-analysis.yml deleted file mode 100644 index 12a1fd6..0000000 --- a/configuration/playbooks/applications/docker-swarm-analysis.yml +++ /dev/null @@ -1,246 +0,0 @@ ---- -- name: Docker Swarm Migration Analysis for ash3c - hosts: ash3c - become: yes - gather_facts: yes - - vars: - analysis_results: [] - - tasks: - # 基础检查 - - name: Check if Docker is installed - command: which docker - register: docker_installed - failed_when: false - changed_when: false - - - name: Fail if Docker not installed - fail: - msg: "Docker is not installed on {{ inventory_hostname }}" - when: docker_installed.rc != 0 - - # 检查当前 Swarm 状态 - - name: Check Docker Swarm status - shell: docker info --format "{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}" - register: swarm_status - - - name: Display current Swarm status - debug: - msg: "🔍 Current Swarm Status: {{ swarm_status.stdout }}" - - # 获取所有容器的详细信息 - - name: Get all containers (running and stopped) - shell: docker ps -a --format "{{ '{{' }}.Names{{ '}}' }}" - register: all_containers - - - name: Get basic container information - shell: | - echo "=== Container: {{ item }} ===" - docker inspect {{ item }} | jq -r ' - .[0] | - "Image: " + .Config.Image, - "Status: " + .State.Status, - "RestartPolicy: " + .HostConfig.RestartPolicy.Name, - "NetworkMode: " + .HostConfig.NetworkMode, - "Ports: " + (.NetworkSettings.Ports | keys | join(", ")), - "Volumes: " + ([.Mounts[]? | .Source + ":" + .Destination + ":" + .Mode] | join(" ")), - "Memory: " + (.HostConfig.Memory | tostring), - "CPUs: " + (.HostConfig.NanoCpus | tostring) - ' - echo "---" - register: container_details - loop: "{{ all_containers.stdout_lines }}" - when: all_containers.stdout_lines | length > 0 - - - name: Display container details - debug: - msg: "{{ item.stdout }}" - loop: "{{ container_details.results }}" - when: container_details is defined - - # 检查 Docker Compose 文件 - - name: Find docker-compose files - find: - paths: - - /root - - /home - - /opt - patterns: - - "docker-compose.yml" - - "docker-compose.yaml" - - "compose.yml" - - "compose.yaml" - recurse: yes - register: compose_files - - - name: Display found compose files - debug: - msg: "📄 Found compose files: {{ item.path }}" - loop: "{{ compose_files.files }}" - when: compose_files.files | length > 0 - - # 分析网络配置 - - name: Get Docker networks - shell: docker network ls --format "{{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.Driver{{ '}}' }}\t{{ '{{' }}.Scope{{ '}}' }}" - register: docker_networks - - - name: Display Docker networks - debug: - msg: "🌐 Docker Networks: {{ docker_networks.stdout_lines }}" - - # 检查卷使用情况 - - name: Get Docker volumes - shell: docker volume ls --format "{{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.Driver{{ '}}' }}" - register: docker_volumes - - - name: Display Docker volumes - debug: - msg: "💾 Docker Volumes: {{ docker_volumes.stdout_lines }}" - - # 检查容器资源使用 - - name: Get container resource usage - shell: docker stats --no-stream --format "{{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.CPUPerc{{ '}}' }}\t{{ '{{' }}.MemUsage{{ '}}' }}\t{{ '{{' }}.NetIO{{ '}}' }}\t{{ '{{' }}.BlockIO{{ '}}' }}" - register: container_stats - when: all_containers.stdout_lines | length > 0 - - - name: Display container stats - debug: - msg: "📊 Container Resource Usage: {{ container_stats.stdout_lines }}" - when: container_stats is defined - - # 分析服务类型和 Swarm 适用性 - - name: Analyze containers for Swarm suitability - set_fact: - swarm_analysis: | - 🔍 SWARM MIGRATION ANALYSIS FOR {{ inventory_hostname }} - ================================================ - - Current Swarm Status: {{ swarm_status.stdout }} - Total Containers: {{ all_containers.stdout_lines | length }} - - 📋 CONTAINER ANALYSIS: - {% for container in container_details.results %} - - Container: {{ container.item }} - {% set details = container.stdout.split('\n') %} - {% for line in details %} - {{ line }} - {% endfor %} - - SWARM SUITABILITY ASSESSMENT: - {% if 'restart=always' in container.stdout or 'restart=unless-stopped' in container.stdout %} - ✅ Good restart policy for Swarm - {% else %} - ⚠️ Consider adding restart policy - {% endif %} - - {% if 'NetworkMode: bridge' in container.stdout or 'NetworkMode: host' in container.stdout %} - ⚠️ May need network configuration for Swarm - {% else %} - ✅ Custom network - good for Swarm - {% endif %} - - {% if '/var/run/docker.sock' in container.stdout %} - ❌ Uses Docker socket - NOT suitable for Swarm - {% elif 'bind' in container.stdout %} - ⚠️ Uses bind mounts - consider using volumes - {% else %} - ✅ Good volume configuration - {% endif %} - - {% endfor %} - - 💡 RECOMMENDATIONS: - - SUITABLE FOR SWARM: - {% for container in container_details.results %} - {% if '/var/run/docker.sock' not in container.stdout %} - - {{ container.item }}: Ready for Swarm migration - {% endif %} - {% endfor %} - - NEEDS MODIFICATION: - {% for container in container_details.results %} - {% if '/var/run/docker.sock' in container.stdout %} - - {{ container.item }}: Uses Docker socket - keep as standalone - {% elif 'bind' in container.stdout %} - - {{ container.item }}: Convert bind mounts to volumes - {% endif %} - {% endfor %} - - NEXT STEPS: - 1. Initialize Swarm: docker swarm init - 2. Create overlay networks for services - 3. Convert suitable containers to services - 4. Set up service discovery and load balancing - 5. Configure secrets and configs management - when: container_details is defined - - - name: Display Swarm analysis - debug: - msg: "{{ swarm_analysis }}" - when: swarm_analysis is defined - - # 生成迁移脚本建议 - - name: Generate migration script suggestions - set_fact: - migration_script: | - #!/bin/bash - # Docker Swarm Migration Script for {{ inventory_hostname }} - # Generated on {{ ansible_date_time.iso8601 }} - - echo "🚀 Starting Docker Swarm migration..." - - # Initialize Swarm (if not already done) - if [ "{{ swarm_status.stdout }}" != "active" ]; then - echo "Initializing Docker Swarm..." - docker swarm init - fi - - # Create overlay networks - echo "Creating overlay networks..." - docker network create -d overlay --attachable app-network - - # Example service creation (modify as needed) - {% for container in container_details.results if container_details is defined %} - {% if '/var/run/docker.sock' not in container.stdout %} - echo "Converting {{ container.item }} to Swarm service..." - # docker service create --name {{ container.item }}-svc \ - # --network app-network \ - # --replicas 1 \ - # [ADD_YOUR_SPECIFIC_OPTIONS] \ - # [IMAGE_NAME] - {% endif %} - {% endfor %} - - echo "✅ Migration script template generated!" - echo "Please review and customize before running." - when: container_details is defined - - - name: Display migration script - debug: - msg: "{{ migration_script }}" - when: migration_script is defined - - # 保存分析结果到文件 - - name: Save analysis results to file - copy: - content: | - {{ swarm_analysis }} - - MIGRATION SCRIPT: - {{ migration_script }} - dest: "/tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt" - when: swarm_analysis is defined and migration_script is defined - - - name: Analysis complete - debug: - msg: | - 🎉 Analysis complete! - Results saved to: /tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt - - Summary: - - Total containers analyzed: {{ all_containers.stdout_lines | length }} - - Compose files found: {{ compose_files.files | length }} - - Current Swarm status: {{ swarm_status.stdout }} \ No newline at end of file diff --git a/configuration/playbooks/applications/docker-swarm-check.yml b/configuration/playbooks/applications/docker-swarm-check.yml deleted file mode 100644 index 6f2303d..0000000 --- a/configuration/playbooks/applications/docker-swarm-check.yml +++ /dev/null @@ -1,236 +0,0 @@ ---- -- name: Docker Swarm Check for ash3c - hosts: ash3c - become: yes - gather_facts: yes - - tasks: - # 基础检查 - - name: Check if Docker is installed - command: which docker - register: docker_installed - failed_when: false - changed_when: false - - - name: Fail if Docker not installed - fail: - msg: "Docker is not installed on {{ inventory_hostname }}" - when: docker_installed.rc != 0 - - # 检查当前 Swarm 状态 - - name: Check Docker Swarm status - shell: docker info | grep "Swarm:" -A 1 - register: swarm_status - - - name: Display current Swarm status - debug: - msg: "🔍 Current Swarm Status: {{ swarm_status.stdout_lines }}" - - # 获取运行中的容器 - 使用简单格式 - - name: Get running containers - shell: docker ps - register: running_containers - - - name: Display running containers - debug: - msg: "🏃 Running Containers:\n{{ running_containers.stdout }}" - - # 获取所有容器(包括停止的) - - name: Get all containers - shell: docker ps -a - register: all_containers - - - name: Display all containers - debug: - msg: "📦 All Containers:\n{{ all_containers.stdout }}" - - # 获取容器名称列表 - - name: Get container names - shell: docker ps -a | awk 'NR>1 {print $NF}' | head -20 - register: container_names - - - name: Display container names - debug: - msg: "Container names: {{ container_names.stdout_lines }}" - - # 检查每个容器的基本信息 - - name: Get basic container info - shell: | - echo "=== Container: {{ item }} ===" - docker inspect {{ item }} | jq -r '.[0] | { - "Image": .Config.Image, - "Status": .State.Status, - "RestartPolicy": .HostConfig.RestartPolicy.Name, - "NetworkMode": .HostConfig.NetworkMode - }' - echo "Ports:" - docker port {{ item }} 2>/dev/null || echo "No published ports" - echo "Mounts:" - docker inspect {{ item }} | jq -r '.[0].Mounts[]? | " \(.Source):\(.Destination) (\(.Mode))"' - echo "---" - register: container_info - loop: "{{ container_names.stdout_lines[:10] }}" # 限制前10个容器 - when: container_names.stdout_lines | length > 0 - - - name: Display container info - debug: - msg: "{{ item.stdout }}" - loop: "{{ container_info.results }}" - when: container_info is defined - - # 检查 Docker Compose 文件 - - name: Find docker-compose files in common locations - find: - paths: - - /root - - /home - - /opt - - /var/lib/docker - patterns: - - "docker-compose.yml" - - "docker-compose.yaml" - - "compose.yml" - - "compose.yaml" - recurse: yes - depth: 3 - register: compose_files - ignore_errors: yes - - - name: Display found compose files - debug: - msg: "📄 Found compose files: {{ compose_files.files | map(attribute='path') | list }}" - when: compose_files.files | length > 0 - - # 分析网络配置 - - name: Get Docker networks - shell: docker network ls - register: docker_networks - - - name: Display Docker networks - debug: - msg: "🌐 Docker Networks:\n{{ docker_networks.stdout }}" - - # 检查卷使用情况 - - name: Get Docker volumes - shell: docker volume ls - register: docker_volumes - - - name: Display Docker volumes - debug: - msg: "💾 Docker Volumes:\n{{ docker_volumes.stdout }}" - - # 检查容器资源使用 - - name: Get container resource usage - shell: docker stats --no-stream - register: container_stats - when: container_names.stdout_lines | length > 0 - - - name: Display container stats - debug: - msg: "📊 Container Resource Usage:\n{{ container_stats.stdout }}" - when: container_stats is defined - - # 检查 Docker 镜像 - - name: Get Docker images - shell: docker images - register: docker_images - - - name: Display Docker images - debug: - msg: "🖼️ Docker Images:\n{{ docker_images.stdout }}" - - # 生成 Swarm 适用性分析 - - name: Generate Swarm suitability analysis - debug: - msg: | - - 🔍 DOCKER SWARM MIGRATION ANALYSIS FOR {{ inventory_hostname }} - ================================================================ - - 📋 SUMMARY: - - Current Swarm Status: {{ 'Active' if 'active' in swarm_status.stdout else 'Inactive' }} - - Total Containers: {{ container_names.stdout_lines | length }} - - Running Containers: {{ running_containers.stdout_lines | length - 1 }} - - Compose Files Found: {{ compose_files.files | length if compose_files.files is defined else 0 }} - - 💡 SWARM MIGRATION RECOMMENDATIONS: - - ✅ TYPICALLY SUITABLE FOR SWARM: - - Web servers (nginx, apache, caddy) - - API services and microservices - - Application servers - - Load balancers (traefik, haproxy) - - Monitoring tools (prometheus, grafana) - - Databases (with proper volume strategy) - - ❌ NOT SUITABLE FOR SWARM: - - Containers using Docker socket (/var/run/docker.sock) - - Containers with --privileged flag - - Development/testing containers - - Containers requiring specific host hardware access - - ⚠️ NEEDS MODIFICATION FOR SWARM: - - Containers using bind mounts → convert to volumes - - Containers without restart policies → add restart policies - - Containers using host networking → use overlay networks - - Containers with hardcoded IPs → use service discovery - - 🚀 MIGRATION STEPS: - 1. ✅ Swarm is already initialized - 2. Create overlay networks for service communication - 3. Convert suitable containers to Docker services - 4. Set up service discovery and load balancing - 5. Configure secrets and configs management - 6. Test service scaling and failover - - 📝 NEXT ACTIONS: - - Review each container above for Swarm suitability - - Identify services that would benefit from scaling - - Plan network topology for services - - Prepare volume migration strategy - when: container_names is defined - - # 保存分析结果 - - name: Save analysis summary to file - copy: - content: | - Docker Swarm Analysis for {{ inventory_hostname }} - Generated: {{ ansible_date_time.iso8601 }} - - SWARM STATUS: - {{ swarm_status.stdout }} - - CONTAINERS ({{ container_names.stdout_lines | length }} total): - {{ container_names.stdout_lines | join('\n') }} - - NETWORKS: - {{ docker_networks.stdout }} - - VOLUMES: - {{ docker_volumes.stdout }} - - IMAGES: - {{ docker_images.stdout }} - - {% if compose_files.files is defined and compose_files.files | length > 0 %} - COMPOSE FILES FOUND: - {% for file in compose_files.files %} - - {{ file.path }} - {% endfor %} - {% endif %} - dest: "/tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt" - - - name: Analysis complete - debug: - msg: | - - 🎉 ANALYSIS COMPLETE! - - 📄 Results saved to: /tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt - - 🔍 Review the container details above to identify: - - Which services are suitable for Swarm - - Which containers need modification - - Migration priority and strategy - - 💡 TIP: Focus on stateless services first for easier migration! \ No newline at end of file diff --git a/configuration/playbooks/applications/gitea-runner-management.yml b/configuration/playbooks/applications/gitea-runner-management.yml deleted file mode 100644 index 842692f..0000000 --- a/configuration/playbooks/applications/gitea-runner-management.yml +++ /dev/null @@ -1,95 +0,0 @@ ---- -- name: Gitea Runner Management - hosts: hcp - become: yes - vars: - gitea_runner_user: "gitea-runner" - gitea_runner_data_dir: "/var/lib/gitea-runner" - gitea_runner_log_dir: "/var/log/gitea-runner" - - tasks: - - name: Check gitea-runner service status - systemd: - name: gitea-runner - register: service_status - - - name: Display service status - debug: - msg: | - Service: {{ service_status.status.ActiveState }} - Enabled: {{ service_status.status.UnitFileState }} - Main PID: {{ service_status.status.MainPID | default('N/A') }} - - - name: Show recent logs - command: journalctl -u gitea-runner --no-pager -n 20 - register: recent_logs - changed_when: false - - - name: Display recent logs - debug: - var: recent_logs.stdout_lines - - - name: Check runner registration - stat: - path: "{{ gitea_runner_data_dir }}/.runner" - register: runner_registered - - - name: Display registration status - debug: - msg: "Runner registered: {{ runner_registered.stat.exists }}" - - - name: Show runner configuration (if registered) - command: cat {{ gitea_runner_data_dir }}/.runner - register: runner_config - become_user: "{{ gitea_runner_user }}" - when: runner_registered.stat.exists - changed_when: false - - - name: Display runner configuration - debug: - var: runner_config.stdout_lines - when: runner_registered.stat.exists - - - name: Check Docker access for runner user - command: docker ps - become_user: "{{ gitea_runner_user }}" - register: docker_access - changed_when: false - failed_when: false - - - name: Display Docker access status - debug: - msg: | - Docker access: {{ 'OK' if docker_access.rc == 0 else 'FAILED' }} - {% if docker_access.rc != 0 %} - Error: {{ docker_access.stderr }} - {% endif %} - -# 单独的任务用于管理服务 -- name: Service Management Tasks - hosts: hcp - become: yes - tasks: - - name: Start gitea-runner service - systemd: - name: gitea-runner - state: started - when: ansible_run_tags is defined and 'start' in ansible_run_tags - - - name: Stop gitea-runner service - systemd: - name: gitea-runner - state: stopped - when: ansible_run_tags is defined and 'stop' in ansible_run_tags - - - name: Restart gitea-runner service - systemd: - name: gitea-runner - state: restarted - when: ansible_run_tags is defined and 'restart' in ansible_run_tags - - - name: Reload gitea-runner service - systemd: - name: gitea-runner - state: reloaded - when: ansible_run_tags is defined and 'reload' in ansible_run_tags \ No newline at end of file diff --git a/configuration/playbooks/applications/gitea-runner-setup.yml b/configuration/playbooks/applications/gitea-runner-setup.yml deleted file mode 100644 index fe8d2ac..0000000 --- a/configuration/playbooks/applications/gitea-runner-setup.yml +++ /dev/null @@ -1,157 +0,0 @@ ---- -- name: Setup Gitea Runner on HCP nodes - hosts: hcp - become: yes - vars: - gitea_runner_token: "vOrrQda6Qiet9YOj4waZVU5QgLig2J3rKp2RfoN7" - gitea_server_url: "http://gitea:3000" - gitea_runner_user: "gitea-runner" - gitea_runner_home: "/home/{{ gitea_runner_user }}" - gitea_runner_config_dir: "/etc/gitea-runner" - gitea_runner_data_dir: "/var/lib/gitea-runner" - gitea_runner_log_dir: "/var/log/gitea-runner" - gitea_runner_binary: "/usr/bin/act_runner" - - tasks: - - name: Check if gitea-runner binary exists - stat: - path: "{{ gitea_runner_binary }}" - register: runner_binary - - - name: Fail if act_runner binary not found - fail: - msg: "Act runner binary not found at {{ gitea_runner_binary }}. Please install it first." - when: not runner_binary.stat.exists - - - name: Create gitea-runner user - user: - name: "{{ gitea_runner_user }}" - system: yes - shell: /bin/bash - home: "{{ gitea_runner_home }}" - create_home: yes - comment: "Gitea Runner Service User" - - - name: Create gitea-runner directories - file: - path: "{{ item }}" - state: directory - owner: "{{ gitea_runner_user }}" - group: "{{ gitea_runner_user }}" - mode: '0755' - loop: - - "{{ gitea_runner_config_dir }}" - - "{{ gitea_runner_data_dir }}" - - "{{ gitea_runner_log_dir }}" - - - name: Create gitea-runner configuration file - template: - src: gitea-runner-config.yml.j2 - dest: "{{ gitea_runner_config_dir }}/config.yml" - owner: "{{ gitea_runner_user }}" - group: "{{ gitea_runner_user }}" - mode: '0600' - notify: restart gitea-runner - - - name: Create gitea-runner systemd service file - template: - src: gitea-runner.service.j2 - dest: /etc/systemd/system/gitea-runner.service - owner: root - group: root - mode: '0644' - notify: - - reload systemd - - restart gitea-runner - - - name: Create gitea-runner environment file - template: - src: gitea-runner.env.j2 - dest: /etc/default/gitea-runner - owner: root - group: root - mode: '0600' - notify: restart gitea-runner - - - name: Create runner registration script - template: - src: register-runner.sh.j2 - dest: "{{ gitea_runner_home }}/register-runner.sh" - owner: "{{ gitea_runner_user }}" - group: "{{ gitea_runner_user }}" - mode: '0755' - - - name: Check if runner is already registered - stat: - path: "{{ gitea_runner_data_dir }}/.runner" - register: runner_registered - - - name: Register gitea runner - command: "{{ gitea_runner_home }}/register-runner.sh" - become_user: "{{ gitea_runner_user }}" - when: not runner_registered.stat.exists - register: registration_result - - - name: Display registration result - debug: - var: registration_result.stdout_lines - when: registration_result is defined and registration_result.stdout_lines is defined - - - name: Create runner startup script - template: - src: start-runner.sh.j2 - dest: "{{ gitea_runner_home }}/start-runner.sh" - owner: "{{ gitea_runner_user }}" - group: "{{ gitea_runner_user }}" - mode: '0755' - - - name: Create logrotate configuration for gitea-runner - template: - src: gitea-runner.logrotate.j2 - dest: /etc/logrotate.d/gitea-runner - owner: root - group: root - mode: '0644' - - - name: Install Docker (required for runner) - package: - name: docker.io - state: present - - - name: Add gitea-runner user to docker group - user: - name: "{{ gitea_runner_user }}" - groups: docker - append: yes - - - name: Start and enable Docker service - systemd: - name: docker - state: started - enabled: yes - - - name: Start and enable gitea-runner service - systemd: - name: gitea-runner - state: started - enabled: yes - daemon_reload: yes - - - name: Check gitea-runner service status - systemd: - name: gitea-runner - register: service_status - - - name: Display service status - debug: - msg: "Gitea Runner service is {{ service_status.status.ActiveState }}" - - handlers: - - name: reload systemd - systemd: - daemon_reload: yes - - - name: restart gitea-runner - systemd: - name: gitea-runner - state: restarted \ No newline at end of file diff --git a/configuration/playbooks/applications/swarm-migration-plan.yml b/configuration/playbooks/applications/swarm-migration-plan.yml deleted file mode 100644 index c9eff47..0000000 --- a/configuration/playbooks/applications/swarm-migration-plan.yml +++ /dev/null @@ -1,194 +0,0 @@ ---- -- name: Docker Swarm Migration Plan for ash3c - hosts: ash3c - become: yes - gather_facts: yes - - vars: - # 定义服务迁移计划 - swarm_services: - high_priority: - - name: ghproxy - image: wjqserver/ghproxy:latest - ports: "8046:8080" - replicas: 2 - networks: ["app-network"] - - - name: redis - image: redis:latest - ports: "63789:6379" - replicas: 1 - networks: ["app-network"] - volumes: ["redis-data:/data"] - - medium_priority: - - name: consul - image: bitnami/consul:latest - ports: - - "8310:8300" - - "8311:8301" - - "8312:8302" - - "8501:8500" - - "8601:8600/udp" - replicas: 1 - networks: ["consul-network"] - - - name: discourse-app - image: bitnami/discourse:3.4.1 - ports: "31080:3000" - replicas: 1 - networks: ["app-network"] - depends_on: ["postgres", "redis"] - - - name: discourse-sidekiq - image: bitnami/discourse:3.4.1 - replicas: 1 - networks: ["app-network"] - depends_on: ["postgres", "redis"] - - low_priority: - - name: elasticsearch - image: bitnami/elasticsearch:8.17.2 - ports: "59200:9200" - replicas: 1 - networks: ["elastic-network"] - volumes: ["elastic-data:/bitnami/elasticsearch/data"] - constraints: ["node.role==manager"] - - - name: postgres - image: postgres:17.2 - ports: "54322:5432" - replicas: 1 - networks: ["db-network"] - volumes: ["postgres-data:/var/lib/postgresql/data"] - constraints: ["node.role==manager"] - secrets: ["postgres_password"] - - tasks: - - name: Display migration plan - debug: - msg: | - 🚀 DOCKER SWARM MIGRATION PLAN FOR {{ inventory_hostname }} - ========================================================= - - 📋 PHASE 1 - HIGH PRIORITY (Low Risk) - {% for service in swarm_services.high_priority %} - ✅ {{ service.name }}: - - Image: {{ service.image }} - - Replicas: {{ service.replicas }} - - Networks: {{ service.networks | join(', ') }} - - Migration: Safe, stateless service - {% endfor %} - - 📋 PHASE 2 - MEDIUM PRIORITY (Medium Risk) - {% for service in swarm_services.medium_priority %} - ⚠️ {{ service.name }}: - - Image: {{ service.image }} - - Replicas: {{ service.replicas }} - - Networks: {{ service.networks | join(', ') }} - - Migration: Requires coordination - {% endfor %} - - 📋 PHASE 3 - LOW PRIORITY (High Risk) - {% for service in swarm_services.low_priority %} - 🔴 {{ service.name }}: - - Image: {{ service.image }} - - Replicas: {{ service.replicas }} - - Networks: {{ service.networks | join(', ') }} - - Migration: Requires careful planning - {% endfor %} - - - name: Create migration script - copy: - content: | - #!/bin/bash - # Docker Swarm Migration Script for {{ inventory_hostname }} - # Generated: {{ ansible_date_time.iso8601 }} - - set -e - - echo "🚀 Starting Docker Swarm Migration..." - - # Create networks - echo "📡 Creating overlay networks..." - docker network create -d overlay --attachable app-network || true - docker network create -d overlay --attachable db-network || true - docker network create -d overlay --attachable consul-network || true - docker network create -d overlay --attachable elastic-network || true - - # Create volumes - echo "💾 Creating volumes..." - docker volume create redis-data || true - docker volume create postgres-data || true - docker volume create elastic-data || true - - # Create secrets (example) - echo "🔐 Creating secrets..." - echo "your_postgres_password" | docker secret create postgres_password - || true - - echo "✅ Infrastructure setup complete!" - echo "" - echo "🔄 PHASE 1 - Migrate high priority services:" - echo "docker service create --name ghproxy-svc --replicas 2 --network app-network -p 8046:8080 wjqserver/ghproxy:latest" - echo "docker service create --name redis-svc --replicas 1 --network app-network -p 63789:6379 --mount type=volume,source=redis-data,target=/data redis:latest" - echo "" - echo "🔄 PHASE 2 - Migrate medium priority services:" - echo "docker service create --name consul-svc --replicas 1 --network consul-network -p 8310:8300 -p 8311:8301 -p 8312:8302 -p 8501:8500 -p 8601:8600/udp bitnami/consul:latest" - echo "docker service create --name discourse-app-svc --replicas 1 --network app-network -p 31080:3000 bitnami/discourse:3.4.1" - echo "docker service create --name discourse-sidekiq-svc --replicas 1 --network app-network bitnami/discourse:3.4.1" - echo "" - echo "🔄 PHASE 3 - Migrate low priority services (CAREFUL!):" - echo "docker service create --name postgres-svc --replicas 1 --network db-network -p 54322:5432 --mount type=volume,source=postgres-data,target=/var/lib/postgresql/data --secret postgres_password --constraint 'node.role==manager' postgres:17.2" - echo "docker service create --name elasticsearch-svc --replicas 1 --network elastic-network -p 59200:9200 --mount type=volume,source=elastic-data,target=/bitnami/elasticsearch/data --constraint 'node.role==manager' bitnami/elasticsearch:8.17.2" - echo "" - echo "📊 Monitor services:" - echo "docker service ls" - echo "docker service ps " - echo "" - echo "⚠️ IMPORTANT NOTES:" - echo "1. Stop original containers before creating services" - echo "2. Backup data before migrating databases" - echo "3. Test each phase before proceeding" - echo "4. Monitor logs: docker service logs " - dest: "/tmp/swarm-migration-{{ inventory_hostname }}.sh" - mode: '0755' - - - name: Create rollback script - copy: - content: | - #!/bin/bash - # Docker Swarm Rollback Script for {{ inventory_hostname }} - - echo "🔄 Rolling back Swarm services..." - - # Remove services - docker service rm ghproxy-svc redis-svc consul-svc discourse-app-svc discourse-sidekiq-svc postgres-svc elasticsearch-svc 2>/dev/null || true - - # Remove networks (optional) - # docker network rm app-network db-network consul-network elastic-network 2>/dev/null || true - - echo "✅ Rollback complete. Original containers should be restarted manually." - dest: "/tmp/swarm-rollback-{{ inventory_hostname }}.sh" - mode: '0755' - - - name: Migration plan complete - debug: - msg: | - 🎉 MIGRATION PLAN GENERATED! - - 📄 Files created: - - /tmp/swarm-migration-{{ inventory_hostname }}.sh (Migration script) - - /tmp/swarm-rollback-{{ inventory_hostname }}.sh (Rollback script) - - 🚀 RECOMMENDED APPROACH: - 1. Backup all data first - 2. Test migration in phases - 3. Start with Phase 1 (low risk services) - 4. Monitor each service before proceeding - 5. Keep rollback script ready - - 💡 NEXT STEPS: - 1. Review and customize the migration script - 2. Plan maintenance window - 3. Execute phase by phase - 4. Monitor and validate each service \ No newline at end of file diff --git a/configuration/playbooks/applications/templates/gitea-runner-config.yml.j2 b/configuration/playbooks/applications/templates/gitea-runner-config.yml.j2 deleted file mode 100644 index 283fee7..0000000 --- a/configuration/playbooks/applications/templates/gitea-runner-config.yml.j2 +++ /dev/null @@ -1,50 +0,0 @@ -# Gitea Runner Configuration -log: - level: info - file: {{ gitea_runner_log_dir }}/runner.log - -runner: - # Runner name (will be auto-generated if not specified) - name: "{{ inventory_hostname }}-runner" - - # Runner capacity (number of concurrent jobs) - capacity: 2 - - # Runner timeout - timeout: 3600 - - # Runner labels (for job targeting) - labels: - - "ubuntu-latest:docker://ubuntu:22.04" - - "ubuntu-20.04:docker://ubuntu:20.04" - - "ubuntu-18.04:docker://ubuntu:18.04" - - "node:docker://node:18" - - "python:docker://python:3.11" - - "ansible:docker://quay.io/ansible/ansible-runner:latest" - - "opentofu:docker://opentofu/opentofu:latest" - -cache: - enabled: true - dir: {{ gitea_runner_data_dir }}/cache - host: "" - port: 0 - -container: - # Docker network for runner containers - network: "gitea-runner" - - # Enable privileged containers (needed for Docker-in-Docker) - privileged: false - - # Container options - options: "--rm --pull=always" - - # Valid platforms - valid_volumes: - - "/tmp" - - "{{ gitea_runner_data_dir }}" - - docker_host: "unix:///var/run/docker.sock" - -host: - workdir_parent: {{ gitea_runner_data_dir }}/work \ No newline at end of file diff --git a/configuration/playbooks/applications/templates/gitea-runner.env.j2 b/configuration/playbooks/applications/templates/gitea-runner.env.j2 deleted file mode 100644 index 65b4ce8..0000000 --- a/configuration/playbooks/applications/templates/gitea-runner.env.j2 +++ /dev/null @@ -1,18 +0,0 @@ -# Gitea Runner Environment Variables - -# Gitea server configuration -GITEA_INSTANCE_URL={{ gitea_server_url }} -GITEA_RUNNER_REGISTRATION_TOKEN={{ gitea_runner_token }} - -# Runner configuration -GITEA_RUNNER_NAME={{ inventory_hostname }}-runner -GITEA_RUNNER_LABELS=ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu - -# Docker configuration -DOCKER_HOST=unix:///var/run/docker.sock - -# Logging -GITEA_RUNNER_LOG_LEVEL=info - -# Security -GITEA_RUNNER_SECURITY_PRIVILEGED=false \ No newline at end of file diff --git a/configuration/playbooks/applications/templates/gitea-runner.logrotate.j2 b/configuration/playbooks/applications/templates/gitea-runner.logrotate.j2 deleted file mode 100644 index 2d8049b..0000000 --- a/configuration/playbooks/applications/templates/gitea-runner.logrotate.j2 +++ /dev/null @@ -1,12 +0,0 @@ -{{ gitea_runner_log_dir }}/*.log { - daily - missingok - rotate 30 - compress - delaycompress - notifempty - create 644 {{ gitea_runner_user }} {{ gitea_runner_user }} - postrotate - systemctl reload gitea-runner || true - endscript -} \ No newline at end of file diff --git a/configuration/playbooks/applications/templates/gitea-runner.service.j2 b/configuration/playbooks/applications/templates/gitea-runner.service.j2 deleted file mode 100644 index 1f1661f..0000000 --- a/configuration/playbooks/applications/templates/gitea-runner.service.j2 +++ /dev/null @@ -1,39 +0,0 @@ -[Unit] -Description=Gitea Actions Runner -Documentation=https://docs.gitea.io/en-us/actions/ -After=network.target docker.service -Wants=docker.service - -[Service] -Type=simple -User={{ gitea_runner_user }} -Group={{ gitea_runner_user }} -WorkingDirectory={{ gitea_runner_data_dir }} -ExecStart={{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml -ExecReload=/bin/kill -HUP $MAINPID -KillMode=mixed -KillSignal=SIGINT -TimeoutStopSec=5 -Restart=always -RestartSec=10 -StartLimitInterval=0 - -# Security settings -NoNewPrivileges=yes -PrivateTmp=yes -ProtectSystem=strict -ProtectHome=yes -ReadWritePaths={{ gitea_runner_data_dir }} {{ gitea_runner_log_dir }} /var/run/docker.sock -ProtectKernelTunables=yes -ProtectKernelModules=yes -ProtectControlGroups=yes - -# Environment -EnvironmentFile=-/etc/default/gitea-runner - -# Logging -StandardOutput=append:{{ gitea_runner_log_dir }}/gitea-runner.log -StandardError=append:{{ gitea_runner_log_dir }}/gitea-runner-error.log - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/configuration/playbooks/applications/templates/register-runner.sh.j2 b/configuration/playbooks/applications/templates/register-runner.sh.j2 deleted file mode 100644 index 4944df0..0000000 --- a/configuration/playbooks/applications/templates/register-runner.sh.j2 +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# Gitea Runner Registration Script - -set -e - -echo "🚀 注册 Gitea Runner..." - -# 配置变量 -GITEA_URL="{{ gitea_server_url }}" -REGISTRATION_TOKEN="{{ gitea_runner_token }}" -RUNNER_NAME="{{ inventory_hostname }}-runner" -RUNNER_LABELS="ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu" - -# 切换到数据目录 -cd {{ gitea_runner_data_dir }} - -# 检查是否已经注册 -if [ -f ".runner" ]; then - echo "✅ Runner 已经注册" - exit 0 -fi - -echo "📝 注册 Runner: $RUNNER_NAME" -echo "🔗 Gitea URL: $GITEA_URL" -echo "🏷️ Labels: $RUNNER_LABELS" - -# 注册 Runner -{{ gitea_runner_binary }} register \ - --instance "$GITEA_URL" \ - --token "$REGISTRATION_TOKEN" \ - --name "$RUNNER_NAME" \ - --labels "$RUNNER_LABELS" - -if [ $? -eq 0 ]; then - echo "✅ Runner 注册成功!" - - # 设置文件权限 - chown {{ gitea_runner_user }}:{{ gitea_runner_user }} .runner .credentials - chmod 600 .runner .credentials - - echo "📋 Runner 信息:" - cat .runner -else - echo "❌ Runner 注册失败" - exit 1 -fi \ No newline at end of file diff --git a/configuration/playbooks/applications/templates/start-runner.sh.j2 b/configuration/playbooks/applications/templates/start-runner.sh.j2 deleted file mode 100644 index 85e94de..0000000 --- a/configuration/playbooks/applications/templates/start-runner.sh.j2 +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Gitea Runner Startup Script - -set -e - -echo "🚀 启动 Gitea Runner..." - -# 切换到数据目录 -cd {{ gitea_runner_data_dir }} - -# 检查注册状态 -if [ ! -f ".runner" ]; then - echo "❌ Runner 未注册,请先运行注册脚本" - exit 1 -fi - -echo "✅ Runner 已注册,启动守护进程..." - -# 启动 Runner -exec {{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml \ No newline at end of file diff --git a/configuration/proxy.env b/configuration/proxy.env new file mode 100644 index 0000000..73b1421 --- /dev/null +++ b/configuration/proxy.env @@ -0,0 +1,30 @@ +# Proxy Configuration for istoreos.tailnet-68f9.ts.net:1082 +# This file contains proxy environment variables for the management system + +# HTTP/HTTPS Proxy Settings +export http_proxy=http://istoreos.tailnet-68f9.ts.net:1082 +export https_proxy=http://istoreos.tailnet-68f9.ts.net:1082 +export HTTP_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 +export HTTPS_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 + +# No Proxy Settings (local networks and services) +export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net +export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net + +# Additional proxy settings for various tools +export ALL_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 +export all_proxy=http://istoreos.tailnet-68f9.ts.net:1082 + +# Docker proxy settings +export DOCKER_BUILDKIT=1 +export BUILDKIT_PROGRESS=plain + +# Git proxy settings +export GIT_HTTP_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 +export GIT_HTTPS_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 + +# Curl proxy settings +export CURL_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 + +# Wget proxy settings +export WGET_PROXY=http://istoreos.tailnet-68f9.ts.net:1082 diff --git a/configuration/roles/gitea-runner/templates/gitea-runner-config.yml.j2 b/configuration/roles/gitea-runner/templates/gitea-runner-config.yml.j2 deleted file mode 100644 index 283fee7..0000000 --- a/configuration/roles/gitea-runner/templates/gitea-runner-config.yml.j2 +++ /dev/null @@ -1,50 +0,0 @@ -# Gitea Runner Configuration -log: - level: info - file: {{ gitea_runner_log_dir }}/runner.log - -runner: - # Runner name (will be auto-generated if not specified) - name: "{{ inventory_hostname }}-runner" - - # Runner capacity (number of concurrent jobs) - capacity: 2 - - # Runner timeout - timeout: 3600 - - # Runner labels (for job targeting) - labels: - - "ubuntu-latest:docker://ubuntu:22.04" - - "ubuntu-20.04:docker://ubuntu:20.04" - - "ubuntu-18.04:docker://ubuntu:18.04" - - "node:docker://node:18" - - "python:docker://python:3.11" - - "ansible:docker://quay.io/ansible/ansible-runner:latest" - - "opentofu:docker://opentofu/opentofu:latest" - -cache: - enabled: true - dir: {{ gitea_runner_data_dir }}/cache - host: "" - port: 0 - -container: - # Docker network for runner containers - network: "gitea-runner" - - # Enable privileged containers (needed for Docker-in-Docker) - privileged: false - - # Container options - options: "--rm --pull=always" - - # Valid platforms - valid_volumes: - - "/tmp" - - "{{ gitea_runner_data_dir }}" - - docker_host: "unix:///var/run/docker.sock" - -host: - workdir_parent: {{ gitea_runner_data_dir }}/work \ No newline at end of file diff --git a/configuration/roles/gitea-runner/templates/gitea-runner.env.j2 b/configuration/roles/gitea-runner/templates/gitea-runner.env.j2 deleted file mode 100644 index 65b4ce8..0000000 --- a/configuration/roles/gitea-runner/templates/gitea-runner.env.j2 +++ /dev/null @@ -1,18 +0,0 @@ -# Gitea Runner Environment Variables - -# Gitea server configuration -GITEA_INSTANCE_URL={{ gitea_server_url }} -GITEA_RUNNER_REGISTRATION_TOKEN={{ gitea_runner_token }} - -# Runner configuration -GITEA_RUNNER_NAME={{ inventory_hostname }}-runner -GITEA_RUNNER_LABELS=ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu - -# Docker configuration -DOCKER_HOST=unix:///var/run/docker.sock - -# Logging -GITEA_RUNNER_LOG_LEVEL=info - -# Security -GITEA_RUNNER_SECURITY_PRIVILEGED=false \ No newline at end of file diff --git a/configuration/roles/gitea-runner/templates/gitea-runner.logrotate.j2 b/configuration/roles/gitea-runner/templates/gitea-runner.logrotate.j2 deleted file mode 100644 index 2d8049b..0000000 --- a/configuration/roles/gitea-runner/templates/gitea-runner.logrotate.j2 +++ /dev/null @@ -1,12 +0,0 @@ -{{ gitea_runner_log_dir }}/*.log { - daily - missingok - rotate 30 - compress - delaycompress - notifempty - create 644 {{ gitea_runner_user }} {{ gitea_runner_user }} - postrotate - systemctl reload gitea-runner || true - endscript -} \ No newline at end of file diff --git a/configuration/roles/gitea-runner/templates/gitea-runner.service.j2 b/configuration/roles/gitea-runner/templates/gitea-runner.service.j2 deleted file mode 100644 index 1f1661f..0000000 --- a/configuration/roles/gitea-runner/templates/gitea-runner.service.j2 +++ /dev/null @@ -1,39 +0,0 @@ -[Unit] -Description=Gitea Actions Runner -Documentation=https://docs.gitea.io/en-us/actions/ -After=network.target docker.service -Wants=docker.service - -[Service] -Type=simple -User={{ gitea_runner_user }} -Group={{ gitea_runner_user }} -WorkingDirectory={{ gitea_runner_data_dir }} -ExecStart={{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml -ExecReload=/bin/kill -HUP $MAINPID -KillMode=mixed -KillSignal=SIGINT -TimeoutStopSec=5 -Restart=always -RestartSec=10 -StartLimitInterval=0 - -# Security settings -NoNewPrivileges=yes -PrivateTmp=yes -ProtectSystem=strict -ProtectHome=yes -ReadWritePaths={{ gitea_runner_data_dir }} {{ gitea_runner_log_dir }} /var/run/docker.sock -ProtectKernelTunables=yes -ProtectKernelModules=yes -ProtectControlGroups=yes - -# Environment -EnvironmentFile=-/etc/default/gitea-runner - -# Logging -StandardOutput=append:{{ gitea_runner_log_dir }}/gitea-runner.log -StandardError=append:{{ gitea_runner_log_dir }}/gitea-runner-error.log - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/configuration/roles/gitea-runner/templates/register-runner.sh.j2 b/configuration/roles/gitea-runner/templates/register-runner.sh.j2 deleted file mode 100644 index 4944df0..0000000 --- a/configuration/roles/gitea-runner/templates/register-runner.sh.j2 +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# Gitea Runner Registration Script - -set -e - -echo "🚀 注册 Gitea Runner..." - -# 配置变量 -GITEA_URL="{{ gitea_server_url }}" -REGISTRATION_TOKEN="{{ gitea_runner_token }}" -RUNNER_NAME="{{ inventory_hostname }}-runner" -RUNNER_LABELS="ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu" - -# 切换到数据目录 -cd {{ gitea_runner_data_dir }} - -# 检查是否已经注册 -if [ -f ".runner" ]; then - echo "✅ Runner 已经注册" - exit 0 -fi - -echo "📝 注册 Runner: $RUNNER_NAME" -echo "🔗 Gitea URL: $GITEA_URL" -echo "🏷️ Labels: $RUNNER_LABELS" - -# 注册 Runner -{{ gitea_runner_binary }} register \ - --instance "$GITEA_URL" \ - --token "$REGISTRATION_TOKEN" \ - --name "$RUNNER_NAME" \ - --labels "$RUNNER_LABELS" - -if [ $? -eq 0 ]; then - echo "✅ Runner 注册成功!" - - # 设置文件权限 - chown {{ gitea_runner_user }}:{{ gitea_runner_user }} .runner .credentials - chmod 600 .runner .credentials - - echo "📋 Runner 信息:" - cat .runner -else - echo "❌ Runner 注册失败" - exit 1 -fi \ No newline at end of file diff --git a/configuration/roles/gitea-runner/templates/start-runner.sh.j2 b/configuration/roles/gitea-runner/templates/start-runner.sh.j2 deleted file mode 100644 index 85e94de..0000000 --- a/configuration/roles/gitea-runner/templates/start-runner.sh.j2 +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Gitea Runner Startup Script - -set -e - -echo "🚀 启动 Gitea Runner..." - -# 切换到数据目录 -cd {{ gitea_runner_data_dir }} - -# 检查注册状态 -if [ ! -f ".runner" ]; then - echo "❌ Runner 未注册,请先运行注册脚本" - exit 1 -fi - -echo "✅ Runner 已注册,启动守护进程..." - -# 启动 Runner -exec {{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml \ No newline at end of file diff --git a/configuration/zsh/README.md b/configuration/zsh/README.md deleted file mode 100644 index 84b4e36..0000000 --- a/configuration/zsh/README.md +++ /dev/null @@ -1,202 +0,0 @@ -# ZSH 配置同步方案 - -这个目录包含了完整的 oh-my-zsh 配置,可以在多个 VPS 之间同步使用。 - -## 文件结构 - -``` -configuration/zsh/ -├── README.md # 本文件 -├── install-zsh-config.sh # 完整安装脚本 -├── quick-install.sh # 快速安装脚本 -├── zshrc.template # ZSH 配置文件模板 -└── oh-my-zsh-custom/ # 自定义 oh-my-zsh 配置 - ├── aliases.zsh # 自定义别名 - └── plugins/ # 自定义插件 -``` - -## 使用方法 - -### 方法一:智能安装(推荐) - -询问用户是否使用代理,安装完成后可选择是否保持: - -```bash -# 智能安装(询问代理使用,安装后可选择是否保持) -curl -fsSL https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt/raw/branch/main/configuration/zsh/smart-install.sh | bash -``` - -**特点:** -- 安装前询问是否使用代理 -- 测试代理连接确保可用 -- 安装完成后询问是否保持代理 -- 用户完全控制代理使用 - -### 方法二:快速安装 - -在新 VPS 上运行: - -```bash -# 一键安装 -curl -fsSL https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt/raw/branch/main/configuration/zsh/quick-install.sh | bash -``` - -### 方法三:手动安装 - -1. 克隆仓库: -```bash -git clone https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt.git /root/mgmt -``` - -2. 运行安装脚本: -```bash -cd /root/mgmt -chmod +x configuration/zsh/install-zsh-config.sh -./configuration/zsh/install-zsh-config.sh -``` - -## 配置同步 - -安装完成后,可以使用以下命令同步最新配置: - -```bash -# 同步配置 -sync-zsh-config -``` - -这个命令会: -1. 从 Gitea 拉取最新配置 -2. 备份当前配置 -3. 部署新配置 - -## 代理管理 - -如果网络环境需要代理,可以使用以下命令管理代理: - -```bash -# 代理管理命令 -proxy-on # 临时开启代理 -proxy-off # 临时关闭代理 -proxy-toggle # 切换代理状态 -proxy-enable # 永久开启代理 -proxy-disable # 永久关闭代理 -proxy-status # 查看代理状态 -proxy-test # 测试代理连接 -``` - -### 代理使用场景 - -- **临时使用**: `proxy-on` → 使用代理 → `proxy-off` -- **永久开启**: `proxy-enable` → 重启后仍然有效 -- **快速切换**: `proxy-toggle` → 一键切换状态 -- **状态检查**: `proxy-status` → 查看当前状态和IP - -## 包含的功能 - -### 插件 -- **git** - Git 集成 -- **docker** - Docker 支持 -- **ansible** - Ansible 支持 -- **terraform** - OpenTofu/Terraform 支持 -- **kubectl** - Kubernetes 支持 -- **zsh-autosuggestions** - 命令自动建议 -- **zsh-syntax-highlighting** - 语法高亮 -- **zsh-completions** - 增强补全 - -### 别名 -- **项目管理**: `mgmt-status`, `mgmt-deploy`, `mgmt-cleanup` -- **Ansible**: `ansible-check`, `ansible-deploy`, `ansible-ping` -- **OpenTofu**: `tofu-init`, `tofu-plan`, `tofu-apply` -- **Docker**: `dps`, `dex`, `dlog`, `dclean` -- **Kubernetes**: `k`, `kgp`, `kgs`, `kaf` -- **Git**: `gs`, `ga`, `gc`, `gp`, `gl` -- **系统**: `ll`, `la`, `ports`, `myip` - -### 主题 -- **agnoster** - 功能丰富的主题,显示 Git 状态 - -## 更新配置 - -当您在主 VPS 上更新配置后: - -1. 提交更改: -```bash -cd /root/mgmt -git add configuration/zsh/ -git commit -m "Update zsh configuration" -git push origin main -``` - -2. 在其他 VPS 上同步: -```bash -sync-zsh-config -``` - -## 自定义配置 - -如果您需要在特定 VPS 上添加自定义配置: - -1. 编辑 `~/.zshrc` 文件 -2. 在文件末尾添加您的自定义配置 -3. 这些配置不会被同步脚本覆盖 - -## 故障排除 - -### 如果自动建议插件不工作 -```bash -# 运行测试脚本检查插件状态 -chmod +x /root/mgmt/configuration/zsh/test-plugins.sh -/root/mgmt/configuration/zsh/test-plugins.sh - -# 手动安装缺失的插件 -cd ~/.oh-my-zsh/custom/plugins -git clone https://github.com/zsh-users/zsh-autosuggestions -git clone https://github.com/zsh-users/zsh-syntax-highlighting.git -git clone https://github.com/zsh-users/zsh-completions - -# 重新加载配置 -source ~/.zshrc -``` - -### 如果同步失败 -```bash -# 检查网络连接 -ping gitea.tailnet-68f9.ts.net - -# 手动拉取 -cd /root/mgmt -git pull origin main -``` - -### 如果别名不工作 -```bash -# 重新加载配置 -source ~/.zshrc - -# 检查别名 -alias | grep -``` - -### 如果插件不工作 -```bash -# 检查插件目录 -ls ~/.oh-my-zsh/plugins/ -ls ~/.oh-my-zsh/custom/plugins/ - -# 运行测试脚本 -/root/mgmt/configuration/zsh/test-plugins.sh -``` - -## 安全说明 - -- 此配置包含访问 Gitea 的凭据 -- 请确保只在可信的 VPS 上使用 -- 建议定期更新访问令牌 - -## 支持 - -如有问题,请检查: -1. 网络连接是否正常 -2. Git 凭据是否正确 -3. 依赖包是否已安装 -4. 权限是否正确 diff --git a/configuration/zsh/install-zsh-config.sh b/configuration/zsh/install-zsh-config.sh deleted file mode 100755 index 4694def..0000000 --- a/configuration/zsh/install-zsh-config.sh +++ /dev/null @@ -1,281 +0,0 @@ -#!/bin/bash - -# ZSH 配置安装脚本 -# 用于在其他 VPS 上安装和同步 oh-my-zsh 配置 - -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -# 日志函数 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# 检查是否为 root 用户 -check_root() { - if [[ $EUID -ne 0 ]]; then - log_error "此脚本需要 root 权限运行" - exit 1 - fi -} - -# 设置代理(如果需要) -setup_proxy() { - log_info "检查代理设置..." - - # 检查是否已经有代理配置 - if [[ -f "/root/mgmt/configuration/proxy.env" ]]; then - log_info "发现代理配置文件,加载代理设置..." - source "/root/mgmt/configuration/proxy.env" - - # 测试代理连接 - if curl -s --connect-timeout 5 --proxy "$http_proxy" https://httpbin.org/ip >/dev/null 2>&1; then - log_success "代理连接正常,将使用代理下载" - else - log_warning "代理连接失败,将使用直连" - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - fi - else - log_info "未发现代理配置,将使用直连" - fi -} - -# 安装依赖 -install_dependencies() { - log_info "安装依赖包..." - - # 更新包列表 - apt update - - # 安装必要的包 - apt install -y \ - zsh \ - git \ - curl \ - wget \ - htop \ - tree \ - jq \ - tmux \ - fonts-powerline \ - fontconfig - - log_success "依赖包安装完成" -} - -# 安装 oh-my-zsh -install_oh_my_zsh() { - log_info "安装 oh-my-zsh..." - - if [[ -d "$HOME/.oh-my-zsh" ]]; then - log_warning "oh-my-zsh 已安装,跳过安装步骤" - return 0 - fi - - # 安装 oh-my-zsh - RUNZSH=no CHSH=no sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" - - log_success "oh-my-zsh 安装完成" -} - -# 安装自定义插件 -install_custom_plugins() { - log_info "安装自定义插件..." - - local custom_dir="$HOME/.oh-my-zsh/custom/plugins" - - # zsh-autosuggestions - if [[ ! -d "$custom_dir/zsh-autosuggestions" ]]; then - log_info "安装 zsh-autosuggestions..." - git clone https://github.com/zsh-users/zsh-autosuggestions "$custom_dir/zsh-autosuggestions" - fi - - # zsh-syntax-highlighting - if [[ ! -d "$custom_dir/zsh-syntax-highlighting" ]]; then - log_info "安装 zsh-syntax-highlighting..." - git clone https://github.com/zsh-users/zsh-syntax-highlighting.git "$custom_dir/zsh-syntax-highlighting" - fi - - # zsh-completions - if [[ ! -d "$custom_dir/zsh-completions" ]]; then - log_info "安装 zsh-completions..." - git clone https://github.com/zsh-users/zsh-completions "$custom_dir/zsh-completions" - fi - - log_success "自定义插件安装完成" -} - -# 部署配置文件 -deploy_configs() { - log_info "部署配置文件..." - - local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - - # 备份现有配置 - if [[ -f "$HOME/.zshrc" ]]; then - log_info "备份现有 .zshrc..." - cp "$HOME/.zshrc" "$HOME/.zshrc.backup.$(date +%Y%m%d_%H%M%S)" - fi - - # 部署 .zshrc - if [[ -f "$script_dir/zshrc.template" ]]; then - log_info "部署 .zshrc 配置..." - cp "$script_dir/zshrc.template" "$HOME/.zshrc" - else - log_error "找不到 zshrc.template 文件" - exit 1 - fi - - # 部署自定义配置 - if [[ -d "$script_dir/oh-my-zsh-custom" ]]; then - log_info "部署自定义 oh-my-zsh 配置..." - # 复制自定义别名文件 - if [[ -f "$script_dir/oh-my-zsh-custom/aliases.zsh" ]]; then - cp "$script_dir/oh-my-zsh-custom/aliases.zsh" "$HOME/.oh-my-zsh/custom/" - fi - fi - - log_success "配置文件部署完成" -} - -# 设置默认 shell -set_default_shell() { - log_info "设置 zsh 为默认 shell..." - - # 检查 zsh 是否在 /etc/shells 中 - if ! grep -q "$(which zsh)" /etc/shells; then - log_info "添加 zsh 到 /etc/shells..." - echo "$(which zsh)" >> /etc/shells - fi - - # 设置默认 shell - chsh -s "$(which zsh)" - - log_success "默认 shell 设置为 zsh" -} - -# 创建同步脚本 -create_sync_script() { - log_info "创建同步脚本..." - - cat > /usr/local/bin/sync-zsh-config << 'EOF' -#!/bin/bash - -# ZSH 配置同步脚本 -# 从 Gitea 仓库拉取最新配置 - -set -euo pipefail - -MGMT_DIR="/root/mgmt" -ZSH_CONFIG_DIR="$MGMT_DIR/configuration/zsh" - -log_info() { - echo -e "\033[0;34m[INFO]\033[0m $1" -} - -log_success() { - echo -e "\033[0;32m[SUCCESS]\033[0m $1" -} - -log_error() { - echo -e "\033[0;31m[ERROR]\033[0m $1" -} - -# 检查 mgmt 目录是否存在 -if [[ ! -d "$MGMT_DIR" ]]; then - log_error "mgmt 目录不存在: $MGMT_DIR" - exit 1 -fi - -# 进入 mgmt 目录 -cd "$MGMT_DIR" - -# 拉取最新配置 -log_info "拉取最新配置..." -git pull origin main - -# 检查 zsh 配置目录 -if [[ ! -d "$ZSH_CONFIG_DIR" ]]; then - log_error "zsh 配置目录不存在: $ZSH_CONFIG_DIR" - exit 1 -fi - -# 备份当前配置 -if [[ -f "$HOME/.zshrc" ]]; then - log_info "备份当前配置..." - cp "$HOME/.zshrc" "$HOME/.zshrc.backup.$(date +%Y%m%d_%H%M%S)" -fi - -# 部署新配置 -log_info "部署新配置..." -cp "$ZSH_CONFIG_DIR/zshrc.template" "$HOME/.zshrc" - -# 部署自定义别名 -if [[ -f "$ZSH_CONFIG_DIR/oh-my-zsh-custom/aliases.zsh" ]]; then - cp "$ZSH_CONFIG_DIR/oh-my-zsh-custom/aliases.zsh" "$HOME/.oh-my-zsh/custom/" -fi - -log_success "ZSH 配置同步完成!" -log_info "请运行 'source ~/.zshrc' 或重新登录以应用新配置" -EOF - - chmod +x /usr/local/bin/sync-zsh-config - - log_success "同步脚本创建完成: /usr/local/bin/sync-zsh-config" -} - -# 显示使用说明 -show_usage() { - log_success "ZSH 配置安装完成!" - echo "" - log_info "使用方法:" - echo " 1. 重新登录或运行: source ~/.zshrc" - echo " 2. 同步配置: sync-zsh-config" - echo " 3. 查看别名: alias" - echo "" - log_info "可用命令:" - echo " - mgmt-status, mgmt-deploy, mgmt-cleanup" - echo " - ansible-check, ansible-deploy, ansible-ping" - echo " - tofu-init, tofu-plan, tofu-apply" - echo " - dps, dex, dlog (Docker)" - echo " - k, kgp, kgs (Kubernetes)" - echo "" -} - -# 主函数 -main() { - log_info "开始安装 ZSH 配置..." - - check_root - setup_proxy - install_dependencies - install_oh_my_zsh - install_custom_plugins - deploy_configs - set_default_shell - create_sync_script - show_usage - - log_success "安装完成!" -} - -# 运行主函数 -main "$@" diff --git a/configuration/zsh/oh-my-zsh-custom/aliases.zsh b/configuration/zsh/oh-my-zsh-custom/aliases.zsh deleted file mode 100644 index 80097c4..0000000 --- a/configuration/zsh/oh-my-zsh-custom/aliases.zsh +++ /dev/null @@ -1,251 +0,0 @@ -# ============================================================================= -# CUSTOM ALIASES FOR MANAGEMENT SYSTEM -# ============================================================================= - -# Project Management -alias mgmt='cd /root/mgmt' -alias mgmt-status='cd /root/mgmt && ./mgmt.sh status' -alias mgmt-deploy='cd /root/mgmt && ./mgmt.sh deploy' -alias mgmt-cleanup='cd /root/mgmt && ./mgmt.sh cleanup' -alias mgmt-swarm='cd /root/mgmt && ./mgmt.sh swarm' -alias mgmt-tofu='cd /root/mgmt && ./mgmt.sh tofu' - -# Ansible Management -alias ansible-check='cd /root/mgmt/configuration && ansible-playbook --syntax-check' -alias ansible-deploy='cd /root/mgmt/configuration && ansible-playbook -i inventories/production/inventory.ini' -alias ansible-ping='cd /root/mgmt/configuration && ansible -i inventories/production/inventory.ini all -m ping' -alias ansible-vault='cd /root/mgmt/configuration && ansible-vault' -alias ansible-galaxy='cd /root/mgmt/configuration && ansible-galaxy' - -# OpenTofu/Terraform Management -alias tofu-init='cd /root/mgmt/tofu/environments/dev && tofu init' -alias tofu-plan='cd /root/mgmt/tofu/environments/dev && tofu plan -var-file="terraform.tfvars"' -alias tofu-apply='cd /root/mgmt/tofu/environments/dev && tofu apply -var-file="terraform.tfvars"' -alias tofu-destroy='cd /root/mgmt/tofu/environments/dev && tofu destroy -var-file="terraform.tfvars"' -alias tofu-output='cd /root/mgmt/tofu/environments/dev && tofu output' -alias tofu-validate='cd /root/mgmt/tofu/environments/dev && tofu validate' -alias tofu-fmt='cd /root/mgmt/tofu/environments/dev && tofu fmt -recursive' - -# Docker Management -alias d='docker' -alias dc='docker-compose' -alias dps='docker ps' -alias dpsa='docker ps -a' -alias di='docker images' -alias dex='docker exec -it' -alias dlog='docker logs -f' -alias dstop='docker stop' -alias dstart='docker start' -alias drm='docker rm' -alias drmi='docker rmi' -alias dclean='docker system prune -f' -alias dbuild='docker build' -alias drun='docker run' -alias dpull='docker pull' -alias dpush='docker push' - -# Docker Swarm Management -alias dswarm='docker swarm' -alias dstack='docker stack' -alias dservice='docker service' -alias dnode='docker node' -alias dnetwork='docker network' -alias dsecret='docker secret' -alias dconfig='docker config' -alias dstack-ls='docker stack ls' -alias dstack-rm='docker stack rm' -alias dstack-deploy='docker stack deploy' -alias dservice-ls='docker service ls' -alias dservice-ps='docker service ps' -alias dservice-logs='docker service logs' - -# Kubernetes Management -alias k='kubectl' -alias kgp='kubectl get pods' -alias kgs='kubectl get services' -alias kgd='kubectl get deployments' -alias kgn='kubectl get nodes' -alias kgi='kubectl get ingress' -alias kgc='kubectl get configmaps' -alias kgs='kubectl get secrets' -alias kdp='kubectl describe pod' -alias kds='kubectl describe service' -alias kdd='kubectl describe deployment' -alias kdn='kubectl describe node' -alias kdi='kubectl describe ingress' -alias kaf='kubectl apply -f' -alias kdf='kubectl delete -f' -alias kl='kubectl logs -f' -alias ke='kubectl edit' -alias kx='kubectl exec -it' -alias kctx='kubectl config current-context' -alias kuse='kubectl config use-context' - -# Git Management -alias gs='git status' -alias ga='git add' -alias gc='git commit' -alias gp='git push' -alias gl='git pull' -alias gd='git diff' -alias gb='git branch' -alias gco='git checkout' -alias gcom='git checkout main' -alias gcod='git checkout develop' -alias gst='git stash' -alias gstp='git stash pop' -alias gstl='git stash list' -alias gstc='git stash clear' -alias gcl='git clone' -alias gfe='git fetch' -alias gme='git merge' -alias gr='git rebase' -alias grc='git rebase --continue' -alias gra='git rebase --abort' -alias gres='git reset' -alias gresh='git reset --hard' -alias gress='git reset --soft' - -# System Management -alias ll='ls -alF' -alias la='ls -A' -alias l='ls -CF' -alias ..='cd ..' -alias ...='cd ../..' -alias ....='cd ../../..' -alias grep='grep --color=auto' -alias fgrep='fgrep --color=auto' -alias egrep='egrep --color=auto' -alias ports='netstat -tuln' -alias myip='curl -s https://httpbin.org/ip | jq -r .origin' -alias speedtest='curl -s https://raw.githubusercontent.com/sivel/speedtest-cli/master/speedtest.py | python3' -alias psg='ps aux | grep' -alias top='htop' -alias cp='cp -i' -alias mv='mv -i' -alias rm='rm -i' -alias mkdir='mkdir -pv' - -# Network Management -alias ping='ping -c 4' -alias traceroute='traceroute -n' -alias nmap='nmap -sS -O' -alias ss='ss -tuln' - -# File Operations -alias find='find . -name' -alias locate='locate -i' -alias which='which -a' -alias whereis='whereis -b' - -# Text Processing -alias cat='cat -n' -alias less='less -R' -alias more='more -R' -alias head='head -n 20' -alias tail='tail -n 20' -alias wc='wc -l' - -# Archive Operations -alias tar='tar -v' -alias zip='zip -r' -alias unzip='unzip -l' -alias gzip='gzip -v' -alias gunzip='gunzip -v' - -# Process Management -alias jobs='jobs -l' -alias bg='bg %' -alias fg='fg %' -alias kill='kill -9' -alias pkill='pkill -f' - -# Environment -alias env='env | sort' -alias set='set | sort' -alias unset='unset' -alias export='export' -alias source='source' - -# History -alias h='history' -alias hg='history | grep' -alias hc='history -c' - -# Directory Navigation -alias cd..='cd ..' -alias cd...='cd ../..' -alias cd....='cd ../../..' -alias cd-='cd -' -alias cd~='cd ~' -alias cd/='cd /' - -# Quick Access -alias vim='vim' -alias nano='nano' -alias emacs='emacs' -alias code='code' -alias subl='subl' - -# Monitoring -alias df='df -h' -alias du='du -h' -alias free='free -h' -alias meminfo='cat /proc/meminfo' -alias cpuinfo='cat /proc/cpuinfo' -alias uptime='uptime -p' - -# Security -alias chmod='chmod -v' -alias chown='chown -v' -alias chgrp='chgrp -v' -alias passwd='passwd' -alias su='su -' -alias sudo='sudo -E' - -# Development -alias make='make -j$(nproc)' -alias cmake='cmake -DCMAKE_BUILD_TYPE=Release' -alias gcc='gcc -Wall -Wextra' -alias g++='g++ -Wall -Wextra' -alias python='python3' -alias pip='pip3' -alias node='node' -alias npm='npm' -alias yarn='yarn' - -# Logs -alias journal='journalctl -f' -alias syslog='tail -f /var/log/syslog' -alias auth='tail -f /var/log/auth.log' -alias kern='tail -f /var/log/kern.log' -alias mail='tail -f /var/log/mail.log' - -# Backup -alias backup='tar -czf backup-$(date +%Y%m%d-%H%M%S).tar.gz' -alias restore='tar -xzf' - -# Cleanup -alias clean='rm -rf ~/.cache/* ~/.tmp/* /tmp/*' -alias clean-docker='docker system prune -af --volumes' -alias clean-k8s='kubectl delete pods --field-selector=status.phase=Succeeded' -alias clean-ansible='rm -rf ~/.ansible/tmp/*' - -# Information -alias info='uname -a' -alias whoami='whoami' -alias id='id' -alias groups='groups' -alias users='users' -alias w='w' -alias who='who' -alias last='last -n 10' - -# Proxy Management -alias proxy-on='/root/mgmt/scripts/utilities/proxy-toggle.sh on' -alias proxy-off='/root/mgmt/scripts/utilities/proxy-toggle.sh off' -alias proxy-toggle='/root/mgmt/scripts/utilities/proxy-toggle.sh toggle' -alias proxy-enable='/root/mgmt/scripts/utilities/proxy-toggle.sh enable' -alias proxy-disable='/root/mgmt/scripts/utilities/proxy-toggle.sh disable' -alias proxy-status='/root/mgmt/scripts/utilities/proxy-toggle.sh status' -alias proxy-test='/root/mgmt/scripts/utilities/proxy-toggle.sh test' diff --git a/configuration/zsh/oh-my-zsh-custom/example.zsh b/configuration/zsh/oh-my-zsh-custom/example.zsh deleted file mode 100644 index c194f49..0000000 --- a/configuration/zsh/oh-my-zsh-custom/example.zsh +++ /dev/null @@ -1,12 +0,0 @@ -# Put files in this folder to add your own custom functionality. -# See: https://github.com/ohmyzsh/ohmyzsh/wiki/Customization -# -# Files in the custom/ directory will be: -# - loaded automatically by the init script, in alphabetical order -# - loaded last, after all built-ins in the lib/ directory, to override them -# - ignored by git by default -# -# Example: add custom/shortcuts.zsh for shortcuts to your local projects -# -# brainstormr=~/Projects/development/planetargon/brainstormr -# cd $brainstormr diff --git a/configuration/zsh/oh-my-zsh-custom/themes/example.zsh-theme b/configuration/zsh/oh-my-zsh-custom/themes/example.zsh-theme deleted file mode 100644 index 5551207..0000000 --- a/configuration/zsh/oh-my-zsh-custom/themes/example.zsh-theme +++ /dev/null @@ -1,6 +0,0 @@ -# Put your custom themes in this folder. -# See: https://github.com/ohmyzsh/ohmyzsh/wiki/Customization#overriding-and-adding-themes -# -# Example: - -PROMPT="%{$fg[red]%}%n%{$reset_color%}@%{$fg[blue]%}%m %{$fg[yellow]%}%~ %{$reset_color%}%% " diff --git a/configuration/zsh/quick-install.sh b/configuration/zsh/quick-install.sh deleted file mode 100755 index 361fc47..0000000 --- a/configuration/zsh/quick-install.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash - -# 快速安装脚本 - 从 Gitea 仓库直接安装 ZSH 配置 -# 用法: curl -fsSL https://your-gitea.com/ben/mgmt/raw/branch/main/configuration/zsh/quick-install.sh | bash - -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# Gitea 仓库信息 -GITEA_URL="https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt.git" -MGMT_DIR="/root/mgmt" - -log_info "开始快速安装 ZSH 配置..." - -# 检查 root 权限 -if [[ $EUID -ne 0 ]]; then - log_error "此脚本需要 root 权限运行" - exit 1 -fi - -# 克隆或更新仓库 -if [[ -d "$MGMT_DIR" ]]; then - log_info "更新现有仓库..." - cd "$MGMT_DIR" - git pull origin main -else - log_info "克隆仓库..." - git clone "$GITEA_URL" "$MGMT_DIR" - cd "$MGMT_DIR" -fi - -# 询问用户是否使用代理 -echo "" -log_info "网络环境检测:" -echo " 检测到可能需要代理访问外网资源(如 GitHub)" -echo "" -log_info "是否使用代理进行安装?" -echo " Y - 使用代理安装(推荐,确保下载成功)" -echo " N - 直连安装(如果网络环境良好)" -echo "" - -while true; do - read -p "请选择 (Y/n): " choice - case $choice in - [Yy]|"") - log_info "选择使用代理安装" - PROXY_URL="http://istoreos.tailnet-68f9.ts.net:1082" - - # 测试代理连接 - if curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then - log_success "代理连接正常,设置代理环境" - export http_proxy="$PROXY_URL" - export https_proxy="$PROXY_URL" - export HTTP_PROXY="$PROXY_URL" - export HTTPS_PROXY="$PROXY_URL" - - # 创建代理配置文件 - cat > "$MGMT_DIR/configuration/proxy.env" << EOF -# Proxy Configuration for istoreos.tailnet-68f9.ts.net:1082 -export http_proxy=${PROXY_URL} -export https_proxy=${PROXY_URL} -export HTTP_PROXY=${PROXY_URL} -export HTTPS_PROXY=${PROXY_URL} -export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net -export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net -export ALL_PROXY=${PROXY_URL} -export all_proxy=${PROXY_URL} -export GIT_HTTP_PROXY=${PROXY_URL} -export GIT_HTTPS_PROXY=${PROXY_URL} -export CURL_PROXY=${PROXY_URL} -export WGET_PROXY=${PROXY_URL} -EOF - else - log_error "代理连接失败,无法继续安装" - exit 1 - fi - break - ;; - [Nn]) - log_info "选择直连安装" - # 测试直连 - if curl -s --connect-timeout 5 https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then - log_success "直连正常,开始安装" - else - log_error "直连失败,无法继续安装" - exit 1 - fi - break - ;; - *) - log_warning "无效选择,请输入 Y 或 N" - ;; - esac -done - -# 运行安装脚本 -log_info "运行 ZSH 配置安装脚本..." -chmod +x "$MGMT_DIR/configuration/zsh/install-zsh-config.sh" -"$MGMT_DIR/configuration/zsh/install-zsh-config.sh" - -log_success "快速安装完成!" - -# 如果创建了代理配置,询问是否保持 -if [[ -f "$MGMT_DIR/configuration/proxy.env" ]]; then - echo "" - log_info "安装完成!代理已临时开启用于安装。" - echo "" - log_info "是否保持代理开启?" - echo " Y - 保持代理开启(推荐,方便访问外网)" - echo " N - 关闭代理(如果不需要访问外网)" - echo "" - - while true; do - read -p "请选择 (Y/n): " choice - case $choice in - [Yy]|"") - log_success "代理保持开启" - log_info "使用 'proxy-status' 查看代理状态" - log_info "使用 'proxy-toggle' 切换代理状态" - break - ;; - [Nn]) - log_info "关闭代理..." - rm -f "$MGMT_DIR/configuration/proxy.env" - log_success "代理已关闭" - break - ;; - *) - log_warning "无效选择,请输入 Y 或 N" - ;; - esac - done -fi - -log_info "请重新登录或运行: source ~/.zshrc" diff --git a/configuration/zsh/smart-install.sh b/configuration/zsh/smart-install.sh deleted file mode 100755 index 36d497a..0000000 --- a/configuration/zsh/smart-install.sh +++ /dev/null @@ -1,215 +0,0 @@ -#!/bin/bash - -# 智能安装脚本 - 自动检测网络环境并设置代理 -# 用法: curl -fsSL https://your-gitea.com/ben/mgmt/raw/branch/main/configuration/zsh/smart-install.sh | bash - -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } -log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } -log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -# Gitea 仓库信息 -GITEA_URL="https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt.git" -MGMT_DIR="/root/mgmt" -PROXY_HOST="istoreos.tailnet-68f9.ts.net" -PROXY_PORT="1082" -PROXY_URL="http://${PROXY_HOST}:${PROXY_PORT}" - -# 检查 root 权限 -if [[ $EUID -ne 0 ]]; then - log_error "此脚本需要 root 权限运行" - exit 1 -fi - -# 询问用户是否使用代理 -ask_proxy_usage() { - echo "" - log_info "网络环境检测:" - echo " 检测到可能需要代理访问外网资源(如 GitHub)" - echo "" - log_info "是否使用代理进行安装?" - echo " Y - 使用代理安装(推荐,确保下载成功)" - echo " N - 直连安装(如果网络环境良好)" - echo "" - - while true; do - read -p "请选择 (Y/n): " choice - case $choice in - [Yy]|"") - log_info "选择使用代理安装" - return 0 - ;; - [Nn]) - log_info "选择直连安装" - return 1 - ;; - *) - log_warning "无效选择,请输入 Y 或 N" - ;; - esac - done -} - -# 测试代理连接 -test_proxy_connection() { - log_info "测试代理连接..." - if curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then - log_success "代理连接正常" - return 0 - else - log_error "代理连接失败" - return 1 - fi -} - -# 测试直连 -test_direct_connection() { - log_info "测试直连..." - if curl -s --connect-timeout 5 https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then - log_success "直连正常" - return 0 - else - log_error "直连失败" - return 1 - fi -} - -# 设置代理环境 -setup_proxy_env() { - log_info "设置代理环境..." - - export http_proxy="$PROXY_URL" - export https_proxy="$PROXY_URL" - export HTTP_PROXY="$PROXY_URL" - export HTTPS_PROXY="$PROXY_URL" - export no_proxy="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" - export NO_PROXY="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net" - - log_success "代理环境已设置" -} - -# 克隆或更新仓库 -clone_repository() { - log_info "获取配置仓库..." - - if [[ -d "$MGMT_DIR" ]]; then - log_info "更新现有仓库..." - cd "$MGMT_DIR" - git pull origin main - else - log_info "克隆仓库..." - git clone "$GITEA_URL" "$MGMT_DIR" - cd "$MGMT_DIR" - fi -} - -# 创建代理配置文件 -create_proxy_config() { - log_info "创建代理配置文件..." - - cat > "$MGMT_DIR/configuration/proxy.env" << EOF -# Proxy Configuration for ${PROXY_HOST}:${PROXY_PORT} -export http_proxy=${PROXY_URL} -export https_proxy=${PROXY_URL} -export HTTP_PROXY=${PROXY_URL} -export HTTPS_PROXY=${PROXY_URL} -export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net -export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net -export ALL_PROXY=${PROXY_URL} -export all_proxy=${PROXY_URL} -export GIT_HTTP_PROXY=${PROXY_URL} -export GIT_HTTPS_PROXY=${PROXY_URL} -export CURL_PROXY=${PROXY_URL} -export WGET_PROXY=${PROXY_URL} -EOF - - log_success "代理配置文件已创建" -} - -# 询问用户是否保持代理 -ask_proxy_keep() { - if [[ -f "$MGMT_DIR/configuration/proxy.env" ]]; then - echo "" - log_info "安装完成!代理已临时开启用于安装。" - echo "" - log_info "是否保持代理开启?" - echo " Y - 保持代理开启(推荐,方便访问外网)" - echo " N - 关闭代理(如果不需要访问外网)" - echo "" - - while true; do - read -p "请选择 (Y/n): " choice - case $choice in - [Yy]|"") - log_success "代理保持开启" - log_info "使用 'proxy-status' 查看代理状态" - log_info "使用 'proxy-toggle' 切换代理状态" - break - ;; - [Nn]) - log_info "关闭代理..." - if [[ -f "$MGMT_DIR/scripts/utilities/proxy-toggle.sh" ]]; then - "$MGMT_DIR/scripts/utilities/proxy-toggle.sh" disable - else - rm -f "$MGMT_DIR/configuration/proxy.env" - log_success "代理已关闭" - fi - break - ;; - *) - log_warning "无效选择,请输入 Y 或 N" - ;; - esac - done - fi -} - -# 主安装流程 -main() { - log_info "开始智能安装 ZSH 配置..." - - # 询问用户是否使用代理 - if ask_proxy_usage; then - # 用户选择使用代理 - if test_proxy_connection; then - setup_proxy_env - create_proxy_config - log_success "代理环境已设置,开始安装..." - else - log_error "代理连接失败,无法继续安装" - exit 1 - fi - else - # 用户选择直连 - if test_direct_connection; then - log_success "直连正常,开始安装..." - else - log_error "直连失败,无法继续安装" - exit 1 - fi - fi - - # 克隆仓库 - clone_repository - - # 运行安装脚本 - log_info "运行 ZSH 配置安装脚本..." - chmod +x "$MGMT_DIR/configuration/zsh/install-zsh-config.sh" - "$MGMT_DIR/configuration/zsh/install-zsh-config.sh" - - log_success "智能安装完成!" - - # 如果使用了代理,询问是否保持 - ask_proxy_keep -} - -main "$@" diff --git a/configuration/zsh/test-plugins.sh b/configuration/zsh/test-plugins.sh deleted file mode 100755 index 3616357..0000000 --- a/configuration/zsh/test-plugins.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash - -# 测试 ZSH 插件是否正确安装 - -set -euo pipefail - -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -test_plugin() { - local plugin_name="$1" - local plugin_path="$2" - - if [[ -d "$plugin_path" ]]; then - log_success "✓ $plugin_name 已安装" - return 0 - else - log_error "✗ $plugin_name 未安装: $plugin_path" - return 1 - fi -} - -test_alias() { - local alias_name="$1" - - if alias "$alias_name" &>/dev/null; then - log_success "✓ 别名 $alias_name 已加载" - return 0 - else - log_warning "✗ 别名 $alias_name 未加载" - return 1 - fi -} - -main() { - log_info "测试 ZSH 插件和配置..." - echo "" - - local failed=0 - - # 测试 oh-my-zsh 安装 - log_info "检查 oh-my-zsh 安装..." - if [[ -d "$HOME/.oh-my-zsh" ]]; then - log_success "✓ oh-my-zsh 已安装" - else - log_error "✗ oh-my-zsh 未安装" - ((failed++)) - fi - echo "" - - # 测试自定义插件 - log_info "检查自定义插件..." - test_plugin "zsh-autosuggestions" "$HOME/.oh-my-zsh/custom/plugins/zsh-autosuggestions" || ((failed++)) - test_plugin "zsh-syntax-highlighting" "$HOME/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting" || ((failed++)) - test_plugin "zsh-completions" "$HOME/.oh-my-zsh/custom/plugins/zsh-completions" || ((failed++)) - echo "" - - # 测试内置插件 - log_info "检查内置插件..." - test_plugin "git" "$HOME/.oh-my-zsh/plugins/git" || ((failed++)) - test_plugin "docker" "$HOME/.oh-my-zsh/plugins/docker" || ((failed++)) - test_plugin "ansible" "$HOME/.oh-my-zsh/plugins/ansible" || ((failed++)) - test_plugin "terraform" "$HOME/.oh-my-zsh/plugins/terraform" || ((failed++)) - test_plugin "kubectl" "$HOME/.oh-my-zsh/plugins/kubectl" || ((failed++)) - echo "" - - # 测试自定义别名文件 - log_info "检查自定义别名..." - if [[ -f "$HOME/.oh-my-zsh/custom/aliases.zsh" ]]; then - log_success "✓ 自定义别名文件已安装" - else - log_warning "✗ 自定义别名文件未安装" - fi - echo "" - - # 测试一些关键别名 - log_info "检查关键别名..." - test_alias "mgmt" || ((failed++)) - test_alias "dps" || ((failed++)) - test_alias "k" || ((failed++)) - test_alias "gs" || ((failed++)) - echo "" - - # 测试 .zshrc 文件 - log_info "检查 .zshrc 配置..." - if [[ -f "$HOME/.zshrc" ]]; then - log_success "✓ .zshrc 文件存在" - - # 检查关键配置 - if grep -q "zsh-autosuggestions" "$HOME/.zshrc"; then - log_success "✓ zsh-autosuggestions 已配置" - else - log_warning "✗ zsh-autosuggestions 未配置" - fi - - if grep -q "zsh-syntax-highlighting" "$HOME/.zshrc"; then - log_success "✓ zsh-syntax-highlighting 已配置" - else - log_warning "✗ zsh-syntax-highlighting 未配置" - fi - - if grep -q "agnoster" "$HOME/.zshrc"; then - log_success "✓ agnoster 主题已配置" - else - log_warning "✗ agnoster 主题未配置" - fi - else - log_error "✗ .zshrc 文件不存在" - ((failed++)) - fi - echo "" - - # 总结 - if [[ $failed -eq 0 ]]; then - log_success "🎉 所有测试通过!ZSH 配置完整。" - echo "" - log_info "使用方法:" - echo " - 重新登录或运行: source ~/.zshrc" - echo " - 测试自动建议: 输入 'docker' 然后按 → 键" - echo " - 测试别名: 运行 'mgmt-status' 或 'dps'" - else - log_error "❌ 发现 $failed 个问题,请检查安装。" - echo "" - log_info "修复建议:" - echo " 1. 重新运行安装脚本" - echo " 2. 检查网络连接" - echo " 3. 手动安装缺失的插件" - fi -} - -main "$@" diff --git a/configuration/zsh/zshrc.template b/configuration/zsh/zshrc.template deleted file mode 100644 index 2b3307a..0000000 --- a/configuration/zsh/zshrc.template +++ /dev/null @@ -1,260 +0,0 @@ -# If you come from bash you might have to change your $PATH. -# export PATH=$HOME/bin:$HOME/.local/bin:/usr/local/bin:$PATH - -# Path to your Oh My Zsh installation. -export ZSH="$HOME/.oh-my-zsh" - -# Set name of the theme to load --- if set to "random", it will -# load a random theme each time Oh My Zsh is loaded, in which case, -# to know which specific one was loaded, run: echo $RANDOM_THEME -# See https://github.com/ohmyzsh/ohmyzsh/wiki/Themes -ZSH_THEME="agnoster" - -# Set list of themes to pick from when loading at random -# Setting this variable when ZSH_THEME=random will cause zsh to load -# a theme from this variable instead of looking in $ZSH/themes/ -# If set to an empty array, this variable will have no effect. -# ZSH_THEME_RANDOM_CANDIDATES=( "robbyrussell" "agnoster" ) - -# Uncomment the following line to use case-sensitive completion. -# CASE_SENSITIVE="true" - -# Uncomment the following line to use hyphen-insensitive completion. -# Case-sensitive completion must be off. _ and - will be interchangeable. -# HYPHEN_INSENSITIVE="true" - -# Uncomment one of the following lines to change the auto-update behavior -# zstyle ':omz:update' mode disabled # disable automatic updates -# zstyle ':omz:update' mode auto # update automatically without asking -zstyle ':omz:update' mode reminder # just remind me to update when it's time - -# Uncomment the following line to change how often to auto-update (in days). -# zstyle ':omz:update' frequency 13 - -# Uncomment the following line if pasting URLs and other text is messed up. -# DISABLE_MAGIC_FUNCTIONS="true" - -# Uncomment the following line to disable colors in ls. -# DISABLE_LS_COLORS="true" - -# Uncomment the following line to disable auto-setting terminal title. -# DISABLE_AUTO_TITLE="true" - -# Uncomment the following line to enable command auto-correction. -# ENABLE_CORRECTION="true" - -# Uncomment the following line to display red dots whilst waiting for completion. -# You can also set it to another string to have that shown instead of the default red dots. -# e.g. COMPLETION_WAITING_DOTS="%F{yellow}waiting...%f" -# Caution: this setting can cause issues with multiline prompts in zsh < 5.7.1 (see #5765) -# COMPLETION_WAITING_DOTS="true" - -# Uncomment the following line if you want to disable marking untracked files -# under VCS as dirty. This makes repository status check for large repositories -# much, much faster. -# DISABLE_UNTRACKED_FILES_DIRTY="true" - -# Uncomment the following line if you want to change the command execution time -# stamp shown in the history command output. -# You can set one of the optional three formats: -# "mm/dd/yyyy"|"dd.mm.yyyy"|"yyyy-mm-dd" -# or set a custom format using the strftime function format specifications, -# see 'man strftime' for details. -# HIST_STAMPS="mm/dd/yyyy" - -# Would you like to use another custom folder than $ZSH/custom? -# ZSH_CUSTOM=/path/to/new-custom-folder - -# Which plugins would you like to load? -# Standard plugins can be found in $ZSH/plugins/ -# Custom plugins may be added to $ZSH_CUSTOM/plugins/ -# Example format: plugins=(rails git textmate ruby lighthouse) -# Add wisely, as too many plugins slow down shell startup. -plugins=( - git - docker - docker-compose - ansible - terraform - kubectl - helm - aws - gcloud - zsh-autosuggestions - zsh-syntax-highlighting - zsh-completions - colored-man-pages - command-not-found - extract - history-substring-search - sudo - systemd - tmux - vscode - web-search - z -) - -source $ZSH/oh-my-zsh.sh - -# User configuration - -# export MANPATH="/usr/local/man:$MANPATH" - -# You may need to manually set your language environment -# export LANG=en_US.UTF-8 - -# Preferred editor for local and remote sessions -if [[ -n $SSH_CONNECTION ]]; then - export EDITOR='vim' -else - export EDITOR='vim' -fi - -# Compilation flags -# export ARCHFLAGS="-arch $(uname -m)" - -# ============================================================================= -# CUSTOM CONFIGURATION FOR MANAGEMENT SYSTEM -# ============================================================================= - -# Load proxy configuration if exists -if [[ -f /root/mgmt/configuration/proxy.env ]]; then - source /root/mgmt/configuration/proxy.env -fi - -# Project management aliases -alias mgmt='cd /root/mgmt' -alias mgmt-status='cd /root/mgmt && ./mgmt.sh status' -alias mgmt-deploy='cd /root/mgmt && ./mgmt.sh deploy' -alias mgmt-cleanup='cd /root/mgmt && ./mgmt.sh cleanup' - -# Ansible aliases -alias ansible-check='cd /root/mgmt/configuration && ansible-playbook --syntax-check' -alias ansible-deploy='cd /root/mgmt/configuration && ansible-playbook -i inventories/production/inventory.ini' -alias ansible-ping='cd /root/mgmt/configuration && ansible -i inventories/production/inventory.ini all -m ping' - -# OpenTofu/Terraform aliases -alias tofu-init='cd /root/mgmt/tofu/environments/dev && tofu init' -alias tofu-plan='cd /root/mgmt/tofu/environments/dev && tofu plan -var-file="terraform.tfvars"' -alias tofu-apply='cd /root/mgmt/tofu/environments/dev && tofu apply -var-file="terraform.tfvars"' -alias tofu-destroy='cd /root/mgmt/tofu/environments/dev && tofu destroy -var-file="terraform.tfvars"' -alias tofu-output='cd /root/mgmt/tofu/environments/dev && tofu output' - -# Docker aliases -alias d='docker' -alias dc='docker-compose' -alias dps='docker ps' -alias dpsa='docker ps -a' -alias di='docker images' -alias dex='docker exec -it' -alias dlog='docker logs -f' -alias dstop='docker stop' -alias dstart='docker start' -alias drm='docker rm' -alias drmi='docker rmi' -alias dclean='docker system prune -f' - -# Docker Swarm aliases -alias dswarm='docker swarm' -alias dstack='docker stack' -alias dservice='docker service' -alias dnode='docker node' -alias dnetwork='docker network' -alias dsecret='docker secret' -alias dconfig='docker config' - -# Kubernetes aliases -alias k='kubectl' -alias kgp='kubectl get pods' -alias kgs='kubectl get services' -alias kgd='kubectl get deployments' -alias kgn='kubectl get nodes' -alias kdp='kubectl describe pod' -alias kds='kubectl describe service' -alias kdd='kubectl describe deployment' -alias kaf='kubectl apply -f' -alias kdf='kubectl delete -f' -alias kl='kubectl logs -f' - -# Git aliases -alias gs='git status' -alias ga='git add' -alias gc='git commit' -alias gp='git push' -alias gl='git pull' -alias gd='git diff' -alias gb='git branch' -alias gco='git checkout' -alias gcom='git checkout main' -alias gcod='git checkout develop' -alias gst='git stash' -alias gstp='git stash pop' - -# System aliases -alias ll='ls -alF' -alias la='ls -A' -alias l='ls -CF' -alias ..='cd ..' -alias ...='cd ../..' -alias ....='cd ../../..' -alias grep='grep --color=auto' -alias fgrep='fgrep --color=auto' -alias egrep='egrep --color=auto' - -# Network aliases -alias ports='netstat -tuln' -alias myip='curl -s https://httpbin.org/ip | jq -r .origin' -alias speedtest='curl -s https://raw.githubusercontent.com/sivel/speedtest-cli/master/speedtest.py | python3' - -# Process aliases -alias psg='ps aux | grep' -alias top='htop' - -# File operations -alias cp='cp -i' -alias mv='mv -i' -alias rm='rm -i' -alias mkdir='mkdir -pv' - -# History configuration -HISTSIZE=10000 -SAVEHIST=10000 -HISTFILE=~/.zsh_history -setopt HIST_VERIFY -setopt SHARE_HISTORY -setopt APPEND_HISTORY -setopt INC_APPEND_HISTORY -setopt HIST_IGNORE_DUPS -setopt HIST_IGNORE_ALL_DUPS -setopt HIST_REDUCE_BLANKS -setopt HIST_IGNORE_SPACE - -# Auto-completion configuration -autoload -U compinit && compinit -zstyle ':completion:*' matcher-list 'm:{a-zA-Z}={A-Za-z}' -zstyle ':completion:*' list-colors "${(s.:.)LS_COLORS}" -zstyle ':completion:*' menu select - -# Key bindings -bindkey '^[[A' history-substring-search-up -bindkey '^[[B' history-substring-search-down -bindkey '^[[1;5C' forward-word -bindkey '^[[1;5D' backward-word - -# Auto-suggestions configuration -ZSH_AUTOSUGGEST_HIGHLIGHT_STYLE='fg=8' -ZSH_AUTOSUGGEST_STRATEGY=(history completion) - -# Syntax highlighting configuration -ZSH_HIGHLIGHT_HIGHLIGHTERS=(main brackets pattern cursor) - -# Welcome message -echo "🚀 Management System Shell Ready!" -echo "📁 Project: /root/mgmt" -echo "🔧 Available commands: mgmt-status, mgmt-deploy, mgmt-cleanup" -echo "🐳 Docker: d, dc, dps, dex, dlog" -echo "☸️ Kubernetes: k, kgp, kgs, kaf, kdf" -echo "🏗️ OpenTofu: tofu-init, tofu-plan, tofu-apply" -echo "⚙️ Ansible: ansible-check, ansible-deploy, ansible-ping" -echo "" \ No newline at end of file diff --git a/docs/consul-cluster-troubleshooting.md b/docs/consul-cluster-troubleshooting.md new file mode 100644 index 0000000..8ba809f --- /dev/null +++ b/docs/consul-cluster-troubleshooting.md @@ -0,0 +1,147 @@ +# Consul 集群故障排除指南 + +## 问题诊断 + +### 发现的问题 +1. **DNS 解析失败**:服务间无法通过服务名相互发现 +2. **网络连通性问题**:`ash3c` 节点网络配置异常(地址显示为 0.0.0.0) +3. **跨节点通信失败**:`no route to host` 错误 +4. **集群无法形成**:持续的 "No cluster leader" 错误 + +### 根本原因 +- Docker Swarm overlay 网络在跨节点环境中的服务发现机制存在问题 +- `ash3c` 节点的网络配置可能有问题 +- 防火墙或网络策略可能阻止了 Consul 集群通信端口 + +## 解决方案 + +### 方案 1:单节点 Consul(临时解决方案) +**文件**: `swarm/stacks/consul-single-node.yml` +**优点**: 简单、可靠、立即可用 +**缺点**: 没有高可用性 + +```bash +docker stack deploy -c swarm/stacks/consul-single-node.yml consul +``` + +### 方案 2:使用主机网络的集群配置 +**文件**: `swarm/stacks/consul-cluster-host-network.yml` +**优点**: 绕过 overlay 网络问题 +**缺点**: 需要手动配置 IP 地址 + +### 方案 3:修复后的 overlay 网络配置 +**文件**: `swarm/stacks/consul-cluster-fixed.yml` +**优点**: 使用 Docker 原生网络 +**缺点**: 需要解决底层网络问题 + +### 方案 4:macvlan 网络配置 +**文件**: `swarm/stacks/consul-cluster-macvlan.yml` +**优点**: 直接使用物理网络 +**缺点**: 需要网络管理员权限和配置 + +## 网络诊断步骤 + +### 1. 检查节点状态 +```bash +docker node ls +docker node inspect --format '{{.Status.Addr}}' +``` + +### 2. 检查网络连通性 +```bash +# 在 master 节点上测试到 ash3c 的连通性 +ping +telnet 8301 +``` + +### 3. 检查防火墙设置 +```bash +# 确保以下端口开放 +# 8300: Consul server RPC +# 8301: Consul Serf LAN +# 8302: Consul Serf WAN +# 8500: Consul HTTP API +# 8600: Consul DNS +``` + +### 4. 检查 Docker Swarm 网络 +```bash +docker network ls +docker network inspect +``` + +## 推荐的修复流程 + +### 立即解决方案(单节点) +1. 部署单节点 Consul 以恢复服务 +2. 验证基本功能正常 + +### 长期解决方案(集群) +1. 修复 `ash3c` 节点的网络配置 +2. 确保节点间网络连通性 +3. 配置防火墙规则 +4. 重新部署集群配置 + +## 验证步骤 + +### 单节点验证 +```bash +# 检查服务状态 +docker service ls | grep consul + +# 检查日志 +docker service logs consul_consul + +# 访问 Web UI +curl http://localhost:8500/v1/status/leader +``` + +### 集群验证 +```bash +# 检查集群成员 +docker exec consul members + +# 检查领导者 +docker exec consul operator raft list-peers +``` + +## 常见问题 + +### Q: 为什么服务发现不工作? +A: Docker Swarm 的 overlay 网络在某些配置下可能存在 DNS 解析问题,特别是跨节点通信时。 + +### Q: 如何选择合适的网络方案? +A: +- 开发/测试环境:使用单节点或 overlay 网络 +- 生产环境:推荐使用 macvlan 或主机网络以获得更好的性能和可靠性 + +### Q: 集群恢复后数据会丢失吗? +A: 如果使用了持久化卷,数据不会丢失。但建议在修复前备份重要数据。 + +## 监控和维护 + +### 健康检查 +```bash +# 定期检查集群状态 +consul members +consul operator raft list-peers +``` + +### 日志监控 +```bash +# 监控关键错误 +docker service logs consul_consul | grep -E "(ERROR|WARN)" +``` + +### 性能监控 +- 监控 Consul 的 HTTP API 响应时间 +- 检查集群同步延迟 +- 监控网络连接数 + +## 联系支持 + +如果问题持续存在,请提供以下信息: +1. Docker 版本和 Swarm 配置 +2. 网络拓扑图 +3. 完整的服务日志 +4. 节点间网络测试结果 \ No newline at end of file diff --git a/docs/setup/zsh-configuration.md b/docs/setup/zsh-configuration.md new file mode 100644 index 0000000..73ec6df --- /dev/null +++ b/docs/setup/zsh-configuration.md @@ -0,0 +1,240 @@ +# ZSH 配置总结 + +## 已安装和配置的组件 + +### 1. 基础组件 +- ✅ **oh-my-zsh**: 已安装并配置 +- ✅ **zsh**: 版本 5.9 +- ✅ **Powerline 字体**: 已安装支持 +- ✅ **tmux**: 已安装 + +### 2. 核心插件 +- ✅ **git**: Git 集成和别名 +- ✅ **docker**: Docker 命令补全和别名 +- ✅ **docker-compose**: Docker Compose 支持 +- ✅ **ansible**: Ansible 命令补全 +- ✅ **terraform**: Terraform/OpenTofu 支持 +- ✅ **kubectl**: Kubernetes 命令补全 +- ✅ **helm**: Helm 包管理器支持 +- ✅ **aws**: AWS CLI 支持 +- ✅ **gcloud**: Google Cloud CLI 支持 + +### 3. 增强插件 +- ✅ **zsh-autosuggestions**: 命令自动建议 +- ✅ **zsh-syntax-highlighting**: 语法高亮 +- ✅ **zsh-completions**: 增强补全功能 +- ✅ **colored-man-pages**: 彩色手册页 +- ✅ **command-not-found**: 命令未找到提示 +- ✅ **extract**: 解压文件支持 +- ✅ **history-substring-search**: 历史搜索 +- ✅ **sudo**: sudo 支持 +- ✅ **systemd**: systemd 服务管理 +- ✅ **tmux**: tmux 集成 +- ✅ **vscode**: VS Code 集成 +- ✅ **web-search**: 网络搜索 +- ✅ **z**: 智能目录跳转 + +### 4. 主题 +- ✅ **agnoster**: 功能丰富的主题,支持 Git 状态显示 + +## 自定义别名 + +### 项目管理别名 +```bash +mgmt # 进入管理项目目录 +mgmt-status # 显示项目状态 +mgmt-deploy # 快速部署 +mgmt-cleanup # 清理环境 +mgmt-swarm # Swarm 管理 +mgmt-tofu # OpenTofu 管理 +``` + +### Ansible 别名 +```bash +ansible-check # 语法检查 +ansible-deploy # 部署 +ansible-ping # 连通性测试 +ansible-vault # 密码管理 +ansible-galaxy # 角色管理 +``` + +### OpenTofu/Terraform 别名 +```bash +tofu-init # 初始化 +tofu-plan # 计划 +tofu-apply # 应用 +tofu-destroy # 销毁 +tofu-output # 输出 +tofu-validate # 验证 +tofu-fmt # 格式化 +``` + +### Docker 别名 +```bash +d # docker +dc # docker-compose +dps # docker ps +dpsa # docker ps -a +di # docker images +dex # docker exec -it +dlog # docker logs -f +dclean # docker system prune -f +``` + +### Docker Swarm 别名 +```bash +dswarm # docker swarm +dstack # docker stack +dservice # docker service +dnode # docker node +dnetwork # docker network +dsecret # docker secret +dconfig # docker config +``` + +### Kubernetes 别名 +```bash +k # kubectl +kgp # kubectl get pods +kgs # kubectl get services +kgd # kubectl get deployments +kgn # kubectl get nodes +kaf # kubectl apply -f +kdf # kubectl delete -f +kl # kubectl logs -f +``` + +### Git 别名 +```bash +gs # git status +ga # git add +gc # git commit +gp # git push +gl # git pull +gd # git diff +gb # git branch +gco # git checkout +``` + +### 系统别名 +```bash +ll # ls -alF +la # ls -A +l # ls -CF +.. # cd .. +... # cd ../.. +.... # cd ../../.. +grep # grep --color=auto +ports # netstat -tuln +myip # 获取公网IP +speedtest # 网速测试 +psg # ps aux | grep +top # htop +``` + +## 配置文件位置 + +- **主配置**: `~/.zshrc` +- **自定义别名**: `~/.oh-my-zsh/custom/aliases.zsh` +- **代理配置**: `/root/mgmt/configuration/proxy.env` + +## 使用方法 + +### 启动 ZSH +```bash +zsh +``` + +### 重新加载配置 +```bash +source ~/.zshrc +``` + +### 查看所有别名 +```bash +alias +``` + +### 查看特定别名 +```bash +alias | grep docker +alias | grep mgmt +``` + +## 功能特性 + +### 1. 自动建议 +- 输入命令时会显示历史命令建议 +- 使用 `→` 键接受建议 + +### 2. 语法高亮 +- 命令输入时实时语法高亮 +- 错误命令显示为红色 + +### 3. 智能补全 +- 支持所有已安装工具的补全 +- 支持文件路径补全 +- 支持命令参数补全 + +### 4. 历史搜索 +- 使用 `↑` `↓` 键搜索历史命令 +- 支持部分匹配搜索 + +### 5. 目录跳转 +- 使用 `z` 命令智能跳转到常用目录 +- 基于访问频率和最近访问时间 + +### 6. 代理支持 +- 自动加载代理配置 +- 支持 HTTP/HTTPS 代理 + +## 故障排除 + +### 如果别名不工作 +```bash +# 检查别名是否加载 +alias | grep + +# 重新加载配置 +source ~/.zshrc +``` + +### 如果插件不工作 +```bash +# 检查插件是否安装 +ls ~/.oh-my-zsh/plugins/ | grep + +# 检查自定义插件 +ls ~/.oh-my-zsh/custom/plugins/ +``` + +### 如果主题显示异常 +```bash +# 检查字体是否安装 +fc-list | grep Powerline + +# 尝试其他主题 +# 编辑 ~/.zshrc 中的 ZSH_THEME +``` + +## 扩展建议 + +### 可以添加的额外插件 +- **fzf**: 模糊查找 +- **bat**: 更好的 cat 命令 +- **exa**: 更好的 ls 命令 +- **ripgrep**: 更快的 grep +- **fd**: 更快的 find + +### 可以添加的额外别名 +- 根据个人使用习惯添加更多别名 +- 为常用命令组合创建别名 +- 为项目特定命令创建别名 + +## 性能优化 + +- 已配置的插件数量适中,不会显著影响启动速度 +- 使用 `zsh-completions` 提供更好的补全性能 +- 历史记录配置优化,避免内存占用过大 + +配置完成!现在您拥有了一个功能强大、高度定制的 ZSH 环境,专门为管理系统的需求进行了优化。 diff --git a/scripts/deployment/configure-nomad-cluster.sh b/scripts/deployment/configure-nomad-cluster.sh new file mode 100755 index 0000000..76fbbd1 --- /dev/null +++ b/scripts/deployment/configure-nomad-cluster.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查必要的文件 +check_prerequisites() { + log_info "检查前置条件..." + + if [ ! -f "configuration/inventories/production/nomad-cluster.ini" ]; then + log_error "找不到 Nomad 集群配置文件" + exit 1 + fi + + if [ ! -f "configuration/playbooks/applications/configure-nomad-cluster.yml" ]; then + log_error "找不到 Nomad 配置 playbook" + exit 1 + fi + + log_success "前置条件检查完成" +} + +# 生成加密密钥 +generate_encrypt_key() { + log_info "生成 Nomad 加密密钥..." + + if command -v nomad >/dev/null 2>&1; then + ENCRYPT_KEY=$(nomad operator gossip keyring generate) + log_success "生成加密密钥: $ENCRYPT_KEY" + + # 更新配置文件中的加密密钥 + sed -i "s|YOUR_NOMAD_ENCRYPT_KEY_HERE|$ENCRYPT_KEY|g" configuration/inventories/production/nomad-cluster.ini + log_success "已更新配置文件中的加密密钥" + else + log_warning "本地未安装 Nomad,将在远程节点生成密钥" + fi +} + +# 测试连接 +test_connectivity() { + log_info "测试目标主机连接性..." + + ansible -i configuration/inventories/production/nomad-cluster.ini nomad_cluster -m ping + + if [ $? -eq 0 ]; then + log_success "所有主机连接正常" + else + log_error "部分主机连接失败,请检查网络和SSH配置" + exit 1 + fi +} + +# 配置 Nomad 集群 +configure_cluster() { + log_info "开始配置 Nomad 集群..." + + ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \ + configuration/playbooks/applications/configure-nomad-cluster.yml \ + -v + + if [ $? -eq 0 ]; then + log_success "Nomad 集群配置完成" + else + log_error "Nomad 集群配置失败" + exit 1 + fi +} + +# 验证集群状态 +verify_cluster() { + log_info "验证集群状态..." + + # 等待服务启动 + sleep 10 + + log_info "检查 Nomad 服务状态..." + ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ + -m shell -a "systemctl status nomad --no-pager" + + log_info "检查集群成员..." + ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ + -m shell -a "nomad server members" --limit 1 + + log_info "检查节点状态..." + ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \ + -m shell -a "nomad node status" --limit 1 +} + +# 主函数 +main() { + echo "🚀 开始配置 Nomad 集群..." + echo "==================================" + + check_prerequisites + generate_encrypt_key + test_connectivity + configure_cluster + verify_cluster + + echo "==================================" + log_success "Nomad 集群配置完成!" + echo "" + echo "访问 Nomad UI:" + echo "- Master: http://100.117.106.136:4646" + echo "- Semaphore: http://100.116.158.95:4646" + echo "" + echo "常用命令:" + echo "- 查看集群状态: nomad server members" + echo "- 查看节点状态: nomad node status" + echo "- 运行作业: nomad job run " +} + +# 运行主函数 +main "$@" \ No newline at end of file diff --git a/scripts/deployment/deploy-consul-cluster.sh b/scripts/deployment/deploy-consul-cluster.sh new file mode 100755 index 0000000..e6c7d25 --- /dev/null +++ b/scripts/deployment/deploy-consul-cluster.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Consul 集群部署脚本 +# 使用 Ansible 在物理机上部署 Consul 集群 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" +PLAYBOOK_FILE="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster.yml" + +echo "=== Consul 集群部署脚本 ===" +echo "项目根目录: $PROJECT_ROOT" +echo "清单文件: $INVENTORY_FILE" +echo "Playbook: $PLAYBOOK_FILE" +echo + +# 检查必要文件 +if [[ ! -f "$INVENTORY_FILE" ]]; then + echo "错误: 清单文件不存在: $INVENTORY_FILE" + exit 1 +fi + +if [[ ! -f "$PLAYBOOK_FILE" ]]; then + echo "错误: Playbook 文件不存在: $PLAYBOOK_FILE" + exit 1 +fi + +# 生成 Consul 加密密钥(如果需要) +echo "1. 检查 Consul 加密密钥..." +if grep -q "YOUR_BASE64_ENCRYPT_KEY_HERE" "$INVENTORY_FILE"; then + echo "需要生成 Consul 加密密钥..." + + # 尝试使用已安装的 consul 生成密钥 + if command -v consul &> /dev/null; then + ENCRYPT_KEY=$(consul keygen) + echo "生成的加密密钥: $ENCRYPT_KEY" + + # 替换清单文件中的占位符 + sed -i "s/YOUR_BASE64_ENCRYPT_KEY_HERE/$ENCRYPT_KEY/" "$INVENTORY_FILE" + echo "已更新清单文件中的加密密钥" + else + echo "警告: 未找到 consul 命令,请手动生成加密密钥并更新清单文件" + echo "可以使用以下命令生成: consul keygen" + echo "或者使用在线工具生成 32 字节的 base64 编码密钥" + fi +fi + +# 测试连接 +echo +echo "2. 测试目标主机连接..." +ansible -i "$INVENTORY_FILE" consul_cluster -m ping + +if [[ $? -ne 0 ]]; then + echo "错误: 无法连接到目标主机,请检查清单文件中的连接信息" + exit 1 +fi + +# 显示部署信息 +echo +echo "3. 部署信息:" +echo "目标主机:" +ansible -i "$INVENTORY_FILE" consul_cluster --list-hosts + +echo +echo "Consul 版本: $(grep consul_version "$INVENTORY_FILE" | cut -d'=' -f2)" +echo "数据中心: $(grep consul_datacenter "$INVENTORY_FILE" | cut -d'=' -f2)" + +# 确认部署 +echo +read -p "确认部署 Consul 集群到上述主机? (y/N): " confirm +if [[ $confirm != "y" && $confirm != "Y" ]]; then + echo "部署已取消" + exit 0 +fi + +# 执行部署 +echo +echo "4. 开始部署 Consul 集群..." +ansible-playbook -i "$INVENTORY_FILE" "$PLAYBOOK_FILE" -v + +if [[ $? -eq 0 ]]; then + echo + echo "=== 部署完成 ===" + echo + echo "验证集群状态:" + echo "1. 检查服务状态:" + echo " ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'systemctl status consul'" + echo + echo "2. 检查集群成员:" + echo " ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'consul members'" + echo + echo "3. 访问 Web UI:" + echo " - Master: http://master:8500" + echo " - Ash3c: http://ash3c:8500" + echo + echo "4. 检查集群领导者:" + echo " curl http://master:8500/v1/status/leader" + echo +else + echo "部署失败,请检查错误信息" + exit 1 +fi \ No newline at end of file diff --git a/scripts/deployment/deploy-consul-simple.sh b/scripts/deployment/deploy-consul-simple.sh new file mode 100755 index 0000000..b140cd3 --- /dev/null +++ b/scripts/deployment/deploy-consul-simple.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# Consul Cluster Simple Deployment Script +# 简化版 Consul 集群部署脚本 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查依赖 +check_dependencies() { + log_info "检查依赖项..." + + if ! command -v ansible-playbook &> /dev/null; then + log_error "ansible-playbook 未找到,请安装 Ansible" + exit 1 + fi + + if ! command -v python3 &> /dev/null; then + log_error "python3 未找到" + exit 1 + fi + + log_success "依赖检查完成" +} + +# 检查网络连接 +check_connectivity() { + log_info "检查目标主机连接性..." + + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" + + if [[ ! -f "$inventory_file" ]]; then + log_error "清单文件不存在: $inventory_file" + exit 1 + fi + + # 测试连接 + if ansible consul_cluster -i "$inventory_file" -m ping --one-line; then + log_success "所有主机连接正常" + else + log_warning "部分主机连接失败,但继续部署..." + fi +} + +# 部署 Consul 集群 +deploy_consul() { + log_info "开始部署 Consul 集群..." + + local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster-simple.yml" + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" + + if [[ ! -f "$playbook_file" ]]; then + log_error "Playbook 文件不存在: $playbook_file" + exit 1 + fi + + # 运行 Ansible playbook + if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then + log_success "Consul 集群部署完成" + else + log_error "Consul 集群部署失败" + exit 1 + fi +} + +# 验证集群状态 +verify_cluster() { + log_info "验证 Consul 集群状态..." + + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" + + # 检查服务状态 + log_info "检查 Consul 服务状态..." + ansible consul_cluster -i "$inventory_file" -m shell -a "systemctl status consul --no-pager" || true + + # 检查集群成员 + log_info "检查集群成员..." + ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true + + # 检查领导者 + log_info "检查集群领导者..." + ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul operator raft list-peers" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true +} + +# 主函数 +main() { + log_info "开始 Consul 集群简化部署..." + + check_dependencies + check_connectivity + deploy_consul + verify_cluster + + log_success "Consul 集群部署流程完成!" + + echo "" + log_info "后续步骤:" + echo "1. 检查集群状态: consul members" + echo "2. 访问 Web UI: http://:8500" + echo "3. 检查日志: journalctl -u consul -f" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/scripts/deployment/deploy-nomad-cluster.sh b/scripts/deployment/deploy-nomad-cluster.sh new file mode 100755 index 0000000..2d4e852 --- /dev/null +++ b/scripts/deployment/deploy-nomad-cluster.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Nomad Cluster Deployment Script +# Nomad 集群部署脚本 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查依赖 +check_dependencies() { + log_info "检查依赖项..." + + if ! command -v ansible-playbook &> /dev/null; then + log_error "ansible-playbook 未找到,请安装 Ansible" + exit 1 + fi + + log_success "依赖检查完成" +} + +# 检查网络连接 +check_connectivity() { + log_info "检查目标主机连接性..." + + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + if [[ ! -f "$inventory_file" ]]; then + log_error "清单文件不存在: $inventory_file" + exit 1 + fi + + # 测试连接 + if ansible nomad_cluster -i "$inventory_file" -m ping --one-line; then + log_success "所有主机连接正常" + else + log_warning "部分主机连接失败,但继续部署..." + fi +} + +# 部署 Nomad 集群 +deploy_nomad() { + log_info "开始部署 Nomad 集群..." + + local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-cluster.yml" + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + if [[ ! -f "$playbook_file" ]]; then + log_error "Playbook 文件不存在: $playbook_file" + exit 1 + fi + + # 运行 Ansible playbook + if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then + log_success "Nomad 集群部署完成" + else + log_error "Nomad 集群部署失败" + exit 1 + fi +} + +# 验证集群状态 +verify_cluster() { + log_info "验证 Nomad 集群状态..." + + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + # 检查服务状态 + log_info "检查 Nomad 服务状态..." + ansible nomad_cluster -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true + + # 检查集群成员 + log_info "检查集群服务器..." + ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true + + # 检查节点状态 + log_info "检查节点状态..." + ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true + + # 显示集群信息 + log_info "集群信息..." + ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true +} + +# 显示访问信息 +show_access_info() { + log_info "Nomad 集群访问信息:" + echo "" + echo "Web UI 访问地址:" + echo " - http://10.0.0.232:4646" + echo " - http://10.0.0.179:4646" + echo "" + echo "API 访问地址:" + echo " - http://10.0.0.232:4646/v1/" + echo " - http://10.0.0.179:4646/v1/" + echo "" + echo "常用命令:" + echo " - 查看集群状态: nomad status" + echo " - 查看节点: nomad node status" + echo " - 查看服务器: nomad server members" + echo " - 提交作业: nomad job run " + echo "" +} + +# 主函数 +main() { + log_info "开始 Nomad 集群部署..." + + check_dependencies + check_connectivity + deploy_nomad + verify_cluster + show_access_info + + log_success "Nomad 集群部署流程完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/scripts/deployment/deploy-nomad-local.sh b/scripts/deployment/deploy-nomad-local.sh new file mode 100755 index 0000000..fcdbf2c --- /dev/null +++ b/scripts/deployment/deploy-nomad-local.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# Nomad Local Deployment Script +# Nomad 本地部署脚本 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查依赖 +check_dependencies() { + log_info "检查依赖项..." + + if ! command -v ansible-playbook &> /dev/null; then + log_error "ansible-playbook 未找到,请安装 Ansible" + exit 1 + fi + + if ! command -v docker &> /dev/null; then + log_error "docker 未找到,请安装 Docker" + exit 1 + fi + + log_success "依赖检查完成" +} + +# 部署 Nomad +deploy_nomad() { + log_info "开始部署 Nomad (本地单节点)..." + + local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-local.yml" + + if [[ ! -f "$playbook_file" ]]; then + log_error "Playbook 文件不存在: $playbook_file" + exit 1 + fi + + # 运行 Ansible playbook + if ansible-playbook "$playbook_file" -v; then + log_success "Nomad 本地部署完成" + else + log_error "Nomad 本地部署失败" + exit 1 + fi +} + +# 验证部署 +verify_deployment() { + log_info "验证 Nomad 部署..." + + # 等待服务启动 + sleep 5 + + # 检查服务状态 + log_info "检查 Nomad 服务状态..." + systemctl status nomad --no-pager || true + + # 检查 Nomad 版本 + log_info "检查 Nomad 版本..." + /usr/local/bin/nomad version || true + + # 检查节点状态 + log_info "检查节点状态..." + /usr/local/bin/nomad node status || true + + # 检查服务器状态 + log_info "检查服务器状态..." + /usr/local/bin/nomad server members || true +} + +# 显示访问信息 +show_access_info() { + local current_ip=$(hostname -I | awk '{print $1}') + + log_info "Nomad 访问信息:" + echo "" + echo "Web UI 访问地址:" + echo " - http://localhost:4646" + echo " - http://${current_ip}:4646" + echo "" + echo "API 访问地址:" + echo " - http://localhost:4646/v1/" + echo " - http://${current_ip}:4646/v1/" + echo "" + echo "常用命令:" + echo " - 查看集群状态: nomad status" + echo " - 查看节点: nomad node status" + echo " - 查看服务器: nomad server members" + echo " - 提交作业: nomad job run " + echo "" + echo "示例作业文件位置:" + echo " - $PROJECT_ROOT/examples/nomad-jobs/" + echo "" +} + +# 主函数 +main() { + log_info "开始 Nomad 本地部署..." + + check_dependencies + deploy_nomad + verify_deployment + show_access_info + + log_success "Nomad 本地部署流程完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/scripts/deployment/install-nomad-cluster.sh b/scripts/deployment/install-nomad-cluster.sh new file mode 100755 index 0000000..aa5aadc --- /dev/null +++ b/scripts/deployment/install-nomad-cluster.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Install Nomad Cluster via APT +# 通过 APT 安装 Nomad 集群 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查依赖 +check_dependencies() { + log_info "检查依赖项..." + + if ! command -v ansible-playbook &> /dev/null; then + log_error "ansible-playbook 未找到,请安装 Ansible" + exit 1 + fi + + log_success "依赖检查完成" +} + +# 检查网络连接 +check_connectivity() { + log_info "检查目标主机连接性..." + + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + if [[ ! -f "$inventory_file" ]]; then + log_error "清单文件不存在: $inventory_file" + exit 1 + fi + + # 测试连接 + if ansible nomad_servers -i "$inventory_file" -m ping --one-line; then + log_success "所有主机连接正常" + else + log_warning "部分主机连接失败,但继续安装..." + fi +} + +# 安装 Nomad +install_nomad() { + log_info "开始在远程主机安装 Nomad..." + + local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/install-nomad-apt.yml" + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + if [[ ! -f "$playbook_file" ]]; then + log_error "Playbook 文件不存在: $playbook_file" + exit 1 + fi + + # 运行 Ansible playbook + if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then + log_success "Nomad 集群安装完成" + else + log_error "Nomad 集群安装失败" + exit 1 + fi +} + +# 验证安装 +verify_installation() { + log_info "验证 Nomad 安装..." + + local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini" + + # 检查服务状态 + log_info "检查 Nomad 服务状态..." + ansible nomad_servers -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true + + # 检查 Nomad 版本 + log_info "检查 Nomad 版本..." + ansible nomad_servers -i "$inventory_file" -m shell -a "nomad version" || true + + # 检查集群成员 + log_info "检查集群服务器..." + ansible nomad_servers -i "$inventory_file" -m shell -a "nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true + + # 检查节点状态 + log_info "检查节点状态..." + ansible nomad_servers -i "$inventory_file" -m shell -a "nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true +} + +# 显示访问信息 +show_access_info() { + log_info "Nomad 集群访问信息:" + echo "" + echo "Web UI 访问地址:" + echo " - http://100.117.106.136:4646 (master)" + echo " - http://100.116.158.95:4646 (semaphore)" + echo "" + echo "API 访问地址:" + echo " - http://100.117.106.136:4646/v1/ (master)" + echo " - http://100.116.158.95:4646/v1/ (semaphore)" + echo "" + echo "常用命令:" + echo " - 查看集群状态: nomad status" + echo " - 查看节点: nomad node status" + echo " - 查看服务器: nomad server members" + echo " - 提交作业: nomad job run " + echo "" + echo "示例作业文件位置:" + echo " - $PROJECT_ROOT/examples/nomad-jobs/" + echo "" +} + +# 主函数 +main() { + log_info "开始 Nomad 集群安装..." + + check_dependencies + check_connectivity + install_nomad + verify_installation + show_access_info + + log_success "Nomad 集群安装流程完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/scripts/utilities/NUCLEAR-NOMAD-RESET.yml b/scripts/utilities/NUCLEAR-NOMAD-RESET.yml new file mode 100644 index 0000000..f080662 --- /dev/null +++ b/scripts/utilities/NUCLEAR-NOMAD-RESET.yml @@ -0,0 +1,375 @@ +--- +# ☢️ NUCLEAR NOMAD RESET ☢️ +# 这是比终极还要强的修复脚本 +# 警告:这将完全摧毁并重建 Nomad 集群 +- name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️" + hosts: nomad_cluster + become: yes + gather_facts: yes + serial: 1 # 一次处理一个节点,避免同时炸掉所有节点 + vars: + nomad_version: "1.10.5" + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + tailscale_ips: + semaphore: "100.116.158.95" + master: "100.117.106.136" + ash3c: "100.116.80.94" + + tasks: + - name: "🚨 警告:即将进行核弹级重置" + debug: + msg: | + ☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️ + 这将完全摧毁所有 Nomad 相关的数据、配置和进程! + 如果你不确定,请立即按 Ctrl+C 取消! + + - name: "⏰ 等待 10 秒,给你最后的机会取消..." + pause: + seconds: 10 + + # ========== 第一阶段:核弹级清理 ========== + - name: "💀 第一阶段:核弹级进程清理" + debug: + msg: "开始核弹级进程清理..." + + - name: "🔥 停止 Nomad 服务(如果存在)" + systemd: + name: nomad + state: stopped + enabled: no + daemon_reload: yes + ignore_errors: yes + + - name: "💣 强制杀死所有 Nomad 相关进程" + shell: | + # 杀死所有 nomad 进程 + pkill -9 -f nomad || true + # 杀死所有可能的子进程 + pkill -9 -f "nomad agent" || true + pkill -9 -f "nomad server" || true + pkill -9 -f "nomad client" || true + # 等待进程完全死亡 + sleep 5 + # 再次确认杀死 + ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true + ignore_errors: yes + + - name: "🧹 清理所有 Nomad 相关文件和目录" + file: + path: "{{ item }}" + state: absent + loop: + - /opt/nomad + - /etc/nomad.d + - /var/log/nomad + - /etc/systemd/system/nomad.service + - /usr/local/bin/nomad + - /usr/bin/nomad + - /tmp/nomad* + - /var/lib/nomad + - /run/nomad + - /var/run/nomad.pid + ignore_errors: yes + + - name: "🔧 清理 systemd 缓存" + systemd: + daemon_reload: yes + + # ========== 第二阶段:重新安装 Nomad ========== + - name: "🚀 第二阶段:重新安装 Nomad" + debug: + msg: "开始重新安装 Nomad..." + + - name: "🔑 添加 HashiCorp GPG 密钥" + apt_key: + url: https://apt.releases.hashicorp.com/gpg + state: present + + - name: "📦 添加 HashiCorp APT 仓库" + apt_repository: + repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main" + state: present + update_cache: yes + + - name: "🔧 安装 Nomad(自动检测架构)" + apt: + name: "nomad={{ nomad_version }}-1" + state: present + update_cache: yes + + - name: "👤 创建 nomad 用户和组" + group: + name: nomad + state: present + + - name: "👤 创建 nomad 用户" + user: + name: nomad + group: nomad + system: yes + shell: /bin/false + home: /opt/nomad + create_home: no + + - name: "📁 创建全新的目录结构" + file: + path: "{{ item.path }}" + state: directory + owner: "{{ item.owner | default('nomad') }}" + group: "{{ item.group | default('nomad') }}" + mode: "{{ item.mode | default('0755') }}" + loop: + - { path: "/etc/nomad.d", mode: "0755" } + - { path: "/opt/nomad", mode: "0755" } + - { path: "/opt/nomad/data", mode: "0755" } + - { path: "/opt/nomad/alloc_mounts", mode: "0755" } + - { path: "/var/log/nomad", mode: "0755" } + + # ========== 第三阶段:网络和防火墙检查 ========== + - name: "🌐 第三阶段:网络配置验证" + debug: + msg: "验证网络配置..." + + - name: "🔍 检查 Tailscale IP 是否正确绑定" + shell: | + ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND" + register: ip_check + + - name: "⚠️ IP 地址检查结果" + debug: + msg: | + 节点: {{ inventory_hostname }} + 期望 IP: {{ tailscale_ips[inventory_hostname] }} + 检查结果: {{ ip_check.stdout }} + {% if 'IP_NOT_FOUND' in ip_check.stdout %} + ❌ 警告:IP 地址未正确绑定! + {% else %} + ✅ IP 地址检查通过 + {% endif %} + + - name: "🔥 确保防火墙端口开放" + shell: | + # 检查并开放 Nomad 端口 + if command -v ufw >/dev/null 2>&1; then + ufw allow 4646/tcp # HTTP API + ufw allow 4647/tcp # RPC + ufw allow 4648/tcp # Serf + elif command -v firewall-cmd >/dev/null 2>&1; then + firewall-cmd --permanent --add-port=4646/tcp + firewall-cmd --permanent --add-port=4647/tcp + firewall-cmd --permanent --add-port=4648/tcp + firewall-cmd --reload + fi + ignore_errors: yes + + # ========== 第四阶段:创建超强配置 ========== + - name: "⚙️ 第四阶段:创建超强配置文件" + debug: + msg: "创建超强配置文件..." + + - name: "📝 创建核弹级 Nomad 配置" + copy: + content: | + # ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }} + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + # 使用正确的 Tailscale IP + bind_addr = "{{ tailscale_ips[inventory_hostname] }}" + + # 日志配置 + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + log_rotate_duration = "24h" + log_rotate_max_files = 5 + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + # 更激进的重试配置 + server_join { + retry_join = [ + "{{ tailscale_ips.semaphore }}:4647", + "{{ tailscale_ips.master }}:4647", + "{{ tailscale_ips.ash3c }}:4647" + ] + retry_max = 10 + retry_interval = "15s" + } + + # 更宽松的心跳配置 + heartbeat_grace = "30s" + min_heartbeat_ttl = "10s" + max_heartbeats_per_second = 50.0 + + # Raft 配置优化 + raft_protocol = 3 + raft_multiplier = 1 + } + + client { + enabled = true + + # 网络接口配置 + network_interface = "tailscale0" + + # 更宽松的心跳配置 + max_kill_timeout = "30s" + + # 主机卷配置 + host_volume "docker-sock" { + path = "/var/run/docker.sock" + read_only = false + } + } + + + + # 地址和端口配置 + addresses { + http = "0.0.0.0" + rpc = "{{ tailscale_ips[inventory_hostname] }}" + serf = "{{ tailscale_ips[inventory_hostname] }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + # Docker 插件配置 + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + + # 更宽松的资源限制 + gc { + image = true + image_delay = "10m" + container = true + dangling_containers { + enabled = true + dry_run = false + period = "5m" + creation_grace = "5m" + } + } + } + } + + # 遥测配置 + telemetry { + collection_interval = "10s" + disable_hostname = false + prometheus_metrics = true + publish_allocation_metrics = true + publish_node_metrics = true + } + dest: "/etc/nomad.d/nomad.hcl" + owner: nomad + group: nomad + mode: '0640' + + # ========== 第五阶段:创建超强 systemd 服务 ========== + - name: "🔧 创建超强 systemd 服务文件" + copy: + content: | + [Unit] + Description=Nomad - Nuclear Edition + Documentation=https://www.nomadproject.io/ + Wants=network-online.target + After=network-online.target + ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl + + [Service] + Type=notify + User=nomad + Group=nomad + ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=always + RestartSec=10 + LimitNOFILE=65536 + + # 更强的重启策略 + StartLimitInterval=0 + StartLimitBurst=10 + + # 环境变量 + Environment=NOMAD_DISABLE_UPDATE_CHECK=1 + + [Install] + WantedBy=multi-user.target + dest: "/etc/systemd/system/nomad.service" + owner: root + group: root + mode: '0644' + + - name: "🔄 重新加载 systemd" + systemd: + daemon_reload: yes + + # ========== 第六阶段:启动和验证 ========== + - name: "🚀 第六阶段:启动服务" + debug: + msg: "启动 Nomad 服务..." + + - name: "🔥 启用并启动 Nomad 服务" + systemd: + name: nomad + enabled: yes + state: started + daemon_reload: yes + + - name: "⏰ 等待服务启动" + pause: + seconds: 15 + + - name: "🔍 验证服务状态" + systemd: + name: nomad + register: nomad_service_status + + - name: "📊 显示服务状态" + debug: + msg: | + ☢️ 核弹级重置完成! + 节点: {{ inventory_hostname }} + 服务状态: {{ nomad_service_status.status.ActiveState }} + IP 地址: {{ tailscale_ips[inventory_hostname] }} + + {% if nomad_service_status.status.ActiveState == 'active' %} + ✅ 服务启动成功! + {% else %} + ❌ 服务启动失败,请检查日志! + {% endif %} + + - name: "🧹 清理临时文件" + file: + path: "{{ item }}" + state: absent + loop: + - "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip" + - "/tmp/nomad" + ignore_errors: yes + + - name: "🎉 核弹级重置完成通知" + debug: + msg: | + ☢️☢️☢️ 核弹级重置完成!☢️☢️☢️ + + 节点 {{ inventory_hostname }} 已经被完全摧毁并重建! + + 下一步: + 1. 等待所有节点完成重置 + 2. 检查集群状态:nomad server members + 3. 检查节点状态:nomad node status + 4. 如果还有问题,那就真的没救了... 😅 \ No newline at end of file diff --git a/scripts/utilities/check-nomad-cluster.sh b/scripts/utilities/check-nomad-cluster.sh new file mode 100755 index 0000000..7286a83 --- /dev/null +++ b/scripts/utilities/check-nomad-cluster.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +echo "=== Nomad 集群状态检查 ===" + +# 检查所有节点的服务状态 +echo "1. 检查服务状态..." +ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null + +echo -e "\n2. 检查网络连通性..." +# 检查网络连通性 +for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do + echo "检查到 $ip 的连接..." + timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)" + timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)" + timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)" +done + +echo -e "\n3. 检查 Nomad 集群成员..." +# 尝试查询集群成员 +if nomad server members 2>/dev/null; then + echo "集群成员查询成功" +else + echo "无法查询集群成员 - 可能没有 leader" +fi + +echo -e "\n4. 检查节点状态..." +if nomad node status 2>/dev/null; then + echo "节点状态查询成功" +else + echo "无法查询节点状态" +fi + +echo -e "\n5. 检查最近的日志..." +echo "=== Semaphore 节点日志 ===" +journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5 + +echo -e "\n=== 检查完成 ===" \ No newline at end of file diff --git a/scripts/utilities/complete-nomad-cluster-fix.yml b/scripts/utilities/complete-nomad-cluster-fix.yml new file mode 100644 index 0000000..08274ab --- /dev/null +++ b/scripts/utilities/complete-nomad-cluster-fix.yml @@ -0,0 +1,189 @@ +--- +- name: Complete Nomad Cluster Fix with Ansible + hosts: nomad_cluster + become: yes + gather_facts: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + tailscale_ips: + semaphore: "100.116.158.95" + master: "100.117.106.136" + ash3c: "100.116.80.94" + + tasks: + - name: Stop nomad service completely + systemd: + name: nomad + state: stopped + enabled: yes + ignore_errors: yes + + - name: Kill any remaining nomad processes + shell: pkill -f nomad || true + ignore_errors: yes + + - name: Reset systemd failure state + shell: systemctl reset-failed nomad + ignore_errors: yes + + - name: Create nomad user if not exists + user: + name: nomad + system: yes + shell: /bin/false + home: /opt/nomad + create_home: no + + - name: Create all required directories with correct permissions + file: + path: "{{ item }}" + state: directory + owner: nomad + group: nomad + mode: '0755' + loop: + - /opt/nomad + - /opt/nomad/data + - /opt/nomad/alloc_mounts + - /var/log/nomad + - /etc/nomad.d + + - name: Completely clean nomad data directory + shell: rm -rf /opt/nomad/data/* /opt/nomad/data/.* + ignore_errors: yes + + - name: Create correct nomad configuration + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "{{ tailscale_ips[inventory_hostname] }}" + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + server_join { + retry_join = [ + "{{ tailscale_ips.semaphore }}:4647", + "{{ tailscale_ips.master }}:4647", + "{{ tailscale_ips.ash3c }}:4647" + ] + retry_interval = "15s" + retry_max = 3 + } + } + + client { + enabled = true + alloc_dir = "/opt/nomad/alloc_mounts" + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ tailscale_ips[inventory_hostname] }}" + serf = "{{ tailscale_ips[inventory_hostname] }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + log_rotate_duration = "24h" + log_rotate_max_files = 5 + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + + - name: Set correct ownership for all nomad files + file: + path: "{{ item }}" + owner: nomad + group: nomad + recurse: yes + loop: + - /opt/nomad + - /var/log/nomad + - /etc/nomad.d + + - name: Validate nomad configuration + shell: nomad config validate /etc/nomad.d/nomad.hcl + register: config_validation + ignore_errors: yes + + - name: Show config validation result + debug: + var: config_validation + + - name: Start nomad service on first node (semaphore) + systemd: + name: nomad + state: started + daemon_reload: yes + when: inventory_hostname == 'semaphore' + + - name: Wait for first node to start + pause: + seconds: 30 + when: inventory_hostname == 'semaphore' + + - name: Start nomad service on remaining nodes + systemd: + name: nomad + state: started + daemon_reload: yes + when: inventory_hostname != 'semaphore' + + - name: Wait for all services to start + pause: + seconds: 20 + + - name: Check nomad service status + shell: systemctl status nomad --no-pager -l + register: service_status + ignore_errors: yes + + - name: Show service status + debug: + var: service_status.stdout_lines + + - name: Check nomad logs for errors + shell: journalctl -u nomad -n 10 --no-pager + register: nomad_logs + ignore_errors: yes + + - name: Show recent nomad logs + debug: + var: nomad_logs.stdout_lines + + - name: Test nomad connectivity + shell: nomad server members + register: nomad_members + ignore_errors: yes + when: inventory_hostname == 'semaphore' + + - name: Show cluster members + debug: + var: nomad_members.stdout_lines + when: inventory_hostname == 'semaphore' \ No newline at end of file diff --git a/scripts/utilities/complete-nomad-reset.yml b/scripts/utilities/complete-nomad-reset.yml new file mode 100644 index 0000000..7b3633f --- /dev/null +++ b/scripts/utilities/complete-nomad-reset.yml @@ -0,0 +1,151 @@ +--- +- name: Complete Nomad Cluster Reset and Rebuild + hosts: nomad_cluster + become: yes + serial: 1 # 一次处理一个节点 + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + tailscale_ips: + semaphore: "100.116.158.95" + master: "100.117.106.136" + ash3c: "100.116.80.94" + + tasks: + - name: Stop nomad service completely + systemd: + name: nomad + state: stopped + ignore_errors: yes + + - name: Kill any remaining nomad processes + shell: pkill -f nomad || true + ignore_errors: yes + + - name: Remove all nomad data and state + shell: | + rm -rf /opt/nomad/data/* + rm -rf /opt/nomad/data/.* + rm -rf /var/log/nomad/* + ignore_errors: yes + + - name: Create fresh nomad configuration with correct Tailscale IPs + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + # 使用 Tailscale IP 地址 + bind_addr = "{{ tailscale_ips[inventory_hostname] }}" + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + server_join { + retry_join = [ + "{{ tailscale_ips.semaphore }}", + "{{ tailscale_ips.master }}", + "{{ tailscale_ips.ash3c }}" + ] + } + } + + client { + enabled = true + network_interface = "tailscale0" + } + + ui_config { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ tailscale_ips[inventory_hostname] }}" + serf = "{{ tailscale_ips[inventory_hostname] }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + + - name: Ensure log directory exists + file: + path: /var/log/nomad + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Start nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for nomad to start + wait_for: + port: 4646 + host: "{{ tailscale_ips[inventory_hostname] }}" + delay: 5 + timeout: 30 + + - name: Check nomad service status + shell: systemctl status nomad --no-pager -l + register: nomad_status + ignore_errors: yes + + - name: Display nomad status + debug: + var: nomad_status.stdout_lines + +- name: Wait for cluster to form + hosts: localhost + gather_facts: no + tasks: + - name: Wait for cluster formation + pause: + seconds: 30 + prompt: "等待集群形成..." + +- name: Verify cluster status + hosts: semaphore + become: yes + tasks: + - name: Check cluster members + shell: nomad server members + register: cluster_members + ignore_errors: yes + + - name: Display cluster members + debug: + var: cluster_members.stdout_lines + + - name: Check node status + shell: nomad node status + register: node_status + ignore_errors: yes + + - name: Display node status + debug: + var: node_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/consul-cluster-manager.sh b/scripts/utilities/consul-cluster-manager.sh new file mode 100755 index 0000000..5503ced --- /dev/null +++ b/scripts/utilities/consul-cluster-manager.sh @@ -0,0 +1,233 @@ +#!/bin/bash + +# Consul 集群管理脚本 +# 提供集群状态检查、重启、停止等功能 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 打印带颜色的消息 +print_status() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_header() { + echo -e "${BLUE}=== $1 ===${NC}" +} + +# 检查必要文件 +check_prerequisites() { + if [[ ! -f "$INVENTORY_FILE" ]]; then + print_error "清单文件不存在: $INVENTORY_FILE" + exit 1 + fi + + if ! command -v ansible &> /dev/null; then + print_error "未找到 ansible 命令" + exit 1 + fi +} + +# 显示帮助信息 +show_help() { + echo "Consul 集群管理脚本" + echo + echo "用法: $0 [命令]" + echo + echo "命令:" + echo " status - 检查集群状态" + echo " members - 显示集群成员" + echo " leader - 显示集群领导者" + echo " restart - 重启 Consul 服务" + echo " stop - 停止 Consul 服务" + echo " start - 启动 Consul 服务" + echo " logs - 查看服务日志" + echo " health - 健康检查" + echo " cleanup - 清理 Consul 数据(危险操作)" + echo " help - 显示此帮助信息" + echo +} + +# 检查集群状态 +check_status() { + print_header "Consul 服务状态" + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o + + echo + print_header "Consul 进程状态" + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ps aux | grep consul | grep -v grep" -o +} + +# 显示集群成员 +show_members() { + print_header "Consul 集群成员" + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members" -o +} + +# 显示集群领导者 +show_leader() { + print_header "Consul 集群领导者" + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul operator raft list-peers" -o + + echo + print_header "通过 API 检查领导者" + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s http://localhost:8500/v1/status/leader" -o +} + +# 重启服务 +restart_service() { + print_header "重启 Consul 服务" + print_warning "即将重启所有 Consul 节点..." + read -p "确认继续? (y/N): " confirm + if [[ $confirm != "y" && $confirm != "Y" ]]; then + print_status "操作已取消" + return + fi + + ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=restarted" -b + + print_status "等待服务启动..." + sleep 10 + check_status +} + +# 停止服务 +stop_service() { + print_header "停止 Consul 服务" + print_warning "即将停止所有 Consul 节点..." + read -p "确认继续? (y/N): " confirm + if [[ $confirm != "y" && $confirm != "Y" ]]; then + print_status "操作已取消" + return + fi + + ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b +} + +# 启动服务 +start_service() { + print_header "启动 Consul 服务" + ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b + + print_status "等待服务启动..." + sleep 10 + check_status +} + +# 查看日志 +show_logs() { + print_header "Consul 服务日志" + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "journalctl -u consul --no-pager -n 20" -o +} + +# 健康检查 +health_check() { + print_header "Consul 健康检查" + + # 检查服务状态 + print_status "检查服务状态..." + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o + + echo + # 检查端口监听 + print_status "检查端口监听..." + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ss -tlnp | grep :8500" -o + + echo + # 检查集群成员 + print_status "检查集群成员..." + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members | wc -l" -o + + echo + # 检查 API 响应 + print_status "检查 API 响应..." + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s -o /dev/null -w '%{http_code}' http://localhost:8500/v1/status/leader" -o +} + +# 清理数据(危险操作) +cleanup_data() { + print_header "清理 Consul 数据" + print_error "警告: 此操作将删除所有 Consul 数据,包括服务注册、KV 存储等!" + print_error "此操作不可逆!" + echo + read -p "确认要清理所有数据? 请输入 'YES' 确认: " confirm + if [[ $confirm != "YES" ]]; then + print_status "操作已取消" + return + fi + + print_status "停止 Consul 服务..." + ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b + + print_status "清理数据目录..." + ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "rm -rf /opt/consul/data/*" -b + + print_status "启动 Consul 服务..." + ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b + + print_status "数据清理完成" +} + +# 主函数 +main() { + check_prerequisites + + case "${1:-help}" in + status) + check_status + ;; + members) + show_members + ;; + leader) + show_leader + ;; + restart) + restart_service + ;; + stop) + stop_service + ;; + start) + start_service + ;; + logs) + show_logs + ;; + health) + health_check + ;; + cleanup) + cleanup_data + ;; + help|--help|-h) + show_help + ;; + *) + print_error "未知命令: $1" + echo + show_help + exit 1 + ;; + esac +} + +main "$@" \ No newline at end of file diff --git a/scripts/utilities/correct-nomad-cluster.yml b/scripts/utilities/correct-nomad-cluster.yml new file mode 100644 index 0000000..cbe1717 --- /dev/null +++ b/scripts/utilities/correct-nomad-cluster.yml @@ -0,0 +1,115 @@ +--- +- name: Correct Nomad Cluster Configuration + hosts: nomad_cluster + become: yes + gather_facts: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + tailscale_ips: + semaphore: "100.116.158.95" + master: "100.117.106.136" + ash3c: "100.116.80.94" + + tasks: + - name: Stop nomad service + systemd: + name: nomad + state: stopped + ignore_errors: yes + + - name: Clean nomad data + file: + path: /opt/nomad/data + state: absent + + - name: Recreate nomad data directory + file: + path: /opt/nomad/data + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Create correct nomad configuration + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "{{ tailscale_ips[inventory_hostname] }}" + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + server_join { + retry_join = [ + "{{ tailscale_ips.semaphore }}:4647", + "{{ tailscale_ips.master }}:4647", + "{{ tailscale_ips.ash3c }}:4647" + ] + retry_interval = "15s" + retry_max = 3 + } + } + + client { + enabled = true + alloc_dir = "/opt/nomad/alloc_mounts" + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ tailscale_ips[inventory_hostname] }}" + serf = "{{ tailscale_ips[inventory_hostname] }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + +- name: Start nomad services in sequence + hosts: nomad_cluster + become: yes + serial: 1 + tasks: + - name: Start nomad service + systemd: + name: nomad + state: started + daemon_reload: yes + + - name: Wait for nomad to start + wait_for: + port: 4646 + host: "{{ tailscale_ips[inventory_hostname] }}" + delay: 10 + timeout: 60 + + - name: Wait between nodes + pause: + seconds: 30 \ No newline at end of file diff --git a/scripts/utilities/deploy-nomad-configs.yml b/scripts/utilities/deploy-nomad-configs.yml new file mode 100644 index 0000000..6336b9b --- /dev/null +++ b/scripts/utilities/deploy-nomad-configs.yml @@ -0,0 +1,113 @@ +--- +- name: Deploy Nomad Configurations + hosts: nomad_cluster + become: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + node_ips: + semaphore: "100.116.158.95" + master: "100.117.106.136" + ash3c: "100.116.80.94" + + tasks: + - name: Create nomad configuration for each node + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "{{ node_ips[inventory_hostname] }}" + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + server_join { + retry_join = [ + "{{ node_ips.semaphore }}:4647", + "{{ node_ips.master }}:4647", + "{{ node_ips.ash3c }}:4647" + ] + retry_interval = "15s" + retry_max = 3 + } + } + + client { + enabled = true + alloc_dir = "/opt/nomad/alloc_mounts" + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ node_ips[inventory_hostname] }}" + serf = "{{ node_ips[inventory_hostname] }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + + - name: Validate nomad configuration + shell: nomad config validate /etc/nomad.d/nomad.hcl + register: config_validation + + - name: Show validation result + debug: + var: config_validation.stdout_lines + + - name: Start nomad service on bootstrap node first + systemd: + name: nomad + state: started + daemon_reload: yes + when: inventory_hostname == 'semaphore' + + - name: Wait for bootstrap node + pause: + seconds: 15 + when: inventory_hostname == 'semaphore' + + - name: Start nomad service on other nodes + systemd: + name: nomad + state: started + daemon_reload: yes + when: inventory_hostname != 'semaphore' + + - name: Wait for services to start + pause: + seconds: 10 + + - name: Check service status + shell: systemctl status nomad --no-pager + register: service_status + ignore_errors: yes + + - name: Show service status + debug: + var: service_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/final-nomad-cluster-fix.yml b/scripts/utilities/final-nomad-cluster-fix.yml new file mode 100644 index 0000000..46080a9 --- /dev/null +++ b/scripts/utilities/final-nomad-cluster-fix.yml @@ -0,0 +1,190 @@ +--- +- name: Final Complete Nomad Cluster Fix + hosts: nomad_cluster + become: yes + gather_facts: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + nomad_servers: + - "100.116.158.95:4647" # semaphore + - "100.117.106.136:4647" # master + - "100.116.80.94:4647" # ash3c + + tasks: + - name: Stop nomad service + systemd: + name: nomad + state: stopped + ignore_errors: yes + + - name: Reset failed nomad service + systemd: + name: nomad + daemon_reload: yes + ignore_errors: yes + + - name: Create nomad user if not exists + user: + name: nomad + system: yes + shell: /bin/false + home: /opt/nomad + create_home: no + + - name: Create nomad directories with correct permissions + file: + path: "{{ item }}" + state: directory + owner: nomad + group: nomad + mode: '0755' + loop: + - /etc/nomad.d + - /opt/nomad + - /opt/nomad/data + - /opt/nomad/alloc_mounts + - /var/log/nomad + + - name: Clean old nomad data + file: + path: /opt/nomad/data + state: absent + + - name: Recreate nomad data directory + file: + path: /opt/nomad/data + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Get Tailscale IP address + shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1 + register: tailscale_ip + failed_when: false + + - name: Set bind address (fallback to default interface if tailscale not available) + set_fact: + bind_address: "{{ tailscale_ip.stdout if tailscale_ip.stdout != '' else ansible_default_ipv4.address }}" + + - name: Generate nomad configuration + template: + src: nomad-server.hcl.j2 + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + vars: + nomad_datacenter: "dc1" + nomad_region: "global" + nomad_data_dir: "/opt/nomad/data" + nomad_bind_addr: "{{ bind_address }}" + nomad_bootstrap_expect: 3 + nomad_encrypt: "{{ nomad_encrypt_key }}" + nomad_retry_join: "{{ nomad_servers }}" + nomad_alloc_dir: "/opt/nomad/alloc_mounts" + nomad_log_file: "/var/log/nomad/nomad.log" + + - name: Create nomad systemd service + copy: + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl + + [Service] + Type=notify + User=nomad + Group=nomad + ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/nomad.service + mode: '0644' + + - name: Reload systemd daemon + systemd: + daemon_reload: yes + + - name: Start nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for nomad to start + wait_for: + port: 4646 + host: "{{ bind_address }}" + delay: 5 + timeout: 30 + ignore_errors: yes + +- name: Create nomad configuration template + hosts: localhost + gather_facts: no + tasks: + - name: Create nomad server template + copy: + content: | + datacenter = "{{ nomad_datacenter }}" + region = "{{ nomad_region }}" + data_dir = "{{ nomad_data_dir }}" + + bind_addr = "{{ nomad_bind_addr }}" + + server { + enabled = true + bootstrap_expect = {{ nomad_bootstrap_expect }} + encrypt = "{{ nomad_encrypt }}" + + server_join { + retry_join = {{ nomad_retry_join | to_json }} + retry_interval = "15s" + retry_max = 3 + } + } + + client { + enabled = true + alloc_dir = "{{ nomad_alloc_dir }}" + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ nomad_bind_addr }}" + serf = "{{ nomad_bind_addr }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + + log_level = "INFO" + log_file = "{{ nomad_log_file }}" + dest: /tmp/nomad-server.hcl.j2 + delegate_to: localhost + run_once: true \ No newline at end of file diff --git a/scripts/utilities/final-nomad-fix.yml b/scripts/utilities/final-nomad-fix.yml new file mode 100644 index 0000000..ed51095 --- /dev/null +++ b/scripts/utilities/final-nomad-fix.yml @@ -0,0 +1,111 @@ +--- +- name: Final Nomad Cluster Fix + hosts: nomad_cluster + become: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + tailscale_ips: + semaphore: "100.116.158.95" + master: "100.117.106.136" + ash3c: "100.116.80.94" + + tasks: + - name: Stop nomad service + systemd: + name: nomad + state: stopped + ignore_errors: yes + + - name: Create required directories + file: + path: "{{ item }}" + state: directory + owner: nomad + group: nomad + mode: '0755' + loop: + - /opt/nomad/data + - /opt/nomad/alloc_mounts + - /var/log/nomad + + - name: Clean nomad data + shell: rm -rf /opt/nomad/data/* + ignore_errors: yes + + - name: Create working nomad configuration + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "{{ tailscale_ips[inventory_hostname] }}" + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + server_join { + retry_join = [ + "{{ tailscale_ips.semaphore }}", + "{{ tailscale_ips.master }}", + "{{ tailscale_ips.ash3c }}" + ] + } + } + + client { + enabled = true + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ tailscale_ips[inventory_hostname] }}" + serf = "{{ tailscale_ips[inventory_hostname] }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + + - name: Start nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for service to start + pause: + seconds: 10 + + - name: Check service status + shell: systemctl status nomad --no-pager -l + register: service_status + ignore_errors: yes + + - name: Show service status + debug: + var: service_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/fix-ash3c-ip.sh b/scripts/utilities/fix-ash3c-ip.sh new file mode 100755 index 0000000..8428d89 --- /dev/null +++ b/scripts/utilities/fix-ash3c-ip.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# 🔧 ash3c IP 地址修复脚本 + +set -e + +echo "🔧 ash3c IP 地址问题修复脚本" +echo "" + +# 定义正确的 IP 地址 +CORRECT_IP="100.116.80.94" +ASH3C_HOST="100.116.80.94" + +echo "📡 检查 ash3c 节点的网络配置..." + +# 检查 ash3c 的实际 IP 配置 +echo "🔍 检查 ash3c 节点的 IP 地址绑定..." +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S ip addr show" | grep -E "inet.*100\." || echo "❌ 未找到 Tailscale IP" + +echo "" +echo "🔍 检查 Tailscale 状态..." +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S tailscale status" || echo "❌ Tailscale 状态检查失败" + +echo "" +echo "🔧 修复 ash3c 的 Nomad 配置..." + +# 创建正确的配置文件 +cat > /tmp/ash3c-nomad.hcl << EOF +# 🔧 ash3c 修复后的 Nomad 配置 +datacenter = "dc1" +region = "global" +data_dir = "/opt/nomad/data" + +# 强制使用正确的 Tailscale IP +bind_addr = "${CORRECT_IP}" + +# 日志配置 +log_level = "INFO" +log_file = "/var/log/nomad/nomad.log" + +server { + enabled = true + bootstrap_expect = 3 + encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + + server_join { + retry_join = [ + "100.116.158.95:4647", + "100.117.106.136:4647", + "100.116.80.94:4647" + ] + retry_max = 10 + retry_interval = "15s" + } + + # 更宽松的心跳配置 + heartbeat_grace = "30s" + min_heartbeat_ttl = "10s" +} + +client { + enabled = true + network_interface = "tailscale0" +} + +ui_config { + enabled = true +} + +addresses { + http = "0.0.0.0" + rpc = "${CORRECT_IP}" + serf = "${CORRECT_IP}" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } +} +EOF + +echo "📤 上传修复后的配置到 ash3c..." +scp -P 22 -i ~/.ssh/id_ed25519 /tmp/ash3c-nomad.hcl ben@${ASH3C_HOST}:/tmp/ + +echo "🔧 在 ash3c 上应用修复..." +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} << 'REMOTE_SCRIPT' +echo '3131' | sudo -S systemctl stop nomad || true +echo '3131' | sudo -S pkill -f nomad || true +sleep 5 + +# 备份旧配置 +echo '3131' | sudo -S cp /etc/nomad.d/nomad.hcl /etc/nomad.d/nomad.hcl.backup.$(date +%Y%m%d_%H%M%S) || true + +# 应用新配置 +echo '3131' | sudo -S cp /tmp/ash3c-nomad.hcl /etc/nomad.d/nomad.hcl +echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl +echo '3131' | sudo -S chmod 640 /etc/nomad.d/nomad.hcl + +# 清理数据目录 +echo '3131' | sudo -S rm -rf /opt/nomad/data/* + +# 重启服务 +echo '3131' | sudo -S systemctl daemon-reload +echo '3131' | sudo -S systemctl enable nomad +echo '3131' | sudo -S systemctl start nomad + +echo "✅ ash3c 配置修复完成" +REMOTE_SCRIPT + +echo "" +echo "⏰ 等待 ash3c 服务启动..." +sleep 15 + +echo "" +echo "🔍 检查 ash3c 服务状态..." +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S systemctl status nomad --no-pager" || echo "❌ 服务状态检查失败" + +echo "" +echo "🧹 清理临时文件..." +rm -f /tmp/ash3c-nomad.hcl + +echo "" +echo "✅ ash3c IP 修复完成!" +echo "" +echo "下一步:" +echo "1. 检查集群状态: nomad server members" +echo "2. 如果还有问题,运行核弹级重置: ./scripts/utilities/nuclear-reset.sh" \ No newline at end of file diff --git a/scripts/utilities/fix-consul-cluster.sh b/scripts/utilities/fix-consul-cluster.sh new file mode 100755 index 0000000..35c07a4 --- /dev/null +++ b/scripts/utilities/fix-consul-cluster.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Consul 集群修复脚本 +# 解决 "No cluster leader" 问题 + +set -e + +echo "=== Consul 集群修复脚本 ===" +echo "当前时间: $(date)" +echo + +# 检查当前 Consul 服务状态 +echo "1. 检查当前 Consul 服务状态..." +docker service ls | grep consul || echo "未找到 consul 服务" +echo + +# 显示当前问题 +echo "2. 检查 Consul 日志中的错误..." +echo "Master 节点日志:" +docker service logs consul-cluster_consul-master --tail 5 2>/dev/null || echo "无法获取 master 日志" +echo +echo "Ash3c 节点日志:" +docker service logs consul-cluster_consul-ash3c --tail 5 2>/dev/null || echo "无法获取 ash3c 日志" +echo + +# 提供修复选项 +echo "3. 修复选项:" +echo " a) 使用修复后的 overlay 网络配置 (推荐)" +echo " b) 使用 macvlan 网络配置" +echo " c) 仅重启现有服务" +echo + +read -p "请选择修复方案 (a/b/c): " choice + +case $choice in + a) + echo "使用修复后的 overlay 网络配置..." + + # 停止现有服务 + echo "停止现有 Consul 集群..." + docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在" + + # 等待服务完全停止 + echo "等待服务完全停止..." + sleep 10 + + # 清理数据卷 (可选) + read -p "是否清理现有数据卷? (y/n): " clean_volumes + if [[ $clean_volumes == "y" ]]; then + docker volume rm consul-cluster_consul_master_data 2>/dev/null || true + docker volume rm consul-cluster_consul_ash3c_data 2>/dev/null || true + echo "数据卷已清理" + fi + + # 部署修复后的配置 + echo "部署修复后的 Consul 集群..." + docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-fixed.yml consul-cluster + + echo "等待服务启动..." + sleep 15 + + # 检查服务状态 + echo "检查新服务状态..." + docker service ls | grep consul + ;; + + b) + echo "使用 macvlan 网络配置..." + echo "注意: 需要根据你的网络环境调整 IP 地址和网络接口" + + # 检查网络接口 + echo "当前网络接口:" + ip link show | grep -E "^[0-9]+:" | awk '{print $2}' | sed 's/://' + echo + + read -p "请输入要使用的网络接口 (如 eth0): " interface + read -p "请输入子网 (如 192.168.1.0/24): " subnet + read -p "请输入网关 (如 192.168.1.1): " gateway + + # 更新 macvlan 配置文件 + sed -i "s/parent: eth0/parent: $interface/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml + sed -i "s/192.168.1.0\/24/$subnet/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml + sed -i "s/192.168.1.1/$gateway/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml + + # 停止现有服务 + echo "停止现有 Consul 集群..." + docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在" + + # 等待服务完全停止 + echo "等待服务完全停止..." + sleep 10 + + # 部署 macvlan 配置 + echo "部署 macvlan Consul 集群..." + docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml consul-cluster + + echo "等待服务启动..." + sleep 15 + + # 检查服务状态 + echo "检查新服务状态..." + docker service ls | grep consul + ;; + + c) + echo "重启现有服务..." + + # 重启服务 + docker service update --force consul-cluster_consul-master + docker service update --force consul-cluster_consul-ash3c + + echo "等待服务重启..." + sleep 10 + + # 检查服务状态 + echo "检查服务状态..." + docker service ls | grep consul + ;; + + *) + echo "无效选择,退出" + exit 1 + ;; +esac + +echo +echo "4. 验证修复结果..." +sleep 5 + +# 检查服务状态 +echo "服务状态:" +docker service ls | grep consul + +echo +echo "等待 30 秒后检查集群状态..." +sleep 30 + +# 尝试检查集群成员 +echo "尝试检查集群成员状态..." +timeout 10 docker service logs consul-cluster_consul-master --tail 10 2>/dev/null || echo "无法获取日志" + +echo +echo "=== 修复完成 ===" +echo "请等待几分钟让集群完全启动,然后访问:" +echo "- Master UI: http://your-master-ip:8500" +echo "- Ash3c UI: http://your-ash3c-ip:8501" +echo +echo "如果问题仍然存在,请检查:" +echo "1. 节点间网络连通性" +echo "2. 防火墙设置" +echo "3. Docker Swarm 网络配置" \ No newline at end of file diff --git a/scripts/utilities/fix-master-binary.sh b/scripts/utilities/fix-master-binary.sh new file mode 100755 index 0000000..b774783 --- /dev/null +++ b/scripts/utilities/fix-master-binary.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +echo "🔧 使用 HashiCorp 官方脚本修复 master 节点二进制文件..." + +# 停止 nomad 服务 +echo '3131' | sudo -S systemctl stop nomad || true +echo '3131' | sudo -S pkill -9 -f nomad || true + +# 删除旧的二进制文件 +echo '3131' | sudo -S rm -f /usr/local/bin/nomad /usr/bin/nomad + +# 使用 HashiCorp 官方安装脚本(自动检测架构) +curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add - +echo '3131' | sudo -S apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com $(lsb_release -cs) main" +echo '3131' | sudo -S apt-get update +echo '3131' | sudo -S apt-get install -y nomad=1.10.5-1 + +# 验证安装 +nomad version + +# 重启服务 +echo '3131' | sudo -S systemctl daemon-reload +echo '3131' | sudo -S systemctl enable nomad +echo '3131' | sudo -S systemctl start nomad + +echo "✅ Master 节点二进制文件修复完成!" \ No newline at end of file diff --git a/scripts/utilities/fix-nomad-cluster.yml b/scripts/utilities/fix-nomad-cluster.yml new file mode 100644 index 0000000..335295c --- /dev/null +++ b/scripts/utilities/fix-nomad-cluster.yml @@ -0,0 +1,92 @@ +--- +- name: Fix Nomad Cluster Issues + hosts: nomad_cluster + become: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + + tasks: + - name: Stop nomad service + systemd: + name: nomad + state: stopped + ignore_errors: yes + + - name: Clean nomad data directory + shell: rm -rf /opt/nomad/data/* + ignore_errors: yes + + - name: Create correct nomad configuration + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}" + + server { + enabled = true + bootstrap_expect = 3 + encrypt = "{{ nomad_encrypt_key }}" + + server_join { + retry_join = ["100.116.158.95", "100.117.106.136", "100.116.80.94"] + } + } + + client { + enabled = true + network_interface = "{{ ansible_default_ipv4.interface | default('eth0') }}" + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "0.0.0.0" + serf = "0.0.0.0" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } + } + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + + - name: Start nomad service + systemd: + name: nomad + state: started + enabled: yes + + - name: Wait for nomad to start + wait_for: + port: 4646 + host: "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}" + delay: 10 + timeout: 60 + + - name: Check nomad status + shell: systemctl status nomad --no-pager -l + register: nomad_status + ignore_errors: yes + + - name: Display nomad status + debug: + var: nomad_status.stdout_lines \ No newline at end of file diff --git a/scripts/utilities/nomad-diagnosis.sh b/scripts/utilities/nomad-diagnosis.sh new file mode 100755 index 0000000..f0caf8d --- /dev/null +++ b/scripts/utilities/nomad-diagnosis.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +# 🔍 Nomad 集群快速诊断脚本 + +echo "🔍 Nomad 集群快速诊断" +echo "====================" +echo "" + +# 定义节点信息 +declare -A NODES=( + ["semaphore"]="local" + ["master"]="100.117.106.136:60022" + ["ash3c"]="100.116.80.94:22" +) + +declare -A TAILSCALE_IPS=( + ["semaphore"]="100.116.158.95" + ["master"]="100.117.106.136" + ["ash3c"]="100.116.80.94" +) + +echo "📊 1. 本地 Nomad 服务状态" +echo "------------------------" +systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常" +echo "" + +echo "📊 2. 集群成员状态" +echo "----------------" +nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态" +echo "" + +echo "📊 3. 节点状态" +echo "------------" +nomad node status 2>/dev/null || echo "❌ 无法获取节点状态" +echo "" + +echo "🌐 4. 网络连通性测试" +echo "------------------" +for node in "${!NODES[@]}"; do + ip="${TAILSCALE_IPS[$node]}" + echo "测试 $node ($ip):" + + if [[ "$node" == "semaphore" ]]; then + echo " ✅ 本地节点" + else + # Ping 测试 + if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then + echo " ✅ Ping: 成功" + else + echo " ❌ Ping: 失败" + fi + + # 端口测试 + if timeout 5 bash -c "/dev/null; then + echo " ✅ RPC端口(4647): 开放" + else + echo " ❌ RPC端口(4647): 关闭" + fi + + if timeout 5 bash -c "/dev/null; then + echo " ✅ HTTP端口(4646): 开放" + else + echo " ❌ HTTP端口(4646): 关闭" + fi + fi + echo "" +done + +echo "🔧 5. 远程节点服务状态" +echo "-------------------" +for node in "${!NODES[@]}"; do + if [[ "$node" == "semaphore" ]]; then + continue + fi + + connection="${NODES[$node]}" + ip=$(echo "$connection" | cut -d: -f1) + port=$(echo "$connection" | cut -d: -f2) + + echo "检查 $node ($ip:$port):" + + if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then + status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null) + echo " 服务状态: $status" + + # 检查配置文件中的 bind_addr + bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1) + echo " 配置绑定地址: $bind_addr" + + # 检查实际监听端口 + listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3) + if [[ -n "$listening" ]]; then + echo " 监听端口:" + echo "$listening" | sed 's/^/ /' + else + echo " ❌ 未发现 Nomad 监听端口" + fi + else + echo " ❌ 无法连接或服务未运行" + fi + echo "" +done + +echo "📋 6. 问题总结和建议" +echo "==================" + +# 检查是否有 leader +if nomad server members 2>/dev/null | grep -q "leader"; then + echo "✅ 集群有 leader" +else + echo "❌ 集群没有 leader - 这是主要问题!" + echo "" + echo "🔧 建议的修复步骤:" + echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh" + echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh" + echo "3. 检查 master 节点是否需要重启" +fi + +echo "" +echo "🔗 有用的链接:" +echo " Web UI: http://100.116.158.95:4646" +echo " 日志查看: journalctl -u nomad -f" +echo "" +echo "🔍 诊断完成!" \ No newline at end of file diff --git a/scripts/utilities/nuclear-reset.sh b/scripts/utilities/nuclear-reset.sh new file mode 100755 index 0000000..a1f8d8b --- /dev/null +++ b/scripts/utilities/nuclear-reset.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# ☢️ 核弹级 Nomad 重置执行脚本 ☢️ + +set -e + +echo "☢️☢️☢️ 核弹级 Nomad 集群重置 ☢️☢️☢️" +echo "" +echo "这个脚本将:" +echo "1. 完全摧毁所有 Nomad 进程和数据" +echo "2. 重新下载并安装 Nomad 二进制文件" +echo "3. 创建全新的配置文件" +echo "4. 重新启动整个集群" +echo "" +echo "⚠️ 警告:这是不可逆的操作!⚠️" +echo "" + +# 检查是否在正确的目录 +if [[ ! -f "scripts/utilities/NUCLEAR-NOMAD-RESET.yml" ]]; then + echo "❌ 错误:请在 /root/mgmt 目录下运行此脚本" + exit 1 +fi + +# 确认操作 +read -p "你确定要进行核弹级重置吗?输入 'NUCLEAR' 确认: " confirm +if [[ "$confirm" != "NUCLEAR" ]]; then + echo "❌ 操作已取消" + exit 1 +fi + +echo "" +echo "🚀 开始核弹级重置..." +echo "" + +# 设置 Ansible 配置 +export ANSIBLE_HOST_KEY_CHECKING=False +export ANSIBLE_STDOUT_CALLBACK=yaml + +# 执行核弹级重置 +echo "📡 执行 Ansible playbook..." +cd /root/mgmt/configuration + +ansible-playbook \ + -i inventories/production/nomad-cluster.ini \ + ../scripts/utilities/NUCLEAR-NOMAD-RESET.yml \ + --extra-vars "ansible_ssh_common_args='-o StrictHostKeyChecking=no'" \ + -v + +echo "" +echo "⏰ 等待集群稳定..." +sleep 30 + +echo "" +echo "🔍 检查集群状态..." + +# 检查集群成员 +echo "📊 集群成员状态:" +nomad server members || echo "❌ 无法获取集群成员状态" + +echo "" +echo "📊 节点状态:" +nomad node status || echo "❌ 无法获取节点状态" + +echo "" +echo "🎯 如果上面显示错误,请等待几分钟后再次检查" +echo "集群可能需要一些时间来完全启动和同步" + +echo "" +echo "🔧 有用的命令:" +echo " 检查集群成员: nomad server members" +echo " 检查节点状态: nomad node status" +echo " 查看日志: journalctl -u nomad -f" +echo " Web UI: http://100.116.158.95:4646" + +echo "" +echo "☢️ 核弹级重置完成!☢️" \ No newline at end of file diff --git a/scripts/utilities/simple-nomad-fix.sh b/scripts/utilities/simple-nomad-fix.sh new file mode 100755 index 0000000..f0feaa3 --- /dev/null +++ b/scripts/utilities/simple-nomad-fix.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +echo "=== 简单的 Nomad 集群修复脚本 ===" + +# 定义 Tailscale IP 地址 +SEMAPHORE_IP="100.116.158.95" +MASTER_IP="100.117.106.136" +ASH3C_IP="100.116.80.94" +ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + +# 创建配置文件函数 +create_config() { + local node_name=$1 + local bind_ip=$2 + + cat > /tmp/nomad-${node_name}.hcl << EOF +datacenter = "dc1" +region = "global" +data_dir = "/opt/nomad/data" + +bind_addr = "${bind_ip}" + +server { + enabled = true + bootstrap_expect = 3 + encrypt = "${ENCRYPT_KEY}" + + server_join { + retry_join = ["${SEMAPHORE_IP}", "${MASTER_IP}", "${ASH3C_IP}"] + } +} + +client { + enabled = true +} + +ui_config { + enabled = true +} + +addresses { + http = "0.0.0.0" + rpc = "${bind_ip}" + serf = "${bind_ip}" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } +} + +log_level = "INFO" +log_file = "/var/log/nomad/nomad.log" +EOF +} + +echo "1. 停止所有 Nomad 服务..." +systemctl stop nomad +ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl stop nomad" +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl stop nomad" + +echo "2. 清理数据目录..." +rm -rf /opt/nomad/data/* +ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*" +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*" + +echo "3. 创建新配置文件..." +create_config "semaphore" "${SEMAPHORE_IP}" +create_config "master" "${MASTER_IP}" +create_config "ash3c" "${ASH3C_IP}" + +echo "4. 部署配置文件..." +cp /tmp/nomad-semaphore.hcl /etc/nomad.d/nomad.hcl +chown nomad:nomad /etc/nomad.d/nomad.hcl + +scp -P 60022 -i ~/.ssh/id_ed25519 /tmp/nomad-master.hcl ben@${MASTER_IP}:/tmp/ +ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S cp /tmp/nomad-master.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl" + +scp -P 22 -i ~/.ssh/id_ed25519 /tmp/nomad-ash3c.hcl ben@${ASH3C_IP}:/tmp/ +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S cp /tmp/nomad-ash3c.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl" + +echo "5. 启动服务..." +systemctl start nomad +ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl start nomad" +ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl start nomad" + +echo "6. 等待集群形成..." +sleep 30 + +echo "7. 检查集群状态..." +nomad server members +nomad node status + +echo "=== 修复完成 ===" \ No newline at end of file diff --git a/scripts/utilities/ultimate-nomad-fix.yml b/scripts/utilities/ultimate-nomad-fix.yml new file mode 100644 index 0000000..d051a57 --- /dev/null +++ b/scripts/utilities/ultimate-nomad-fix.yml @@ -0,0 +1,113 @@ +--- +- name: Ultimate Nomad Cluster Fix - Complete Reset + hosts: nomad_cluster + become: yes + gather_facts: yes + vars: + nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + + tasks: + - name: Stop and disable nomad service completely + systemd: + name: nomad + state: stopped + enabled: no + daemon_reload: yes + ignore_errors: yes + + - name: Kill any remaining nomad processes + shell: pkill -f nomad || true + ignore_errors: yes + + - name: Remove all nomad data and state + file: + path: "{{ item }}" + state: absent + loop: + - /opt/nomad/data + - /etc/nomad.d/nomad.hcl + - /var/log/nomad + + - name: Create clean nomad directories + file: + path: "{{ item }}" + state: directory + owner: nomad + group: nomad + mode: '0755' + loop: + - /etc/nomad.d + - /opt/nomad + - /opt/nomad/data + - /opt/nomad/alloc_mounts + - /var/log/nomad + + - name: Create minimal nomad configuration + copy: + content: | + datacenter = "dc1" + region = "global" + data_dir = "/opt/nomad/data" + + bind_addr = "{{ ansible_default_ipv4.address }}" + + server { + enabled = true + bootstrap_expect = 1 + encrypt = "{{ nomad_encrypt_key }}" + } + + client { + enabled = true + alloc_dir = "/opt/nomad/alloc_mounts" + } + + ui { + enabled = true + } + + addresses { + http = "0.0.0.0" + rpc = "{{ ansible_default_ipv4.address }}" + serf = "{{ ansible_default_ipv4.address }}" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + log_level = "INFO" + log_file = "/var/log/nomad/nomad.log" + dest: /etc/nomad.d/nomad.hcl + owner: nomad + group: nomad + mode: '0640' + + - name: Enable and start nomad service + systemd: + name: nomad + state: started + enabled: yes + daemon_reload: yes + + - name: Wait for nomad to start + wait_for: + port: 4646 + host: "{{ ansible_default_ipv4.address }}" + delay: 10 + timeout: 60 + + - name: Check nomad status + uri: + url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader" + method: GET + register: nomad_leader + retries: 5 + delay: 5 + ignore_errors: yes + + - name: Display nomad status + debug: + msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}" \ No newline at end of file diff --git a/snippets/zsh/zshrc-minimal.sh b/snippets/zsh/zshrc-minimal.sh new file mode 100644 index 0000000..7fc136c --- /dev/null +++ b/snippets/zsh/zshrc-minimal.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# 最小化 ZSH 配置 - 适合快速部署 +# 用法: curl -fsSL https://your-gitea.com/ben/mgmt/raw/branch/main/snippets/zsh/zshrc-minimal.sh | bash + +set -euo pipefail + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# 检查 root 权限 +if [[ $EUID -ne 0 ]]; then + log_error "需要 root 权限" + exit 1 +fi + +log_info "开始安装最小化 ZSH 配置..." + +# 安装依赖 +apt update && apt install -y zsh git curl fonts-powerline + +# 安装 oh-my-zsh +if [[ ! -d "$HOME/.oh-my-zsh" ]]; then + RUNZSH=no CHSH=no sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" +fi + +# 安装关键插件 +custom_dir="$HOME/.oh-my-zsh/custom/plugins" +mkdir -p "$custom_dir" + +[[ ! -d "$custom_dir/zsh-autosuggestions" ]] && git clone https://github.com/zsh-users/zsh-autosuggestions "$custom_dir/zsh-autosuggestions" +[[ ! -d "$custom_dir/zsh-syntax-highlighting" ]] && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git "$custom_dir/zsh-syntax-highlighting" + +# 创建最小化配置 +cat > "$HOME/.zshrc" << 'EOF' +# Oh My Zsh 配置 +export ZSH="$HOME/.oh-my-zsh" +ZSH_THEME="agnoster" + +plugins=( + git + docker + ansible + terraform + kubectl + zsh-autosuggestions + zsh-syntax-highlighting +) + +source $ZSH/oh-my-zsh.sh + +# 基本别名 +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' +alias ..='cd ..' +alias ...='cd ../..' +alias grep='grep --color=auto' + +# Docker 别名 +alias d='docker' +alias dps='docker ps' +alias dpsa='docker ps -a' +alias dex='docker exec -it' +alias dlog='docker logs -f' + +# Kubernetes 别名 +alias k='kubectl' +alias kgp='kubectl get pods' +alias kgs='kubectl get services' +alias kgd='kubectl get deployments' + +# Git 别名 +alias gs='git status' +alias ga='git add' +alias gc='git commit' +alias gp='git push' +alias gl='git pull' + +# 历史配置 +HISTSIZE=10000 +SAVEHIST=10000 +HISTFILE=~/.zsh_history +setopt SHARE_HISTORY +setopt HIST_IGNORE_DUPS + +# 自动建议配置 +ZSH_AUTOSUGGEST_HIGHLIGHT_STYLE='fg=8' +ZSH_AUTOSUGGEST_STRATEGY=(history completion) + +echo "🚀 ZSH 配置完成!" +EOF + +# 设置默认 shell +chsh -s "$(which zsh)" + +log_success "最小化 ZSH 配置安装完成!" +log_info "请重新登录或运行: source ~/.zshrc" diff --git a/swarm/stacks/consul-cluster-fixed.yml b/swarm/stacks/consul-cluster-fixed.yml new file mode 100644 index 0000000..0fea038 --- /dev/null +++ b/swarm/stacks/consul-cluster-fixed.yml @@ -0,0 +1,76 @@ +version: '3.8' + +services: + consul-master: + image: consul:latest + hostname: consul-master + command: > + sh -c " + IP=$$(hostname -i | awk '{print $$1}'); + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 + -ui + -log-level=INFO + " + ports: + - "8500:8500" + - "8600:8600/udp" + volumes: + - consul_master_data:/consul/data + networks: + consul-net: + aliases: + - consul-master + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == master + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + + consul-ash3c: + image: consul:latest + hostname: consul-ash3c + command: > + sh -c " + IP=$$(hostname -i | awk '{print $$1}'); + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 + -retry-join=consul-master + -ui + -log-level=INFO + " + ports: + - "8501:8500" + - "8601:8600/udp" + volumes: + - consul_ash3c_data:/consul/data + networks: + consul-net: + aliases: + - consul-ash3c + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == ash3c + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + +volumes: + consul_master_data: + consul_ash3c_data: + +networks: + consul-net: + driver: overlay + attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-host-network.yml b/swarm/stacks/consul-cluster-host-network.yml new file mode 100644 index 0000000..300bddd --- /dev/null +++ b/swarm/stacks/consul-cluster-host-network.yml @@ -0,0 +1,68 @@ +version: '3.8' + +services: + consul-master: + image: consul:latest + hostname: consul-master + command: > + sh -c " + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-master -bind=100.117.106.136 -advertise=100.117.106.136 -client=0.0.0.0 + -ui + -log-level=INFO + " + ports: + - "8500:8500" + - "8600:8600/udp" + - "8301:8301" + - "8302:8302" + volumes: + - consul_master_data:/consul/data + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == master + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + + consul-ash3c: + image: consul:latest + hostname: consul-ash3c + command: > + sh -c " + ASH3C_IP=$$(getent hosts ash3c | awk '{print $$1}'); + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-ash3c -bind=$$ASH3C_IP -advertise=$$ASH3C_IP -client=0.0.0.0 + -retry-join=100.117.106.136 + -ui + -log-level=INFO + " + ports: + - "8501:8500" + - "8601:8600/udp" + - "8311:8301" + - "8312:8302" + volumes: + - consul_ash3c_data:/consul/data + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == ash3c + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + depends_on: + - consul-master + +volumes: + consul_master_data: + consul_ash3c_data: \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-ip-based.yml b/swarm/stacks/consul-cluster-ip-based.yml new file mode 100644 index 0000000..56a86f0 --- /dev/null +++ b/swarm/stacks/consul-cluster-ip-based.yml @@ -0,0 +1,78 @@ +version: '3.8' + +services: + consul-master: + image: consul:latest + hostname: consul-master + command: > + sh -c " + IP=$$(hostname -i | awk '{print $$1}'); + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 + -ui + -log-level=INFO + " + ports: + - "8500:8500" + - "8600:8600/udp" + volumes: + - consul_master_data:/consul/data + networks: + consul-net: + aliases: + - consul-master + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == master + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + + consul-ash3c: + image: consul:latest + hostname: consul-ash3c + command: > + sh -c " + IP=$$(hostname -i | awk '{print $$1}'); + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0 + -retry-join=10.0.5.5 + -ui + -log-level=INFO + " + ports: + - "8501:8500" + - "8601:8600/udp" + volumes: + - consul_ash3c_data:/consul/data + networks: + consul-net: + aliases: + - consul-ash3c + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == ash3c + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + depends_on: + - consul-master + +volumes: + consul_master_data: + consul_ash3c_data: + +networks: + consul-net: + driver: overlay + attachable: true \ No newline at end of file diff --git a/swarm/stacks/consul-cluster-macvlan.yml b/swarm/stacks/consul-cluster-macvlan.yml new file mode 100644 index 0000000..aa8e2a4 --- /dev/null +++ b/swarm/stacks/consul-cluster-macvlan.yml @@ -0,0 +1,78 @@ +version: '3.8' + +services: + consul-master: + image: consul:latest + hostname: consul-master + command: > + sh -c " + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-master -bind=192.168.1.100 -advertise=192.168.1.100 -client=0.0.0.0 + -ui + -log-level=INFO + " + ports: + - "8500:8500" + - "8600:8600/udp" + volumes: + - consul_master_data:/consul/data + networks: + consul-macvlan: + ipv4_address: 192.168.1.100 + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == master + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + + consul-ash3c: + image: consul:latest + hostname: consul-ash3c + command: > + sh -c " + consul agent -server -bootstrap-expect=2 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-ash3c -bind=192.168.1.101 -advertise=192.168.1.101 -client=0.0.0.0 + -retry-join=192.168.1.100 + -ui + -log-level=INFO + " + ports: + - "8501:8500" + - "8601:8600/udp" + volumes: + - consul_ash3c_data:/consul/data + networks: + consul-macvlan: + ipv4_address: 192.168.1.101 + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == ash3c + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + +volumes: + consul_master_data: + consul_ash3c_data: + +networks: + consul-macvlan: + driver: macvlan + driver_opts: + parent: eth0 # 根据你的网络接口调整 + ipam: + config: + - subnet: 192.168.1.0/24 + gateway: 192.168.1.1 + ip_range: 192.168.1.100/30 # 只分配 .100-.103 的IP \ No newline at end of file diff --git a/swarm/stacks/consul-single-node.yml b/swarm/stacks/consul-single-node.yml new file mode 100644 index 0000000..379d78d --- /dev/null +++ b/swarm/stacks/consul-single-node.yml @@ -0,0 +1,40 @@ +version: '3.8' + +services: + consul: + image: consul:latest + hostname: consul-master + command: > + sh -c " + IP=$$(hostname -i | awk '{print $$1}'); + consul agent -server -bootstrap-expect=1 + -datacenter=dc1 -data-dir=/consul/data + -node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0 + -ui + -log-level=INFO + " + ports: + - "8500:8500" + - "8600:8600/udp" + volumes: + - consul_data:/consul/data + networks: + - consul-net + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == master + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + +volumes: + consul_data: + +networks: + consul-net: + driver: overlay + attachable: true \ No newline at end of file diff --git a/tofu/environments/production/nomad-multi-dc.tf b/tofu/environments/production/nomad-multi-dc.tf new file mode 100644 index 0000000..4bbaf94 --- /dev/null +++ b/tofu/environments/production/nomad-multi-dc.tf @@ -0,0 +1,169 @@ +# Nomad 多数据中心生产环境配置 +# 部署架构: CN(dc1) + KR(dc2) + US(dc3) + +terraform { + required_version = ">= 1.0" + + required_providers { + oci = { + source = "oracle/oci" + version = "~> 5.0" + } + huaweicloud = { + source = "huaweicloud/huaweicloud" + version = "~> 1.60" + } + } +} + +# Oracle Cloud Provider (韩国) +provider "oci" { + alias = "korea" + tenancy_ocid = var.oracle_tenancy_ocid + user_ocid = var.oracle_user_ocid + fingerprint = var.oracle_fingerprint + private_key_path = var.oracle_private_key_path + region = "ap-seoul-1" # 韩国首尔 +} + +# 华为云 Provider (美国) +provider "huaweicloud" { + alias = "us" + access_key = var.huawei_access_key + secret_key = var.huawei_secret_key + region = "us-east-1" # 美国东部 +} + +# 本地变量 +locals { + project_name = "nomad-multi-dc" + environment = "production" + + common_tags = { + Project = local.project_name + Environment = local.environment + ManagedBy = "opentofu" + Owner = "devops-team" + } +} + +# 数据源:获取 SSH 公钥 +data "local_file" "ssh_public_key" { + filename = pathexpand("~/.ssh/id_rsa.pub") +} + +# Oracle Cloud 基础设施 (韩国 - dc2) +module "oracle_infrastructure" { + source = "../../providers/oracle-cloud" + + providers = { + oci = oci.korea + } + + project_name = local.project_name + environment = local.environment + vpc_cidr = "10.1.0.0/16" + + oci_config = { + tenancy_ocid = var.oracle_tenancy_ocid + user_ocid = var.oracle_user_ocid + fingerprint = var.oracle_fingerprint + private_key_path = var.oracle_private_key_path + region = "ap-seoul-1" + } + + common_tags = local.common_tags +} + +# 华为云基础设施 (美国 - dc3) +module "huawei_infrastructure" { + source = "../../providers/huawei-cloud" + + providers = { + huaweicloud = huaweicloud.us + } + + project_name = local.project_name + environment = local.environment + vpc_cidr = "10.2.0.0/16" + availability_zones = ["us-east-1a", "us-east-1b"] + + common_tags = local.common_tags +} + +# Nomad 多数据中心集群 +module "nomad_cluster" { + source = "../../modules/nomad-cluster" + + # 部署配置 + deploy_korea_node = var.deploy_korea_node + deploy_us_node = var.deploy_us_node + + # Oracle Cloud 配置 + oracle_config = { + tenancy_ocid = var.oracle_tenancy_ocid + user_ocid = var.oracle_user_ocid + fingerprint = var.oracle_fingerprint + private_key_path = var.oracle_private_key_path + region = "ap-seoul-1" + } + + oracle_subnet_id = module.oracle_infrastructure.public_subnet_ids[0] + oracle_security_group_id = module.oracle_infrastructure.security_group_id + + # 华为云配置 + huawei_config = { + access_key = var.huawei_access_key + secret_key = var.huawei_secret_key + region = "us-east-1" + } + + huawei_subnet_id = module.huawei_infrastructure.public_subnet_ids[0] + huawei_security_group_id = module.huawei_infrastructure.security_group_id + + # 通用配置 + ssh_public_key = data.local_file.ssh_public_key.content + common_tags = local.common_tags + + # Nomad 配置 + nomad_version = "1.10.5" + nomad_encrypt_key = var.nomad_encrypt_key +} + +# 生成 Ansible inventory +resource "local_file" "ansible_inventory" { + filename = "${path.module}/generated/nomad-cluster-inventory.yml" + content = yamlencode({ + all = { + children = { + nomad_servers = { + hosts = module.nomad_cluster.ansible_inventory.all.children.nomad_servers.hosts + } + } + vars = { + ansible_user = "ubuntu" + ansible_ssh_private_key_file = "~/.ssh/id_rsa" + ansible_ssh_common_args = "-o StrictHostKeyChecking=no" + } + } + }) +} + +# 生成部署后配置脚本 +resource "local_file" "post_deploy_script" { + filename = "${path.module}/generated/post-deploy.sh" + content = templatefile("${path.module}/templates/post-deploy.sh", { + cluster_overview = module.nomad_cluster.cluster_overview + endpoints = module.nomad_cluster.cluster_endpoints + }) + + file_permission = "0755" +} + +# 生成跨数据中心测试任务 +resource "local_file" "cross_dc_test_job" { + filename = "${path.module}/generated/cross-dc-test.nomad" + content = templatefile("${path.module}/templates/cross-dc-test.nomad", { + datacenters = ["dc1", "dc2", "dc3"] + }) +} \ No newline at end of file diff --git a/tofu/environments/production/outputs.tf b/tofu/environments/production/outputs.tf new file mode 100644 index 0000000..2241b89 --- /dev/null +++ b/tofu/environments/production/outputs.tf @@ -0,0 +1,46 @@ +# Nomad 多数据中心生产环境输出 + +output "cluster_overview" { + description = "Nomad 多数据中心集群概览" + value = module.nomad_cluster.cluster_overview +} + +output "cluster_endpoints" { + description = "集群连接端点" + value = module.nomad_cluster.cluster_endpoints +} + +output "oracle_korea_node" { + description = "Oracle Cloud 韩国节点信息" + value = module.nomad_cluster.oracle_korea_node +} + +output "huawei_us_node" { + description = "华为云美国节点信息" + value = module.nomad_cluster.huawei_us_node +} + +output "deployment_summary" { + description = "部署摘要" + value = { + total_nodes = module.nomad_cluster.cluster_overview.total_nodes + datacenters = keys(module.nomad_cluster.cluster_overview.datacenters) + + next_steps = [ + "1. 等待所有节点启动完成 (约 5-10 分钟)", + "2. 运行: ./generated/post-deploy.sh", + "3. 验证集群: nomad server members", + "4. 测试跨 DC 调度: nomad job run generated/cross-dc-test.nomad", + "5. 访问 Web UI 查看集群状态" + ] + + web_ui_urls = module.nomad_cluster.cluster_endpoints.nomad_ui_urls + + ssh_commands = module.nomad_cluster.cluster_endpoints.ssh_commands + } +} + +output "verification_commands" { + description = "验证命令" + value = module.nomad_cluster.verification_commands +} \ No newline at end of file diff --git a/tofu/environments/production/terraform.tfvars.example b/tofu/environments/production/terraform.tfvars.example new file mode 100644 index 0000000..4fc4c7c --- /dev/null +++ b/tofu/environments/production/terraform.tfvars.example @@ -0,0 +1,22 @@ +# Nomad 多数据中心生产环境配置示例 +# 复制此文件为 terraform.tfvars 并填入实际值 + +# 部署控制 +deploy_korea_node = true # 是否部署韩国节点 +deploy_us_node = true # 是否部署美国节点 + +# Oracle Cloud 配置 (韩国 - dc2) +# 获取方式: https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm +oracle_tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaa..." +oracle_user_ocid = "ocid1.user.oc1..aaaaaaaa..." +oracle_fingerprint = "aa:bb:cc:dd:ee:ff:..." +oracle_private_key_path = "~/.oci/oci_api_key.pem" + +# 华为云配置 (美国 - dc3) +# 获取方式: https://console.huaweicloud.com/iam/#/mine/accessKey +huawei_access_key = "YOUR_HUAWEI_ACCESS_KEY" +huawei_secret_key = "YOUR_HUAWEI_SECRET_KEY" + +# Nomad 集群加密密钥 (可选,已有默认值) +# 生成方式: nomad operator keygen +nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" \ No newline at end of file diff --git a/tofu/environments/production/variables.tf b/tofu/environments/production/variables.tf new file mode 100644 index 0000000..edbaf18 --- /dev/null +++ b/tofu/environments/production/variables.tf @@ -0,0 +1,60 @@ +# Nomad 多数据中心生产环境变量 + +# 部署控制 +variable "deploy_korea_node" { + description = "是否部署韩国节点 (Oracle Cloud)" + type = bool + default = true +} + +variable "deploy_us_node" { + description = "是否部署美国节点 (华为云)" + type = bool + default = true +} + +# Oracle Cloud 配置 +variable "oracle_tenancy_ocid" { + description = "Oracle Cloud 租户 OCID" + type = string + sensitive = true +} + +variable "oracle_user_ocid" { + description = "Oracle Cloud 用户 OCID" + type = string + sensitive = true +} + +variable "oracle_fingerprint" { + description = "Oracle Cloud API 密钥指纹" + type = string + sensitive = true +} + +variable "oracle_private_key_path" { + description = "Oracle Cloud 私钥文件路径" + type = string + sensitive = true +} + +# 华为云配置 +variable "huawei_access_key" { + description = "华为云访问密钥" + type = string + sensitive = true +} + +variable "huawei_secret_key" { + description = "华为云秘密密钥" + type = string + sensitive = true +} + +# Nomad 配置 +variable "nomad_encrypt_key" { + description = "Nomad 集群加密密钥" + type = string + sensitive = true + default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" +} \ No newline at end of file diff --git a/tofu/modules/nomad-cluster/main.tf b/tofu/modules/nomad-cluster/main.tf new file mode 100644 index 0000000..d157397 --- /dev/null +++ b/tofu/modules/nomad-cluster/main.tf @@ -0,0 +1,159 @@ +# Nomad 多数据中心集群模块 +# 支持跨地域部署:CN(dc1) + KR(dc2) + US(dc3) + +terraform { + required_providers { + oci = { + source = "oracle/oci" + version = "~> 5.0" + } + huaweicloud = { + source = "huaweicloud/huaweicloud" + version = "~> 1.60" + } + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +# 本地变量 +locals { + nomad_version = "1.10.5" + + # 通用 Nomad 配置 + nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + + # 数据中心配置 + datacenters = { + dc1 = { + name = "dc1" + region = "cn" + location = "China" + provider = "existing" # 现有的 semaphore 节点 + } + dc2 = { + name = "dc2" + region = "kr" + location = "Korea" + provider = "oracle" + } + dc3 = { + name = "dc3" + region = "us" + location = "US" + provider = "huawei" # 或 aws + } + } + + # 用户数据模板 + user_data_template = templatefile("${path.module}/templates/nomad-userdata.sh", { + nomad_version = local.nomad_version + nomad_encrypt_key = local.nomad_encrypt_key + }) +} + +# 数据源:获取现有的 semaphore 节点信息 +data "external" "semaphore_info" { + program = ["bash", "-c", <<-EOF + echo '{ + "ip": "100.116.158.95", + "datacenter": "dc1", + "status": "existing" + }' + EOF + ] +} + +# Oracle Cloud 韩国节点 (dc2) +module "oracle_korea_node" { + source = "../compute" + + count = var.deploy_korea_node ? 1 : 0 + + # Oracle Cloud 特定配置 + provider_type = "oracle" + + # 实例配置 + instance_config = { + name = "nomad-master-kr" + datacenter = "dc2" + instance_type = "VM.Standard.E2.1.Micro" # 免费层 + image_id = var.oracle_ubuntu_image_id + subnet_id = var.oracle_subnet_id + + # Nomad 配置 + nomad_role = "server" + bootstrap_expect = 1 + bind_addr = "auto" # 自动检测 + + # 网络配置 + security_groups = [var.oracle_security_group_id] + + # 标签 + tags = merge(var.common_tags, { + Name = "nomad-master-kr" + Datacenter = "dc2" + Role = "nomad-server" + Provider = "oracle" + }) + } + + # 用户数据 + user_data = templatefile("${path.module}/templates/nomad-userdata.sh", { + datacenter = "dc2" + nomad_version = local.nomad_version + nomad_encrypt_key = local.nomad_encrypt_key + bootstrap_expect = 1 + bind_addr = "auto" + server_enabled = true + client_enabled = true + }) +} + +# 华为云美国节点 (dc3) +module "huawei_us_node" { + source = "../compute" + + count = var.deploy_us_node ? 1 : 0 + + # 华为云特定配置 + provider_type = "huawei" + + # 实例配置 + instance_config = { + name = "nomad-ash3c-us" + datacenter = "dc3" + instance_type = "s6.small.1" # 1vCPU 1GB + image_id = var.huawei_ubuntu_image_id + subnet_id = var.huawei_subnet_id + + # Nomad 配置 + nomad_role = "server" + bootstrap_expect = 1 + bind_addr = "auto" + + # 网络配置 + security_groups = [var.huawei_security_group_id] + + # 标签 + tags = merge(var.common_tags, { + Name = "nomad-ash3c-us" + Datacenter = "dc3" + Role = "nomad-server" + Provider = "huawei" + }) + } + + # 用户数据 + user_data = templatefile("${path.module}/templates/nomad-userdata.sh", { + datacenter = "dc3" + nomad_version = local.nomad_version + nomad_encrypt_key = local.nomad_encrypt_key + bootstrap_expect = 1 + bind_addr = "auto" + server_enabled = true + client_enabled = true + }) +} \ No newline at end of file diff --git a/tofu/modules/nomad-cluster/outputs.tf b/tofu/modules/nomad-cluster/outputs.tf new file mode 100644 index 0000000..61148ef --- /dev/null +++ b/tofu/modules/nomad-cluster/outputs.tf @@ -0,0 +1,145 @@ +# Nomad 多数据中心集群输出 + +# 集群概览 +output "cluster_overview" { + description = "Nomad 多数据中心集群概览" + value = { + datacenters = { + dc1 = { + name = "dc1" + location = "China (CN)" + provider = "existing" + node = "semaphore" + ip = "100.116.158.95" + status = "existing" + } + dc2 = var.deploy_korea_node ? { + name = "dc2" + location = "Korea (KR)" + provider = "oracle" + node = "master" + ip = try(module.oracle_korea_node[0].public_ip, "pending") + status = "deployed" + } : null + dc3 = var.deploy_us_node ? { + name = "dc3" + location = "US" + provider = "huawei" + node = "ash3c" + ip = try(module.huawei_us_node[0].public_ip, "pending") + status = "deployed" + } : null + } + total_nodes = 1 + (var.deploy_korea_node ? 1 : 0) + (var.deploy_us_node ? 1 : 0) + } +} + +# Oracle Cloud 韩国节点输出 +output "oracle_korea_node" { + description = "Oracle Cloud 韩国节点信息" + value = var.deploy_korea_node ? { + instance_id = try(module.oracle_korea_node[0].instance_id, null) + public_ip = try(module.oracle_korea_node[0].public_ip, null) + private_ip = try(module.oracle_korea_node[0].private_ip, null) + datacenter = "dc2" + provider = "oracle" + region = var.oracle_config.region + + # 连接信息 + ssh_command = try("ssh ubuntu@${module.oracle_korea_node[0].public_ip}", null) + nomad_ui = try("http://${module.oracle_korea_node[0].public_ip}:4646", null) + } : null +} + +# 华为云美国节点输出 +output "huawei_us_node" { + description = "华为云美国节点信息" + value = var.deploy_us_node ? { + instance_id = try(module.huawei_us_node[0].instance_id, null) + public_ip = try(module.huawei_us_node[0].public_ip, null) + private_ip = try(module.huawei_us_node[0].private_ip, null) + datacenter = "dc3" + provider = "huawei" + region = var.huawei_config.region + + # 连接信息 + ssh_command = try("ssh ubuntu@${module.huawei_us_node[0].public_ip}", null) + nomad_ui = try("http://${module.huawei_us_node[0].public_ip}:4646", null) + } : null +} + +# 集群连接信息 +output "cluster_endpoints" { + description = "集群连接端点" + value = { + nomad_ui_urls = compact([ + "http://100.116.158.95:4646", # dc1 - semaphore + var.deploy_korea_node ? try("http://${module.oracle_korea_node[0].public_ip}:4646", null) : null, # dc2 + var.deploy_us_node ? try("http://${module.huawei_us_node[0].public_ip}:4646", null) : null # dc3 + ]) + + ssh_commands = compact([ + "ssh root@100.116.158.95", # dc1 - semaphore + var.deploy_korea_node ? try("ssh ubuntu@${module.oracle_korea_node[0].public_ip}", null) : null, # dc2 + var.deploy_us_node ? try("ssh ubuntu@${module.huawei_us_node[0].public_ip}", null) : null # dc3 + ]) + } +} + +# Ansible inventory 生成 +output "ansible_inventory" { + description = "生成的 Ansible inventory" + value = { + all = { + children = { + nomad_servers = { + hosts = merge( + { + semaphore = { + ansible_host = "100.116.158.95" + datacenter = "dc1" + provider = "existing" + } + }, + var.deploy_korea_node ? { + master = { + ansible_host = try(module.oracle_korea_node[0].public_ip, "pending") + datacenter = "dc2" + provider = "oracle" + } + } : {}, + var.deploy_us_node ? { + ash3c = { + ansible_host = try(module.huawei_us_node[0].public_ip, "pending") + datacenter = "dc3" + provider = "huawei" + } + } : {} + ) + } + } + } + } +} + +# 部署后验证命令 +output "verification_commands" { + description = "部署后验证命令" + value = [ + "# 检查集群状态", + "nomad server members", + "", + "# 检查各数据中心节点", + "nomad node status -verbose", + "", + "# 跨数据中心任务调度测试", + "nomad job run examples/cross-dc-test.nomad", + "", + "# 访问 UI", + join("\n", [for url in compact([ + "http://100.116.158.95:4646", + var.deploy_korea_node ? try("http://${module.oracle_korea_node[0].public_ip}:4646", null) : null, + var.deploy_us_node ? try("http://${module.huawei_us_node[0].public_ip}:4646", null) : null + ]) : "curl -s ${url}/v1/status/leader"]) + ] +} \ No newline at end of file diff --git a/tofu/modules/nomad-cluster/templates/nomad-userdata.sh b/tofu/modules/nomad-cluster/templates/nomad-userdata.sh new file mode 100644 index 0000000..f0519b3 --- /dev/null +++ b/tofu/modules/nomad-cluster/templates/nomad-userdata.sh @@ -0,0 +1,230 @@ +#!/bin/bash +# Nomad 多数据中心节点自动配置脚本 +# 数据中心: ${datacenter} + +set -e + +# 日志函数 +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/nomad-setup.log +} + +log "开始配置 Nomad 节点 - 数据中心: ${datacenter}" + +# 更新系统 +log "更新系统包..." +apt-get update -y +apt-get upgrade -y + +# 安装必要的包 +log "安装必要的包..." +apt-get install -y \ + curl \ + wget \ + unzip \ + jq \ + docker.io \ + docker-compose \ + htop \ + net-tools \ + vim + +# 启动 Docker +log "启动 Docker 服务..." +systemctl enable docker +systemctl start docker +usermod -aG docker ubuntu + +# 安装 Nomad +log "安装 Nomad ${nomad_version}..." +cd /tmp +wget -q https://releases.hashicorp.com/nomad/${nomad_version}/nomad_${nomad_version}_linux_amd64.zip +unzip nomad_${nomad_version}_linux_amd64.zip +mv nomad /usr/local/bin/ +chmod +x /usr/local/bin/nomad + +# 创建 Nomad 用户和目录 +log "创建 Nomad 用户和目录..." +useradd --system --home /etc/nomad.d --shell /bin/false nomad +mkdir -p /opt/nomad/data +mkdir -p /etc/nomad.d +mkdir -p /var/log/nomad +chown -R nomad:nomad /opt/nomad /etc/nomad.d /var/log/nomad + +# 获取本机 IP 地址 +if [ "${bind_addr}" = "auto" ]; then + # 尝试多种方法获取 IP + BIND_ADDR=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || \ + curl -s http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip -H "Metadata-Flavor: Google" 2>/dev/null || \ + ip route get 8.8.8.8 | awk '{print $7; exit}' || \ + hostname -I | awk '{print $1}') +else + BIND_ADDR="${bind_addr}" +fi + +log "检测到 IP 地址: $BIND_ADDR" + +# 创建 Nomad 配置文件 +log "创建 Nomad 配置文件..." +cat > /etc/nomad.d/nomad.hcl << EOF +datacenter = "${datacenter}" +region = "global" +data_dir = "/opt/nomad/data" + +bind_addr = "$BIND_ADDR" + +%{ if server_enabled } +server { + enabled = true + bootstrap_expect = ${bootstrap_expect} + encrypt = "${nomad_encrypt_key}" +} +%{ endif } + +%{ if client_enabled } +client { + enabled = true + + host_volume "docker-sock" { + path = "/var/run/docker.sock" + read_only = false + } +} +%{ endif } + +ui { + enabled = true +} + +addresses { + http = "0.0.0.0" + rpc = "$BIND_ADDR" + serf = "$BIND_ADDR" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +plugin "docker" { + config { + allow_privileged = true + volumes { + enabled = true + } + } +} + +telemetry { + collection_interval = "10s" + disable_hostname = false + prometheus_metrics = true + publish_allocation_metrics = true + publish_node_metrics = true +} + +log_level = "INFO" +log_file = "/var/log/nomad/nomad.log" +EOF + +# 创建 systemd 服务文件 +log "创建 systemd 服务文件..." +cat > /etc/systemd/system/nomad.service << EOF +[Unit] +Description=Nomad +Documentation=https://www.nomadproject.io/ +Requires=network-online.target +After=network-online.target +ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl + +[Service] +Type=notify +User=nomad +Group=nomad +ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl +ExecReload=/bin/kill -HUP \$MAINPID +KillMode=process +Restart=on-failure +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target +EOF + +# 启动 Nomad 服务 +log "启动 Nomad 服务..." +systemctl daemon-reload +systemctl enable nomad +systemctl start nomad + +# 等待服务启动 +log "等待 Nomad 服务启动..." +sleep 10 + +# 验证安装 +log "验证 Nomad 安装..." +if systemctl is-active --quiet nomad; then + log "✅ Nomad 服务运行正常" + log "📊 节点信息:" + /usr/local/bin/nomad node status -self || true +else + log "❌ Nomad 服务启动失败" + systemctl status nomad --no-pager || true + journalctl -u nomad --no-pager -n 20 || true +fi + +# 配置防火墙(如果需要) +log "配置防火墙规则..." +if command -v ufw >/dev/null 2>&1; then + ufw allow 4646/tcp # HTTP API + ufw allow 4647/tcp # RPC + ufw allow 4648/tcp # Serf + ufw allow 22/tcp # SSH +fi + +# 创建有用的别名和脚本 +log "创建管理脚本..." +cat > /usr/local/bin/nomad-status << 'EOF' +#!/bin/bash +echo "=== Nomad 服务状态 ===" +systemctl status nomad --no-pager + +echo -e "\n=== Nomad 集群成员 ===" +nomad server members 2>/dev/null || echo "无法连接到集群" + +echo -e "\n=== Nomad 节点状态 ===" +nomad node status 2>/dev/null || echo "无法获取节点状态" + +echo -e "\n=== 最近日志 ===" +journalctl -u nomad --no-pager -n 5 +EOF + +chmod +x /usr/local/bin/nomad-status + +# 添加到 ubuntu 用户的 bashrc +echo 'alias ns="nomad-status"' >> /home/ubuntu/.bashrc +echo 'alias nomad-logs="journalctl -u nomad -f"' >> /home/ubuntu/.bashrc + +log "🎉 Nomad 节点配置完成!" +log "📍 数据中心: ${datacenter}" +log "🌐 IP 地址: $BIND_ADDR" +log "🔗 Web UI: http://$BIND_ADDR:4646" +log "📝 使用 'nomad-status' 或 'ns' 命令查看状态" + +# 输出重要信息到 motd +cat > /etc/update-motd.d/99-nomad << EOF +#!/bin/bash +echo "" +echo "🚀 Nomad 节点信息:" +echo " 数据中心: ${datacenter}" +echo " IP 地址: $BIND_ADDR" +echo " Web UI: http://$BIND_ADDR:4646" +echo " 状态检查: nomad-status" +echo "" +EOF + +chmod +x /etc/update-motd.d/99-nomad + +log "节点配置脚本执行完成" \ No newline at end of file diff --git a/tofu/modules/nomad-cluster/variables.tf b/tofu/modules/nomad-cluster/variables.tf new file mode 100644 index 0000000..6033fb8 --- /dev/null +++ b/tofu/modules/nomad-cluster/variables.tf @@ -0,0 +1,118 @@ +# Nomad 多数据中心集群变量定义 + +variable "deploy_korea_node" { + description = "是否部署韩国节点 (Oracle Cloud)" + type = bool + default = true +} + +variable "deploy_us_node" { + description = "是否部署美国节点 (华为云)" + type = bool + default = true +} + +# Oracle Cloud 配置 +variable "oracle_config" { + description = "Oracle Cloud 配置" + type = object({ + tenancy_ocid = string + user_ocid = string + fingerprint = string + private_key_path = string + region = string + }) + sensitive = true +} + +variable "oracle_ubuntu_image_id" { + description = "Oracle Cloud Ubuntu 镜像 ID" + type = string + default = "" # 将通过数据源自动获取 +} + +variable "oracle_subnet_id" { + description = "Oracle Cloud 子网 ID" + type = string +} + +variable "oracle_security_group_id" { + description = "Oracle Cloud 安全组 ID" + type = string +} + +# 华为云配置 +variable "huawei_config" { + description = "华为云配置" + type = object({ + access_key = string + secret_key = string + region = string + }) + sensitive = true +} + +variable "huawei_ubuntu_image_id" { + description = "华为云 Ubuntu 镜像 ID" + type = string + default = "" # 将通过数据源自动获取 +} + +variable "huawei_subnet_id" { + description = "华为云子网 ID" + type = string +} + +variable "huawei_security_group_id" { + description = "华为云安全组 ID" + type = string +} + +# 通用配置 +variable "common_tags" { + description = "通用标签" + type = map(string) + default = { + Project = "nomad-multi-dc" + Environment = "production" + ManagedBy = "opentofu" + } +} + +variable "ssh_public_key" { + description = "SSH 公钥" + type = string +} + +variable "allowed_cidr_blocks" { + description = "允许访问的 CIDR 块" + type = list(string) + default = ["0.0.0.0/0"] # 生产环境应该限制 +} + +# Nomad 特定配置 +variable "nomad_version" { + description = "Nomad 版本" + type = string + default = "1.10.5" +} + +variable "nomad_encrypt_key" { + description = "Nomad 集群加密密钥" + type = string + sensitive = true + default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" +} + +# 网络配置 +variable "vpc_cidr" { + description = "VPC CIDR 块" + type = string + default = "10.0.0.0/16" +} + +variable "availability_zones" { + description = "可用区列表" + type = list(string) + default = ["a", "b"] +} \ No newline at end of file