This commit is contained in:
Houzhong Xu 2025-09-23 13:37:17 +00:00
parent 2fe53b6504
commit 3f45ad8361
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
74 changed files with 5012 additions and 3336 deletions

View File

@ -0,0 +1,14 @@
{
"proxies": {
"http-proxy": "http://istoreos.tailnet-68f9.ts.net:7891",
"https-proxy": "http://istoreos.tailnet-68f9.ts.net:7891",
"no-proxy": "localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net"
},
"registry-mirrors": [],
"insecure-registries": [],
"debug": false,
"experimental": false,
"features": {
"buildkit": true
}
}

View File

@ -0,0 +1,10 @@
[consul_cluster]
master ansible_host=master ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131
ash3c ansible_host=ash3c ansible_user=ben ansible_become=yes ansible_become_pass=3131
[consul_cluster:vars]
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
consul_version=1.21.4
consul_datacenter=dc1
# 生成加密密钥: consul keygen
vault_consul_encrypt_key=1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=

View File

@ -0,0 +1,20 @@
[nomad_servers]
master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
semaphore ansible_connection=local nomad_role=server nomad_bootstrap_expect=3
ash3c ansible_host=100.116.80.94 ansible_port=22 ansible_user=ben ansible_become=yes ansible_become_pass=3131 nomad_role=server nomad_bootstrap_expect=3
[nomad_clients]
# 如果需要客户端节点,可以在这里添加
[nomad_cluster:children]
nomad_servers
nomad_clients
[nomad_cluster:vars]
ansible_ssh_private_key_file=~/.ssh/id_ed25519
ansible_user=ben
ansible_become=yes
nomad_version=1.10.5
nomad_datacenter=dc1
nomad_region=global
nomad_encrypt_key=NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=

View File

@ -0,0 +1,15 @@
[nomad_servers]
localhost ansible_connection=local nomad_role=server nomad_bootstrap_expect=1
[nomad_clients]
# 如果需要客户端节点,可以在这里添加
[nomad_cluster:children]
nomad_servers
nomad_clients
[nomad_cluster:vars]
ansible_user=root
nomad_version=1.6.2
nomad_datacenter=dc1
nomad_region=global

View File

@ -1,72 +0,0 @@
---
- name: Cloud Providers System Update Playbook
hosts: huawei,google,ditigalocean,aws
become: yes
gather_facts: yes
tasks:
# Ubuntu/Debian 系统更新 (apt)
- name: Update apt cache (Ubuntu/Debian)
apt:
update_cache: yes
cache_valid_time: 3600
when: ansible_os_family == "Debian"
- name: Upgrade all packages (Ubuntu/Debian)
apt:
upgrade: yes
autoremove: yes
autoclean: yes
when: ansible_os_family == "Debian"
register: apt_upgrade_result
# AWS Linux 系统更新 (dnf)
- name: Update dnf cache (AWS Linux/RHEL)
dnf:
update_cache: yes
when: ansible_os_family == "RedHat"
- name: Upgrade all packages (AWS Linux/RHEL)
dnf:
name: "*"
state: latest
skip_broken: yes
when: ansible_os_family == "RedHat"
register: dnf_upgrade_result
# 显示升级结果
- name: Display apt upgrade results
debug:
msg: "APT system upgrade completed. {{ apt_upgrade_result.changed }} packages were updated."
when: ansible_os_family == "Debian" and apt_upgrade_result is defined
- name: Display dnf upgrade results
debug:
msg: "DNF system upgrade completed. {{ dnf_upgrade_result.changed }} packages were updated."
when: ansible_os_family == "RedHat" and dnf_upgrade_result is defined
# 检查是否需要重启 (Ubuntu/Debian)
- name: Check if reboot is required (Ubuntu/Debian)
stat:
path: /var/run/reboot-required
register: debian_reboot_required
when: ansible_os_family == "Debian"
# 检查是否需要重启 (AWS Linux/RHEL)
- name: Check if reboot is required (AWS Linux/RHEL)
command: needs-restarting -r
register: rhel_reboot_required
failed_when: false
changed_when: false
when: ansible_os_family == "RedHat"
# 通知重启信息
- name: Notify if reboot is required (Ubuntu/Debian)
debug:
msg: "System reboot is required to complete the update."
when: ansible_os_family == "Debian" and debian_reboot_required.stat.exists is defined and debian_reboot_required.stat.exists
- name: Notify if reboot is required (AWS Linux/RHEL)
debug:
msg: "System reboot is required to complete the update."
when: ansible_os_family == "RedHat" and rhel_reboot_required.rc == 1

View File

@ -1,128 +0,0 @@
---
- name: Docker Container Management
hosts: all
become: yes
gather_facts: yes
tasks:
# 检查 Docker 是否安装
- name: Check if Docker is installed
command: which docker
register: docker_installed
failed_when: false
changed_when: false
- name: Skip Docker tasks if not installed
debug:
msg: "Docker not installed on {{ inventory_hostname }}, skipping Docker tasks"
when: docker_installed.rc != 0
# Docker 系统信息
- name: Get Docker system info
shell: docker system df
register: docker_system_info
when: docker_installed.rc == 0
- name: Display Docker system usage
debug:
msg: "🐳 Docker System Usage: {{ docker_system_info.stdout_lines }}"
when: docker_installed.rc == 0
# 检查运行中的容器
- name: List running containers
shell: docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
register: running_containers
when: docker_installed.rc == 0
- name: Display running containers
debug:
msg: "📦 Running Containers: {{ running_containers.stdout_lines }}"
when: docker_installed.rc == 0
# 检查停止的容器
- name: List stopped containers
shell: docker ps -a --filter "status=exited" --format "table {{.Names}}\t{{.Status}}"
register: stopped_containers
when: docker_installed.rc == 0
- name: Display stopped containers
debug:
msg: "⏹️ Stopped Containers: {{ stopped_containers.stdout_lines }}"
when: docker_installed.rc == 0 and stopped_containers.stdout_lines | length > 1
# 检查 Docker 镜像
- name: List Docker images
shell: docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}"
register: docker_images
when: docker_installed.rc == 0
- name: Display Docker images
debug:
msg: "🖼️ Docker Images: {{ docker_images.stdout_lines }}"
when: docker_installed.rc == 0
# 检查悬空镜像
- name: Check for dangling images
shell: docker images -f "dangling=true" -q
register: dangling_images
when: docker_installed.rc == 0
- name: Report dangling images
debug:
msg: "🗑️ Found {{ dangling_images.stdout_lines | length }} dangling images"
when: docker_installed.rc == 0
# 检查 Docker 卷
- name: List Docker volumes
shell: docker volume ls
register: docker_volumes
when: docker_installed.rc == 0
- name: Display Docker volumes
debug:
msg: "💾 Docker Volumes: {{ docker_volumes.stdout_lines }}"
when: docker_installed.rc == 0
# 检查 Docker 网络
- name: List Docker networks
shell: docker network ls
register: docker_networks
when: docker_installed.rc == 0
- name: Display Docker networks
debug:
msg: "🌐 Docker Networks: {{ docker_networks.stdout_lines }}"
when: docker_installed.rc == 0
# 检查容器资源使用
- name: Check container resource usage
shell: docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}"
register: container_stats
when: docker_installed.rc == 0
- name: Display container resource usage
debug:
msg: "📊 Container Stats: {{ container_stats.stdout_lines }}"
when: docker_installed.rc == 0
# 检查 Docker 服务状态
- name: Check Docker service status
systemd:
name: docker
register: docker_service_status
when: docker_installed.rc == 0
- name: Display Docker service status
debug:
msg: "🔧 Docker Service: {{ docker_service_status.status.ActiveState }}"
when: docker_installed.rc == 0
# 清理建议
- name: Suggest cleanup if needed
debug:
msg: |
💡 Cleanup suggestions:
- Run 'docker system prune -f' to remove unused data
- Run 'docker image prune -f' to remove dangling images
- Run 'docker volume prune -f' to remove unused volumes
when: docker_installed.rc == 0 and (dangling_images.stdout_lines | length > 0 or stopped_containers.stdout_lines | length > 1)

View File

@ -1,97 +0,0 @@
---
- name: Docker Status Check for HCP Nodes
hosts: hcp
gather_facts: yes
become: yes
tasks:
- name: Check if Docker is installed
command: docker --version
register: docker_version
ignore_errors: yes
- name: Display Docker version
debug:
msg: "Docker version: {{ docker_version.stdout }}"
when: docker_version.rc == 0
- name: Check Docker service status
systemd:
name: docker
register: docker_service_status
- name: Display Docker service status
debug:
msg: "Docker service is {{ docker_service_status.status.ActiveState }}"
- name: Check Docker daemon info
command: docker info --format "{{ '{{' }}.ServerVersion{{ '}}' }}"
register: docker_info
ignore_errors: yes
- name: Display Docker daemon info
debug:
msg: "Docker daemon version: {{ docker_info.stdout }}"
when: docker_info.rc == 0
- name: Check Docker Swarm status
command: docker info --format "{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}"
register: swarm_status
ignore_errors: yes
- name: Display Swarm status
debug:
msg: "Swarm status: {{ swarm_status.stdout }}"
when: swarm_status.rc == 0
- name: Get Docker Swarm node info (if in swarm)
command: docker node ls
register: swarm_nodes
ignore_errors: yes
when: swarm_status.stdout == "active"
- name: Display Swarm nodes
debug:
msg: "{{ swarm_nodes.stdout_lines }}"
when: swarm_nodes is defined and swarm_nodes.rc == 0
- name: List running containers
command: docker ps --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}\t{{ '{{' }}.Ports{{ '}}' }}"
register: running_containers
ignore_errors: yes
- name: Display running containers
debug:
msg: "{{ running_containers.stdout_lines }}"
when: running_containers.rc == 0
- name: Check Docker network list
command: docker network ls
register: docker_networks
ignore_errors: yes
- name: Display Docker networks
debug:
msg: "{{ docker_networks.stdout_lines }}"
when: docker_networks.rc == 0
- name: Get Docker system info
command: docker system df
register: docker_system_info
ignore_errors: yes
- name: Display Docker system usage
debug:
msg: "{{ docker_system_info.stdout_lines }}"
when: docker_system_info.rc == 0
- name: Check if node is Swarm manager
command: docker node inspect self --format "{{ '{{' }}.ManagerStatus.Leader{{ '}}' }}"
register: is_manager
ignore_errors: yes
when: swarm_status.stdout == "active"
- name: Display manager status
debug:
msg: "Is Swarm manager: {{ is_manager.stdout }}"
when: is_manager is defined and is_manager.rc == 0

View File

@ -1,210 +0,0 @@
---
- name: Simple Docker Swarm Analysis for ash3c
hosts: ash3c
become: yes
gather_facts: yes
tasks:
# 基础检查
- name: Check if Docker is installed
command: which docker
register: docker_installed
failed_when: false
changed_when: false
- name: Fail if Docker not installed
fail:
msg: "Docker is not installed on {{ inventory_hostname }}"
when: docker_installed.rc != 0
# 检查当前 Swarm 状态
- name: Check Docker Swarm status
shell: docker info | grep "Swarm:" -A 1
register: swarm_status
- name: Display current Swarm status
debug:
msg: "🔍 Current Swarm Status: {{ swarm_status.stdout_lines }}"
# 获取运行中的容器
- name: Get running containers
shell: docker ps --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Image{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}\t{{ '{{' }}.Ports{{ '}}' }}"
register: running_containers
- name: Display running containers
debug:
msg: "🏃 Running Containers: {{ running_containers.stdout_lines }}"
# 获取所有容器(包括停止的)
- name: Get all containers
shell: docker ps -a --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Image{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}"
register: all_containers
- name: Display all containers
debug:
msg: "📦 All Containers: {{ all_containers.stdout_lines }}"
# 检查每个容器的详细信息
- name: Get container names only
shell: docker ps -a --format "{{ '{{' }}.Names{{ '}}' }}"
register: container_names
- name: Inspect each container
shell: |
echo "=== Container: {{ item }} ==="
echo "Image: $(docker inspect {{ item }} --format '{{ '{{' }}.Config.Image{{ '}}' }}')"
echo "Status: $(docker inspect {{ item }} --format '{{ '{{' }}.State.Status{{ '}}' }}')"
echo "Restart Policy: $(docker inspect {{ item }} --format '{{ '{{' }}.HostConfig.RestartPolicy.Name{{ '}}' }}')"
echo "Network Mode: $(docker inspect {{ item }} --format '{{ '{{' }}.HostConfig.NetworkMode{{ '}}' }}')"
echo "Published Ports: $(docker port {{ item }} 2>/dev/null || echo 'None')"
echo "Volumes/Mounts:"
docker inspect {{ item }} --format '{{ '{{' }}range .Mounts{{ '}}' }} {{ '{{' }}.Source{{ '}}' }}:{{ '{{' }}.Destination{{ '}}' }} ({{ '{{' }}.Mode{{ '}}' }}){{ '{{' }}"\n"{{ '}}' }}{{ '{{' }}end{{ '}}' }}' || echo " None"
echo "Environment Variables:"
docker inspect {{ item }} --format '{{ '{{' }}range .Config.Env{{ '}}' }} {{ '{{' }}.{{ '}}' }}{{ '{{' }}"\n"{{ '}}' }}{{ '{{' }}end{{ '}}' }}' | head -10
echo "Labels:"
docker inspect {{ item }} --format '{{ '{{' }}range $key, $value := .Config.Labels{{ '}}' }} {{ '{{' }}$key{{ '}}' }}={{ '{{' }}$value{{ '}}' }}{{ '{{' }}"\n"{{ '}}' }}{{ '{{' }}end{{ '}}' }}' | head -5
echo "---"
register: container_inspect
loop: "{{ container_names.stdout_lines }}"
when: container_names.stdout_lines | length > 0
- name: Display container inspection results
debug:
msg: "{{ item.stdout }}"
loop: "{{ container_inspect.results }}"
when: container_inspect is defined
# 检查 Docker Compose 文件
- name: Find docker-compose files
find:
paths:
- /root
- /home
- /opt
patterns:
- "docker-compose.yml"
- "docker-compose.yaml"
- "compose.yml"
- "compose.yaml"
recurse: yes
depth: 3
register: compose_files
- name: Display found compose files
debug:
msg: "📄 Found compose files: {{ item.path }}"
loop: "{{ compose_files.files }}"
when: compose_files.files | length > 0
# 分析网络配置
- name: Get Docker networks
shell: docker network ls
register: docker_networks
- name: Display Docker networks
debug:
msg: "🌐 Docker Networks: {{ docker_networks.stdout_lines }}"
# 检查卷使用情况
- name: Get Docker volumes
shell: docker volume ls
register: docker_volumes
- name: Display Docker volumes
debug:
msg: "💾 Docker Volumes: {{ docker_volumes.stdout_lines }}"
# 检查容器资源使用
- name: Get container resource usage
shell: docker stats --no-stream
register: container_stats
when: container_names.stdout_lines | length > 0
- name: Display container stats
debug:
msg: "📊 Container Resource Usage: {{ container_stats.stdout_lines }}"
when: container_stats is defined
# 生成 Swarm 适用性分析
- name: Generate Swarm suitability analysis
debug:
msg: |
🔍 DOCKER SWARM MIGRATION ANALYSIS FOR {{ inventory_hostname }}
================================================================
📋 SUMMARY:
- Current Swarm Status: {{ 'Active' if 'active' in swarm_status.stdout else 'Inactive' }}
- Total Containers: {{ container_names.stdout_lines | length }}
- Running Containers: {{ (running_containers.stdout_lines | length) - 1 }}
- Compose Files Found: {{ compose_files.files | length }}
💡 GENERAL RECOMMENDATIONS:
✅ SUITABLE FOR SWARM (typically):
- Web applications (nginx, apache, etc.)
- API services
- Databases (with proper volume management)
- Monitoring tools (prometheus, grafana, etc.)
- Load balancers
❌ NOT SUITABLE FOR SWARM:
- Containers using Docker socket (/var/run/docker.sock)
- Containers with --privileged flag
- Containers requiring specific host access
- Development/testing containers
⚠️ NEEDS MODIFICATION:
- Containers using bind mounts (convert to volumes)
- Containers without restart policies
- Containers using host networking
🚀 NEXT STEPS:
1. Review each container's configuration above
2. Identify services that can benefit from scaling
3. Convert suitable containers to Docker services
4. Set up overlay networks
5. Configure secrets and configs management
📝 MIGRATION CHECKLIST:
□ Initialize Swarm (already done: {{ 'Yes' if 'active' in swarm_status.stdout else 'No' }})
□ Create overlay networks
□ Convert containers to services
□ Set up service discovery
□ Configure load balancing
□ Test service scaling
□ Set up monitoring
when: container_names is defined
# 保存分析结果
- name: Save analysis summary
copy:
content: |
Docker Swarm Analysis for {{ inventory_hostname }}
Generated: {{ ansible_date_time.iso8601 }}
Current Swarm Status: {{ swarm_status.stdout }}
Total Containers: {{ container_names.stdout_lines | length }}
Container List:
{{ container_names.stdout_lines | join('\n') }}
Networks:
{{ docker_networks.stdout }}
Volumes:
{{ docker_volumes.stdout }}
Compose Files Found:
{% for file in compose_files.files %}
- {{ file.path }}
{% endfor %}
dest: "/tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt"
- name: Analysis complete
debug:
msg: |
🎉 Analysis complete!
Results saved to: /tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt
Review the container details above to determine which services
are suitable for Swarm migration.

View File

@ -1,246 +0,0 @@
---
- name: Docker Swarm Migration Analysis for ash3c
hosts: ash3c
become: yes
gather_facts: yes
vars:
analysis_results: []
tasks:
# 基础检查
- name: Check if Docker is installed
command: which docker
register: docker_installed
failed_when: false
changed_when: false
- name: Fail if Docker not installed
fail:
msg: "Docker is not installed on {{ inventory_hostname }}"
when: docker_installed.rc != 0
# 检查当前 Swarm 状态
- name: Check Docker Swarm status
shell: docker info --format "{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}"
register: swarm_status
- name: Display current Swarm status
debug:
msg: "🔍 Current Swarm Status: {{ swarm_status.stdout }}"
# 获取所有容器的详细信息
- name: Get all containers (running and stopped)
shell: docker ps -a --format "{{ '{{' }}.Names{{ '}}' }}"
register: all_containers
- name: Get basic container information
shell: |
echo "=== Container: {{ item }} ==="
docker inspect {{ item }} | jq -r '
.[0] |
"Image: " + .Config.Image,
"Status: " + .State.Status,
"RestartPolicy: " + .HostConfig.RestartPolicy.Name,
"NetworkMode: " + .HostConfig.NetworkMode,
"Ports: " + (.NetworkSettings.Ports | keys | join(", ")),
"Volumes: " + ([.Mounts[]? | .Source + ":" + .Destination + ":" + .Mode] | join(" ")),
"Memory: " + (.HostConfig.Memory | tostring),
"CPUs: " + (.HostConfig.NanoCpus | tostring)
'
echo "---"
register: container_details
loop: "{{ all_containers.stdout_lines }}"
when: all_containers.stdout_lines | length > 0
- name: Display container details
debug:
msg: "{{ item.stdout }}"
loop: "{{ container_details.results }}"
when: container_details is defined
# 检查 Docker Compose 文件
- name: Find docker-compose files
find:
paths:
- /root
- /home
- /opt
patterns:
- "docker-compose.yml"
- "docker-compose.yaml"
- "compose.yml"
- "compose.yaml"
recurse: yes
register: compose_files
- name: Display found compose files
debug:
msg: "📄 Found compose files: {{ item.path }}"
loop: "{{ compose_files.files }}"
when: compose_files.files | length > 0
# 分析网络配置
- name: Get Docker networks
shell: docker network ls --format "{{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.Driver{{ '}}' }}\t{{ '{{' }}.Scope{{ '}}' }}"
register: docker_networks
- name: Display Docker networks
debug:
msg: "🌐 Docker Networks: {{ docker_networks.stdout_lines }}"
# 检查卷使用情况
- name: Get Docker volumes
shell: docker volume ls --format "{{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.Driver{{ '}}' }}"
register: docker_volumes
- name: Display Docker volumes
debug:
msg: "💾 Docker Volumes: {{ docker_volumes.stdout_lines }}"
# 检查容器资源使用
- name: Get container resource usage
shell: docker stats --no-stream --format "{{ '{{' }}.Name{{ '}}' }}\t{{ '{{' }}.CPUPerc{{ '}}' }}\t{{ '{{' }}.MemUsage{{ '}}' }}\t{{ '{{' }}.NetIO{{ '}}' }}\t{{ '{{' }}.BlockIO{{ '}}' }}"
register: container_stats
when: all_containers.stdout_lines | length > 0
- name: Display container stats
debug:
msg: "📊 Container Resource Usage: {{ container_stats.stdout_lines }}"
when: container_stats is defined
# 分析服务类型和 Swarm 适用性
- name: Analyze containers for Swarm suitability
set_fact:
swarm_analysis: |
🔍 SWARM MIGRATION ANALYSIS FOR {{ inventory_hostname }}
================================================
Current Swarm Status: {{ swarm_status.stdout }}
Total Containers: {{ all_containers.stdout_lines | length }}
📋 CONTAINER ANALYSIS:
{% for container in container_details.results %}
Container: {{ container.item }}
{% set details = container.stdout.split('\n') %}
{% for line in details %}
{{ line }}
{% endfor %}
SWARM SUITABILITY ASSESSMENT:
{% if 'restart=always' in container.stdout or 'restart=unless-stopped' in container.stdout %}
✅ Good restart policy for Swarm
{% else %}
⚠️ Consider adding restart policy
{% endif %}
{% if 'NetworkMode: bridge' in container.stdout or 'NetworkMode: host' in container.stdout %}
⚠️ May need network configuration for Swarm
{% else %}
✅ Custom network - good for Swarm
{% endif %}
{% if '/var/run/docker.sock' in container.stdout %}
❌ Uses Docker socket - NOT suitable for Swarm
{% elif 'bind' in container.stdout %}
⚠️ Uses bind mounts - consider using volumes
{% else %}
✅ Good volume configuration
{% endif %}
{% endfor %}
💡 RECOMMENDATIONS:
SUITABLE FOR SWARM:
{% for container in container_details.results %}
{% if '/var/run/docker.sock' not in container.stdout %}
- {{ container.item }}: Ready for Swarm migration
{% endif %}
{% endfor %}
NEEDS MODIFICATION:
{% for container in container_details.results %}
{% if '/var/run/docker.sock' in container.stdout %}
- {{ container.item }}: Uses Docker socket - keep as standalone
{% elif 'bind' in container.stdout %}
- {{ container.item }}: Convert bind mounts to volumes
{% endif %}
{% endfor %}
NEXT STEPS:
1. Initialize Swarm: docker swarm init
2. Create overlay networks for services
3. Convert suitable containers to services
4. Set up service discovery and load balancing
5. Configure secrets and configs management
when: container_details is defined
- name: Display Swarm analysis
debug:
msg: "{{ swarm_analysis }}"
when: swarm_analysis is defined
# 生成迁移脚本建议
- name: Generate migration script suggestions
set_fact:
migration_script: |
#!/bin/bash
# Docker Swarm Migration Script for {{ inventory_hostname }}
# Generated on {{ ansible_date_time.iso8601 }}
echo "🚀 Starting Docker Swarm migration..."
# Initialize Swarm (if not already done)
if [ "{{ swarm_status.stdout }}" != "active" ]; then
echo "Initializing Docker Swarm..."
docker swarm init
fi
# Create overlay networks
echo "Creating overlay networks..."
docker network create -d overlay --attachable app-network
# Example service creation (modify as needed)
{% for container in container_details.results if container_details is defined %}
{% if '/var/run/docker.sock' not in container.stdout %}
echo "Converting {{ container.item }} to Swarm service..."
# docker service create --name {{ container.item }}-svc \
# --network app-network \
# --replicas 1 \
# [ADD_YOUR_SPECIFIC_OPTIONS] \
# [IMAGE_NAME]
{% endif %}
{% endfor %}
echo "✅ Migration script template generated!"
echo "Please review and customize before running."
when: container_details is defined
- name: Display migration script
debug:
msg: "{{ migration_script }}"
when: migration_script is defined
# 保存分析结果到文件
- name: Save analysis results to file
copy:
content: |
{{ swarm_analysis }}
MIGRATION SCRIPT:
{{ migration_script }}
dest: "/tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt"
when: swarm_analysis is defined and migration_script is defined
- name: Analysis complete
debug:
msg: |
🎉 Analysis complete!
Results saved to: /tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt
Summary:
- Total containers analyzed: {{ all_containers.stdout_lines | length }}
- Compose files found: {{ compose_files.files | length }}
- Current Swarm status: {{ swarm_status.stdout }}

View File

@ -1,236 +0,0 @@
---
- name: Docker Swarm Check for ash3c
hosts: ash3c
become: yes
gather_facts: yes
tasks:
# 基础检查
- name: Check if Docker is installed
command: which docker
register: docker_installed
failed_when: false
changed_when: false
- name: Fail if Docker not installed
fail:
msg: "Docker is not installed on {{ inventory_hostname }}"
when: docker_installed.rc != 0
# 检查当前 Swarm 状态
- name: Check Docker Swarm status
shell: docker info | grep "Swarm:" -A 1
register: swarm_status
- name: Display current Swarm status
debug:
msg: "🔍 Current Swarm Status: {{ swarm_status.stdout_lines }}"
# 获取运行中的容器 - 使用简单格式
- name: Get running containers
shell: docker ps
register: running_containers
- name: Display running containers
debug:
msg: "🏃 Running Containers:\n{{ running_containers.stdout }}"
# 获取所有容器(包括停止的)
- name: Get all containers
shell: docker ps -a
register: all_containers
- name: Display all containers
debug:
msg: "📦 All Containers:\n{{ all_containers.stdout }}"
# 获取容器名称列表
- name: Get container names
shell: docker ps -a | awk 'NR>1 {print $NF}' | head -20
register: container_names
- name: Display container names
debug:
msg: "Container names: {{ container_names.stdout_lines }}"
# 检查每个容器的基本信息
- name: Get basic container info
shell: |
echo "=== Container: {{ item }} ==="
docker inspect {{ item }} | jq -r '.[0] | {
"Image": .Config.Image,
"Status": .State.Status,
"RestartPolicy": .HostConfig.RestartPolicy.Name,
"NetworkMode": .HostConfig.NetworkMode
}'
echo "Ports:"
docker port {{ item }} 2>/dev/null || echo "No published ports"
echo "Mounts:"
docker inspect {{ item }} | jq -r '.[0].Mounts[]? | " \(.Source):\(.Destination) (\(.Mode))"'
echo "---"
register: container_info
loop: "{{ container_names.stdout_lines[:10] }}" # 限制前10个容器
when: container_names.stdout_lines | length > 0
- name: Display container info
debug:
msg: "{{ item.stdout }}"
loop: "{{ container_info.results }}"
when: container_info is defined
# 检查 Docker Compose 文件
- name: Find docker-compose files in common locations
find:
paths:
- /root
- /home
- /opt
- /var/lib/docker
patterns:
- "docker-compose.yml"
- "docker-compose.yaml"
- "compose.yml"
- "compose.yaml"
recurse: yes
depth: 3
register: compose_files
ignore_errors: yes
- name: Display found compose files
debug:
msg: "📄 Found compose files: {{ compose_files.files | map(attribute='path') | list }}"
when: compose_files.files | length > 0
# 分析网络配置
- name: Get Docker networks
shell: docker network ls
register: docker_networks
- name: Display Docker networks
debug:
msg: "🌐 Docker Networks:\n{{ docker_networks.stdout }}"
# 检查卷使用情况
- name: Get Docker volumes
shell: docker volume ls
register: docker_volumes
- name: Display Docker volumes
debug:
msg: "💾 Docker Volumes:\n{{ docker_volumes.stdout }}"
# 检查容器资源使用
- name: Get container resource usage
shell: docker stats --no-stream
register: container_stats
when: container_names.stdout_lines | length > 0
- name: Display container stats
debug:
msg: "📊 Container Resource Usage:\n{{ container_stats.stdout }}"
when: container_stats is defined
# 检查 Docker 镜像
- name: Get Docker images
shell: docker images
register: docker_images
- name: Display Docker images
debug:
msg: "🖼️ Docker Images:\n{{ docker_images.stdout }}"
# 生成 Swarm 适用性分析
- name: Generate Swarm suitability analysis
debug:
msg: |
🔍 DOCKER SWARM MIGRATION ANALYSIS FOR {{ inventory_hostname }}
================================================================
📋 SUMMARY:
- Current Swarm Status: {{ 'Active' if 'active' in swarm_status.stdout else 'Inactive' }}
- Total Containers: {{ container_names.stdout_lines | length }}
- Running Containers: {{ running_containers.stdout_lines | length - 1 }}
- Compose Files Found: {{ compose_files.files | length if compose_files.files is defined else 0 }}
💡 SWARM MIGRATION RECOMMENDATIONS:
✅ TYPICALLY SUITABLE FOR SWARM:
- Web servers (nginx, apache, caddy)
- API services and microservices
- Application servers
- Load balancers (traefik, haproxy)
- Monitoring tools (prometheus, grafana)
- Databases (with proper volume strategy)
❌ NOT SUITABLE FOR SWARM:
- Containers using Docker socket (/var/run/docker.sock)
- Containers with --privileged flag
- Development/testing containers
- Containers requiring specific host hardware access
⚠️ NEEDS MODIFICATION FOR SWARM:
- Containers using bind mounts → convert to volumes
- Containers without restart policies → add restart policies
- Containers using host networking → use overlay networks
- Containers with hardcoded IPs → use service discovery
🚀 MIGRATION STEPS:
1. ✅ Swarm is already initialized
2. Create overlay networks for service communication
3. Convert suitable containers to Docker services
4. Set up service discovery and load balancing
5. Configure secrets and configs management
6. Test service scaling and failover
📝 NEXT ACTIONS:
- Review each container above for Swarm suitability
- Identify services that would benefit from scaling
- Plan network topology for services
- Prepare volume migration strategy
when: container_names is defined
# 保存分析结果
- name: Save analysis summary to file
copy:
content: |
Docker Swarm Analysis for {{ inventory_hostname }}
Generated: {{ ansible_date_time.iso8601 }}
SWARM STATUS:
{{ swarm_status.stdout }}
CONTAINERS ({{ container_names.stdout_lines | length }} total):
{{ container_names.stdout_lines | join('\n') }}
NETWORKS:
{{ docker_networks.stdout }}
VOLUMES:
{{ docker_volumes.stdout }}
IMAGES:
{{ docker_images.stdout }}
{% if compose_files.files is defined and compose_files.files | length > 0 %}
COMPOSE FILES FOUND:
{% for file in compose_files.files %}
- {{ file.path }}
{% endfor %}
{% endif %}
dest: "/tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt"
- name: Analysis complete
debug:
msg: |
🎉 ANALYSIS COMPLETE!
📄 Results saved to: /tmp/swarm-analysis-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.txt
🔍 Review the container details above to identify:
- Which services are suitable for Swarm
- Which containers need modification
- Migration priority and strategy
💡 TIP: Focus on stateless services first for easier migration!

View File

@ -1,95 +0,0 @@
---
- name: Gitea Runner Management
hosts: hcp
become: yes
vars:
gitea_runner_user: "gitea-runner"
gitea_runner_data_dir: "/var/lib/gitea-runner"
gitea_runner_log_dir: "/var/log/gitea-runner"
tasks:
- name: Check gitea-runner service status
systemd:
name: gitea-runner
register: service_status
- name: Display service status
debug:
msg: |
Service: {{ service_status.status.ActiveState }}
Enabled: {{ service_status.status.UnitFileState }}
Main PID: {{ service_status.status.MainPID | default('N/A') }}
- name: Show recent logs
command: journalctl -u gitea-runner --no-pager -n 20
register: recent_logs
changed_when: false
- name: Display recent logs
debug:
var: recent_logs.stdout_lines
- name: Check runner registration
stat:
path: "{{ gitea_runner_data_dir }}/.runner"
register: runner_registered
- name: Display registration status
debug:
msg: "Runner registered: {{ runner_registered.stat.exists }}"
- name: Show runner configuration (if registered)
command: cat {{ gitea_runner_data_dir }}/.runner
register: runner_config
become_user: "{{ gitea_runner_user }}"
when: runner_registered.stat.exists
changed_when: false
- name: Display runner configuration
debug:
var: runner_config.stdout_lines
when: runner_registered.stat.exists
- name: Check Docker access for runner user
command: docker ps
become_user: "{{ gitea_runner_user }}"
register: docker_access
changed_when: false
failed_when: false
- name: Display Docker access status
debug:
msg: |
Docker access: {{ 'OK' if docker_access.rc == 0 else 'FAILED' }}
{% if docker_access.rc != 0 %}
Error: {{ docker_access.stderr }}
{% endif %}
# 单独的任务用于管理服务
- name: Service Management Tasks
hosts: hcp
become: yes
tasks:
- name: Start gitea-runner service
systemd:
name: gitea-runner
state: started
when: ansible_run_tags is defined and 'start' in ansible_run_tags
- name: Stop gitea-runner service
systemd:
name: gitea-runner
state: stopped
when: ansible_run_tags is defined and 'stop' in ansible_run_tags
- name: Restart gitea-runner service
systemd:
name: gitea-runner
state: restarted
when: ansible_run_tags is defined and 'restart' in ansible_run_tags
- name: Reload gitea-runner service
systemd:
name: gitea-runner
state: reloaded
when: ansible_run_tags is defined and 'reload' in ansible_run_tags

View File

@ -1,157 +0,0 @@
---
- name: Setup Gitea Runner on HCP nodes
hosts: hcp
become: yes
vars:
gitea_runner_token: "vOrrQda6Qiet9YOj4waZVU5QgLig2J3rKp2RfoN7"
gitea_server_url: "http://gitea:3000"
gitea_runner_user: "gitea-runner"
gitea_runner_home: "/home/{{ gitea_runner_user }}"
gitea_runner_config_dir: "/etc/gitea-runner"
gitea_runner_data_dir: "/var/lib/gitea-runner"
gitea_runner_log_dir: "/var/log/gitea-runner"
gitea_runner_binary: "/usr/bin/act_runner"
tasks:
- name: Check if gitea-runner binary exists
stat:
path: "{{ gitea_runner_binary }}"
register: runner_binary
- name: Fail if act_runner binary not found
fail:
msg: "Act runner binary not found at {{ gitea_runner_binary }}. Please install it first."
when: not runner_binary.stat.exists
- name: Create gitea-runner user
user:
name: "{{ gitea_runner_user }}"
system: yes
shell: /bin/bash
home: "{{ gitea_runner_home }}"
create_home: yes
comment: "Gitea Runner Service User"
- name: Create gitea-runner directories
file:
path: "{{ item }}"
state: directory
owner: "{{ gitea_runner_user }}"
group: "{{ gitea_runner_user }}"
mode: '0755'
loop:
- "{{ gitea_runner_config_dir }}"
- "{{ gitea_runner_data_dir }}"
- "{{ gitea_runner_log_dir }}"
- name: Create gitea-runner configuration file
template:
src: gitea-runner-config.yml.j2
dest: "{{ gitea_runner_config_dir }}/config.yml"
owner: "{{ gitea_runner_user }}"
group: "{{ gitea_runner_user }}"
mode: '0600'
notify: restart gitea-runner
- name: Create gitea-runner systemd service file
template:
src: gitea-runner.service.j2
dest: /etc/systemd/system/gitea-runner.service
owner: root
group: root
mode: '0644'
notify:
- reload systemd
- restart gitea-runner
- name: Create gitea-runner environment file
template:
src: gitea-runner.env.j2
dest: /etc/default/gitea-runner
owner: root
group: root
mode: '0600'
notify: restart gitea-runner
- name: Create runner registration script
template:
src: register-runner.sh.j2
dest: "{{ gitea_runner_home }}/register-runner.sh"
owner: "{{ gitea_runner_user }}"
group: "{{ gitea_runner_user }}"
mode: '0755'
- name: Check if runner is already registered
stat:
path: "{{ gitea_runner_data_dir }}/.runner"
register: runner_registered
- name: Register gitea runner
command: "{{ gitea_runner_home }}/register-runner.sh"
become_user: "{{ gitea_runner_user }}"
when: not runner_registered.stat.exists
register: registration_result
- name: Display registration result
debug:
var: registration_result.stdout_lines
when: registration_result is defined and registration_result.stdout_lines is defined
- name: Create runner startup script
template:
src: start-runner.sh.j2
dest: "{{ gitea_runner_home }}/start-runner.sh"
owner: "{{ gitea_runner_user }}"
group: "{{ gitea_runner_user }}"
mode: '0755'
- name: Create logrotate configuration for gitea-runner
template:
src: gitea-runner.logrotate.j2
dest: /etc/logrotate.d/gitea-runner
owner: root
group: root
mode: '0644'
- name: Install Docker (required for runner)
package:
name: docker.io
state: present
- name: Add gitea-runner user to docker group
user:
name: "{{ gitea_runner_user }}"
groups: docker
append: yes
- name: Start and enable Docker service
systemd:
name: docker
state: started
enabled: yes
- name: Start and enable gitea-runner service
systemd:
name: gitea-runner
state: started
enabled: yes
daemon_reload: yes
- name: Check gitea-runner service status
systemd:
name: gitea-runner
register: service_status
- name: Display service status
debug:
msg: "Gitea Runner service is {{ service_status.status.ActiveState }}"
handlers:
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart gitea-runner
systemd:
name: gitea-runner
state: restarted

View File

@ -1,194 +0,0 @@
---
- name: Docker Swarm Migration Plan for ash3c
hosts: ash3c
become: yes
gather_facts: yes
vars:
# 定义服务迁移计划
swarm_services:
high_priority:
- name: ghproxy
image: wjqserver/ghproxy:latest
ports: "8046:8080"
replicas: 2
networks: ["app-network"]
- name: redis
image: redis:latest
ports: "63789:6379"
replicas: 1
networks: ["app-network"]
volumes: ["redis-data:/data"]
medium_priority:
- name: consul
image: bitnami/consul:latest
ports:
- "8310:8300"
- "8311:8301"
- "8312:8302"
- "8501:8500"
- "8601:8600/udp"
replicas: 1
networks: ["consul-network"]
- name: discourse-app
image: bitnami/discourse:3.4.1
ports: "31080:3000"
replicas: 1
networks: ["app-network"]
depends_on: ["postgres", "redis"]
- name: discourse-sidekiq
image: bitnami/discourse:3.4.1
replicas: 1
networks: ["app-network"]
depends_on: ["postgres", "redis"]
low_priority:
- name: elasticsearch
image: bitnami/elasticsearch:8.17.2
ports: "59200:9200"
replicas: 1
networks: ["elastic-network"]
volumes: ["elastic-data:/bitnami/elasticsearch/data"]
constraints: ["node.role==manager"]
- name: postgres
image: postgres:17.2
ports: "54322:5432"
replicas: 1
networks: ["db-network"]
volumes: ["postgres-data:/var/lib/postgresql/data"]
constraints: ["node.role==manager"]
secrets: ["postgres_password"]
tasks:
- name: Display migration plan
debug:
msg: |
🚀 DOCKER SWARM MIGRATION PLAN FOR {{ inventory_hostname }}
=========================================================
📋 PHASE 1 - HIGH PRIORITY (Low Risk)
{% for service in swarm_services.high_priority %}
✅ {{ service.name }}:
- Image: {{ service.image }}
- Replicas: {{ service.replicas }}
- Networks: {{ service.networks | join(', ') }}
- Migration: Safe, stateless service
{% endfor %}
📋 PHASE 2 - MEDIUM PRIORITY (Medium Risk)
{% for service in swarm_services.medium_priority %}
⚠️ {{ service.name }}:
- Image: {{ service.image }}
- Replicas: {{ service.replicas }}
- Networks: {{ service.networks | join(', ') }}
- Migration: Requires coordination
{% endfor %}
📋 PHASE 3 - LOW PRIORITY (High Risk)
{% for service in swarm_services.low_priority %}
🔴 {{ service.name }}:
- Image: {{ service.image }}
- Replicas: {{ service.replicas }}
- Networks: {{ service.networks | join(', ') }}
- Migration: Requires careful planning
{% endfor %}
- name: Create migration script
copy:
content: |
#!/bin/bash
# Docker Swarm Migration Script for {{ inventory_hostname }}
# Generated: {{ ansible_date_time.iso8601 }}
set -e
echo "🚀 Starting Docker Swarm Migration..."
# Create networks
echo "📡 Creating overlay networks..."
docker network create -d overlay --attachable app-network || true
docker network create -d overlay --attachable db-network || true
docker network create -d overlay --attachable consul-network || true
docker network create -d overlay --attachable elastic-network || true
# Create volumes
echo "💾 Creating volumes..."
docker volume create redis-data || true
docker volume create postgres-data || true
docker volume create elastic-data || true
# Create secrets (example)
echo "🔐 Creating secrets..."
echo "your_postgres_password" | docker secret create postgres_password - || true
echo "✅ Infrastructure setup complete!"
echo ""
echo "🔄 PHASE 1 - Migrate high priority services:"
echo "docker service create --name ghproxy-svc --replicas 2 --network app-network -p 8046:8080 wjqserver/ghproxy:latest"
echo "docker service create --name redis-svc --replicas 1 --network app-network -p 63789:6379 --mount type=volume,source=redis-data,target=/data redis:latest"
echo ""
echo "🔄 PHASE 2 - Migrate medium priority services:"
echo "docker service create --name consul-svc --replicas 1 --network consul-network -p 8310:8300 -p 8311:8301 -p 8312:8302 -p 8501:8500 -p 8601:8600/udp bitnami/consul:latest"
echo "docker service create --name discourse-app-svc --replicas 1 --network app-network -p 31080:3000 bitnami/discourse:3.4.1"
echo "docker service create --name discourse-sidekiq-svc --replicas 1 --network app-network bitnami/discourse:3.4.1"
echo ""
echo "🔄 PHASE 3 - Migrate low priority services (CAREFUL!):"
echo "docker service create --name postgres-svc --replicas 1 --network db-network -p 54322:5432 --mount type=volume,source=postgres-data,target=/var/lib/postgresql/data --secret postgres_password --constraint 'node.role==manager' postgres:17.2"
echo "docker service create --name elasticsearch-svc --replicas 1 --network elastic-network -p 59200:9200 --mount type=volume,source=elastic-data,target=/bitnami/elasticsearch/data --constraint 'node.role==manager' bitnami/elasticsearch:8.17.2"
echo ""
echo "📊 Monitor services:"
echo "docker service ls"
echo "docker service ps <service-name>"
echo ""
echo "⚠️ IMPORTANT NOTES:"
echo "1. Stop original containers before creating services"
echo "2. Backup data before migrating databases"
echo "3. Test each phase before proceeding"
echo "4. Monitor logs: docker service logs <service-name>"
dest: "/tmp/swarm-migration-{{ inventory_hostname }}.sh"
mode: '0755'
- name: Create rollback script
copy:
content: |
#!/bin/bash
# Docker Swarm Rollback Script for {{ inventory_hostname }}
echo "🔄 Rolling back Swarm services..."
# Remove services
docker service rm ghproxy-svc redis-svc consul-svc discourse-app-svc discourse-sidekiq-svc postgres-svc elasticsearch-svc 2>/dev/null || true
# Remove networks (optional)
# docker network rm app-network db-network consul-network elastic-network 2>/dev/null || true
echo "✅ Rollback complete. Original containers should be restarted manually."
dest: "/tmp/swarm-rollback-{{ inventory_hostname }}.sh"
mode: '0755'
- name: Migration plan complete
debug:
msg: |
🎉 MIGRATION PLAN GENERATED!
📄 Files created:
- /tmp/swarm-migration-{{ inventory_hostname }}.sh (Migration script)
- /tmp/swarm-rollback-{{ inventory_hostname }}.sh (Rollback script)
🚀 RECOMMENDED APPROACH:
1. Backup all data first
2. Test migration in phases
3. Start with Phase 1 (low risk services)
4. Monitor each service before proceeding
5. Keep rollback script ready
💡 NEXT STEPS:
1. Review and customize the migration script
2. Plan maintenance window
3. Execute phase by phase
4. Monitor and validate each service

View File

@ -1,50 +0,0 @@
# Gitea Runner Configuration
log:
level: info
file: {{ gitea_runner_log_dir }}/runner.log
runner:
# Runner name (will be auto-generated if not specified)
name: "{{ inventory_hostname }}-runner"
# Runner capacity (number of concurrent jobs)
capacity: 2
# Runner timeout
timeout: 3600
# Runner labels (for job targeting)
labels:
- "ubuntu-latest:docker://ubuntu:22.04"
- "ubuntu-20.04:docker://ubuntu:20.04"
- "ubuntu-18.04:docker://ubuntu:18.04"
- "node:docker://node:18"
- "python:docker://python:3.11"
- "ansible:docker://quay.io/ansible/ansible-runner:latest"
- "opentofu:docker://opentofu/opentofu:latest"
cache:
enabled: true
dir: {{ gitea_runner_data_dir }}/cache
host: ""
port: 0
container:
# Docker network for runner containers
network: "gitea-runner"
# Enable privileged containers (needed for Docker-in-Docker)
privileged: false
# Container options
options: "--rm --pull=always"
# Valid platforms
valid_volumes:
- "/tmp"
- "{{ gitea_runner_data_dir }}"
docker_host: "unix:///var/run/docker.sock"
host:
workdir_parent: {{ gitea_runner_data_dir }}/work

View File

@ -1,18 +0,0 @@
# Gitea Runner Environment Variables
# Gitea server configuration
GITEA_INSTANCE_URL={{ gitea_server_url }}
GITEA_RUNNER_REGISTRATION_TOKEN={{ gitea_runner_token }}
# Runner configuration
GITEA_RUNNER_NAME={{ inventory_hostname }}-runner
GITEA_RUNNER_LABELS=ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu
# Docker configuration
DOCKER_HOST=unix:///var/run/docker.sock
# Logging
GITEA_RUNNER_LOG_LEVEL=info
# Security
GITEA_RUNNER_SECURITY_PRIVILEGED=false

View File

@ -1,12 +0,0 @@
{{ gitea_runner_log_dir }}/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 644 {{ gitea_runner_user }} {{ gitea_runner_user }}
postrotate
systemctl reload gitea-runner || true
endscript
}

View File

@ -1,39 +0,0 @@
[Unit]
Description=Gitea Actions Runner
Documentation=https://docs.gitea.io/en-us/actions/
After=network.target docker.service
Wants=docker.service
[Service]
Type=simple
User={{ gitea_runner_user }}
Group={{ gitea_runner_user }}
WorkingDirectory={{ gitea_runner_data_dir }}
ExecStart={{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT
TimeoutStopSec=5
Restart=always
RestartSec=10
StartLimitInterval=0
# Security settings
NoNewPrivileges=yes
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=yes
ReadWritePaths={{ gitea_runner_data_dir }} {{ gitea_runner_log_dir }} /var/run/docker.sock
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
# Environment
EnvironmentFile=-/etc/default/gitea-runner
# Logging
StandardOutput=append:{{ gitea_runner_log_dir }}/gitea-runner.log
StandardError=append:{{ gitea_runner_log_dir }}/gitea-runner-error.log
[Install]
WantedBy=multi-user.target

View File

@ -1,46 +0,0 @@
#!/bin/bash
# Gitea Runner Registration Script
set -e
echo "🚀 注册 Gitea Runner..."
# 配置变量
GITEA_URL="{{ gitea_server_url }}"
REGISTRATION_TOKEN="{{ gitea_runner_token }}"
RUNNER_NAME="{{ inventory_hostname }}-runner"
RUNNER_LABELS="ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu"
# 切换到数据目录
cd {{ gitea_runner_data_dir }}
# 检查是否已经注册
if [ -f ".runner" ]; then
echo "✅ Runner 已经注册"
exit 0
fi
echo "📝 注册 Runner: $RUNNER_NAME"
echo "🔗 Gitea URL: $GITEA_URL"
echo "🏷️ Labels: $RUNNER_LABELS"
# 注册 Runner
{{ gitea_runner_binary }} register \
--instance "$GITEA_URL" \
--token "$REGISTRATION_TOKEN" \
--name "$RUNNER_NAME" \
--labels "$RUNNER_LABELS"
if [ $? -eq 0 ]; then
echo "✅ Runner 注册成功!"
# 设置文件权限
chown {{ gitea_runner_user }}:{{ gitea_runner_user }} .runner .credentials
chmod 600 .runner .credentials
echo "📋 Runner 信息:"
cat .runner
else
echo "❌ Runner 注册失败"
exit 1
fi

View File

@ -1,20 +0,0 @@
#!/bin/bash
# Gitea Runner Startup Script
set -e
echo "🚀 启动 Gitea Runner..."
# 切换到数据目录
cd {{ gitea_runner_data_dir }}
# 检查注册状态
if [ ! -f ".runner" ]; then
echo "❌ Runner 未注册,请先运行注册脚本"
exit 1
fi
echo "✅ Runner 已注册,启动守护进程..."
# 启动 Runner
exec {{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml

30
configuration/proxy.env Normal file
View File

@ -0,0 +1,30 @@
# Proxy Configuration for istoreos.tailnet-68f9.ts.net:1082
# This file contains proxy environment variables for the management system
# HTTP/HTTPS Proxy Settings
export http_proxy=http://istoreos.tailnet-68f9.ts.net:1082
export https_proxy=http://istoreos.tailnet-68f9.ts.net:1082
export HTTP_PROXY=http://istoreos.tailnet-68f9.ts.net:1082
export HTTPS_PROXY=http://istoreos.tailnet-68f9.ts.net:1082
# No Proxy Settings (local networks and services)
export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net
export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net
# Additional proxy settings for various tools
export ALL_PROXY=http://istoreos.tailnet-68f9.ts.net:1082
export all_proxy=http://istoreos.tailnet-68f9.ts.net:1082
# Docker proxy settings
export DOCKER_BUILDKIT=1
export BUILDKIT_PROGRESS=plain
# Git proxy settings
export GIT_HTTP_PROXY=http://istoreos.tailnet-68f9.ts.net:1082
export GIT_HTTPS_PROXY=http://istoreos.tailnet-68f9.ts.net:1082
# Curl proxy settings
export CURL_PROXY=http://istoreos.tailnet-68f9.ts.net:1082
# Wget proxy settings
export WGET_PROXY=http://istoreos.tailnet-68f9.ts.net:1082

View File

@ -1,50 +0,0 @@
# Gitea Runner Configuration
log:
level: info
file: {{ gitea_runner_log_dir }}/runner.log
runner:
# Runner name (will be auto-generated if not specified)
name: "{{ inventory_hostname }}-runner"
# Runner capacity (number of concurrent jobs)
capacity: 2
# Runner timeout
timeout: 3600
# Runner labels (for job targeting)
labels:
- "ubuntu-latest:docker://ubuntu:22.04"
- "ubuntu-20.04:docker://ubuntu:20.04"
- "ubuntu-18.04:docker://ubuntu:18.04"
- "node:docker://node:18"
- "python:docker://python:3.11"
- "ansible:docker://quay.io/ansible/ansible-runner:latest"
- "opentofu:docker://opentofu/opentofu:latest"
cache:
enabled: true
dir: {{ gitea_runner_data_dir }}/cache
host: ""
port: 0
container:
# Docker network for runner containers
network: "gitea-runner"
# Enable privileged containers (needed for Docker-in-Docker)
privileged: false
# Container options
options: "--rm --pull=always"
# Valid platforms
valid_volumes:
- "/tmp"
- "{{ gitea_runner_data_dir }}"
docker_host: "unix:///var/run/docker.sock"
host:
workdir_parent: {{ gitea_runner_data_dir }}/work

View File

@ -1,18 +0,0 @@
# Gitea Runner Environment Variables
# Gitea server configuration
GITEA_INSTANCE_URL={{ gitea_server_url }}
GITEA_RUNNER_REGISTRATION_TOKEN={{ gitea_runner_token }}
# Runner configuration
GITEA_RUNNER_NAME={{ inventory_hostname }}-runner
GITEA_RUNNER_LABELS=ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu
# Docker configuration
DOCKER_HOST=unix:///var/run/docker.sock
# Logging
GITEA_RUNNER_LOG_LEVEL=info
# Security
GITEA_RUNNER_SECURITY_PRIVILEGED=false

View File

@ -1,12 +0,0 @@
{{ gitea_runner_log_dir }}/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 644 {{ gitea_runner_user }} {{ gitea_runner_user }}
postrotate
systemctl reload gitea-runner || true
endscript
}

View File

@ -1,39 +0,0 @@
[Unit]
Description=Gitea Actions Runner
Documentation=https://docs.gitea.io/en-us/actions/
After=network.target docker.service
Wants=docker.service
[Service]
Type=simple
User={{ gitea_runner_user }}
Group={{ gitea_runner_user }}
WorkingDirectory={{ gitea_runner_data_dir }}
ExecStart={{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT
TimeoutStopSec=5
Restart=always
RestartSec=10
StartLimitInterval=0
# Security settings
NoNewPrivileges=yes
PrivateTmp=yes
ProtectSystem=strict
ProtectHome=yes
ReadWritePaths={{ gitea_runner_data_dir }} {{ gitea_runner_log_dir }} /var/run/docker.sock
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
# Environment
EnvironmentFile=-/etc/default/gitea-runner
# Logging
StandardOutput=append:{{ gitea_runner_log_dir }}/gitea-runner.log
StandardError=append:{{ gitea_runner_log_dir }}/gitea-runner-error.log
[Install]
WantedBy=multi-user.target

View File

@ -1,46 +0,0 @@
#!/bin/bash
# Gitea Runner Registration Script
set -e
echo "🚀 注册 Gitea Runner..."
# 配置变量
GITEA_URL="{{ gitea_server_url }}"
REGISTRATION_TOKEN="{{ gitea_runner_token }}"
RUNNER_NAME="{{ inventory_hostname }}-runner"
RUNNER_LABELS="ubuntu-latest,ubuntu-20.04,ubuntu-18.04,node,python,ansible,opentofu"
# 切换到数据目录
cd {{ gitea_runner_data_dir }}
# 检查是否已经注册
if [ -f ".runner" ]; then
echo "✅ Runner 已经注册"
exit 0
fi
echo "📝 注册 Runner: $RUNNER_NAME"
echo "🔗 Gitea URL: $GITEA_URL"
echo "🏷️ Labels: $RUNNER_LABELS"
# 注册 Runner
{{ gitea_runner_binary }} register \
--instance "$GITEA_URL" \
--token "$REGISTRATION_TOKEN" \
--name "$RUNNER_NAME" \
--labels "$RUNNER_LABELS"
if [ $? -eq 0 ]; then
echo "✅ Runner 注册成功!"
# 设置文件权限
chown {{ gitea_runner_user }}:{{ gitea_runner_user }} .runner .credentials
chmod 600 .runner .credentials
echo "📋 Runner 信息:"
cat .runner
else
echo "❌ Runner 注册失败"
exit 1
fi

View File

@ -1,20 +0,0 @@
#!/bin/bash
# Gitea Runner Startup Script
set -e
echo "🚀 启动 Gitea Runner..."
# 切换到数据目录
cd {{ gitea_runner_data_dir }}
# 检查注册状态
if [ ! -f ".runner" ]; then
echo "❌ Runner 未注册,请先运行注册脚本"
exit 1
fi
echo "✅ Runner 已注册,启动守护进程..."
# 启动 Runner
exec {{ gitea_runner_binary }} daemon --config {{ gitea_runner_config_dir }}/config.yml

View File

@ -1,202 +0,0 @@
# ZSH 配置同步方案
这个目录包含了完整的 oh-my-zsh 配置,可以在多个 VPS 之间同步使用。
## 文件结构
```
configuration/zsh/
├── README.md # 本文件
├── install-zsh-config.sh # 完整安装脚本
├── quick-install.sh # 快速安装脚本
├── zshrc.template # ZSH 配置文件模板
└── oh-my-zsh-custom/ # 自定义 oh-my-zsh 配置
├── aliases.zsh # 自定义别名
└── plugins/ # 自定义插件
```
## 使用方法
### 方法一:智能安装(推荐)
询问用户是否使用代理,安装完成后可选择是否保持:
```bash
# 智能安装(询问代理使用,安装后可选择是否保持)
curl -fsSL https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt/raw/branch/main/configuration/zsh/smart-install.sh | bash
```
**特点:**
- 安装前询问是否使用代理
- 测试代理连接确保可用
- 安装完成后询问是否保持代理
- 用户完全控制代理使用
### 方法二:快速安装
在新 VPS 上运行:
```bash
# 一键安装
curl -fsSL https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt/raw/branch/main/configuration/zsh/quick-install.sh | bash
```
### 方法三:手动安装
1. 克隆仓库:
```bash
git clone https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt.git /root/mgmt
```
2. 运行安装脚本:
```bash
cd /root/mgmt
chmod +x configuration/zsh/install-zsh-config.sh
./configuration/zsh/install-zsh-config.sh
```
## 配置同步
安装完成后,可以使用以下命令同步最新配置:
```bash
# 同步配置
sync-zsh-config
```
这个命令会:
1. 从 Gitea 拉取最新配置
2. 备份当前配置
3. 部署新配置
## 代理管理
如果网络环境需要代理,可以使用以下命令管理代理:
```bash
# 代理管理命令
proxy-on # 临时开启代理
proxy-off # 临时关闭代理
proxy-toggle # 切换代理状态
proxy-enable # 永久开启代理
proxy-disable # 永久关闭代理
proxy-status # 查看代理状态
proxy-test # 测试代理连接
```
### 代理使用场景
- **临时使用**: `proxy-on` → 使用代理 → `proxy-off`
- **永久开启**: `proxy-enable` → 重启后仍然有效
- **快速切换**: `proxy-toggle` → 一键切换状态
- **状态检查**: `proxy-status` → 查看当前状态和IP
## 包含的功能
### 插件
- **git** - Git 集成
- **docker** - Docker 支持
- **ansible** - Ansible 支持
- **terraform** - OpenTofu/Terraform 支持
- **kubectl** - Kubernetes 支持
- **zsh-autosuggestions** - 命令自动建议
- **zsh-syntax-highlighting** - 语法高亮
- **zsh-completions** - 增强补全
### 别名
- **项目管理**: `mgmt-status`, `mgmt-deploy`, `mgmt-cleanup`
- **Ansible**: `ansible-check`, `ansible-deploy`, `ansible-ping`
- **OpenTofu**: `tofu-init`, `tofu-plan`, `tofu-apply`
- **Docker**: `dps`, `dex`, `dlog`, `dclean`
- **Kubernetes**: `k`, `kgp`, `kgs`, `kaf`
- **Git**: `gs`, `ga`, `gc`, `gp`, `gl`
- **系统**: `ll`, `la`, `ports`, `myip`
### 主题
- **agnoster** - 功能丰富的主题,显示 Git 状态
## 更新配置
当您在主 VPS 上更新配置后:
1. 提交更改:
```bash
cd /root/mgmt
git add configuration/zsh/
git commit -m "Update zsh configuration"
git push origin main
```
2. 在其他 VPS 上同步:
```bash
sync-zsh-config
```
## 自定义配置
如果您需要在特定 VPS 上添加自定义配置:
1. 编辑 `~/.zshrc` 文件
2. 在文件末尾添加您的自定义配置
3. 这些配置不会被同步脚本覆盖
## 故障排除
### 如果自动建议插件不工作
```bash
# 运行测试脚本检查插件状态
chmod +x /root/mgmt/configuration/zsh/test-plugins.sh
/root/mgmt/configuration/zsh/test-plugins.sh
# 手动安装缺失的插件
cd ~/.oh-my-zsh/custom/plugins
git clone https://github.com/zsh-users/zsh-autosuggestions
git clone https://github.com/zsh-users/zsh-syntax-highlighting.git
git clone https://github.com/zsh-users/zsh-completions
# 重新加载配置
source ~/.zshrc
```
### 如果同步失败
```bash
# 检查网络连接
ping gitea.tailnet-68f9.ts.net
# 手动拉取
cd /root/mgmt
git pull origin main
```
### 如果别名不工作
```bash
# 重新加载配置
source ~/.zshrc
# 检查别名
alias | grep <alias-name>
```
### 如果插件不工作
```bash
# 检查插件目录
ls ~/.oh-my-zsh/plugins/
ls ~/.oh-my-zsh/custom/plugins/
# 运行测试脚本
/root/mgmt/configuration/zsh/test-plugins.sh
```
## 安全说明
- 此配置包含访问 Gitea 的凭据
- 请确保只在可信的 VPS 上使用
- 建议定期更新访问令牌
## 支持
如有问题,请检查:
1. 网络连接是否正常
2. Git 凭据是否正确
3. 依赖包是否已安装
4. 权限是否正确

View File

@ -1,281 +0,0 @@
#!/bin/bash
# ZSH 配置安装脚本
# 用于在其他 VPS 上安装和同步 oh-my-zsh 配置
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
exit 1
fi
}
# 设置代理(如果需要)
setup_proxy() {
log_info "检查代理设置..."
# 检查是否已经有代理配置
if [[ -f "/root/mgmt/configuration/proxy.env" ]]; then
log_info "发现代理配置文件,加载代理设置..."
source "/root/mgmt/configuration/proxy.env"
# 测试代理连接
if curl -s --connect-timeout 5 --proxy "$http_proxy" https://httpbin.org/ip >/dev/null 2>&1; then
log_success "代理连接正常,将使用代理下载"
else
log_warning "代理连接失败,将使用直连"
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
fi
else
log_info "未发现代理配置,将使用直连"
fi
}
# 安装依赖
install_dependencies() {
log_info "安装依赖包..."
# 更新包列表
apt update
# 安装必要的包
apt install -y \
zsh \
git \
curl \
wget \
htop \
tree \
jq \
tmux \
fonts-powerline \
fontconfig
log_success "依赖包安装完成"
}
# 安装 oh-my-zsh
install_oh_my_zsh() {
log_info "安装 oh-my-zsh..."
if [[ -d "$HOME/.oh-my-zsh" ]]; then
log_warning "oh-my-zsh 已安装,跳过安装步骤"
return 0
fi
# 安装 oh-my-zsh
RUNZSH=no CHSH=no sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)"
log_success "oh-my-zsh 安装完成"
}
# 安装自定义插件
install_custom_plugins() {
log_info "安装自定义插件..."
local custom_dir="$HOME/.oh-my-zsh/custom/plugins"
# zsh-autosuggestions
if [[ ! -d "$custom_dir/zsh-autosuggestions" ]]; then
log_info "安装 zsh-autosuggestions..."
git clone https://github.com/zsh-users/zsh-autosuggestions "$custom_dir/zsh-autosuggestions"
fi
# zsh-syntax-highlighting
if [[ ! -d "$custom_dir/zsh-syntax-highlighting" ]]; then
log_info "安装 zsh-syntax-highlighting..."
git clone https://github.com/zsh-users/zsh-syntax-highlighting.git "$custom_dir/zsh-syntax-highlighting"
fi
# zsh-completions
if [[ ! -d "$custom_dir/zsh-completions" ]]; then
log_info "安装 zsh-completions..."
git clone https://github.com/zsh-users/zsh-completions "$custom_dir/zsh-completions"
fi
log_success "自定义插件安装完成"
}
# 部署配置文件
deploy_configs() {
log_info "部署配置文件..."
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# 备份现有配置
if [[ -f "$HOME/.zshrc" ]]; then
log_info "备份现有 .zshrc..."
cp "$HOME/.zshrc" "$HOME/.zshrc.backup.$(date +%Y%m%d_%H%M%S)"
fi
# 部署 .zshrc
if [[ -f "$script_dir/zshrc.template" ]]; then
log_info "部署 .zshrc 配置..."
cp "$script_dir/zshrc.template" "$HOME/.zshrc"
else
log_error "找不到 zshrc.template 文件"
exit 1
fi
# 部署自定义配置
if [[ -d "$script_dir/oh-my-zsh-custom" ]]; then
log_info "部署自定义 oh-my-zsh 配置..."
# 复制自定义别名文件
if [[ -f "$script_dir/oh-my-zsh-custom/aliases.zsh" ]]; then
cp "$script_dir/oh-my-zsh-custom/aliases.zsh" "$HOME/.oh-my-zsh/custom/"
fi
fi
log_success "配置文件部署完成"
}
# 设置默认 shell
set_default_shell() {
log_info "设置 zsh 为默认 shell..."
# 检查 zsh 是否在 /etc/shells 中
if ! grep -q "$(which zsh)" /etc/shells; then
log_info "添加 zsh 到 /etc/shells..."
echo "$(which zsh)" >> /etc/shells
fi
# 设置默认 shell
chsh -s "$(which zsh)"
log_success "默认 shell 设置为 zsh"
}
# 创建同步脚本
create_sync_script() {
log_info "创建同步脚本..."
cat > /usr/local/bin/sync-zsh-config << 'EOF'
#!/bin/bash
# ZSH 配置同步脚本
# 从 Gitea 仓库拉取最新配置
set -euo pipefail
MGMT_DIR="/root/mgmt"
ZSH_CONFIG_DIR="$MGMT_DIR/configuration/zsh"
log_info() {
echo -e "\033[0;34m[INFO]\033[0m $1"
}
log_success() {
echo -e "\033[0;32m[SUCCESS]\033[0m $1"
}
log_error() {
echo -e "\033[0;31m[ERROR]\033[0m $1"
}
# 检查 mgmt 目录是否存在
if [[ ! -d "$MGMT_DIR" ]]; then
log_error "mgmt 目录不存在: $MGMT_DIR"
exit 1
fi
# 进入 mgmt 目录
cd "$MGMT_DIR"
# 拉取最新配置
log_info "拉取最新配置..."
git pull origin main
# 检查 zsh 配置目录
if [[ ! -d "$ZSH_CONFIG_DIR" ]]; then
log_error "zsh 配置目录不存在: $ZSH_CONFIG_DIR"
exit 1
fi
# 备份当前配置
if [[ -f "$HOME/.zshrc" ]]; then
log_info "备份当前配置..."
cp "$HOME/.zshrc" "$HOME/.zshrc.backup.$(date +%Y%m%d_%H%M%S)"
fi
# 部署新配置
log_info "部署新配置..."
cp "$ZSH_CONFIG_DIR/zshrc.template" "$HOME/.zshrc"
# 部署自定义别名
if [[ -f "$ZSH_CONFIG_DIR/oh-my-zsh-custom/aliases.zsh" ]]; then
cp "$ZSH_CONFIG_DIR/oh-my-zsh-custom/aliases.zsh" "$HOME/.oh-my-zsh/custom/"
fi
log_success "ZSH 配置同步完成!"
log_info "请运行 'source ~/.zshrc' 或重新登录以应用新配置"
EOF
chmod +x /usr/local/bin/sync-zsh-config
log_success "同步脚本创建完成: /usr/local/bin/sync-zsh-config"
}
# 显示使用说明
show_usage() {
log_success "ZSH 配置安装完成!"
echo ""
log_info "使用方法:"
echo " 1. 重新登录或运行: source ~/.zshrc"
echo " 2. 同步配置: sync-zsh-config"
echo " 3. 查看别名: alias"
echo ""
log_info "可用命令:"
echo " - mgmt-status, mgmt-deploy, mgmt-cleanup"
echo " - ansible-check, ansible-deploy, ansible-ping"
echo " - tofu-init, tofu-plan, tofu-apply"
echo " - dps, dex, dlog (Docker)"
echo " - k, kgp, kgs (Kubernetes)"
echo ""
}
# 主函数
main() {
log_info "开始安装 ZSH 配置..."
check_root
setup_proxy
install_dependencies
install_oh_my_zsh
install_custom_plugins
deploy_configs
set_default_shell
create_sync_script
show_usage
log_success "安装完成!"
}
# 运行主函数
main "$@"

View File

@ -1,251 +0,0 @@
# =============================================================================
# CUSTOM ALIASES FOR MANAGEMENT SYSTEM
# =============================================================================
# Project Management
alias mgmt='cd /root/mgmt'
alias mgmt-status='cd /root/mgmt && ./mgmt.sh status'
alias mgmt-deploy='cd /root/mgmt && ./mgmt.sh deploy'
alias mgmt-cleanup='cd /root/mgmt && ./mgmt.sh cleanup'
alias mgmt-swarm='cd /root/mgmt && ./mgmt.sh swarm'
alias mgmt-tofu='cd /root/mgmt && ./mgmt.sh tofu'
# Ansible Management
alias ansible-check='cd /root/mgmt/configuration && ansible-playbook --syntax-check'
alias ansible-deploy='cd /root/mgmt/configuration && ansible-playbook -i inventories/production/inventory.ini'
alias ansible-ping='cd /root/mgmt/configuration && ansible -i inventories/production/inventory.ini all -m ping'
alias ansible-vault='cd /root/mgmt/configuration && ansible-vault'
alias ansible-galaxy='cd /root/mgmt/configuration && ansible-galaxy'
# OpenTofu/Terraform Management
alias tofu-init='cd /root/mgmt/tofu/environments/dev && tofu init'
alias tofu-plan='cd /root/mgmt/tofu/environments/dev && tofu plan -var-file="terraform.tfvars"'
alias tofu-apply='cd /root/mgmt/tofu/environments/dev && tofu apply -var-file="terraform.tfvars"'
alias tofu-destroy='cd /root/mgmt/tofu/environments/dev && tofu destroy -var-file="terraform.tfvars"'
alias tofu-output='cd /root/mgmt/tofu/environments/dev && tofu output'
alias tofu-validate='cd /root/mgmt/tofu/environments/dev && tofu validate'
alias tofu-fmt='cd /root/mgmt/tofu/environments/dev && tofu fmt -recursive'
# Docker Management
alias d='docker'
alias dc='docker-compose'
alias dps='docker ps'
alias dpsa='docker ps -a'
alias di='docker images'
alias dex='docker exec -it'
alias dlog='docker logs -f'
alias dstop='docker stop'
alias dstart='docker start'
alias drm='docker rm'
alias drmi='docker rmi'
alias dclean='docker system prune -f'
alias dbuild='docker build'
alias drun='docker run'
alias dpull='docker pull'
alias dpush='docker push'
# Docker Swarm Management
alias dswarm='docker swarm'
alias dstack='docker stack'
alias dservice='docker service'
alias dnode='docker node'
alias dnetwork='docker network'
alias dsecret='docker secret'
alias dconfig='docker config'
alias dstack-ls='docker stack ls'
alias dstack-rm='docker stack rm'
alias dstack-deploy='docker stack deploy'
alias dservice-ls='docker service ls'
alias dservice-ps='docker service ps'
alias dservice-logs='docker service logs'
# Kubernetes Management
alias k='kubectl'
alias kgp='kubectl get pods'
alias kgs='kubectl get services'
alias kgd='kubectl get deployments'
alias kgn='kubectl get nodes'
alias kgi='kubectl get ingress'
alias kgc='kubectl get configmaps'
alias kgs='kubectl get secrets'
alias kdp='kubectl describe pod'
alias kds='kubectl describe service'
alias kdd='kubectl describe deployment'
alias kdn='kubectl describe node'
alias kdi='kubectl describe ingress'
alias kaf='kubectl apply -f'
alias kdf='kubectl delete -f'
alias kl='kubectl logs -f'
alias ke='kubectl edit'
alias kx='kubectl exec -it'
alias kctx='kubectl config current-context'
alias kuse='kubectl config use-context'
# Git Management
alias gs='git status'
alias ga='git add'
alias gc='git commit'
alias gp='git push'
alias gl='git pull'
alias gd='git diff'
alias gb='git branch'
alias gco='git checkout'
alias gcom='git checkout main'
alias gcod='git checkout develop'
alias gst='git stash'
alias gstp='git stash pop'
alias gstl='git stash list'
alias gstc='git stash clear'
alias gcl='git clone'
alias gfe='git fetch'
alias gme='git merge'
alias gr='git rebase'
alias grc='git rebase --continue'
alias gra='git rebase --abort'
alias gres='git reset'
alias gresh='git reset --hard'
alias gress='git reset --soft'
# System Management
alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CF'
alias ..='cd ..'
alias ...='cd ../..'
alias ....='cd ../../..'
alias grep='grep --color=auto'
alias fgrep='fgrep --color=auto'
alias egrep='egrep --color=auto'
alias ports='netstat -tuln'
alias myip='curl -s https://httpbin.org/ip | jq -r .origin'
alias speedtest='curl -s https://raw.githubusercontent.com/sivel/speedtest-cli/master/speedtest.py | python3'
alias psg='ps aux | grep'
alias top='htop'
alias cp='cp -i'
alias mv='mv -i'
alias rm='rm -i'
alias mkdir='mkdir -pv'
# Network Management
alias ping='ping -c 4'
alias traceroute='traceroute -n'
alias nmap='nmap -sS -O'
alias ss='ss -tuln'
# File Operations
alias find='find . -name'
alias locate='locate -i'
alias which='which -a'
alias whereis='whereis -b'
# Text Processing
alias cat='cat -n'
alias less='less -R'
alias more='more -R'
alias head='head -n 20'
alias tail='tail -n 20'
alias wc='wc -l'
# Archive Operations
alias tar='tar -v'
alias zip='zip -r'
alias unzip='unzip -l'
alias gzip='gzip -v'
alias gunzip='gunzip -v'
# Process Management
alias jobs='jobs -l'
alias bg='bg %'
alias fg='fg %'
alias kill='kill -9'
alias pkill='pkill -f'
# Environment
alias env='env | sort'
alias set='set | sort'
alias unset='unset'
alias export='export'
alias source='source'
# History
alias h='history'
alias hg='history | grep'
alias hc='history -c'
# Directory Navigation
alias cd..='cd ..'
alias cd...='cd ../..'
alias cd....='cd ../../..'
alias cd-='cd -'
alias cd~='cd ~'
alias cd/='cd /'
# Quick Access
alias vim='vim'
alias nano='nano'
alias emacs='emacs'
alias code='code'
alias subl='subl'
# Monitoring
alias df='df -h'
alias du='du -h'
alias free='free -h'
alias meminfo='cat /proc/meminfo'
alias cpuinfo='cat /proc/cpuinfo'
alias uptime='uptime -p'
# Security
alias chmod='chmod -v'
alias chown='chown -v'
alias chgrp='chgrp -v'
alias passwd='passwd'
alias su='su -'
alias sudo='sudo -E'
# Development
alias make='make -j$(nproc)'
alias cmake='cmake -DCMAKE_BUILD_TYPE=Release'
alias gcc='gcc -Wall -Wextra'
alias g++='g++ -Wall -Wextra'
alias python='python3'
alias pip='pip3'
alias node='node'
alias npm='npm'
alias yarn='yarn'
# Logs
alias journal='journalctl -f'
alias syslog='tail -f /var/log/syslog'
alias auth='tail -f /var/log/auth.log'
alias kern='tail -f /var/log/kern.log'
alias mail='tail -f /var/log/mail.log'
# Backup
alias backup='tar -czf backup-$(date +%Y%m%d-%H%M%S).tar.gz'
alias restore='tar -xzf'
# Cleanup
alias clean='rm -rf ~/.cache/* ~/.tmp/* /tmp/*'
alias clean-docker='docker system prune -af --volumes'
alias clean-k8s='kubectl delete pods --field-selector=status.phase=Succeeded'
alias clean-ansible='rm -rf ~/.ansible/tmp/*'
# Information
alias info='uname -a'
alias whoami='whoami'
alias id='id'
alias groups='groups'
alias users='users'
alias w='w'
alias who='who'
alias last='last -n 10'
# Proxy Management
alias proxy-on='/root/mgmt/scripts/utilities/proxy-toggle.sh on'
alias proxy-off='/root/mgmt/scripts/utilities/proxy-toggle.sh off'
alias proxy-toggle='/root/mgmt/scripts/utilities/proxy-toggle.sh toggle'
alias proxy-enable='/root/mgmt/scripts/utilities/proxy-toggle.sh enable'
alias proxy-disable='/root/mgmt/scripts/utilities/proxy-toggle.sh disable'
alias proxy-status='/root/mgmt/scripts/utilities/proxy-toggle.sh status'
alias proxy-test='/root/mgmt/scripts/utilities/proxy-toggle.sh test'

View File

@ -1,12 +0,0 @@
# Put files in this folder to add your own custom functionality.
# See: https://github.com/ohmyzsh/ohmyzsh/wiki/Customization
#
# Files in the custom/ directory will be:
# - loaded automatically by the init script, in alphabetical order
# - loaded last, after all built-ins in the lib/ directory, to override them
# - ignored by git by default
#
# Example: add custom/shortcuts.zsh for shortcuts to your local projects
#
# brainstormr=~/Projects/development/planetargon/brainstormr
# cd $brainstormr

View File

@ -1,6 +0,0 @@
# Put your custom themes in this folder.
# See: https://github.com/ohmyzsh/ohmyzsh/wiki/Customization#overriding-and-adding-themes
#
# Example:
PROMPT="%{$fg[red]%}%n%{$reset_color%}@%{$fg[blue]%}%m %{$fg[yellow]%}%~ %{$reset_color%}%% "

View File

@ -1,153 +0,0 @@
#!/bin/bash
# 快速安装脚本 - 从 Gitea 仓库直接安装 ZSH 配置
# 用法: curl -fsSL https://your-gitea.com/ben/mgmt/raw/branch/main/configuration/zsh/quick-install.sh | bash
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Gitea 仓库信息
GITEA_URL="https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt.git"
MGMT_DIR="/root/mgmt"
log_info "开始快速安装 ZSH 配置..."
# 检查 root 权限
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
exit 1
fi
# 克隆或更新仓库
if [[ -d "$MGMT_DIR" ]]; then
log_info "更新现有仓库..."
cd "$MGMT_DIR"
git pull origin main
else
log_info "克隆仓库..."
git clone "$GITEA_URL" "$MGMT_DIR"
cd "$MGMT_DIR"
fi
# 询问用户是否使用代理
echo ""
log_info "网络环境检测:"
echo " 检测到可能需要代理访问外网资源(如 GitHub"
echo ""
log_info "是否使用代理进行安装?"
echo " Y - 使用代理安装(推荐,确保下载成功)"
echo " N - 直连安装(如果网络环境良好)"
echo ""
while true; do
read -p "请选择 (Y/n): " choice
case $choice in
[Yy]|"")
log_info "选择使用代理安装"
PROXY_URL="http://istoreos.tailnet-68f9.ts.net:1082"
# 测试代理连接
if curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then
log_success "代理连接正常,设置代理环境"
export http_proxy="$PROXY_URL"
export https_proxy="$PROXY_URL"
export HTTP_PROXY="$PROXY_URL"
export HTTPS_PROXY="$PROXY_URL"
# 创建代理配置文件
cat > "$MGMT_DIR/configuration/proxy.env" << EOF
# Proxy Configuration for istoreos.tailnet-68f9.ts.net:1082
export http_proxy=${PROXY_URL}
export https_proxy=${PROXY_URL}
export HTTP_PROXY=${PROXY_URL}
export HTTPS_PROXY=${PROXY_URL}
export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net
export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net
export ALL_PROXY=${PROXY_URL}
export all_proxy=${PROXY_URL}
export GIT_HTTP_PROXY=${PROXY_URL}
export GIT_HTTPS_PROXY=${PROXY_URL}
export CURL_PROXY=${PROXY_URL}
export WGET_PROXY=${PROXY_URL}
EOF
else
log_error "代理连接失败,无法继续安装"
exit 1
fi
break
;;
[Nn])
log_info "选择直连安装"
# 测试直连
if curl -s --connect-timeout 5 https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then
log_success "直连正常,开始安装"
else
log_error "直连失败,无法继续安装"
exit 1
fi
break
;;
*)
log_warning "无效选择,请输入 Y 或 N"
;;
esac
done
# 运行安装脚本
log_info "运行 ZSH 配置安装脚本..."
chmod +x "$MGMT_DIR/configuration/zsh/install-zsh-config.sh"
"$MGMT_DIR/configuration/zsh/install-zsh-config.sh"
log_success "快速安装完成!"
# 如果创建了代理配置,询问是否保持
if [[ -f "$MGMT_DIR/configuration/proxy.env" ]]; then
echo ""
log_info "安装完成!代理已临时开启用于安装。"
echo ""
log_info "是否保持代理开启?"
echo " Y - 保持代理开启(推荐,方便访问外网)"
echo " N - 关闭代理(如果不需要访问外网)"
echo ""
while true; do
read -p "请选择 (Y/n): " choice
case $choice in
[Yy]|"")
log_success "代理保持开启"
log_info "使用 'proxy-status' 查看代理状态"
log_info "使用 'proxy-toggle' 切换代理状态"
break
;;
[Nn])
log_info "关闭代理..."
rm -f "$MGMT_DIR/configuration/proxy.env"
log_success "代理已关闭"
break
;;
*)
log_warning "无效选择,请输入 Y 或 N"
;;
esac
done
fi
log_info "请重新登录或运行: source ~/.zshrc"

View File

@ -1,215 +0,0 @@
#!/bin/bash
# 智能安装脚本 - 自动检测网络环境并设置代理
# 用法: curl -fsSL https://your-gitea.com/ben/mgmt/raw/branch/main/configuration/zsh/smart-install.sh | bash
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Gitea 仓库信息
GITEA_URL="https://ben:8d7d70f324796be650b79415303c31f567bf459b@gitea.tailnet-68f9.ts.net/ben/mgmt.git"
MGMT_DIR="/root/mgmt"
PROXY_HOST="istoreos.tailnet-68f9.ts.net"
PROXY_PORT="1082"
PROXY_URL="http://${PROXY_HOST}:${PROXY_PORT}"
# 检查 root 权限
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
exit 1
fi
# 询问用户是否使用代理
ask_proxy_usage() {
echo ""
log_info "网络环境检测:"
echo " 检测到可能需要代理访问外网资源(如 GitHub"
echo ""
log_info "是否使用代理进行安装?"
echo " Y - 使用代理安装(推荐,确保下载成功)"
echo " N - 直连安装(如果网络环境良好)"
echo ""
while true; do
read -p "请选择 (Y/n): " choice
case $choice in
[Yy]|"")
log_info "选择使用代理安装"
return 0
;;
[Nn])
log_info "选择直连安装"
return 1
;;
*)
log_warning "无效选择,请输入 Y 或 N"
;;
esac
done
}
# 测试代理连接
test_proxy_connection() {
log_info "测试代理连接..."
if curl -s --connect-timeout 5 --proxy "$PROXY_URL" https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then
log_success "代理连接正常"
return 0
else
log_error "代理连接失败"
return 1
fi
}
# 测试直连
test_direct_connection() {
log_info "测试直连..."
if curl -s --connect-timeout 5 https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh >/dev/null 2>&1; then
log_success "直连正常"
return 0
else
log_error "直连失败"
return 1
fi
}
# 设置代理环境
setup_proxy_env() {
log_info "设置代理环境..."
export http_proxy="$PROXY_URL"
export https_proxy="$PROXY_URL"
export HTTP_PROXY="$PROXY_URL"
export HTTPS_PROXY="$PROXY_URL"
export no_proxy="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net"
export NO_PROXY="localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net"
log_success "代理环境已设置"
}
# 克隆或更新仓库
clone_repository() {
log_info "获取配置仓库..."
if [[ -d "$MGMT_DIR" ]]; then
log_info "更新现有仓库..."
cd "$MGMT_DIR"
git pull origin main
else
log_info "克隆仓库..."
git clone "$GITEA_URL" "$MGMT_DIR"
cd "$MGMT_DIR"
fi
}
# 创建代理配置文件
create_proxy_config() {
log_info "创建代理配置文件..."
cat > "$MGMT_DIR/configuration/proxy.env" << EOF
# Proxy Configuration for ${PROXY_HOST}:${PROXY_PORT}
export http_proxy=${PROXY_URL}
export https_proxy=${PROXY_URL}
export HTTP_PROXY=${PROXY_URL}
export HTTPS_PROXY=${PROXY_URL}
export no_proxy=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net
export NO_PROXY=localhost,127.0.0.1,::1,.local,.tailnet-68f9.ts.net
export ALL_PROXY=${PROXY_URL}
export all_proxy=${PROXY_URL}
export GIT_HTTP_PROXY=${PROXY_URL}
export GIT_HTTPS_PROXY=${PROXY_URL}
export CURL_PROXY=${PROXY_URL}
export WGET_PROXY=${PROXY_URL}
EOF
log_success "代理配置文件已创建"
}
# 询问用户是否保持代理
ask_proxy_keep() {
if [[ -f "$MGMT_DIR/configuration/proxy.env" ]]; then
echo ""
log_info "安装完成!代理已临时开启用于安装。"
echo ""
log_info "是否保持代理开启?"
echo " Y - 保持代理开启(推荐,方便访问外网)"
echo " N - 关闭代理(如果不需要访问外网)"
echo ""
while true; do
read -p "请选择 (Y/n): " choice
case $choice in
[Yy]|"")
log_success "代理保持开启"
log_info "使用 'proxy-status' 查看代理状态"
log_info "使用 'proxy-toggle' 切换代理状态"
break
;;
[Nn])
log_info "关闭代理..."
if [[ -f "$MGMT_DIR/scripts/utilities/proxy-toggle.sh" ]]; then
"$MGMT_DIR/scripts/utilities/proxy-toggle.sh" disable
else
rm -f "$MGMT_DIR/configuration/proxy.env"
log_success "代理已关闭"
fi
break
;;
*)
log_warning "无效选择,请输入 Y 或 N"
;;
esac
done
fi
}
# 主安装流程
main() {
log_info "开始智能安装 ZSH 配置..."
# 询问用户是否使用代理
if ask_proxy_usage; then
# 用户选择使用代理
if test_proxy_connection; then
setup_proxy_env
create_proxy_config
log_success "代理环境已设置,开始安装..."
else
log_error "代理连接失败,无法继续安装"
exit 1
fi
else
# 用户选择直连
if test_direct_connection; then
log_success "直连正常,开始安装..."
else
log_error "直连失败,无法继续安装"
exit 1
fi
fi
# 克隆仓库
clone_repository
# 运行安装脚本
log_info "运行 ZSH 配置安装脚本..."
chmod +x "$MGMT_DIR/configuration/zsh/install-zsh-config.sh"
"$MGMT_DIR/configuration/zsh/install-zsh-config.sh"
log_success "智能安装完成!"
# 如果使用了代理,询问是否保持
ask_proxy_keep
}
main "$@"

View File

@ -1,151 +0,0 @@
#!/bin/bash
# 测试 ZSH 插件是否正确安装
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
test_plugin() {
local plugin_name="$1"
local plugin_path="$2"
if [[ -d "$plugin_path" ]]; then
log_success "$plugin_name 已安装"
return 0
else
log_error "$plugin_name 未安装: $plugin_path"
return 1
fi
}
test_alias() {
local alias_name="$1"
if alias "$alias_name" &>/dev/null; then
log_success "✓ 别名 $alias_name 已加载"
return 0
else
log_warning "✗ 别名 $alias_name 未加载"
return 1
fi
}
main() {
log_info "测试 ZSH 插件和配置..."
echo ""
local failed=0
# 测试 oh-my-zsh 安装
log_info "检查 oh-my-zsh 安装..."
if [[ -d "$HOME/.oh-my-zsh" ]]; then
log_success "✓ oh-my-zsh 已安装"
else
log_error "✗ oh-my-zsh 未安装"
((failed++))
fi
echo ""
# 测试自定义插件
log_info "检查自定义插件..."
test_plugin "zsh-autosuggestions" "$HOME/.oh-my-zsh/custom/plugins/zsh-autosuggestions" || ((failed++))
test_plugin "zsh-syntax-highlighting" "$HOME/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting" || ((failed++))
test_plugin "zsh-completions" "$HOME/.oh-my-zsh/custom/plugins/zsh-completions" || ((failed++))
echo ""
# 测试内置插件
log_info "检查内置插件..."
test_plugin "git" "$HOME/.oh-my-zsh/plugins/git" || ((failed++))
test_plugin "docker" "$HOME/.oh-my-zsh/plugins/docker" || ((failed++))
test_plugin "ansible" "$HOME/.oh-my-zsh/plugins/ansible" || ((failed++))
test_plugin "terraform" "$HOME/.oh-my-zsh/plugins/terraform" || ((failed++))
test_plugin "kubectl" "$HOME/.oh-my-zsh/plugins/kubectl" || ((failed++))
echo ""
# 测试自定义别名文件
log_info "检查自定义别名..."
if [[ -f "$HOME/.oh-my-zsh/custom/aliases.zsh" ]]; then
log_success "✓ 自定义别名文件已安装"
else
log_warning "✗ 自定义别名文件未安装"
fi
echo ""
# 测试一些关键别名
log_info "检查关键别名..."
test_alias "mgmt" || ((failed++))
test_alias "dps" || ((failed++))
test_alias "k" || ((failed++))
test_alias "gs" || ((failed++))
echo ""
# 测试 .zshrc 文件
log_info "检查 .zshrc 配置..."
if [[ -f "$HOME/.zshrc" ]]; then
log_success "✓ .zshrc 文件存在"
# 检查关键配置
if grep -q "zsh-autosuggestions" "$HOME/.zshrc"; then
log_success "✓ zsh-autosuggestions 已配置"
else
log_warning "✗ zsh-autosuggestions 未配置"
fi
if grep -q "zsh-syntax-highlighting" "$HOME/.zshrc"; then
log_success "✓ zsh-syntax-highlighting 已配置"
else
log_warning "✗ zsh-syntax-highlighting 未配置"
fi
if grep -q "agnoster" "$HOME/.zshrc"; then
log_success "✓ agnoster 主题已配置"
else
log_warning "✗ agnoster 主题未配置"
fi
else
log_error "✗ .zshrc 文件不存在"
((failed++))
fi
echo ""
# 总结
if [[ $failed -eq 0 ]]; then
log_success "🎉 所有测试通过ZSH 配置完整。"
echo ""
log_info "使用方法:"
echo " - 重新登录或运行: source ~/.zshrc"
echo " - 测试自动建议: 输入 'docker' 然后按 → 键"
echo " - 测试别名: 运行 'mgmt-status' 或 'dps'"
else
log_error "❌ 发现 $failed 个问题,请检查安装。"
echo ""
log_info "修复建议:"
echo " 1. 重新运行安装脚本"
echo " 2. 检查网络连接"
echo " 3. 手动安装缺失的插件"
fi
}
main "$@"

View File

@ -1,260 +0,0 @@
# If you come from bash you might have to change your $PATH.
# export PATH=$HOME/bin:$HOME/.local/bin:/usr/local/bin:$PATH
# Path to your Oh My Zsh installation.
export ZSH="$HOME/.oh-my-zsh"
# Set name of the theme to load --- if set to "random", it will
# load a random theme each time Oh My Zsh is loaded, in which case,
# to know which specific one was loaded, run: echo $RANDOM_THEME
# See https://github.com/ohmyzsh/ohmyzsh/wiki/Themes
ZSH_THEME="agnoster"
# Set list of themes to pick from when loading at random
# Setting this variable when ZSH_THEME=random will cause zsh to load
# a theme from this variable instead of looking in $ZSH/themes/
# If set to an empty array, this variable will have no effect.
# ZSH_THEME_RANDOM_CANDIDATES=( "robbyrussell" "agnoster" )
# Uncomment the following line to use case-sensitive completion.
# CASE_SENSITIVE="true"
# Uncomment the following line to use hyphen-insensitive completion.
# Case-sensitive completion must be off. _ and - will be interchangeable.
# HYPHEN_INSENSITIVE="true"
# Uncomment one of the following lines to change the auto-update behavior
# zstyle ':omz:update' mode disabled # disable automatic updates
# zstyle ':omz:update' mode auto # update automatically without asking
zstyle ':omz:update' mode reminder # just remind me to update when it's time
# Uncomment the following line to change how often to auto-update (in days).
# zstyle ':omz:update' frequency 13
# Uncomment the following line if pasting URLs and other text is messed up.
# DISABLE_MAGIC_FUNCTIONS="true"
# Uncomment the following line to disable colors in ls.
# DISABLE_LS_COLORS="true"
# Uncomment the following line to disable auto-setting terminal title.
# DISABLE_AUTO_TITLE="true"
# Uncomment the following line to enable command auto-correction.
# ENABLE_CORRECTION="true"
# Uncomment the following line to display red dots whilst waiting for completion.
# You can also set it to another string to have that shown instead of the default red dots.
# e.g. COMPLETION_WAITING_DOTS="%F{yellow}waiting...%f"
# Caution: this setting can cause issues with multiline prompts in zsh < 5.7.1 (see #5765)
# COMPLETION_WAITING_DOTS="true"
# Uncomment the following line if you want to disable marking untracked files
# under VCS as dirty. This makes repository status check for large repositories
# much, much faster.
# DISABLE_UNTRACKED_FILES_DIRTY="true"
# Uncomment the following line if you want to change the command execution time
# stamp shown in the history command output.
# You can set one of the optional three formats:
# "mm/dd/yyyy"|"dd.mm.yyyy"|"yyyy-mm-dd"
# or set a custom format using the strftime function format specifications,
# see 'man strftime' for details.
# HIST_STAMPS="mm/dd/yyyy"
# Would you like to use another custom folder than $ZSH/custom?
# ZSH_CUSTOM=/path/to/new-custom-folder
# Which plugins would you like to load?
# Standard plugins can be found in $ZSH/plugins/
# Custom plugins may be added to $ZSH_CUSTOM/plugins/
# Example format: plugins=(rails git textmate ruby lighthouse)
# Add wisely, as too many plugins slow down shell startup.
plugins=(
git
docker
docker-compose
ansible
terraform
kubectl
helm
aws
gcloud
zsh-autosuggestions
zsh-syntax-highlighting
zsh-completions
colored-man-pages
command-not-found
extract
history-substring-search
sudo
systemd
tmux
vscode
web-search
z
)
source $ZSH/oh-my-zsh.sh
# User configuration
# export MANPATH="/usr/local/man:$MANPATH"
# You may need to manually set your language environment
# export LANG=en_US.UTF-8
# Preferred editor for local and remote sessions
if [[ -n $SSH_CONNECTION ]]; then
export EDITOR='vim'
else
export EDITOR='vim'
fi
# Compilation flags
# export ARCHFLAGS="-arch $(uname -m)"
# =============================================================================
# CUSTOM CONFIGURATION FOR MANAGEMENT SYSTEM
# =============================================================================
# Load proxy configuration if exists
if [[ -f /root/mgmt/configuration/proxy.env ]]; then
source /root/mgmt/configuration/proxy.env
fi
# Project management aliases
alias mgmt='cd /root/mgmt'
alias mgmt-status='cd /root/mgmt && ./mgmt.sh status'
alias mgmt-deploy='cd /root/mgmt && ./mgmt.sh deploy'
alias mgmt-cleanup='cd /root/mgmt && ./mgmt.sh cleanup'
# Ansible aliases
alias ansible-check='cd /root/mgmt/configuration && ansible-playbook --syntax-check'
alias ansible-deploy='cd /root/mgmt/configuration && ansible-playbook -i inventories/production/inventory.ini'
alias ansible-ping='cd /root/mgmt/configuration && ansible -i inventories/production/inventory.ini all -m ping'
# OpenTofu/Terraform aliases
alias tofu-init='cd /root/mgmt/tofu/environments/dev && tofu init'
alias tofu-plan='cd /root/mgmt/tofu/environments/dev && tofu plan -var-file="terraform.tfvars"'
alias tofu-apply='cd /root/mgmt/tofu/environments/dev && tofu apply -var-file="terraform.tfvars"'
alias tofu-destroy='cd /root/mgmt/tofu/environments/dev && tofu destroy -var-file="terraform.tfvars"'
alias tofu-output='cd /root/mgmt/tofu/environments/dev && tofu output'
# Docker aliases
alias d='docker'
alias dc='docker-compose'
alias dps='docker ps'
alias dpsa='docker ps -a'
alias di='docker images'
alias dex='docker exec -it'
alias dlog='docker logs -f'
alias dstop='docker stop'
alias dstart='docker start'
alias drm='docker rm'
alias drmi='docker rmi'
alias dclean='docker system prune -f'
# Docker Swarm aliases
alias dswarm='docker swarm'
alias dstack='docker stack'
alias dservice='docker service'
alias dnode='docker node'
alias dnetwork='docker network'
alias dsecret='docker secret'
alias dconfig='docker config'
# Kubernetes aliases
alias k='kubectl'
alias kgp='kubectl get pods'
alias kgs='kubectl get services'
alias kgd='kubectl get deployments'
alias kgn='kubectl get nodes'
alias kdp='kubectl describe pod'
alias kds='kubectl describe service'
alias kdd='kubectl describe deployment'
alias kaf='kubectl apply -f'
alias kdf='kubectl delete -f'
alias kl='kubectl logs -f'
# Git aliases
alias gs='git status'
alias ga='git add'
alias gc='git commit'
alias gp='git push'
alias gl='git pull'
alias gd='git diff'
alias gb='git branch'
alias gco='git checkout'
alias gcom='git checkout main'
alias gcod='git checkout develop'
alias gst='git stash'
alias gstp='git stash pop'
# System aliases
alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CF'
alias ..='cd ..'
alias ...='cd ../..'
alias ....='cd ../../..'
alias grep='grep --color=auto'
alias fgrep='fgrep --color=auto'
alias egrep='egrep --color=auto'
# Network aliases
alias ports='netstat -tuln'
alias myip='curl -s https://httpbin.org/ip | jq -r .origin'
alias speedtest='curl -s https://raw.githubusercontent.com/sivel/speedtest-cli/master/speedtest.py | python3'
# Process aliases
alias psg='ps aux | grep'
alias top='htop'
# File operations
alias cp='cp -i'
alias mv='mv -i'
alias rm='rm -i'
alias mkdir='mkdir -pv'
# History configuration
HISTSIZE=10000
SAVEHIST=10000
HISTFILE=~/.zsh_history
setopt HIST_VERIFY
setopt SHARE_HISTORY
setopt APPEND_HISTORY
setopt INC_APPEND_HISTORY
setopt HIST_IGNORE_DUPS
setopt HIST_IGNORE_ALL_DUPS
setopt HIST_REDUCE_BLANKS
setopt HIST_IGNORE_SPACE
# Auto-completion configuration
autoload -U compinit && compinit
zstyle ':completion:*' matcher-list 'm:{a-zA-Z}={A-Za-z}'
zstyle ':completion:*' list-colors "${(s.:.)LS_COLORS}"
zstyle ':completion:*' menu select
# Key bindings
bindkey '^[[A' history-substring-search-up
bindkey '^[[B' history-substring-search-down
bindkey '^[[1;5C' forward-word
bindkey '^[[1;5D' backward-word
# Auto-suggestions configuration
ZSH_AUTOSUGGEST_HIGHLIGHT_STYLE='fg=8'
ZSH_AUTOSUGGEST_STRATEGY=(history completion)
# Syntax highlighting configuration
ZSH_HIGHLIGHT_HIGHLIGHTERS=(main brackets pattern cursor)
# Welcome message
echo "🚀 Management System Shell Ready!"
echo "📁 Project: /root/mgmt"
echo "🔧 Available commands: mgmt-status, mgmt-deploy, mgmt-cleanup"
echo "🐳 Docker: d, dc, dps, dex, dlog"
echo "☸️ Kubernetes: k, kgp, kgs, kaf, kdf"
echo "🏗️ OpenTofu: tofu-init, tofu-plan, tofu-apply"
echo "⚙️ Ansible: ansible-check, ansible-deploy, ansible-ping"
echo ""

View File

@ -0,0 +1,147 @@
# Consul 集群故障排除指南
## 问题诊断
### 发现的问题
1. **DNS 解析失败**:服务间无法通过服务名相互发现
2. **网络连通性问题**`ash3c` 节点网络配置异常(地址显示为 0.0.0.0
3. **跨节点通信失败**`no route to host` 错误
4. **集群无法形成**:持续的 "No cluster leader" 错误
### 根本原因
- Docker Swarm overlay 网络在跨节点环境中的服务发现机制存在问题
- `ash3c` 节点的网络配置可能有问题
- 防火墙或网络策略可能阻止了 Consul 集群通信端口
## 解决方案
### 方案 1单节点 Consul临时解决方案
**文件**: `swarm/stacks/consul-single-node.yml`
**优点**: 简单、可靠、立即可用
**缺点**: 没有高可用性
```bash
docker stack deploy -c swarm/stacks/consul-single-node.yml consul
```
### 方案 2使用主机网络的集群配置
**文件**: `swarm/stacks/consul-cluster-host-network.yml`
**优点**: 绕过 overlay 网络问题
**缺点**: 需要手动配置 IP 地址
### 方案 3修复后的 overlay 网络配置
**文件**: `swarm/stacks/consul-cluster-fixed.yml`
**优点**: 使用 Docker 原生网络
**缺点**: 需要解决底层网络问题
### 方案 4macvlan 网络配置
**文件**: `swarm/stacks/consul-cluster-macvlan.yml`
**优点**: 直接使用物理网络
**缺点**: 需要网络管理员权限和配置
## 网络诊断步骤
### 1. 检查节点状态
```bash
docker node ls
docker node inspect <node-name> --format '{{.Status.Addr}}'
```
### 2. 检查网络连通性
```bash
# 在 master 节点上测试到 ash3c 的连通性
ping <ash3c-ip>
telnet <ash3c-ip> 8301
```
### 3. 检查防火墙设置
```bash
# 确保以下端口开放
# 8300: Consul server RPC
# 8301: Consul Serf LAN
# 8302: Consul Serf WAN
# 8500: Consul HTTP API
# 8600: Consul DNS
```
### 4. 检查 Docker Swarm 网络
```bash
docker network ls
docker network inspect <network-name>
```
## 推荐的修复流程
### 立即解决方案(单节点)
1. 部署单节点 Consul 以恢复服务
2. 验证基本功能正常
### 长期解决方案(集群)
1. 修复 `ash3c` 节点的网络配置
2. 确保节点间网络连通性
3. 配置防火墙规则
4. 重新部署集群配置
## 验证步骤
### 单节点验证
```bash
# 检查服务状态
docker service ls | grep consul
# 检查日志
docker service logs consul_consul
# 访问 Web UI
curl http://localhost:8500/v1/status/leader
```
### 集群验证
```bash
# 检查集群成员
docker exec <consul-container> consul members
# 检查领导者
docker exec <consul-container> consul operator raft list-peers
```
## 常见问题
### Q: 为什么服务发现不工作?
A: Docker Swarm 的 overlay 网络在某些配置下可能存在 DNS 解析问题,特别是跨节点通信时。
### Q: 如何选择合适的网络方案?
A:
- 开发/测试环境:使用单节点或 overlay 网络
- 生产环境:推荐使用 macvlan 或主机网络以获得更好的性能和可靠性
### Q: 集群恢复后数据会丢失吗?
A: 如果使用了持久化卷,数据不会丢失。但建议在修复前备份重要数据。
## 监控和维护
### 健康检查
```bash
# 定期检查集群状态
consul members
consul operator raft list-peers
```
### 日志监控
```bash
# 监控关键错误
docker service logs consul_consul | grep -E "(ERROR|WARN)"
```
### 性能监控
- 监控 Consul 的 HTTP API 响应时间
- 检查集群同步延迟
- 监控网络连接数
## 联系支持
如果问题持续存在,请提供以下信息:
1. Docker 版本和 Swarm 配置
2. 网络拓扑图
3. 完整的服务日志
4. 节点间网络测试结果

View File

@ -0,0 +1,240 @@
# ZSH 配置总结
## 已安装和配置的组件
### 1. 基础组件
- ✅ **oh-my-zsh**: 已安装并配置
- ✅ **zsh**: 版本 5.9
- ✅ **Powerline 字体**: 已安装支持
- ✅ **tmux**: 已安装
### 2. 核心插件
- ✅ **git**: Git 集成和别名
- ✅ **docker**: Docker 命令补全和别名
- ✅ **docker-compose**: Docker Compose 支持
- ✅ **ansible**: Ansible 命令补全
- ✅ **terraform**: Terraform/OpenTofu 支持
- ✅ **kubectl**: Kubernetes 命令补全
- ✅ **helm**: Helm 包管理器支持
- ✅ **aws**: AWS CLI 支持
- ✅ **gcloud**: Google Cloud CLI 支持
### 3. 增强插件
- ✅ **zsh-autosuggestions**: 命令自动建议
- ✅ **zsh-syntax-highlighting**: 语法高亮
- ✅ **zsh-completions**: 增强补全功能
- ✅ **colored-man-pages**: 彩色手册页
- ✅ **command-not-found**: 命令未找到提示
- ✅ **extract**: 解压文件支持
- ✅ **history-substring-search**: 历史搜索
- ✅ **sudo**: sudo 支持
- ✅ **systemd**: systemd 服务管理
- ✅ **tmux**: tmux 集成
- ✅ **vscode**: VS Code 集成
- ✅ **web-search**: 网络搜索
- ✅ **z**: 智能目录跳转
### 4. 主题
- ✅ **agnoster**: 功能丰富的主题,支持 Git 状态显示
## 自定义别名
### 项目管理别名
```bash
mgmt # 进入管理项目目录
mgmt-status # 显示项目状态
mgmt-deploy # 快速部署
mgmt-cleanup # 清理环境
mgmt-swarm # Swarm 管理
mgmt-tofu # OpenTofu 管理
```
### Ansible 别名
```bash
ansible-check # 语法检查
ansible-deploy # 部署
ansible-ping # 连通性测试
ansible-vault # 密码管理
ansible-galaxy # 角色管理
```
### OpenTofu/Terraform 别名
```bash
tofu-init # 初始化
tofu-plan # 计划
tofu-apply # 应用
tofu-destroy # 销毁
tofu-output # 输出
tofu-validate # 验证
tofu-fmt # 格式化
```
### Docker 别名
```bash
d # docker
dc # docker-compose
dps # docker ps
dpsa # docker ps -a
di # docker images
dex # docker exec -it
dlog # docker logs -f
dclean # docker system prune -f
```
### Docker Swarm 别名
```bash
dswarm # docker swarm
dstack # docker stack
dservice # docker service
dnode # docker node
dnetwork # docker network
dsecret # docker secret
dconfig # docker config
```
### Kubernetes 别名
```bash
k # kubectl
kgp # kubectl get pods
kgs # kubectl get services
kgd # kubectl get deployments
kgn # kubectl get nodes
kaf # kubectl apply -f
kdf # kubectl delete -f
kl # kubectl logs -f
```
### Git 别名
```bash
gs # git status
ga # git add
gc # git commit
gp # git push
gl # git pull
gd # git diff
gb # git branch
gco # git checkout
```
### 系统别名
```bash
ll # ls -alF
la # ls -A
l # ls -CF
.. # cd ..
... # cd ../..
.... # cd ../../..
grep # grep --color=auto
ports # netstat -tuln
myip # 获取公网IP
speedtest # 网速测试
psg # ps aux | grep
top # htop
```
## 配置文件位置
- **主配置**: `~/.zshrc`
- **自定义别名**: `~/.oh-my-zsh/custom/aliases.zsh`
- **代理配置**: `/root/mgmt/configuration/proxy.env`
## 使用方法
### 启动 ZSH
```bash
zsh
```
### 重新加载配置
```bash
source ~/.zshrc
```
### 查看所有别名
```bash
alias
```
### 查看特定别名
```bash
alias | grep docker
alias | grep mgmt
```
## 功能特性
### 1. 自动建议
- 输入命令时会显示历史命令建议
- 使用 `→` 键接受建议
### 2. 语法高亮
- 命令输入时实时语法高亮
- 错误命令显示为红色
### 3. 智能补全
- 支持所有已安装工具的补全
- 支持文件路径补全
- 支持命令参数补全
### 4. 历史搜索
- 使用 `↑` `↓` 键搜索历史命令
- 支持部分匹配搜索
### 5. 目录跳转
- 使用 `z` 命令智能跳转到常用目录
- 基于访问频率和最近访问时间
### 6. 代理支持
- 自动加载代理配置
- 支持 HTTP/HTTPS 代理
## 故障排除
### 如果别名不工作
```bash
# 检查别名是否加载
alias | grep <alias-name>
# 重新加载配置
source ~/.zshrc
```
### 如果插件不工作
```bash
# 检查插件是否安装
ls ~/.oh-my-zsh/plugins/ | grep <plugin-name>
# 检查自定义插件
ls ~/.oh-my-zsh/custom/plugins/
```
### 如果主题显示异常
```bash
# 检查字体是否安装
fc-list | grep Powerline
# 尝试其他主题
# 编辑 ~/.zshrc 中的 ZSH_THEME
```
## 扩展建议
### 可以添加的额外插件
- **fzf**: 模糊查找
- **bat**: 更好的 cat 命令
- **exa**: 更好的 ls 命令
- **ripgrep**: 更快的 grep
- **fd**: 更快的 find
### 可以添加的额外别名
- 根据个人使用习惯添加更多别名
- 为常用命令组合创建别名
- 为项目特定命令创建别名
## 性能优化
- 已配置的插件数量适中,不会显著影响启动速度
- 使用 `zsh-completions` 提供更好的补全性能
- 历史记录配置优化,避免内存占用过大
配置完成!现在您拥有了一个功能强大、高度定制的 ZSH 环境,专门为管理系统的需求进行了优化。

View File

@ -0,0 +1,137 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查必要的文件
check_prerequisites() {
log_info "检查前置条件..."
if [ ! -f "configuration/inventories/production/nomad-cluster.ini" ]; then
log_error "找不到 Nomad 集群配置文件"
exit 1
fi
if [ ! -f "configuration/playbooks/applications/configure-nomad-cluster.yml" ]; then
log_error "找不到 Nomad 配置 playbook"
exit 1
fi
log_success "前置条件检查完成"
}
# 生成加密密钥
generate_encrypt_key() {
log_info "生成 Nomad 加密密钥..."
if command -v nomad >/dev/null 2>&1; then
ENCRYPT_KEY=$(nomad operator gossip keyring generate)
log_success "生成加密密钥: $ENCRYPT_KEY"
# 更新配置文件中的加密密钥
sed -i "s|YOUR_NOMAD_ENCRYPT_KEY_HERE|$ENCRYPT_KEY|g" configuration/inventories/production/nomad-cluster.ini
log_success "已更新配置文件中的加密密钥"
else
log_warning "本地未安装 Nomad将在远程节点生成密钥"
fi
}
# 测试连接
test_connectivity() {
log_info "测试目标主机连接性..."
ansible -i configuration/inventories/production/nomad-cluster.ini nomad_cluster -m ping
if [ $? -eq 0 ]; then
log_success "所有主机连接正常"
else
log_error "部分主机连接失败请检查网络和SSH配置"
exit 1
fi
}
# 配置 Nomad 集群
configure_cluster() {
log_info "开始配置 Nomad 集群..."
ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \
configuration/playbooks/applications/configure-nomad-cluster.yml \
-v
if [ $? -eq 0 ]; then
log_success "Nomad 集群配置完成"
else
log_error "Nomad 集群配置失败"
exit 1
fi
}
# 验证集群状态
verify_cluster() {
log_info "验证集群状态..."
# 等待服务启动
sleep 10
log_info "检查 Nomad 服务状态..."
ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \
-m shell -a "systemctl status nomad --no-pager"
log_info "检查集群成员..."
ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \
-m shell -a "nomad server members" --limit 1
log_info "检查节点状态..."
ansible -i configuration/inventories/production/nomad-cluster.ini nomad_servers \
-m shell -a "nomad node status" --limit 1
}
# 主函数
main() {
echo "🚀 开始配置 Nomad 集群..."
echo "=================================="
check_prerequisites
generate_encrypt_key
test_connectivity
configure_cluster
verify_cluster
echo "=================================="
log_success "Nomad 集群配置完成!"
echo ""
echo "访问 Nomad UI:"
echo "- Master: http://100.117.106.136:4646"
echo "- Semaphore: http://100.116.158.95:4646"
echo ""
echo "常用命令:"
echo "- 查看集群状态: nomad server members"
echo "- 查看节点状态: nomad node status"
echo "- 运行作业: nomad job run <job-file>"
}
# 运行主函数
main "$@"

View File

@ -0,0 +1,104 @@
#!/bin/bash
# Consul 集群部署脚本
# 使用 Ansible 在物理机上部署 Consul 集群
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini"
PLAYBOOK_FILE="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster.yml"
echo "=== Consul 集群部署脚本 ==="
echo "项目根目录: $PROJECT_ROOT"
echo "清单文件: $INVENTORY_FILE"
echo "Playbook: $PLAYBOOK_FILE"
echo
# 检查必要文件
if [[ ! -f "$INVENTORY_FILE" ]]; then
echo "错误: 清单文件不存在: $INVENTORY_FILE"
exit 1
fi
if [[ ! -f "$PLAYBOOK_FILE" ]]; then
echo "错误: Playbook 文件不存在: $PLAYBOOK_FILE"
exit 1
fi
# 生成 Consul 加密密钥(如果需要)
echo "1. 检查 Consul 加密密钥..."
if grep -q "YOUR_BASE64_ENCRYPT_KEY_HERE" "$INVENTORY_FILE"; then
echo "需要生成 Consul 加密密钥..."
# 尝试使用已安装的 consul 生成密钥
if command -v consul &> /dev/null; then
ENCRYPT_KEY=$(consul keygen)
echo "生成的加密密钥: $ENCRYPT_KEY"
# 替换清单文件中的占位符
sed -i "s/YOUR_BASE64_ENCRYPT_KEY_HERE/$ENCRYPT_KEY/" "$INVENTORY_FILE"
echo "已更新清单文件中的加密密钥"
else
echo "警告: 未找到 consul 命令,请手动生成加密密钥并更新清单文件"
echo "可以使用以下命令生成: consul keygen"
echo "或者使用在线工具生成 32 字节的 base64 编码密钥"
fi
fi
# 测试连接
echo
echo "2. 测试目标主机连接..."
ansible -i "$INVENTORY_FILE" consul_cluster -m ping
if [[ $? -ne 0 ]]; then
echo "错误: 无法连接到目标主机,请检查清单文件中的连接信息"
exit 1
fi
# 显示部署信息
echo
echo "3. 部署信息:"
echo "目标主机:"
ansible -i "$INVENTORY_FILE" consul_cluster --list-hosts
echo
echo "Consul 版本: $(grep consul_version "$INVENTORY_FILE" | cut -d'=' -f2)"
echo "数据中心: $(grep consul_datacenter "$INVENTORY_FILE" | cut -d'=' -f2)"
# 确认部署
echo
read -p "确认部署 Consul 集群到上述主机? (y/N): " confirm
if [[ $confirm != "y" && $confirm != "Y" ]]; then
echo "部署已取消"
exit 0
fi
# 执行部署
echo
echo "4. 开始部署 Consul 集群..."
ansible-playbook -i "$INVENTORY_FILE" "$PLAYBOOK_FILE" -v
if [[ $? -eq 0 ]]; then
echo
echo "=== 部署完成 ==="
echo
echo "验证集群状态:"
echo "1. 检查服务状态:"
echo " ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'systemctl status consul'"
echo
echo "2. 检查集群成员:"
echo " ansible -i $INVENTORY_FILE consul_cluster -m shell -a 'consul members'"
echo
echo "3. 访问 Web UI:"
echo " - Master: http://master:8500"
echo " - Ash3c: http://ash3c:8500"
echo
echo "4. 检查集群领导者:"
echo " curl http://master:8500/v1/status/leader"
echo
else
echo "部署失败,请检查错误信息"
exit 1
fi

View File

@ -0,0 +1,132 @@
#!/bin/bash
# Consul Cluster Simple Deployment Script
# 简化版 Consul 集群部署脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查依赖
check_dependencies() {
log_info "检查依赖项..."
if ! command -v ansible-playbook &> /dev/null; then
log_error "ansible-playbook 未找到,请安装 Ansible"
exit 1
fi
if ! command -v python3 &> /dev/null; then
log_error "python3 未找到"
exit 1
fi
log_success "依赖检查完成"
}
# 检查网络连接
check_connectivity() {
log_info "检查目标主机连接性..."
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini"
if [[ ! -f "$inventory_file" ]]; then
log_error "清单文件不存在: $inventory_file"
exit 1
fi
# 测试连接
if ansible consul_cluster -i "$inventory_file" -m ping --one-line; then
log_success "所有主机连接正常"
else
log_warning "部分主机连接失败,但继续部署..."
fi
}
# 部署 Consul 集群
deploy_consul() {
log_info "开始部署 Consul 集群..."
local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/consul-cluster-simple.yml"
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini"
if [[ ! -f "$playbook_file" ]]; then
log_error "Playbook 文件不存在: $playbook_file"
exit 1
fi
# 运行 Ansible playbook
if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then
log_success "Consul 集群部署完成"
else
log_error "Consul 集群部署失败"
exit 1
fi
}
# 验证集群状态
verify_cluster() {
log_info "验证 Consul 集群状态..."
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini"
# 检查服务状态
log_info "检查 Consul 服务状态..."
ansible consul_cluster -i "$inventory_file" -m shell -a "systemctl status consul --no-pager" || true
# 检查集群成员
log_info "检查集群成员..."
ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
# 检查领导者
log_info "检查集群领导者..."
ansible consul_cluster -i "$inventory_file" -m shell -a "/usr/local/bin/consul operator raft list-peers" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
}
# 主函数
main() {
log_info "开始 Consul 集群简化部署..."
check_dependencies
check_connectivity
deploy_consul
verify_cluster
log_success "Consul 集群部署流程完成!"
echo ""
log_info "后续步骤:"
echo "1. 检查集群状态: consul members"
echo "2. 访问 Web UI: http://<node-ip>:8500"
echo "3. 检查日志: journalctl -u consul -f"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,146 @@
#!/bin/bash
# Nomad Cluster Deployment Script
# Nomad 集群部署脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查依赖
check_dependencies() {
log_info "检查依赖项..."
if ! command -v ansible-playbook &> /dev/null; then
log_error "ansible-playbook 未找到,请安装 Ansible"
exit 1
fi
log_success "依赖检查完成"
}
# 检查网络连接
check_connectivity() {
log_info "检查目标主机连接性..."
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
if [[ ! -f "$inventory_file" ]]; then
log_error "清单文件不存在: $inventory_file"
exit 1
fi
# 测试连接
if ansible nomad_cluster -i "$inventory_file" -m ping --one-line; then
log_success "所有主机连接正常"
else
log_warning "部分主机连接失败,但继续部署..."
fi
}
# 部署 Nomad 集群
deploy_nomad() {
log_info "开始部署 Nomad 集群..."
local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-cluster.yml"
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
if [[ ! -f "$playbook_file" ]]; then
log_error "Playbook 文件不存在: $playbook_file"
exit 1
fi
# 运行 Ansible playbook
if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then
log_success "Nomad 集群部署完成"
else
log_error "Nomad 集群部署失败"
exit 1
fi
}
# 验证集群状态
verify_cluster() {
log_info "验证 Nomad 集群状态..."
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
# 检查服务状态
log_info "检查 Nomad 服务状态..."
ansible nomad_cluster -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true
# 检查集群成员
log_info "检查集群服务器..."
ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
# 检查节点状态
log_info "检查节点状态..."
ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
# 显示集群信息
log_info "集群信息..."
ansible nomad_servers -i "$inventory_file" -m shell -a "/usr/local/bin/nomad status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
}
# 显示访问信息
show_access_info() {
log_info "Nomad 集群访问信息:"
echo ""
echo "Web UI 访问地址:"
echo " - http://10.0.0.232:4646"
echo " - http://10.0.0.179:4646"
echo ""
echo "API 访问地址:"
echo " - http://10.0.0.232:4646/v1/"
echo " - http://10.0.0.179:4646/v1/"
echo ""
echo "常用命令:"
echo " - 查看集群状态: nomad status"
echo " - 查看节点: nomad node status"
echo " - 查看服务器: nomad server members"
echo " - 提交作业: nomad job run <job-file>"
echo ""
}
# 主函数
main() {
log_info "开始 Nomad 集群部署..."
check_dependencies
check_connectivity
deploy_nomad
verify_cluster
show_access_info
log_success "Nomad 集群部署流程完成!"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,136 @@
#!/bin/bash
# Nomad Local Deployment Script
# Nomad 本地部署脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查依赖
check_dependencies() {
log_info "检查依赖项..."
if ! command -v ansible-playbook &> /dev/null; then
log_error "ansible-playbook 未找到,请安装 Ansible"
exit 1
fi
if ! command -v docker &> /dev/null; then
log_error "docker 未找到,请安装 Docker"
exit 1
fi
log_success "依赖检查完成"
}
# 部署 Nomad
deploy_nomad() {
log_info "开始部署 Nomad (本地单节点)..."
local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/nomad-local.yml"
if [[ ! -f "$playbook_file" ]]; then
log_error "Playbook 文件不存在: $playbook_file"
exit 1
fi
# 运行 Ansible playbook
if ansible-playbook "$playbook_file" -v; then
log_success "Nomad 本地部署完成"
else
log_error "Nomad 本地部署失败"
exit 1
fi
}
# 验证部署
verify_deployment() {
log_info "验证 Nomad 部署..."
# 等待服务启动
sleep 5
# 检查服务状态
log_info "检查 Nomad 服务状态..."
systemctl status nomad --no-pager || true
# 检查 Nomad 版本
log_info "检查 Nomad 版本..."
/usr/local/bin/nomad version || true
# 检查节点状态
log_info "检查节点状态..."
/usr/local/bin/nomad node status || true
# 检查服务器状态
log_info "检查服务器状态..."
/usr/local/bin/nomad server members || true
}
# 显示访问信息
show_access_info() {
local current_ip=$(hostname -I | awk '{print $1}')
log_info "Nomad 访问信息:"
echo ""
echo "Web UI 访问地址:"
echo " - http://localhost:4646"
echo " - http://${current_ip}:4646"
echo ""
echo "API 访问地址:"
echo " - http://localhost:4646/v1/"
echo " - http://${current_ip}:4646/v1/"
echo ""
echo "常用命令:"
echo " - 查看集群状态: nomad status"
echo " - 查看节点: nomad node status"
echo " - 查看服务器: nomad server members"
echo " - 提交作业: nomad job run <job-file>"
echo ""
echo "示例作业文件位置:"
echo " - $PROJECT_ROOT/examples/nomad-jobs/"
echo ""
}
# 主函数
main() {
log_info "开始 Nomad 本地部署..."
check_dependencies
deploy_nomad
verify_deployment
show_access_info
log_success "Nomad 本地部署流程完成!"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,149 @@
#!/bin/bash
# Install Nomad Cluster via APT
# 通过 APT 安装 Nomad 集群
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查依赖
check_dependencies() {
log_info "检查依赖项..."
if ! command -v ansible-playbook &> /dev/null; then
log_error "ansible-playbook 未找到,请安装 Ansible"
exit 1
fi
log_success "依赖检查完成"
}
# 检查网络连接
check_connectivity() {
log_info "检查目标主机连接性..."
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
if [[ ! -f "$inventory_file" ]]; then
log_error "清单文件不存在: $inventory_file"
exit 1
fi
# 测试连接
if ansible nomad_servers -i "$inventory_file" -m ping --one-line; then
log_success "所有主机连接正常"
else
log_warning "部分主机连接失败,但继续安装..."
fi
}
# 安装 Nomad
install_nomad() {
log_info "开始在远程主机安装 Nomad..."
local playbook_file="$PROJECT_ROOT/configuration/playbooks/applications/install-nomad-apt.yml"
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
if [[ ! -f "$playbook_file" ]]; then
log_error "Playbook 文件不存在: $playbook_file"
exit 1
fi
# 运行 Ansible playbook
if ansible-playbook -i "$inventory_file" "$playbook_file" -v; then
log_success "Nomad 集群安装完成"
else
log_error "Nomad 集群安装失败"
exit 1
fi
}
# 验证安装
verify_installation() {
log_info "验证 Nomad 安装..."
local inventory_file="$PROJECT_ROOT/configuration/inventories/production/nomad-cluster.ini"
# 检查服务状态
log_info "检查 Nomad 服务状态..."
ansible nomad_servers -i "$inventory_file" -m shell -a "systemctl status nomad --no-pager" || true
# 检查 Nomad 版本
log_info "检查 Nomad 版本..."
ansible nomad_servers -i "$inventory_file" -m shell -a "nomad version" || true
# 检查集群成员
log_info "检查集群服务器..."
ansible nomad_servers -i "$inventory_file" -m shell -a "nomad server members" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
# 检查节点状态
log_info "检查节点状态..."
ansible nomad_servers -i "$inventory_file" -m shell -a "nomad node status" -l "$(head -n1 < <(grep -v '^\[' "$inventory_file" | grep -v '^$' | head -n1))" || true
}
# 显示访问信息
show_access_info() {
log_info "Nomad 集群访问信息:"
echo ""
echo "Web UI 访问地址:"
echo " - http://100.117.106.136:4646 (master)"
echo " - http://100.116.158.95:4646 (semaphore)"
echo ""
echo "API 访问地址:"
echo " - http://100.117.106.136:4646/v1/ (master)"
echo " - http://100.116.158.95:4646/v1/ (semaphore)"
echo ""
echo "常用命令:"
echo " - 查看集群状态: nomad status"
echo " - 查看节点: nomad node status"
echo " - 查看服务器: nomad server members"
echo " - 提交作业: nomad job run <job-file>"
echo ""
echo "示例作业文件位置:"
echo " - $PROJECT_ROOT/examples/nomad-jobs/"
echo ""
}
# 主函数
main() {
log_info "开始 Nomad 集群安装..."
check_dependencies
check_connectivity
install_nomad
verify_installation
show_access_info
log_success "Nomad 集群安装流程完成!"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,375 @@
---
# ☢️ NUCLEAR NOMAD RESET ☢️
# 这是比终极还要强的修复脚本
# 警告:这将完全摧毁并重建 Nomad 集群
- name: "☢️ NUCLEAR NOMAD RESET - 核弹级集群重置 ☢️"
hosts: nomad_cluster
become: yes
gather_facts: yes
serial: 1 # 一次处理一个节点,避免同时炸掉所有节点
vars:
nomad_version: "1.10.5"
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: "🚨 警告:即将进行核弹级重置"
debug:
msg: |
☢️☢️☢️ 警告:即将对 {{ inventory_hostname }} 进行核弹级重置 ☢️☢️☢️
这将完全摧毁所有 Nomad 相关的数据、配置和进程!
如果你不确定,请立即按 Ctrl+C 取消!
- name: "⏰ 等待 10 秒,给你最后的机会取消..."
pause:
seconds: 10
# ========== 第一阶段:核弹级清理 ==========
- name: "💀 第一阶段:核弹级进程清理"
debug:
msg: "开始核弹级进程清理..."
- name: "🔥 停止 Nomad 服务(如果存在)"
systemd:
name: nomad
state: stopped
enabled: no
daemon_reload: yes
ignore_errors: yes
- name: "💣 强制杀死所有 Nomad 相关进程"
shell: |
# 杀死所有 nomad 进程
pkill -9 -f nomad || true
# 杀死所有可能的子进程
pkill -9 -f "nomad agent" || true
pkill -9 -f "nomad server" || true
pkill -9 -f "nomad client" || true
# 等待进程完全死亡
sleep 5
# 再次确认杀死
ps aux | grep nomad | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true
ignore_errors: yes
- name: "🧹 清理所有 Nomad 相关文件和目录"
file:
path: "{{ item }}"
state: absent
loop:
- /opt/nomad
- /etc/nomad.d
- /var/log/nomad
- /etc/systemd/system/nomad.service
- /usr/local/bin/nomad
- /usr/bin/nomad
- /tmp/nomad*
- /var/lib/nomad
- /run/nomad
- /var/run/nomad.pid
ignore_errors: yes
- name: "🔧 清理 systemd 缓存"
systemd:
daemon_reload: yes
# ========== 第二阶段:重新安装 Nomad ==========
- name: "🚀 第二阶段:重新安装 Nomad"
debug:
msg: "开始重新安装 Nomad..."
- name: "🔑 添加 HashiCorp GPG 密钥"
apt_key:
url: https://apt.releases.hashicorp.com/gpg
state: present
- name: "📦 添加 HashiCorp APT 仓库"
apt_repository:
repo: "deb [arch={{ ansible_architecture }}] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main"
state: present
update_cache: yes
- name: "🔧 安装 Nomad自动检测架构"
apt:
name: "nomad={{ nomad_version }}-1"
state: present
update_cache: yes
- name: "👤 创建 nomad 用户和组"
group:
name: nomad
state: present
- name: "👤 创建 nomad 用户"
user:
name: nomad
group: nomad
system: yes
shell: /bin/false
home: /opt/nomad
create_home: no
- name: "📁 创建全新的目录结构"
file:
path: "{{ item.path }}"
state: directory
owner: "{{ item.owner | default('nomad') }}"
group: "{{ item.group | default('nomad') }}"
mode: "{{ item.mode | default('0755') }}"
loop:
- { path: "/etc/nomad.d", mode: "0755" }
- { path: "/opt/nomad", mode: "0755" }
- { path: "/opt/nomad/data", mode: "0755" }
- { path: "/opt/nomad/alloc_mounts", mode: "0755" }
- { path: "/var/log/nomad", mode: "0755" }
# ========== 第三阶段:网络和防火墙检查 ==========
- name: "🌐 第三阶段:网络配置验证"
debug:
msg: "验证网络配置..."
- name: "🔍 检查 Tailscale IP 是否正确绑定"
shell: |
ip addr show | grep "{{ tailscale_ips[inventory_hostname] }}" || echo "IP_NOT_FOUND"
register: ip_check
- name: "⚠️ IP 地址检查结果"
debug:
msg: |
节点: {{ inventory_hostname }}
期望 IP: {{ tailscale_ips[inventory_hostname] }}
检查结果: {{ ip_check.stdout }}
{% if 'IP_NOT_FOUND' in ip_check.stdout %}
❌ 警告IP 地址未正确绑定!
{% else %}
✅ IP 地址检查通过
{% endif %}
- name: "🔥 确保防火墙端口开放"
shell: |
# 检查并开放 Nomad 端口
if command -v ufw >/dev/null 2>&1; then
ufw allow 4646/tcp # HTTP API
ufw allow 4647/tcp # RPC
ufw allow 4648/tcp # Serf
elif command -v firewall-cmd >/dev/null 2>&1; then
firewall-cmd --permanent --add-port=4646/tcp
firewall-cmd --permanent --add-port=4647/tcp
firewall-cmd --permanent --add-port=4648/tcp
firewall-cmd --reload
fi
ignore_errors: yes
# ========== 第四阶段:创建超强配置 ==========
- name: "⚙️ 第四阶段:创建超强配置文件"
debug:
msg: "创建超强配置文件..."
- name: "📝 创建核弹级 Nomad 配置"
copy:
content: |
# ☢️ 核弹级 Nomad 配置 - {{ inventory_hostname }}
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
# 使用正确的 Tailscale IP
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
# 日志配置
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
log_rotate_duration = "24h"
log_rotate_max_files = 5
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
# 更激进的重试配置
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}:4647",
"{{ tailscale_ips.master }}:4647",
"{{ tailscale_ips.ash3c }}:4647"
]
retry_max = 10
retry_interval = "15s"
}
# 更宽松的心跳配置
heartbeat_grace = "30s"
min_heartbeat_ttl = "10s"
max_heartbeats_per_second = 50.0
# Raft 配置优化
raft_protocol = 3
raft_multiplier = 1
}
client {
enabled = true
# 网络接口配置
network_interface = "tailscale0"
# 更宽松的心跳配置
max_kill_timeout = "30s"
# 主机卷配置
host_volume "docker-sock" {
path = "/var/run/docker.sock"
read_only = false
}
}
# 地址和端口配置
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
# Docker 插件配置
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
# 更宽松的资源限制
gc {
image = true
image_delay = "10m"
container = true
dangling_containers {
enabled = true
dry_run = false
period = "5m"
creation_grace = "5m"
}
}
}
}
# 遥测配置
telemetry {
collection_interval = "10s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
dest: "/etc/nomad.d/nomad.hcl"
owner: nomad
group: nomad
mode: '0640'
# ========== 第五阶段:创建超强 systemd 服务 ==========
- name: "🔧 创建超强 systemd 服务文件"
copy:
content: |
[Unit]
Description=Nomad - Nuclear Edition
Documentation=https://www.nomadproject.io/
Wants=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=always
RestartSec=10
LimitNOFILE=65536
# 更强的重启策略
StartLimitInterval=0
StartLimitBurst=10
# 环境变量
Environment=NOMAD_DISABLE_UPDATE_CHECK=1
[Install]
WantedBy=multi-user.target
dest: "/etc/systemd/system/nomad.service"
owner: root
group: root
mode: '0644'
- name: "🔄 重新加载 systemd"
systemd:
daemon_reload: yes
# ========== 第六阶段:启动和验证 ==========
- name: "🚀 第六阶段:启动服务"
debug:
msg: "启动 Nomad 服务..."
- name: "🔥 启用并启动 Nomad 服务"
systemd:
name: nomad
enabled: yes
state: started
daemon_reload: yes
- name: "⏰ 等待服务启动"
pause:
seconds: 15
- name: "🔍 验证服务状态"
systemd:
name: nomad
register: nomad_service_status
- name: "📊 显示服务状态"
debug:
msg: |
☢️ 核弹级重置完成!
节点: {{ inventory_hostname }}
服务状态: {{ nomad_service_status.status.ActiveState }}
IP 地址: {{ tailscale_ips[inventory_hostname] }}
{% if nomad_service_status.status.ActiveState == 'active' %}
✅ 服务启动成功!
{% else %}
❌ 服务启动失败,请检查日志!
{% endif %}
- name: "🧹 清理临时文件"
file:
path: "{{ item }}"
state: absent
loop:
- "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
- "/tmp/nomad"
ignore_errors: yes
- name: "🎉 核弹级重置完成通知"
debug:
msg: |
☢️☢️☢️ 核弹级重置完成!☢️☢️☢️
节点 {{ inventory_hostname }} 已经被完全摧毁并重建!
下一步:
1. 等待所有节点完成重置
2. 检查集群状态nomad server members
3. 检查节点状态nomad node status
4. 如果还有问题,那就真的没救了... 😅

View File

@ -0,0 +1,37 @@
#!/bin/bash
echo "=== Nomad 集群状态检查 ==="
# 检查所有节点的服务状态
echo "1. 检查服务状态..."
ansible nomad_cluster -i /root/mgmt/configuration/inventories/production/nomad-cluster.ini -m shell -a "systemctl is-active nomad" 2>/dev/null
echo -e "\n2. 检查网络连通性..."
# 检查网络连通性
for ip in 100.116.158.95 100.117.106.136 100.116.80.94; do
echo "检查到 $ip 的连接..."
timeout 5 nc -zv $ip 4646 2>&1 | grep -E "(succeeded|open)"
timeout 5 nc -zv $ip 4647 2>&1 | grep -E "(succeeded|open)"
timeout 5 nc -zv $ip 4648 2>&1 | grep -E "(succeeded|open)"
done
echo -e "\n3. 检查 Nomad 集群成员..."
# 尝试查询集群成员
if nomad server members 2>/dev/null; then
echo "集群成员查询成功"
else
echo "无法查询集群成员 - 可能没有 leader"
fi
echo -e "\n4. 检查节点状态..."
if nomad node status 2>/dev/null; then
echo "节点状态查询成功"
else
echo "无法查询节点状态"
fi
echo -e "\n5. 检查最近的日志..."
echo "=== Semaphore 节点日志 ==="
journalctl -u nomad -n 5 --no-pager 2>/dev/null | tail -5
echo -e "\n=== 检查完成 ==="

View File

@ -0,0 +1,189 @@
---
- name: Complete Nomad Cluster Fix with Ansible
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: Stop nomad service completely
systemd:
name: nomad
state: stopped
enabled: yes
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Reset systemd failure state
shell: systemctl reset-failed nomad
ignore_errors: yes
- name: Create nomad user if not exists
user:
name: nomad
system: yes
shell: /bin/false
home: /opt/nomad
create_home: no
- name: Create all required directories with correct permissions
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /opt/nomad
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- /etc/nomad.d
- name: Completely clean nomad data directory
shell: rm -rf /opt/nomad/data/* /opt/nomad/data/.*
ignore_errors: yes
- name: Create correct nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}:4647",
"{{ tailscale_ips.master }}:4647",
"{{ tailscale_ips.ash3c }}:4647"
]
retry_interval = "15s"
retry_max = 3
}
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
log_rotate_duration = "24h"
log_rotate_max_files = 5
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Set correct ownership for all nomad files
file:
path: "{{ item }}"
owner: nomad
group: nomad
recurse: yes
loop:
- /opt/nomad
- /var/log/nomad
- /etc/nomad.d
- name: Validate nomad configuration
shell: nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
ignore_errors: yes
- name: Show config validation result
debug:
var: config_validation
- name: Start nomad service on first node (semaphore)
systemd:
name: nomad
state: started
daemon_reload: yes
when: inventory_hostname == 'semaphore'
- name: Wait for first node to start
pause:
seconds: 30
when: inventory_hostname == 'semaphore'
- name: Start nomad service on remaining nodes
systemd:
name: nomad
state: started
daemon_reload: yes
when: inventory_hostname != 'semaphore'
- name: Wait for all services to start
pause:
seconds: 20
- name: Check nomad service status
shell: systemctl status nomad --no-pager -l
register: service_status
ignore_errors: yes
- name: Show service status
debug:
var: service_status.stdout_lines
- name: Check nomad logs for errors
shell: journalctl -u nomad -n 10 --no-pager
register: nomad_logs
ignore_errors: yes
- name: Show recent nomad logs
debug:
var: nomad_logs.stdout_lines
- name: Test nomad connectivity
shell: nomad server members
register: nomad_members
ignore_errors: yes
when: inventory_hostname == 'semaphore'
- name: Show cluster members
debug:
var: nomad_members.stdout_lines
when: inventory_hostname == 'semaphore'

View File

@ -0,0 +1,151 @@
---
- name: Complete Nomad Cluster Reset and Rebuild
hosts: nomad_cluster
become: yes
serial: 1 # 一次处理一个节点
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: Stop nomad service completely
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Remove all nomad data and state
shell: |
rm -rf /opt/nomad/data/*
rm -rf /opt/nomad/data/.*
rm -rf /var/log/nomad/*
ignore_errors: yes
- name: Create fresh nomad configuration with correct Tailscale IPs
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
# 使用 Tailscale IP 地址
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}",
"{{ tailscale_ips.master }}",
"{{ tailscale_ips.ash3c }}"
]
}
}
client {
enabled = true
network_interface = "tailscale0"
}
ui_config {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Ensure log directory exists
file:
path: /var/log/nomad
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Start nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ tailscale_ips[inventory_hostname] }}"
delay: 5
timeout: 30
- name: Check nomad service status
shell: systemctl status nomad --no-pager -l
register: nomad_status
ignore_errors: yes
- name: Display nomad status
debug:
var: nomad_status.stdout_lines
- name: Wait for cluster to form
hosts: localhost
gather_facts: no
tasks:
- name: Wait for cluster formation
pause:
seconds: 30
prompt: "等待集群形成..."
- name: Verify cluster status
hosts: semaphore
become: yes
tasks:
- name: Check cluster members
shell: nomad server members
register: cluster_members
ignore_errors: yes
- name: Display cluster members
debug:
var: cluster_members.stdout_lines
- name: Check node status
shell: nomad node status
register: node_status
ignore_errors: yes
- name: Display node status
debug:
var: node_status.stdout_lines

View File

@ -0,0 +1,233 @@
#!/bin/bash
# Consul 集群管理脚本
# 提供集群状态检查、重启、停止等功能
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
INVENTORY_FILE="$PROJECT_ROOT/configuration/inventories/production/consul-cluster.ini"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 打印带颜色的消息
print_status() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
print_header() {
echo -e "${BLUE}=== $1 ===${NC}"
}
# 检查必要文件
check_prerequisites() {
if [[ ! -f "$INVENTORY_FILE" ]]; then
print_error "清单文件不存在: $INVENTORY_FILE"
exit 1
fi
if ! command -v ansible &> /dev/null; then
print_error "未找到 ansible 命令"
exit 1
fi
}
# 显示帮助信息
show_help() {
echo "Consul 集群管理脚本"
echo
echo "用法: $0 [命令]"
echo
echo "命令:"
echo " status - 检查集群状态"
echo " members - 显示集群成员"
echo " leader - 显示集群领导者"
echo " restart - 重启 Consul 服务"
echo " stop - 停止 Consul 服务"
echo " start - 启动 Consul 服务"
echo " logs - 查看服务日志"
echo " health - 健康检查"
echo " cleanup - 清理 Consul 数据(危险操作)"
echo " help - 显示此帮助信息"
echo
}
# 检查集群状态
check_status() {
print_header "Consul 服务状态"
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o
echo
print_header "Consul 进程状态"
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ps aux | grep consul | grep -v grep" -o
}
# 显示集群成员
show_members() {
print_header "Consul 集群成员"
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members" -o
}
# 显示集群领导者
show_leader() {
print_header "Consul 集群领导者"
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul operator raft list-peers" -o
echo
print_header "通过 API 检查领导者"
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s http://localhost:8500/v1/status/leader" -o
}
# 重启服务
restart_service() {
print_header "重启 Consul 服务"
print_warning "即将重启所有 Consul 节点..."
read -p "确认继续? (y/N): " confirm
if [[ $confirm != "y" && $confirm != "Y" ]]; then
print_status "操作已取消"
return
fi
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=restarted" -b
print_status "等待服务启动..."
sleep 10
check_status
}
# 停止服务
stop_service() {
print_header "停止 Consul 服务"
print_warning "即将停止所有 Consul 节点..."
read -p "确认继续? (y/N): " confirm
if [[ $confirm != "y" && $confirm != "Y" ]]; then
print_status "操作已取消"
return
fi
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b
}
# 启动服务
start_service() {
print_header "启动 Consul 服务"
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b
print_status "等待服务启动..."
sleep 10
check_status
}
# 查看日志
show_logs() {
print_header "Consul 服务日志"
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "journalctl -u consul --no-pager -n 20" -o
}
# 健康检查
health_check() {
print_header "Consul 健康检查"
# 检查服务状态
print_status "检查服务状态..."
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "systemctl is-active consul" -o
echo
# 检查端口监听
print_status "检查端口监听..."
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "ss -tlnp | grep :8500" -o
echo
# 检查集群成员
print_status "检查集群成员..."
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "consul members | wc -l" -o
echo
# 检查 API 响应
print_status "检查 API 响应..."
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "curl -s -o /dev/null -w '%{http_code}' http://localhost:8500/v1/status/leader" -o
}
# 清理数据(危险操作)
cleanup_data() {
print_header "清理 Consul 数据"
print_error "警告: 此操作将删除所有 Consul 数据包括服务注册、KV 存储等!"
print_error "此操作不可逆!"
echo
read -p "确认要清理所有数据? 请输入 'YES' 确认: " confirm
if [[ $confirm != "YES" ]]; then
print_status "操作已取消"
return
fi
print_status "停止 Consul 服务..."
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=stopped" -b
print_status "清理数据目录..."
ansible -i "$INVENTORY_FILE" consul_cluster -m shell -a "rm -rf /opt/consul/data/*" -b
print_status "启动 Consul 服务..."
ansible -i "$INVENTORY_FILE" consul_cluster -m systemd -a "name=consul state=started" -b
print_status "数据清理完成"
}
# 主函数
main() {
check_prerequisites
case "${1:-help}" in
status)
check_status
;;
members)
show_members
;;
leader)
show_leader
;;
restart)
restart_service
;;
stop)
stop_service
;;
start)
start_service
;;
logs)
show_logs
;;
health)
health_check
;;
cleanup)
cleanup_data
;;
help|--help|-h)
show_help
;;
*)
print_error "未知命令: $1"
echo
show_help
exit 1
;;
esac
}
main "$@"

View File

@ -0,0 +1,115 @@
---
- name: Correct Nomad Cluster Configuration
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: Stop nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Clean nomad data
file:
path: /opt/nomad/data
state: absent
- name: Recreate nomad data directory
file:
path: /opt/nomad/data
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create correct nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}:4647",
"{{ tailscale_ips.master }}:4647",
"{{ tailscale_ips.ash3c }}:4647"
]
retry_interval = "15s"
retry_max = 3
}
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Start nomad services in sequence
hosts: nomad_cluster
become: yes
serial: 1
tasks:
- name: Start nomad service
systemd:
name: nomad
state: started
daemon_reload: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ tailscale_ips[inventory_hostname] }}"
delay: 10
timeout: 60
- name: Wait between nodes
pause:
seconds: 30

View File

@ -0,0 +1,113 @@
---
- name: Deploy Nomad Configurations
hosts: nomad_cluster
become: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
node_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: Create nomad configuration for each node
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ node_ips[inventory_hostname] }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = [
"{{ node_ips.semaphore }}:4647",
"{{ node_ips.master }}:4647",
"{{ node_ips.ash3c }}:4647"
]
retry_interval = "15s"
retry_max = 3
}
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ node_ips[inventory_hostname] }}"
serf = "{{ node_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Validate nomad configuration
shell: nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
- name: Show validation result
debug:
var: config_validation.stdout_lines
- name: Start nomad service on bootstrap node first
systemd:
name: nomad
state: started
daemon_reload: yes
when: inventory_hostname == 'semaphore'
- name: Wait for bootstrap node
pause:
seconds: 15
when: inventory_hostname == 'semaphore'
- name: Start nomad service on other nodes
systemd:
name: nomad
state: started
daemon_reload: yes
when: inventory_hostname != 'semaphore'
- name: Wait for services to start
pause:
seconds: 10
- name: Check service status
shell: systemctl status nomad --no-pager
register: service_status
ignore_errors: yes
- name: Show service status
debug:
var: service_status.stdout_lines

View File

@ -0,0 +1,190 @@
---
- name: Final Complete Nomad Cluster Fix
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
nomad_servers:
- "100.116.158.95:4647" # semaphore
- "100.117.106.136:4647" # master
- "100.116.80.94:4647" # ash3c
tasks:
- name: Stop nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Reset failed nomad service
systemd:
name: nomad
daemon_reload: yes
ignore_errors: yes
- name: Create nomad user if not exists
user:
name: nomad
system: yes
shell: /bin/false
home: /opt/nomad
create_home: no
- name: Create nomad directories with correct permissions
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /etc/nomad.d
- /opt/nomad
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- name: Clean old nomad data
file:
path: /opt/nomad/data
state: absent
- name: Recreate nomad data directory
file:
path: /opt/nomad/data
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Get Tailscale IP address
shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1
register: tailscale_ip
failed_when: false
- name: Set bind address (fallback to default interface if tailscale not available)
set_fact:
bind_address: "{{ tailscale_ip.stdout if tailscale_ip.stdout != '' else ansible_default_ipv4.address }}"
- name: Generate nomad configuration
template:
src: nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
vars:
nomad_datacenter: "dc1"
nomad_region: "global"
nomad_data_dir: "/opt/nomad/data"
nomad_bind_addr: "{{ bind_address }}"
nomad_bootstrap_expect: 3
nomad_encrypt: "{{ nomad_encrypt_key }}"
nomad_retry_join: "{{ nomad_servers }}"
nomad_alloc_dir: "/opt/nomad/alloc_mounts"
nomad_log_file: "/var/log/nomad/nomad.log"
- name: Create nomad systemd service
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Start nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ bind_address }}"
delay: 5
timeout: 30
ignore_errors: yes
- name: Create nomad configuration template
hosts: localhost
gather_facts: no
tasks:
- name: Create nomad server template
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
region = "{{ nomad_region }}"
data_dir = "{{ nomad_data_dir }}"
bind_addr = "{{ nomad_bind_addr }}"
server {
enabled = true
bootstrap_expect = {{ nomad_bootstrap_expect }}
encrypt = "{{ nomad_encrypt }}"
server_join {
retry_join = {{ nomad_retry_join | to_json }}
retry_interval = "15s"
retry_max = 3
}
}
client {
enabled = true
alloc_dir = "{{ nomad_alloc_dir }}"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ nomad_bind_addr }}"
serf = "{{ nomad_bind_addr }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "{{ nomad_log_file }}"
dest: /tmp/nomad-server.hcl.j2
delegate_to: localhost
run_once: true

View File

@ -0,0 +1,111 @@
---
- name: Final Nomad Cluster Fix
hosts: nomad_cluster
become: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tailscale_ips:
semaphore: "100.116.158.95"
master: "100.117.106.136"
ash3c: "100.116.80.94"
tasks:
- name: Stop nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Create required directories
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- name: Clean nomad data
shell: rm -rf /opt/nomad/data/*
ignore_errors: yes
- name: Create working nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ tailscale_ips[inventory_hostname] }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = [
"{{ tailscale_ips.semaphore }}",
"{{ tailscale_ips.master }}",
"{{ tailscale_ips.ash3c }}"
]
}
}
client {
enabled = true
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ tailscale_ips[inventory_hostname] }}"
serf = "{{ tailscale_ips[inventory_hostname] }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Start nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for service to start
pause:
seconds: 10
- name: Check service status
shell: systemctl status nomad --no-pager -l
register: service_status
ignore_errors: yes
- name: Show service status
debug:
var: service_status.stdout_lines

137
scripts/utilities/fix-ash3c-ip.sh Executable file
View File

@ -0,0 +1,137 @@
#!/bin/bash
# 🔧 ash3c IP 地址修复脚本
set -e
echo "🔧 ash3c IP 地址问题修复脚本"
echo ""
# 定义正确的 IP 地址
CORRECT_IP="100.116.80.94"
ASH3C_HOST="100.116.80.94"
echo "📡 检查 ash3c 节点的网络配置..."
# 检查 ash3c 的实际 IP 配置
echo "🔍 检查 ash3c 节点的 IP 地址绑定..."
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S ip addr show" | grep -E "inet.*100\." || echo "❌ 未找到 Tailscale IP"
echo ""
echo "🔍 检查 Tailscale 状态..."
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S tailscale status" || echo "❌ Tailscale 状态检查失败"
echo ""
echo "🔧 修复 ash3c 的 Nomad 配置..."
# 创建正确的配置文件
cat > /tmp/ash3c-nomad.hcl << EOF
# 🔧 ash3c 修复后的 Nomad 配置
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
# 强制使用正确的 Tailscale IP
bind_addr = "${CORRECT_IP}"
# 日志配置
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
server {
enabled = true
bootstrap_expect = 3
encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
server_join {
retry_join = [
"100.116.158.95:4647",
"100.117.106.136:4647",
"100.116.80.94:4647"
]
retry_max = 10
retry_interval = "15s"
}
# 更宽松的心跳配置
heartbeat_grace = "30s"
min_heartbeat_ttl = "10s"
}
client {
enabled = true
network_interface = "tailscale0"
}
ui_config {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "${CORRECT_IP}"
serf = "${CORRECT_IP}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
EOF
echo "📤 上传修复后的配置到 ash3c..."
scp -P 22 -i ~/.ssh/id_ed25519 /tmp/ash3c-nomad.hcl ben@${ASH3C_HOST}:/tmp/
echo "🔧 在 ash3c 上应用修复..."
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} << 'REMOTE_SCRIPT'
echo '3131' | sudo -S systemctl stop nomad || true
echo '3131' | sudo -S pkill -f nomad || true
sleep 5
# 备份旧配置
echo '3131' | sudo -S cp /etc/nomad.d/nomad.hcl /etc/nomad.d/nomad.hcl.backup.$(date +%Y%m%d_%H%M%S) || true
# 应用新配置
echo '3131' | sudo -S cp /tmp/ash3c-nomad.hcl /etc/nomad.d/nomad.hcl
echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl
echo '3131' | sudo -S chmod 640 /etc/nomad.d/nomad.hcl
# 清理数据目录
echo '3131' | sudo -S rm -rf /opt/nomad/data/*
# 重启服务
echo '3131' | sudo -S systemctl daemon-reload
echo '3131' | sudo -S systemctl enable nomad
echo '3131' | sudo -S systemctl start nomad
echo "✅ ash3c 配置修复完成"
REMOTE_SCRIPT
echo ""
echo "⏰ 等待 ash3c 服务启动..."
sleep 15
echo ""
echo "🔍 检查 ash3c 服务状态..."
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_HOST} "echo '3131' | sudo -S systemctl status nomad --no-pager" || echo "❌ 服务状态检查失败"
echo ""
echo "🧹 清理临时文件..."
rm -f /tmp/ash3c-nomad.hcl
echo ""
echo "✅ ash3c IP 修复完成!"
echo ""
echo "下一步:"
echo "1. 检查集群状态: nomad server members"
echo "2. 如果还有问题,运行核弹级重置: ./scripts/utilities/nuclear-reset.sh"

View File

@ -0,0 +1,151 @@
#!/bin/bash
# Consul 集群修复脚本
# 解决 "No cluster leader" 问题
set -e
echo "=== Consul 集群修复脚本 ==="
echo "当前时间: $(date)"
echo
# 检查当前 Consul 服务状态
echo "1. 检查当前 Consul 服务状态..."
docker service ls | grep consul || echo "未找到 consul 服务"
echo
# 显示当前问题
echo "2. 检查 Consul 日志中的错误..."
echo "Master 节点日志:"
docker service logs consul-cluster_consul-master --tail 5 2>/dev/null || echo "无法获取 master 日志"
echo
echo "Ash3c 节点日志:"
docker service logs consul-cluster_consul-ash3c --tail 5 2>/dev/null || echo "无法获取 ash3c 日志"
echo
# 提供修复选项
echo "3. 修复选项:"
echo " a) 使用修复后的 overlay 网络配置 (推荐)"
echo " b) 使用 macvlan 网络配置"
echo " c) 仅重启现有服务"
echo
read -p "请选择修复方案 (a/b/c): " choice
case $choice in
a)
echo "使用修复后的 overlay 网络配置..."
# 停止现有服务
echo "停止现有 Consul 集群..."
docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在"
# 等待服务完全停止
echo "等待服务完全停止..."
sleep 10
# 清理数据卷 (可选)
read -p "是否清理现有数据卷? (y/n): " clean_volumes
if [[ $clean_volumes == "y" ]]; then
docker volume rm consul-cluster_consul_master_data 2>/dev/null || true
docker volume rm consul-cluster_consul_ash3c_data 2>/dev/null || true
echo "数据卷已清理"
fi
# 部署修复后的配置
echo "部署修复后的 Consul 集群..."
docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-fixed.yml consul-cluster
echo "等待服务启动..."
sleep 15
# 检查服务状态
echo "检查新服务状态..."
docker service ls | grep consul
;;
b)
echo "使用 macvlan 网络配置..."
echo "注意: 需要根据你的网络环境调整 IP 地址和网络接口"
# 检查网络接口
echo "当前网络接口:"
ip link show | grep -E "^[0-9]+:" | awk '{print $2}' | sed 's/://'
echo
read -p "请输入要使用的网络接口 (如 eth0): " interface
read -p "请输入子网 (如 192.168.1.0/24): " subnet
read -p "请输入网关 (如 192.168.1.1): " gateway
# 更新 macvlan 配置文件
sed -i "s/parent: eth0/parent: $interface/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
sed -i "s/192.168.1.0\/24/$subnet/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
sed -i "s/192.168.1.1/$gateway/" /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml
# 停止现有服务
echo "停止现有 Consul 集群..."
docker stack rm consul-cluster 2>/dev/null || echo "consul-cluster stack 不存在"
# 等待服务完全停止
echo "等待服务完全停止..."
sleep 10
# 部署 macvlan 配置
echo "部署 macvlan Consul 集群..."
docker stack deploy -c /root/mgmt/swarm/stacks/consul-cluster-macvlan.yml consul-cluster
echo "等待服务启动..."
sleep 15
# 检查服务状态
echo "检查新服务状态..."
docker service ls | grep consul
;;
c)
echo "重启现有服务..."
# 重启服务
docker service update --force consul-cluster_consul-master
docker service update --force consul-cluster_consul-ash3c
echo "等待服务重启..."
sleep 10
# 检查服务状态
echo "检查服务状态..."
docker service ls | grep consul
;;
*)
echo "无效选择,退出"
exit 1
;;
esac
echo
echo "4. 验证修复结果..."
sleep 5
# 检查服务状态
echo "服务状态:"
docker service ls | grep consul
echo
echo "等待 30 秒后检查集群状态..."
sleep 30
# 尝试检查集群成员
echo "尝试检查集群成员状态..."
timeout 10 docker service logs consul-cluster_consul-master --tail 10 2>/dev/null || echo "无法获取日志"
echo
echo "=== 修复完成 ==="
echo "请等待几分钟让集群完全启动,然后访问:"
echo "- Master UI: http://your-master-ip:8500"
echo "- Ash3c UI: http://your-ash3c-ip:8501"
echo
echo "如果问题仍然存在,请检查:"
echo "1. 节点间网络连通性"
echo "2. 防火墙设置"
echo "3. Docker Swarm 网络配置"

View File

@ -0,0 +1,26 @@
#!/bin/bash
echo "🔧 使用 HashiCorp 官方脚本修复 master 节点二进制文件..."
# 停止 nomad 服务
echo '3131' | sudo -S systemctl stop nomad || true
echo '3131' | sudo -S pkill -9 -f nomad || true
# 删除旧的二进制文件
echo '3131' | sudo -S rm -f /usr/local/bin/nomad /usr/bin/nomad
# 使用 HashiCorp 官方安装脚本(自动检测架构)
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
echo '3131' | sudo -S apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
echo '3131' | sudo -S apt-get update
echo '3131' | sudo -S apt-get install -y nomad=1.10.5-1
# 验证安装
nomad version
# 重启服务
echo '3131' | sudo -S systemctl daemon-reload
echo '3131' | sudo -S systemctl enable nomad
echo '3131' | sudo -S systemctl start nomad
echo "✅ Master 节点二进制文件修复完成!"

View File

@ -0,0 +1,92 @@
---
- name: Fix Nomad Cluster Issues
hosts: nomad_cluster
become: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tasks:
- name: Stop nomad service
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: Clean nomad data directory
shell: rm -rf /opt/nomad/data/*
ignore_errors: yes
- name: Create correct nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "{{ nomad_encrypt_key }}"
server_join {
retry_join = ["100.116.158.95", "100.117.106.136", "100.116.80.94"]
}
}
client {
enabled = true
network_interface = "{{ ansible_default_ipv4.interface | default('eth0') }}"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "0.0.0.0"
serf = "0.0.0.0"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Start nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ ansible_host | default(hostvars[inventory_hostname]['ansible_default_ipv4']['address']) }}"
delay: 10
timeout: 60
- name: Check nomad status
shell: systemctl status nomad --no-pager -l
register: nomad_status
ignore_errors: yes
- name: Display nomad status
debug:
var: nomad_status.stdout_lines

View File

@ -0,0 +1,124 @@
#!/bin/bash
# 🔍 Nomad 集群快速诊断脚本
echo "🔍 Nomad 集群快速诊断"
echo "===================="
echo ""
# 定义节点信息
declare -A NODES=(
["semaphore"]="local"
["master"]="100.117.106.136:60022"
["ash3c"]="100.116.80.94:22"
)
declare -A TAILSCALE_IPS=(
["semaphore"]="100.116.158.95"
["master"]="100.117.106.136"
["ash3c"]="100.116.80.94"
)
echo "📊 1. 本地 Nomad 服务状态"
echo "------------------------"
systemctl status nomad --no-pager | head -10 || echo "❌ 本地 Nomad 服务异常"
echo ""
echo "📊 2. 集群成员状态"
echo "----------------"
nomad server members 2>/dev/null || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 3. 节点状态"
echo "------------"
nomad node status 2>/dev/null || echo "❌ 无法获取节点状态"
echo ""
echo "🌐 4. 网络连通性测试"
echo "------------------"
for node in "${!NODES[@]}"; do
ip="${TAILSCALE_IPS[$node]}"
echo "测试 $node ($ip):"
if [[ "$node" == "semaphore" ]]; then
echo " ✅ 本地节点"
else
# Ping 测试
if ping -c 1 -W 3 "$ip" >/dev/null 2>&1; then
echo " ✅ Ping: 成功"
else
echo " ❌ Ping: 失败"
fi
# 端口测试
if timeout 5 bash -c "</dev/tcp/$ip/4647" 2>/dev/null; then
echo " ✅ RPC端口(4647): 开放"
else
echo " ❌ RPC端口(4647): 关闭"
fi
if timeout 5 bash -c "</dev/tcp/$ip/4646" 2>/dev/null; then
echo " ✅ HTTP端口(4646): 开放"
else
echo " ❌ HTTP端口(4646): 关闭"
fi
fi
echo ""
done
echo "🔧 5. 远程节点服务状态"
echo "-------------------"
for node in "${!NODES[@]}"; do
if [[ "$node" == "semaphore" ]]; then
continue
fi
connection="${NODES[$node]}"
ip=$(echo "$connection" | cut -d: -f1)
port=$(echo "$connection" | cut -d: -f2)
echo "检查 $node ($ip:$port):"
if ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null; then
status=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S systemctl is-active nomad" 2>/dev/null)
echo " 服务状态: $status"
# 检查配置文件中的 bind_addr
bind_addr=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S grep 'bind_addr' /etc/nomad.d/nomad.hcl 2>/dev/null" | head -1)
echo " 配置绑定地址: $bind_addr"
# 检查实际监听端口
listening=$(ssh -p "$port" -i ~/.ssh/id_ed25519 -o ConnectTimeout=10 -o StrictHostKeyChecking=no ben@"$ip" "echo '3131' | sudo -S netstat -tlnp | grep :464" 2>/dev/null | head -3)
if [[ -n "$listening" ]]; then
echo " 监听端口:"
echo "$listening" | sed 's/^/ /'
else
echo " ❌ 未发现 Nomad 监听端口"
fi
else
echo " ❌ 无法连接或服务未运行"
fi
echo ""
done
echo "📋 6. 问题总结和建议"
echo "=================="
# 检查是否有 leader
if nomad server members 2>/dev/null | grep -q "leader"; then
echo "✅ 集群有 leader"
else
echo "❌ 集群没有 leader - 这是主要问题!"
echo ""
echo "🔧 建议的修复步骤:"
echo "1. 先尝试 ash3c IP 修复: ./scripts/utilities/fix-ash3c-ip.sh"
echo "2. 如果还不行,使用核弹级重置: ./scripts/utilities/nuclear-reset.sh"
echo "3. 检查 master 节点是否需要重启"
fi
echo ""
echo "🔗 有用的链接:"
echo " Web UI: http://100.116.158.95:4646"
echo " 日志查看: journalctl -u nomad -f"
echo ""
echo "🔍 诊断完成!"

View File

@ -0,0 +1,76 @@
#!/bin/bash
# ☢️ 核弹级 Nomad 重置执行脚本 ☢️
set -e
echo "☢️☢️☢️ 核弹级 Nomad 集群重置 ☢️☢️☢️"
echo ""
echo "这个脚本将:"
echo "1. 完全摧毁所有 Nomad 进程和数据"
echo "2. 重新下载并安装 Nomad 二进制文件"
echo "3. 创建全新的配置文件"
echo "4. 重新启动整个集群"
echo ""
echo "⚠️ 警告:这是不可逆的操作!⚠️"
echo ""
# 检查是否在正确的目录
if [[ ! -f "scripts/utilities/NUCLEAR-NOMAD-RESET.yml" ]]; then
echo "❌ 错误:请在 /root/mgmt 目录下运行此脚本"
exit 1
fi
# 确认操作
read -p "你确定要进行核弹级重置吗?输入 'NUCLEAR' 确认: " confirm
if [[ "$confirm" != "NUCLEAR" ]]; then
echo "❌ 操作已取消"
exit 1
fi
echo ""
echo "🚀 开始核弹级重置..."
echo ""
# 设置 Ansible 配置
export ANSIBLE_HOST_KEY_CHECKING=False
export ANSIBLE_STDOUT_CALLBACK=yaml
# 执行核弹级重置
echo "📡 执行 Ansible playbook..."
cd /root/mgmt/configuration
ansible-playbook \
-i inventories/production/nomad-cluster.ini \
../scripts/utilities/NUCLEAR-NOMAD-RESET.yml \
--extra-vars "ansible_ssh_common_args='-o StrictHostKeyChecking=no'" \
-v
echo ""
echo "⏰ 等待集群稳定..."
sleep 30
echo ""
echo "🔍 检查集群状态..."
# 检查集群成员
echo "📊 集群成员状态:"
nomad server members || echo "❌ 无法获取集群成员状态"
echo ""
echo "📊 节点状态:"
nomad node status || echo "❌ 无法获取节点状态"
echo ""
echo "🎯 如果上面显示错误,请等待几分钟后再次检查"
echo "集群可能需要一些时间来完全启动和同步"
echo ""
echo "🔧 有用的命令:"
echo " 检查集群成员: nomad server members"
echo " 检查节点状态: nomad node status"
echo " 查看日志: journalctl -u nomad -f"
echo " Web UI: http://100.116.158.95:4646"
echo ""
echo "☢️ 核弹级重置完成!☢️"

View File

@ -0,0 +1,104 @@
#!/bin/bash
echo "=== 简单的 Nomad 集群修复脚本 ==="
# 定义 Tailscale IP 地址
SEMAPHORE_IP="100.116.158.95"
MASTER_IP="100.117.106.136"
ASH3C_IP="100.116.80.94"
ENCRYPT_KEY="NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
# 创建配置文件函数
create_config() {
local node_name=$1
local bind_ip=$2
cat > /tmp/nomad-${node_name}.hcl << EOF
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "${bind_ip}"
server {
enabled = true
bootstrap_expect = 3
encrypt = "${ENCRYPT_KEY}"
server_join {
retry_join = ["${SEMAPHORE_IP}", "${MASTER_IP}", "${ASH3C_IP}"]
}
}
client {
enabled = true
}
ui_config {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "${bind_ip}"
serf = "${bind_ip}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
EOF
}
echo "1. 停止所有 Nomad 服务..."
systemctl stop nomad
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl stop nomad"
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl stop nomad"
echo "2. 清理数据目录..."
rm -rf /opt/nomad/data/*
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*"
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S rm -rf /opt/nomad/data/*"
echo "3. 创建新配置文件..."
create_config "semaphore" "${SEMAPHORE_IP}"
create_config "master" "${MASTER_IP}"
create_config "ash3c" "${ASH3C_IP}"
echo "4. 部署配置文件..."
cp /tmp/nomad-semaphore.hcl /etc/nomad.d/nomad.hcl
chown nomad:nomad /etc/nomad.d/nomad.hcl
scp -P 60022 -i ~/.ssh/id_ed25519 /tmp/nomad-master.hcl ben@${MASTER_IP}:/tmp/
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S cp /tmp/nomad-master.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl"
scp -P 22 -i ~/.ssh/id_ed25519 /tmp/nomad-ash3c.hcl ben@${ASH3C_IP}:/tmp/
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S cp /tmp/nomad-ash3c.hcl /etc/nomad.d/nomad.hcl && echo '3131' | sudo -S chown nomad:nomad /etc/nomad.d/nomad.hcl"
echo "5. 启动服务..."
systemctl start nomad
ssh -p 60022 -i ~/.ssh/id_ed25519 ben@${MASTER_IP} "echo '3131' | sudo -S systemctl start nomad"
ssh -p 22 -i ~/.ssh/id_ed25519 ben@${ASH3C_IP} "echo '3131' | sudo -S systemctl start nomad"
echo "6. 等待集群形成..."
sleep 30
echo "7. 检查集群状态..."
nomad server members
nomad node status
echo "=== 修复完成 ==="

View File

@ -0,0 +1,113 @@
---
- name: Ultimate Nomad Cluster Fix - Complete Reset
hosts: nomad_cluster
become: yes
gather_facts: yes
vars:
nomad_encrypt_key: "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
tasks:
- name: Stop and disable nomad service completely
systemd:
name: nomad
state: stopped
enabled: no
daemon_reload: yes
ignore_errors: yes
- name: Kill any remaining nomad processes
shell: pkill -f nomad || true
ignore_errors: yes
- name: Remove all nomad data and state
file:
path: "{{ item }}"
state: absent
loop:
- /opt/nomad/data
- /etc/nomad.d/nomad.hcl
- /var/log/nomad
- name: Create clean nomad directories
file:
path: "{{ item }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
loop:
- /etc/nomad.d
- /opt/nomad
- /opt/nomad/data
- /opt/nomad/alloc_mounts
- /var/log/nomad
- name: Create minimal nomad configuration
copy:
content: |
datacenter = "dc1"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "{{ ansible_default_ipv4.address }}"
server {
enabled = true
bootstrap_expect = 1
encrypt = "{{ nomad_encrypt_key }}"
}
client {
enabled = true
alloc_dir = "/opt/nomad/alloc_mounts"
}
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "{{ ansible_default_ipv4.address }}"
serf = "{{ ansible_default_ipv4.address }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
- name: Enable and start nomad service
systemd:
name: nomad
state: started
enabled: yes
daemon_reload: yes
- name: Wait for nomad to start
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 10
timeout: 60
- name: Check nomad status
uri:
url: "http://{{ ansible_default_ipv4.address }}:4646/v1/status/leader"
method: GET
register: nomad_leader
retries: 5
delay: 5
ignore_errors: yes
- name: Display nomad status
debug:
msg: "Nomad leader status: {{ nomad_leader.json if nomad_leader.json is defined else 'No leader elected yet' }}"

View File

@ -0,0 +1,106 @@
#!/bin/bash
# 最小化 ZSH 配置 - 适合快速部署
# 用法: curl -fsSL https://your-gitea.com/ben/mgmt/raw/branch/main/snippets/zsh/zshrc-minimal.sh | bash
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# 检查 root 权限
if [[ $EUID -ne 0 ]]; then
log_error "需要 root 权限"
exit 1
fi
log_info "开始安装最小化 ZSH 配置..."
# 安装依赖
apt update && apt install -y zsh git curl fonts-powerline
# 安装 oh-my-zsh
if [[ ! -d "$HOME/.oh-my-zsh" ]]; then
RUNZSH=no CHSH=no sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)"
fi
# 安装关键插件
custom_dir="$HOME/.oh-my-zsh/custom/plugins"
mkdir -p "$custom_dir"
[[ ! -d "$custom_dir/zsh-autosuggestions" ]] && git clone https://github.com/zsh-users/zsh-autosuggestions "$custom_dir/zsh-autosuggestions"
[[ ! -d "$custom_dir/zsh-syntax-highlighting" ]] && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git "$custom_dir/zsh-syntax-highlighting"
# 创建最小化配置
cat > "$HOME/.zshrc" << 'EOF'
# Oh My Zsh 配置
export ZSH="$HOME/.oh-my-zsh"
ZSH_THEME="agnoster"
plugins=(
git
docker
ansible
terraform
kubectl
zsh-autosuggestions
zsh-syntax-highlighting
)
source $ZSH/oh-my-zsh.sh
# 基本别名
alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CF'
alias ..='cd ..'
alias ...='cd ../..'
alias grep='grep --color=auto'
# Docker 别名
alias d='docker'
alias dps='docker ps'
alias dpsa='docker ps -a'
alias dex='docker exec -it'
alias dlog='docker logs -f'
# Kubernetes 别名
alias k='kubectl'
alias kgp='kubectl get pods'
alias kgs='kubectl get services'
alias kgd='kubectl get deployments'
# Git 别名
alias gs='git status'
alias ga='git add'
alias gc='git commit'
alias gp='git push'
alias gl='git pull'
# 历史配置
HISTSIZE=10000
SAVEHIST=10000
HISTFILE=~/.zsh_history
setopt SHARE_HISTORY
setopt HIST_IGNORE_DUPS
# 自动建议配置
ZSH_AUTOSUGGEST_HIGHLIGHT_STYLE='fg=8'
ZSH_AUTOSUGGEST_STRATEGY=(history completion)
echo "🚀 ZSH 配置完成!"
EOF
# 设置默认 shell
chsh -s "$(which zsh)"
log_success "最小化 ZSH 配置安装完成!"
log_info "请重新登录或运行: source ~/.zshrc"

View File

@ -0,0 +1,76 @@
version: '3.8'
services:
consul-master:
image: consul:latest
hostname: consul-master
command: >
sh -c "
IP=$$(hostname -i | awk '{print $$1}');
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0
-ui
-log-level=INFO
"
ports:
- "8500:8500"
- "8600:8600/udp"
volumes:
- consul_master_data:/consul/data
networks:
consul-net:
aliases:
- consul-master
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == master
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
consul-ash3c:
image: consul:latest
hostname: consul-ash3c
command: >
sh -c "
IP=$$(hostname -i | awk '{print $$1}');
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0
-retry-join=consul-master
-ui
-log-level=INFO
"
ports:
- "8501:8500"
- "8601:8600/udp"
volumes:
- consul_ash3c_data:/consul/data
networks:
consul-net:
aliases:
- consul-ash3c
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == ash3c
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
volumes:
consul_master_data:
consul_ash3c_data:
networks:
consul-net:
driver: overlay
attachable: true

View File

@ -0,0 +1,68 @@
version: '3.8'
services:
consul-master:
image: consul:latest
hostname: consul-master
command: >
sh -c "
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-master -bind=100.117.106.136 -advertise=100.117.106.136 -client=0.0.0.0
-ui
-log-level=INFO
"
ports:
- "8500:8500"
- "8600:8600/udp"
- "8301:8301"
- "8302:8302"
volumes:
- consul_master_data:/consul/data
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == master
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
consul-ash3c:
image: consul:latest
hostname: consul-ash3c
command: >
sh -c "
ASH3C_IP=$$(getent hosts ash3c | awk '{print $$1}');
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-ash3c -bind=$$ASH3C_IP -advertise=$$ASH3C_IP -client=0.0.0.0
-retry-join=100.117.106.136
-ui
-log-level=INFO
"
ports:
- "8501:8500"
- "8601:8600/udp"
- "8311:8301"
- "8312:8302"
volumes:
- consul_ash3c_data:/consul/data
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == ash3c
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
depends_on:
- consul-master
volumes:
consul_master_data:
consul_ash3c_data:

View File

@ -0,0 +1,78 @@
version: '3.8'
services:
consul-master:
image: consul:latest
hostname: consul-master
command: >
sh -c "
IP=$$(hostname -i | awk '{print $$1}');
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0
-ui
-log-level=INFO
"
ports:
- "8500:8500"
- "8600:8600/udp"
volumes:
- consul_master_data:/consul/data
networks:
consul-net:
aliases:
- consul-master
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == master
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
consul-ash3c:
image: consul:latest
hostname: consul-ash3c
command: >
sh -c "
IP=$$(hostname -i | awk '{print $$1}');
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-ash3c -bind=$$IP -advertise=$$IP -client=0.0.0.0
-retry-join=10.0.5.5
-ui
-log-level=INFO
"
ports:
- "8501:8500"
- "8601:8600/udp"
volumes:
- consul_ash3c_data:/consul/data
networks:
consul-net:
aliases:
- consul-ash3c
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == ash3c
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
depends_on:
- consul-master
volumes:
consul_master_data:
consul_ash3c_data:
networks:
consul-net:
driver: overlay
attachable: true

View File

@ -0,0 +1,78 @@
version: '3.8'
services:
consul-master:
image: consul:latest
hostname: consul-master
command: >
sh -c "
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-master -bind=192.168.1.100 -advertise=192.168.1.100 -client=0.0.0.0
-ui
-log-level=INFO
"
ports:
- "8500:8500"
- "8600:8600/udp"
volumes:
- consul_master_data:/consul/data
networks:
consul-macvlan:
ipv4_address: 192.168.1.100
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == master
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
consul-ash3c:
image: consul:latest
hostname: consul-ash3c
command: >
sh -c "
consul agent -server -bootstrap-expect=2
-datacenter=dc1 -data-dir=/consul/data
-node=consul-ash3c -bind=192.168.1.101 -advertise=192.168.1.101 -client=0.0.0.0
-retry-join=192.168.1.100
-ui
-log-level=INFO
"
ports:
- "8501:8500"
- "8601:8600/udp"
volumes:
- consul_ash3c_data:/consul/data
networks:
consul-macvlan:
ipv4_address: 192.168.1.101
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == ash3c
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
volumes:
consul_master_data:
consul_ash3c_data:
networks:
consul-macvlan:
driver: macvlan
driver_opts:
parent: eth0 # 根据你的网络接口调整
ipam:
config:
- subnet: 192.168.1.0/24
gateway: 192.168.1.1
ip_range: 192.168.1.100/30 # 只分配 .100-.103 的IP

View File

@ -0,0 +1,40 @@
version: '3.8'
services:
consul:
image: consul:latest
hostname: consul-master
command: >
sh -c "
IP=$$(hostname -i | awk '{print $$1}');
consul agent -server -bootstrap-expect=1
-datacenter=dc1 -data-dir=/consul/data
-node=consul-master -bind=$$IP -advertise=$$IP -client=0.0.0.0
-ui
-log-level=INFO
"
ports:
- "8500:8500"
- "8600:8600/udp"
volumes:
- consul_data:/consul/data
networks:
- consul-net
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.hostname == master
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
volumes:
consul_data:
networks:
consul-net:
driver: overlay
attachable: true

View File

@ -0,0 +1,169 @@
# Nomad
# : CN(dc1) + KR(dc2) + US(dc3)
terraform {
required_version = ">= 1.0"
required_providers {
oci = {
source = "oracle/oci"
version = "~> 5.0"
}
huaweicloud = {
source = "huaweicloud/huaweicloud"
version = "~> 1.60"
}
}
}
# Oracle Cloud Provider ()
provider "oci" {
alias = "korea"
tenancy_ocid = var.oracle_tenancy_ocid
user_ocid = var.oracle_user_ocid
fingerprint = var.oracle_fingerprint
private_key_path = var.oracle_private_key_path
region = "ap-seoul-1" #
}
# Provider ()
provider "huaweicloud" {
alias = "us"
access_key = var.huawei_access_key
secret_key = var.huawei_secret_key
region = "us-east-1" #
}
#
locals {
project_name = "nomad-multi-dc"
environment = "production"
common_tags = {
Project = local.project_name
Environment = local.environment
ManagedBy = "opentofu"
Owner = "devops-team"
}
}
# SSH
data "local_file" "ssh_public_key" {
filename = pathexpand("~/.ssh/id_rsa.pub")
}
# Oracle Cloud ( - dc2)
module "oracle_infrastructure" {
source = "../../providers/oracle-cloud"
providers = {
oci = oci.korea
}
project_name = local.project_name
environment = local.environment
vpc_cidr = "10.1.0.0/16"
oci_config = {
tenancy_ocid = var.oracle_tenancy_ocid
user_ocid = var.oracle_user_ocid
fingerprint = var.oracle_fingerprint
private_key_path = var.oracle_private_key_path
region = "ap-seoul-1"
}
common_tags = local.common_tags
}
# ( - dc3)
module "huawei_infrastructure" {
source = "../../providers/huawei-cloud"
providers = {
huaweicloud = huaweicloud.us
}
project_name = local.project_name
environment = local.environment
vpc_cidr = "10.2.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b"]
common_tags = local.common_tags
}
# Nomad
module "nomad_cluster" {
source = "../../modules/nomad-cluster"
#
deploy_korea_node = var.deploy_korea_node
deploy_us_node = var.deploy_us_node
# Oracle Cloud
oracle_config = {
tenancy_ocid = var.oracle_tenancy_ocid
user_ocid = var.oracle_user_ocid
fingerprint = var.oracle_fingerprint
private_key_path = var.oracle_private_key_path
region = "ap-seoul-1"
}
oracle_subnet_id = module.oracle_infrastructure.public_subnet_ids[0]
oracle_security_group_id = module.oracle_infrastructure.security_group_id
#
huawei_config = {
access_key = var.huawei_access_key
secret_key = var.huawei_secret_key
region = "us-east-1"
}
huawei_subnet_id = module.huawei_infrastructure.public_subnet_ids[0]
huawei_security_group_id = module.huawei_infrastructure.security_group_id
#
ssh_public_key = data.local_file.ssh_public_key.content
common_tags = local.common_tags
# Nomad
nomad_version = "1.10.5"
nomad_encrypt_key = var.nomad_encrypt_key
}
# Ansible inventory
resource "local_file" "ansible_inventory" {
filename = "${path.module}/generated/nomad-cluster-inventory.yml"
content = yamlencode({
all = {
children = {
nomad_servers = {
hosts = module.nomad_cluster.ansible_inventory.all.children.nomad_servers.hosts
}
}
vars = {
ansible_user = "ubuntu"
ansible_ssh_private_key_file = "~/.ssh/id_rsa"
ansible_ssh_common_args = "-o StrictHostKeyChecking=no"
}
}
})
}
#
resource "local_file" "post_deploy_script" {
filename = "${path.module}/generated/post-deploy.sh"
content = templatefile("${path.module}/templates/post-deploy.sh", {
cluster_overview = module.nomad_cluster.cluster_overview
endpoints = module.nomad_cluster.cluster_endpoints
})
file_permission = "0755"
}
#
resource "local_file" "cross_dc_test_job" {
filename = "${path.module}/generated/cross-dc-test.nomad"
content = templatefile("${path.module}/templates/cross-dc-test.nomad", {
datacenters = ["dc1", "dc2", "dc3"]
})
}

View File

@ -0,0 +1,46 @@
# Nomad
output "cluster_overview" {
description = "Nomad 多数据中心集群概览"
value = module.nomad_cluster.cluster_overview
}
output "cluster_endpoints" {
description = "集群连接端点"
value = module.nomad_cluster.cluster_endpoints
}
output "oracle_korea_node" {
description = "Oracle Cloud 韩国节点信息"
value = module.nomad_cluster.oracle_korea_node
}
output "huawei_us_node" {
description = "华为云美国节点信息"
value = module.nomad_cluster.huawei_us_node
}
output "deployment_summary" {
description = "部署摘要"
value = {
total_nodes = module.nomad_cluster.cluster_overview.total_nodes
datacenters = keys(module.nomad_cluster.cluster_overview.datacenters)
next_steps = [
"1. 等待所有节点启动完成 (约 5-10 分钟)",
"2. 运行: ./generated/post-deploy.sh",
"3. 验证集群: nomad server members",
"4. 测试跨 DC 调度: nomad job run generated/cross-dc-test.nomad",
"5. 访问 Web UI 查看集群状态"
]
web_ui_urls = module.nomad_cluster.cluster_endpoints.nomad_ui_urls
ssh_commands = module.nomad_cluster.cluster_endpoints.ssh_commands
}
}
output "verification_commands" {
description = "验证命令"
value = module.nomad_cluster.verification_commands
}

View File

@ -0,0 +1,22 @@
# Nomad 多数据中心生产环境配置示例
# 复制此文件为 terraform.tfvars 并填入实际值
# 部署控制
deploy_korea_node = true # 是否部署韩国节点
deploy_us_node = true # 是否部署美国节点
# Oracle Cloud 配置 (韩国 - dc2)
# 获取方式: https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm
oracle_tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaa..."
oracle_user_ocid = "ocid1.user.oc1..aaaaaaaa..."
oracle_fingerprint = "aa:bb:cc:dd:ee:ff:..."
oracle_private_key_path = "~/.oci/oci_api_key.pem"
# 华为云配置 (美国 - dc3)
# 获取方式: https://console.huaweicloud.com/iam/#/mine/accessKey
huawei_access_key = "YOUR_HUAWEI_ACCESS_KEY"
huawei_secret_key = "YOUR_HUAWEI_SECRET_KEY"
# Nomad 集群加密密钥 (可选,已有默认值)
# 生成方式: nomad operator keygen
nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="

View File

@ -0,0 +1,60 @@
# Nomad
#
variable "deploy_korea_node" {
description = "是否部署韩国节点 (Oracle Cloud)"
type = bool
default = true
}
variable "deploy_us_node" {
description = "是否部署美国节点 (华为云)"
type = bool
default = true
}
# Oracle Cloud
variable "oracle_tenancy_ocid" {
description = "Oracle Cloud 租户 OCID"
type = string
sensitive = true
}
variable "oracle_user_ocid" {
description = "Oracle Cloud 用户 OCID"
type = string
sensitive = true
}
variable "oracle_fingerprint" {
description = "Oracle Cloud API 密钥指纹"
type = string
sensitive = true
}
variable "oracle_private_key_path" {
description = "Oracle Cloud 私钥文件路径"
type = string
sensitive = true
}
#
variable "huawei_access_key" {
description = "华为云访问密钥"
type = string
sensitive = true
}
variable "huawei_secret_key" {
description = "华为云秘密密钥"
type = string
sensitive = true
}
# Nomad
variable "nomad_encrypt_key" {
description = "Nomad 集群加密密钥"
type = string
sensitive = true
default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}

View File

@ -0,0 +1,159 @@
# Nomad
# CN(dc1) + KR(dc2) + US(dc3)
terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "~> 5.0"
}
huaweicloud = {
source = "huaweicloud/huaweicloud"
version = "~> 1.60"
}
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
#
locals {
nomad_version = "1.10.5"
# Nomad
nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
#
datacenters = {
dc1 = {
name = "dc1"
region = "cn"
location = "China"
provider = "existing" # semaphore
}
dc2 = {
name = "dc2"
region = "kr"
location = "Korea"
provider = "oracle"
}
dc3 = {
name = "dc3"
region = "us"
location = "US"
provider = "huawei" # aws
}
}
#
user_data_template = templatefile("${path.module}/templates/nomad-userdata.sh", {
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
})
}
# semaphore
data "external" "semaphore_info" {
program = ["bash", "-c", <<-EOF
echo '{
"ip": "100.116.158.95",
"datacenter": "dc1",
"status": "existing"
}'
EOF
]
}
# Oracle Cloud (dc2)
module "oracle_korea_node" {
source = "../compute"
count = var.deploy_korea_node ? 1 : 0
# Oracle Cloud
provider_type = "oracle"
#
instance_config = {
name = "nomad-master-kr"
datacenter = "dc2"
instance_type = "VM.Standard.E2.1.Micro" #
image_id = var.oracle_ubuntu_image_id
subnet_id = var.oracle_subnet_id
# Nomad
nomad_role = "server"
bootstrap_expect = 1
bind_addr = "auto" #
#
security_groups = [var.oracle_security_group_id]
#
tags = merge(var.common_tags, {
Name = "nomad-master-kr"
Datacenter = "dc2"
Role = "nomad-server"
Provider = "oracle"
})
}
#
user_data = templatefile("${path.module}/templates/nomad-userdata.sh", {
datacenter = "dc2"
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
bootstrap_expect = 1
bind_addr = "auto"
server_enabled = true
client_enabled = true
})
}
# (dc3)
module "huawei_us_node" {
source = "../compute"
count = var.deploy_us_node ? 1 : 0
#
provider_type = "huawei"
#
instance_config = {
name = "nomad-ash3c-us"
datacenter = "dc3"
instance_type = "s6.small.1" # 1vCPU 1GB
image_id = var.huawei_ubuntu_image_id
subnet_id = var.huawei_subnet_id
# Nomad
nomad_role = "server"
bootstrap_expect = 1
bind_addr = "auto"
#
security_groups = [var.huawei_security_group_id]
#
tags = merge(var.common_tags, {
Name = "nomad-ash3c-us"
Datacenter = "dc3"
Role = "nomad-server"
Provider = "huawei"
})
}
#
user_data = templatefile("${path.module}/templates/nomad-userdata.sh", {
datacenter = "dc3"
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
bootstrap_expect = 1
bind_addr = "auto"
server_enabled = true
client_enabled = true
})
}

View File

@ -0,0 +1,145 @@
# Nomad
#
output "cluster_overview" {
description = "Nomad 多数据中心集群概览"
value = {
datacenters = {
dc1 = {
name = "dc1"
location = "China (CN)"
provider = "existing"
node = "semaphore"
ip = "100.116.158.95"
status = "existing"
}
dc2 = var.deploy_korea_node ? {
name = "dc2"
location = "Korea (KR)"
provider = "oracle"
node = "master"
ip = try(module.oracle_korea_node[0].public_ip, "pending")
status = "deployed"
} : null
dc3 = var.deploy_us_node ? {
name = "dc3"
location = "US"
provider = "huawei"
node = "ash3c"
ip = try(module.huawei_us_node[0].public_ip, "pending")
status = "deployed"
} : null
}
total_nodes = 1 + (var.deploy_korea_node ? 1 : 0) + (var.deploy_us_node ? 1 : 0)
}
}
# Oracle Cloud
output "oracle_korea_node" {
description = "Oracle Cloud 韩国节点信息"
value = var.deploy_korea_node ? {
instance_id = try(module.oracle_korea_node[0].instance_id, null)
public_ip = try(module.oracle_korea_node[0].public_ip, null)
private_ip = try(module.oracle_korea_node[0].private_ip, null)
datacenter = "dc2"
provider = "oracle"
region = var.oracle_config.region
#
ssh_command = try("ssh ubuntu@${module.oracle_korea_node[0].public_ip}", null)
nomad_ui = try("http://${module.oracle_korea_node[0].public_ip}:4646", null)
} : null
}
#
output "huawei_us_node" {
description = "华为云美国节点信息"
value = var.deploy_us_node ? {
instance_id = try(module.huawei_us_node[0].instance_id, null)
public_ip = try(module.huawei_us_node[0].public_ip, null)
private_ip = try(module.huawei_us_node[0].private_ip, null)
datacenter = "dc3"
provider = "huawei"
region = var.huawei_config.region
#
ssh_command = try("ssh ubuntu@${module.huawei_us_node[0].public_ip}", null)
nomad_ui = try("http://${module.huawei_us_node[0].public_ip}:4646", null)
} : null
}
#
output "cluster_endpoints" {
description = "集群连接端点"
value = {
nomad_ui_urls = compact([
"http://100.116.158.95:4646", # dc1 - semaphore
var.deploy_korea_node ? try("http://${module.oracle_korea_node[0].public_ip}:4646", null) : null, # dc2
var.deploy_us_node ? try("http://${module.huawei_us_node[0].public_ip}:4646", null) : null # dc3
])
ssh_commands = compact([
"ssh root@100.116.158.95", # dc1 - semaphore
var.deploy_korea_node ? try("ssh ubuntu@${module.oracle_korea_node[0].public_ip}", null) : null, # dc2
var.deploy_us_node ? try("ssh ubuntu@${module.huawei_us_node[0].public_ip}", null) : null # dc3
])
}
}
# Ansible inventory
output "ansible_inventory" {
description = "生成的 Ansible inventory"
value = {
all = {
children = {
nomad_servers = {
hosts = merge(
{
semaphore = {
ansible_host = "100.116.158.95"
datacenter = "dc1"
provider = "existing"
}
},
var.deploy_korea_node ? {
master = {
ansible_host = try(module.oracle_korea_node[0].public_ip, "pending")
datacenter = "dc2"
provider = "oracle"
}
} : {},
var.deploy_us_node ? {
ash3c = {
ansible_host = try(module.huawei_us_node[0].public_ip, "pending")
datacenter = "dc3"
provider = "huawei"
}
} : {}
)
}
}
}
}
}
#
output "verification_commands" {
description = "部署后验证命令"
value = [
"# 检查集群状态",
"nomad server members",
"",
"# 检查各数据中心节点",
"nomad node status -verbose",
"",
"# 跨数据中心任务调度测试",
"nomad job run examples/cross-dc-test.nomad",
"",
"# 访问 UI",
join("\n", [for url in compact([
"http://100.116.158.95:4646",
var.deploy_korea_node ? try("http://${module.oracle_korea_node[0].public_ip}:4646", null) : null,
var.deploy_us_node ? try("http://${module.huawei_us_node[0].public_ip}:4646", null) : null
]) : "curl -s ${url}/v1/status/leader"])
]
}

View File

@ -0,0 +1,230 @@
#!/bin/bash
# Nomad 多数据中心节点自动配置脚本
# 数据中心: ${datacenter}
set -e
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/nomad-setup.log
}
log "开始配置 Nomad 节点 - 数据中心: ${datacenter}"
# 更新系统
log "更新系统包..."
apt-get update -y
apt-get upgrade -y
# 安装必要的包
log "安装必要的包..."
apt-get install -y \
curl \
wget \
unzip \
jq \
docker.io \
docker-compose \
htop \
net-tools \
vim
# 启动 Docker
log "启动 Docker 服务..."
systemctl enable docker
systemctl start docker
usermod -aG docker ubuntu
# 安装 Nomad
log "安装 Nomad ${nomad_version}..."
cd /tmp
wget -q https://releases.hashicorp.com/nomad/${nomad_version}/nomad_${nomad_version}_linux_amd64.zip
unzip nomad_${nomad_version}_linux_amd64.zip
mv nomad /usr/local/bin/
chmod +x /usr/local/bin/nomad
# 创建 Nomad 用户和目录
log "创建 Nomad 用户和目录..."
useradd --system --home /etc/nomad.d --shell /bin/false nomad
mkdir -p /opt/nomad/data
mkdir -p /etc/nomad.d
mkdir -p /var/log/nomad
chown -R nomad:nomad /opt/nomad /etc/nomad.d /var/log/nomad
# 获取本机 IP 地址
if [ "${bind_addr}" = "auto" ]; then
# 尝试多种方法获取 IP
BIND_ADDR=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || \
curl -s http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip -H "Metadata-Flavor: Google" 2>/dev/null || \
ip route get 8.8.8.8 | awk '{print $7; exit}' || \
hostname -I | awk '{print $1}')
else
BIND_ADDR="${bind_addr}"
fi
log "检测到 IP 地址: $BIND_ADDR"
# 创建 Nomad 配置文件
log "创建 Nomad 配置文件..."
cat > /etc/nomad.d/nomad.hcl << EOF
datacenter = "${datacenter}"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "$BIND_ADDR"
%{ if server_enabled }
server {
enabled = true
bootstrap_expect = ${bootstrap_expect}
encrypt = "${nomad_encrypt_key}"
}
%{ endif }
%{ if client_enabled }
client {
enabled = true
host_volume "docker-sock" {
path = "/var/run/docker.sock"
read_only = false
}
}
%{ endif }
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "$BIND_ADDR"
serf = "$BIND_ADDR"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
}
}
telemetry {
collection_interval = "10s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
EOF
# 创建 systemd 服务文件
log "创建 systemd 服务文件..."
cat > /etc/systemd/system/nomad.service << EOF
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
# 启动 Nomad 服务
log "启动 Nomad 服务..."
systemctl daemon-reload
systemctl enable nomad
systemctl start nomad
# 等待服务启动
log "等待 Nomad 服务启动..."
sleep 10
# 验证安装
log "验证 Nomad 安装..."
if systemctl is-active --quiet nomad; then
log "✅ Nomad 服务运行正常"
log "📊 节点信息:"
/usr/local/bin/nomad node status -self || true
else
log "❌ Nomad 服务启动失败"
systemctl status nomad --no-pager || true
journalctl -u nomad --no-pager -n 20 || true
fi
# 配置防火墙(如果需要)
log "配置防火墙规则..."
if command -v ufw >/dev/null 2>&1; then
ufw allow 4646/tcp # HTTP API
ufw allow 4647/tcp # RPC
ufw allow 4648/tcp # Serf
ufw allow 22/tcp # SSH
fi
# 创建有用的别名和脚本
log "创建管理脚本..."
cat > /usr/local/bin/nomad-status << 'EOF'
#!/bin/bash
echo "=== Nomad 服务状态 ==="
systemctl status nomad --no-pager
echo -e "\n=== Nomad 集群成员 ==="
nomad server members 2>/dev/null || echo "无法连接到集群"
echo -e "\n=== Nomad 节点状态 ==="
nomad node status 2>/dev/null || echo "无法获取节点状态"
echo -e "\n=== 最近日志 ==="
journalctl -u nomad --no-pager -n 5
EOF
chmod +x /usr/local/bin/nomad-status
# 添加到 ubuntu 用户的 bashrc
echo 'alias ns="nomad-status"' >> /home/ubuntu/.bashrc
echo 'alias nomad-logs="journalctl -u nomad -f"' >> /home/ubuntu/.bashrc
log "🎉 Nomad 节点配置完成!"
log "📍 数据中心: ${datacenter}"
log "🌐 IP 地址: $BIND_ADDR"
log "🔗 Web UI: http://$BIND_ADDR:4646"
log "📝 使用 'nomad-status' 或 'ns' 命令查看状态"
# 输出重要信息到 motd
cat > /etc/update-motd.d/99-nomad << EOF
#!/bin/bash
echo ""
echo "🚀 Nomad 节点信息:"
echo " 数据中心: ${datacenter}"
echo " IP 地址: $BIND_ADDR"
echo " Web UI: http://$BIND_ADDR:4646"
echo " 状态检查: nomad-status"
echo ""
EOF
chmod +x /etc/update-motd.d/99-nomad
log "节点配置脚本执行完成"

View File

@ -0,0 +1,118 @@
# Nomad
variable "deploy_korea_node" {
description = "是否部署韩国节点 (Oracle Cloud)"
type = bool
default = true
}
variable "deploy_us_node" {
description = "是否部署美国节点 (华为云)"
type = bool
default = true
}
# Oracle Cloud
variable "oracle_config" {
description = "Oracle Cloud 配置"
type = object({
tenancy_ocid = string
user_ocid = string
fingerprint = string
private_key_path = string
region = string
})
sensitive = true
}
variable "oracle_ubuntu_image_id" {
description = "Oracle Cloud Ubuntu 镜像 ID"
type = string
default = "" #
}
variable "oracle_subnet_id" {
description = "Oracle Cloud 子网 ID"
type = string
}
variable "oracle_security_group_id" {
description = "Oracle Cloud 安全组 ID"
type = string
}
#
variable "huawei_config" {
description = "华为云配置"
type = object({
access_key = string
secret_key = string
region = string
})
sensitive = true
}
variable "huawei_ubuntu_image_id" {
description = "华为云 Ubuntu 镜像 ID"
type = string
default = "" #
}
variable "huawei_subnet_id" {
description = "华为云子网 ID"
type = string
}
variable "huawei_security_group_id" {
description = "华为云安全组 ID"
type = string
}
#
variable "common_tags" {
description = "通用标签"
type = map(string)
default = {
Project = "nomad-multi-dc"
Environment = "production"
ManagedBy = "opentofu"
}
}
variable "ssh_public_key" {
description = "SSH 公钥"
type = string
}
variable "allowed_cidr_blocks" {
description = "允许访问的 CIDR 块"
type = list(string)
default = ["0.0.0.0/0"] #
}
# Nomad
variable "nomad_version" {
description = "Nomad 版本"
type = string
default = "1.10.5"
}
variable "nomad_encrypt_key" {
description = "Nomad 集群加密密钥"
type = string
sensitive = true
default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}
#
variable "vpc_cidr" {
description = "VPC CIDR 块"
type = string
default = "10.0.0.0/16"
}
variable "availability_zones" {
description = "可用区列表"
type = list(string)
default = ["a", "b"]
}