Clean repository: organized structure and GitOps setup

- Organized root directory structure
- Moved orphan files to proper locations
- Updated .gitignore to ignore temporary files
- Set up Gitea Runner for GitOps automation
- Fixed Tailscale access issues
- Added workflow for automated Nomad deployment
This commit is contained in:
2025-10-09 06:13:45 +00:00
commit 89ee6f7967
306 changed files with 30781 additions and 0 deletions

104
deployment/Makefile Normal file
View File

@@ -0,0 +1,104 @@
# 项目管理 Makefile
.PHONY: help setup init plan apply destroy clean test lint docs
# 默认目标
help: ## 显示帮助信息
@echo "可用的命令:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
# 环境设置
setup: ## 设置开发环境
@echo "🚀 设置开发环境..."
@bash scripts/setup/environment/setup-environment.sh
# OpenTofu 操作
init: ## 初始化 OpenTofu
@echo "🏗️ 初始化 OpenTofu..."
@cd infrastructure/environments/dev && tofu init
plan: ## 生成执行计划
@echo "📋 生成执行计划..."
@cd infrastructure/environments/dev && tofu plan -var-file="terraform.tfvars"
apply: ## 应用基础设施变更
@echo "🚀 应用基础设施变更..."
@cd infrastructure/environments/dev && tofu apply -var-file="terraform.tfvars"
destroy: ## 销毁基础设施
@echo "💥 销毁基础设施..."
@cd infrastructure/environments/dev && tofu destroy -var-file="terraform.tfvars"
# Ansible 操作
ansible-check: ## 检查 Ansible 配置
@echo "🔍 检查 Ansible 配置..."
@cd configuration && ansible-playbook --syntax-check playbooks/bootstrap/main.yml
ansible-deploy: ## 部署应用
@echo "📦 部署应用..."
@cd configuration && ansible-playbook -i inventories/production/inventory.ini playbooks/bootstrap/main.yml
# Podman 操作
podman-build: ## 构建 Podman 镜像
@echo "📦 构建 Podman 镜像..."
@podman-compose -f containers/compose/development/docker-compose.yml build
podman-up: ## 启动开发环境
@echo "🚀 启动开发环境..."
@podman-compose -f containers/compose/development/docker-compose.yml up -d
podman-down: ## 停止开发环境
@echo "🛑 停止开发环境..."
@podman-compose -f containers/compose/development/docker-compose.yml down
# 测试
test: ## 运行测试
@echo "🧪 运行测试..."
@bash scripts/testing/test-runner.sh
test-mcp: ## 运行MCP服务器测试
@echo "🧪 运行MCP服务器测试..."
@bash scripts/testing/mcp/test_local_mcp_servers.sh
test-kali: ## 运行Kali Linux快速健康检查
@echo "🧪 运行Kali Linux快速健康检查..."
@cd configuration && ansible-playbook -i inventories/production/inventory.ini playbooks/test/kali-health-check.yml
test-kali-security: ## 运行Kali Linux安全工具测试
@echo "🧪 运行Kali Linux安全工具测试..."
@cd configuration && ansible-playbook -i inventories/production/inventory.ini playbooks/test/kali-security-tools.yml
test-kali-full: ## 运行Kali Linux完整测试套件
@echo "🧪 运行Kali Linux完整测试套件..."
@cd configuration && ansible-playbook playbooks/test/kali-full-test-suite.yml
lint: ## 代码检查
@echo "🔍 代码检查..."
@bash scripts/ci-cd/quality/lint.sh
# 文档
docs: ## 生成文档
@echo "📚 生成文档..."
@bash scripts/ci-cd/build/generate-docs.sh
# 清理
clean: ## 清理临时文件
@echo "🧹 清理临时文件..."
@find . -name "*.tfstate*" -delete
@find . -name ".terraform" -type d -exec rm -rf {} + 2>/dev/null || true
@podman system prune -f
# 备份
backup: ## 创建备份
@echo "💾 创建备份..."
@bash scripts/utilities/backup/backup-all.sh
# 监控
monitor: ## 启动监控
@echo "📊 启动监控..."
@podman-compose -f containers/compose/production/monitoring.yml up -d
# 安全扫描
security-scan: ## 安全扫描
@echo "🔒 安全扫描..."
@bash scripts/ci-cd/quality/security-scan.sh

View File

@@ -0,0 +1,20 @@
[defaults]
inventory = inventory.ini
host_key_checking = False
forks = 8
timeout = 30
gathering = smart
fact_caching = memory
# 支持新的 playbooks 目录结构
roles_path = playbooks/
collections_path = playbooks/
# 启用SSH密钥认证
ansible_ssh_common_args = '-o PreferredAuthentications=publickey -o PubkeyAuthentication=yes'
[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no -o PreferredAuthentications=publickey -o PubkeyAuthentication=yes
pipelining = True
[inventory]
# 启用插件以支持动态 inventory
enable_plugins = host_list, script, auto, yaml, ini, toml

View File

@@ -0,0 +1,57 @@
---
- name: Clean up Consul configuration from dedicated clients
hosts: hcp1,influxdb1,browser
become: yes
tasks:
- name: Stop Consul service
systemd:
name: consul
state: stopped
enabled: no
- name: Disable Consul service
systemd:
name: consul
enabled: no
- name: Kill any remaining Consul processes
shell: |
pkill -f consul || true
sleep 2
pkill -9 -f consul || true
ignore_errors: yes
- name: Remove Consul systemd service file
file:
path: /etc/systemd/system/consul.service
state: absent
- name: Remove Consul configuration directory
file:
path: /etc/consul.d
state: absent
- name: Remove Consul data directory
file:
path: /opt/consul
state: absent
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Verify Consul is stopped
shell: |
if pgrep -f consul; then
echo "Consul still running"
exit 1
else
echo "Consul stopped successfully"
fi
register: consul_status
failed_when: consul_status.rc != 0
- name: Display cleanup status
debug:
msg: "Consul cleanup completed on {{ inventory_hostname }}"

View File

@@ -0,0 +1,55 @@
---
- name: Configure Consul Auto-Discovery
hosts: all
become: yes
vars:
consul_servers:
- "warden.tailnet-68f9.ts.net:8301"
- "ch4.tailnet-68f9.ts.net:8301"
- "ash3c.tailnet-68f9.ts.net:8301"
tasks:
- name: Backup current nomad.hcl
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup.{{ ansible_date_time.epoch }}
remote_src: yes
backup: yes
- name: Update Consul configuration for auto-discovery
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CONSUL CONFIG"
block: |
consul {
retry_join = [
"warden.tailnet-68f9.ts.net:8301",
"ch4.tailnet-68f9.ts.net:8301",
"ash3c.tailnet-68f9.ts.net:8301"
]
server_service_name = "nomad"
client_service_name = "nomad-client"
}
insertbefore: '^consul \{'
replace: '^consul \{.*?\}'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Verify Consul connection
shell: |
NOMAD_ADDR=http://localhost:4646 nomad node status | grep -q "ready"
register: nomad_ready
failed_when: nomad_ready.rc != 0
retries: 3
delay: 10

View File

@@ -0,0 +1,75 @@
---
- name: Remove Consul configuration from Nomad servers
hosts: semaphore,ash1d,ash2e,ch2,ch3,onecloud1,de
become: yes
tasks:
- name: Remove entire Consul configuration block
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CONSUL CONFIG"
state: absent
- name: Remove Consul configuration lines
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^consul \{'
state: absent
- name: Remove Consul configuration content
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ address ='
state: absent
- name: Remove Consul service names
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ server_service_name ='
state: absent
- name: Remove Consul client service name
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ client_service_name ='
state: absent
- name: Remove Consul auto-advertise
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ auto_advertise ='
state: absent
- name: Remove Consul server auto-join
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ server_auto_join ='
state: absent
- name: Remove Consul client auto-join
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ client_auto_join ='
state: absent
- name: Remove Consul closing brace
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^}'
state: absent
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Display completion message
debug:
msg: "Removed Consul configuration from {{ inventory_hostname }}"

View File

@@ -0,0 +1,32 @@
---
- name: Enable Nomad Client Mode on Servers
hosts: ch2,ch3,de
become: yes
tasks:
- name: Enable Nomad client mode
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^client \{'
line: 'client {'
state: present
- name: Enable client mode
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ enabled = false'
line: ' enabled = true'
state: present
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30

View File

@@ -0,0 +1,38 @@
client {
enabled = true
# 配置七姐妹服务器地址
servers = [
"100.116.158.95:4647", # bj-semaphore
"100.81.26.3:4647", # ash1d
"100.103.147.94:4647", # ash2e
"100.90.159.68:4647", # ch2
"100.86.141.112:4647", # ch3
"100.98.209.50:4647", # bj-onecloud1
"100.120.225.29:4647" # de
]
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
# 禁用Docker驱动只使用Podman
options {
"driver.raw_exec.enable" = "1"
"driver.exec.enable" = "1"
}
plugin_dir = "/opt/nomad/plugins"
}
# 配置Podman驱动
plugin "podman" {
config {
volumes {
enabled = true
}
logging {
type = "journald"
}
gc {
container = true
}
}
}

View File

@@ -0,0 +1,62 @@
---
- name: Fix all master references to ch4
hosts: localhost
gather_facts: no
vars:
files_to_fix:
- "scripts/diagnose-consul-sync.sh"
- "scripts/register-traefik-to-all-consul.sh"
- "deployment/ansible/playbooks/update-nomad-consul-config.yml"
- "deployment/ansible/templates/nomad-server.hcl.j2"
- "deployment/ansible/templates/nomad-client.hcl"
- "deployment/ansible/playbooks/fix-nomad-consul-roles.yml"
- "deployment/ansible/onecloud1_nomad.hcl"
- "ansible/templates/consul-client.hcl.j2"
- "ansible/consul-client-deployment.yml"
- "ansible/consul-client-simple.yml"
tasks:
- name: Replace master.tailnet-68f9.ts.net with ch4.tailnet-68f9.ts.net
replace:
path: "{{ item }}"
regexp: 'master\.tailnet-68f9\.ts\.net'
replace: 'ch4.tailnet-68f9.ts.net'
loop: "{{ files_to_fix }}"
when: item is file
- name: Replace master hostname references
replace:
path: "{{ item }}"
regexp: '\bmaster\b'
replace: 'ch4'
loop: "{{ files_to_fix }}"
when: item is file
- name: Replace master IP references in comments
replace:
path: "{{ item }}"
regexp: '# master'
replace: '# ch4'
loop: "{{ files_to_fix }}"
when: item is file
- name: Fix inventory files
replace:
path: "{{ item }}"
regexp: 'master ansible_host=master'
replace: 'ch4 ansible_host=ch4'
loop:
- "deployment/ansible/inventories/production/inventory.ini"
- "deployment/ansible/inventories/production/csol-consul-nodes.ini"
- "deployment/ansible/inventories/production/nomad-clients.ini"
- "deployment/ansible/inventories/production/master-ash3c.ini"
- "deployment/ansible/inventories/production/consul-nodes.ini"
- "deployment/ansible/inventories/production/vault.ini"
- name: Fix IP address references (100.117.106.136 comments)
replace:
path: "{{ item }}"
regexp: '100\.117\.106\.136.*# master'
replace: '100.117.106.136 # ch4'
loop: "{{ files_to_fix }}"
when: item is file

View File

@@ -0,0 +1,2 @@
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"

View File

@@ -0,0 +1,108 @@
# CSOL Consul 静态节点配置说明
## 概述
本目录包含CSOLCloud Service Operations Layer的Consul静态节点配置文件。这些配置文件定义了Consul集群的服务器和客户端节点信息便于团队成员快速了解和使用Consul集群。
## 配置文件说明
### 1. csol-consul-nodes.ini
这是主要的Consul节点配置文件包含所有服务器和客户端节点的详细信息。
**文件结构:**
- `[consul_servers]` - Consul服务器节点7个节点
- `[consul_clients]` - Consul客户端节点2个节点
- `[consul_cluster:children]` - 集群所有节点的组合
- `[consul_servers:vars]` - 服务器节点的通用配置
- `[consul_clients:vars]` - 客户端节点的通用配置
- `[consul_cluster:vars]` - 整个集群的通用配置
**使用方法:**
```bash
# 使用此配置文件运行Ansible Playbook
ansible-playbook -i csol-consul-nodes.ini your-playbook.yml
```
### 2. csol-consul-nodes.json
这是JSON格式的Consul节点配置文件便于程序读取和处理。
**文件结构:**
- `servers` - 服务器节点列表
- `clients` - 客户端节点列表
- `configuration` - 集群配置信息
- `notes` - 节点统计和备注信息
**使用方法:**
```bash
# 使用jq工具查询JSON文件
jq '.csol_consul_nodes.servers.nodes[].name' csol-consul-nodes.json
# 使用Python脚本处理JSON文件
python3 -c "import json; data=json.load(open('csol-consul-nodes.json')); print(data['csol_consul_nodes']['servers']['nodes'])"
```
### 3. consul-nodes.ini
这是更新的Consul节点配置文件替代了原有的旧版本。
### 4. consul-cluster.ini
这是Consul集群服务器节点的配置文件主要用于集群部署和管理。
## 节点列表
### 服务器节点7个
| 节点名称 | IP地址 | 区域 | 角色 |
|---------|--------|------|------|
| ch2 | 100.90.159.68 | Oracle Cloud KR | 服务器 |
| ch3 | 100.86.141.112 | Oracle Cloud KR | 服务器 |
| ash1d | 100.81.26.3 | Oracle Cloud US | 服务器 |
| ash2e | 100.103.147.94 | Oracle Cloud US | 服务器 |
| onecloud1 | 100.98.209.50 | Armbian | 服务器 |
| de | 100.120.225.29 | Armbian | 服务器 |
| bj-semaphore | 100.116.158.95 | Semaphore | 服务器 |
### 客户端节点2个
| 节点名称 | IP地址 | 端口 | 区域 | 角色 |
|---------|--------|------|------|------|
| master | 100.117.106.136 | 60022 | Oracle Cloud A1 | 客户端 |
| ash3c | 100.116.80.94 | - | Oracle Cloud A1 | 客户端 |
## 配置参数
### 通用配置
- `consul_version`: 1.21.5
- `datacenter`: dc1
- `encrypt_key`: 1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=
- `client_addr`: 0.0.0.0
- `data_dir`: /opt/consul/data
- `config_dir`: /etc/consul.d
- `log_level`: INFO
- `port`: 8500
### 服务器特定配置
- `consul_server`: true
- `bootstrap_expect`: 7
- `ui_config`: true
### 客户端特定配置
- `consul_server`: false
## 注意事项
1. **退役节点**hcs节点已于2025-09-27退役不再包含在配置中。
2. **故障节点**syd节点为故障节点已隔离不包含在配置中。
3. **端口配置**master节点使用60022端口其他节点使用默认SSH端口。
4. **认证信息**所有节点使用统一的认证信息用户名ben密码3131
5. **bootstrap_expect**设置为7表示期望有7个服务器节点形成集群。
## 更新日志
- 2025-06-17初始版本包含完整的CSOL Consul节点配置。
## 维护说明
1. 添加新节点时,请同时更新所有配置文件。
2. 节点退役或故障时,请及时从配置中移除并更新说明。
3. 定期验证节点可达性和配置正确性。
4. 更新配置后请同步更新此README文件。

View File

@@ -0,0 +1,47 @@
# CSOL Consul 集群 Inventory - 更新时间: 2025-06-17
# 此文件包含所有CSOL的Consul服务器节点信息
[consul_servers]
# Oracle Cloud 韩国区域 (KR)
ch2 ansible_host=100.90.159.68 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch3 ansible_host=100.86.141.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131
# Oracle Cloud 美国区域 (US)
ash1d ansible_host=100.81.26.3 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash2e ansible_host=100.103.147.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
# Armbian 节点
onecloud1 ansible_host=100.98.209.50 ansible_user=ben ansible_password=3131 ansible_become_password=3131
de ansible_host=100.120.225.29 ansible_user=ben ansible_password=3131 ansible_become_password=3131
# Semaphore 节点
bj-semaphore ansible_host=100.116.158.95 ansible_user=root
[consul_cluster:children]
consul_servers
[consul_servers:vars]
# Consul服务器配置
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
consul_version=1.21.5
consul_datacenter=dc1
consul_encrypt_key=1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=
consul_bootstrap_expect=7
consul_server=true
consul_ui_config=true
consul_client_addr=0.0.0.0
consul_bind_addr="{{ ansible_default_ipv4.address }}"
consul_data_dir=/opt/consul/data
consul_config_dir=/etc/consul.d
consul_log_level=INFO
consul_port=8500
# === 节点说明 ===
# 服务器节点 (7个):
# - Oracle Cloud KR: ch2, ch3
# - Oracle Cloud US: ash1d, ash2e
# - Armbian: onecloud1, de
# - Semaphore: bj-semaphore
#
# 注意: hcs节点已退役 (2025-09-27)
# 注意: syd节点为故障节点已隔离

View File

@@ -0,0 +1,65 @@
# CSOL Consul 静态节点配置
# 更新时间: 2025-06-17 (基于实际Consul集群信息更新)
# 此文件包含所有CSOL的服务器和客户端节点信息
[consul_servers]
# 主要服务器节点 (全部为服务器模式)
master ansible_host=100.117.106.136 ansible_user=ben ansible_password=3131 ansible_become_password=3131 ansible_port=60022
ash3c ansible_host=100.116.80.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
warden ansible_host=100.122.197.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131
[consul_clients]
# 客户端节点
bj-warden ansible_host=100.122.197.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131
bj-hcp2 ansible_host=100.116.112.45 ansible_user=root ansible_password=313131 ansible_become_password=313131
bj-influxdb ansible_host=100.100.7.4 ansible_user=root ansible_password=313131 ansible_become_password=313131
bj-hcp1 ansible_host=100.97.62.111 ansible_user=root ansible_password=313131 ansible_become_password=313131
[consul_cluster:children]
consul_servers
consul_clients
[consul_servers:vars]
# Consul服务器配置
consul_server=true
consul_bootstrap_expect=3
consul_datacenter=dc1
consul_encrypt_key=1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=
consul_client_addr=0.0.0.0
consul_bind_addr="{{ ansible_default_ipv4.address }}"
consul_data_dir=/opt/consul/data
consul_config_dir=/etc/consul.d
consul_log_level=INFO
consul_port=8500
consul_ui_config=true
[consul_clients:vars]
# Consul客户端配置
consul_server=false
consul_datacenter=dc1
consul_encrypt_key=1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=
consul_client_addr=0.0.0.0
consul_bind_addr="{{ ansible_default_ipv4.address }}"
consul_data_dir=/opt/consul/data
consul_config_dir=/etc/consul.d
consul_log_level=INFO
[consul_cluster:vars]
# 通用配置
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
ansible_ssh_private_key_file=~/.ssh/id_ed25519
consul_version=1.21.5
# === 节点说明 ===
# 服务器节点 (3个):
# - bj-semaphore: 100.116.158.95 (主要服务器节点)
# - kr-master: 100.117.106.136 (韩国主节点)
# - us-ash3c: 100.116.80.94 (美国服务器节点)
#
# 客户端节点 (4个):
# - bj-warden: 100.122.197.112 (北京客户端节点)
# - bj-hcp2: 100.116.112.45 (北京HCP客户端节点2)
# - bj-influxdb: 100.100.7.4 (北京InfluxDB客户端节点)
# - bj-hcp1: 100.97.62.111 (北京HCP客户端节点1)
#
# 注意: 此配置基于实际Consul集群信息更新包含3个服务器节点

View File

@@ -0,0 +1,44 @@
# Consul 静态节点配置
# 此文件包含所有CSOL的服务器和客户端节点信息
# 更新时间: 2025-06-17 (基于实际Consul集群信息更新)
# === CSOL 服务器节点 ===
# 这些节点运行Consul服务器模式参与集群决策和数据存储
[consul_servers]
# 主要服务器节点 (全部为服务器模式)
master ansible_host=100.117.106.136 ansible_user=ben ansible_password=3131 ansible_become_password=3131 ansible_port=60022
ash3c ansible_host=100.116.80.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
warden ansible_host=100.122.197.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131
# === 节点分组 ===
[consul_cluster:children]
consul_servers
[consul_servers:vars]
# Consul服务器配置
consul_server=true
consul_bootstrap_expect=3
consul_datacenter=dc1
consul_encrypt_key=1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=
consul_client_addr=0.0.0.0
consul_bind_addr="{{ ansible_default_ipv4.address }}"
consul_data_dir=/opt/consul/data
consul_config_dir=/etc/consul.d
consul_log_level=INFO
consul_port=8500
consul_ui_config=true
[consul_cluster:vars]
# 通用配置
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
consul_version=1.21.5
# === 节点说明 ===
# 服务器节点 (3个):
# - master: 100.117.106.136 (韩国主节点)
# - ash3c: 100.116.80.94 (美国服务器节点)
# - warden: 100.122.197.112 (北京服务器节点当前集群leader)
#
# 注意: 此配置基于实际Consul集群信息更新所有节点均为服务器模式

View File

@@ -0,0 +1,126 @@
{
"csol_consul_nodes": {
"updated_at": "2025-06-17",
"description": "CSOL Consul静态节点配置",
"servers": {
"description": "Consul服务器节点参与集群决策和数据存储",
"nodes": [
{
"name": "ch2",
"host": "100.90.159.68",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Oracle Cloud KR",
"role": "server"
},
{
"name": "ch3",
"host": "100.86.141.112",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Oracle Cloud KR",
"role": "server"
},
{
"name": "ash1d",
"host": "100.81.26.3",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Oracle Cloud US",
"role": "server"
},
{
"name": "ash2e",
"host": "100.103.147.94",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Oracle Cloud US",
"role": "server"
},
{
"name": "onecloud1",
"host": "100.98.209.50",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Armbian",
"role": "server"
},
{
"name": "de",
"host": "100.120.225.29",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Armbian",
"role": "server"
},
{
"name": "bj-semaphore",
"host": "100.116.158.95",
"user": "root",
"region": "Semaphore",
"role": "server"
}
]
},
"clients": {
"description": "Consul客户端节点用于服务发现和健康检查",
"nodes": [
{
"name": "ch4",
"host": "100.117.106.136",
"user": "ben",
"password": "3131",
"become_password": "3131",
"port": 60022,
"region": "Oracle Cloud A1",
"role": "client"
},
{
"name": "ash3c",
"host": "100.116.80.94",
"user": "ben",
"password": "3131",
"become_password": "3131",
"region": "Oracle Cloud A1",
"role": "client"
}
]
},
"configuration": {
"consul_version": "1.21.5",
"datacenter": "dc1",
"encrypt_key": "1EvGItLOB8nuHnSA0o+rO0zXzLeJl+U+Jfvuw0+H848=",
"client_addr": "0.0.0.0",
"data_dir": "/opt/consul/data",
"config_dir": "/etc/consul.d",
"log_level": "INFO",
"port": 8500,
"bootstrap_expect": 7,
"ui_config": true
},
"notes": {
"server_count": 7,
"client_count": 2,
"total_nodes": 9,
"retired_nodes": [
{
"name": "hcs",
"retired_date": "2025-09-27",
"reason": "节点退役"
}
],
"isolated_nodes": [
{
"name": "syd",
"reason": "故障节点,已隔离"
}
]
}
}
}

View File

@@ -0,0 +1,20 @@
# Nomad 集群全局配置
# InfluxDB 2.x + Grafana 监控配置
# InfluxDB 2.x 连接配置
influxdb_url: "http://influxdb1.tailnet-68f9.ts.net:8086"
influxdb_token: "VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w=="
influxdb_org: "seekkey" # 组织名称
influxdb_bucket: "VPS" # Bucket 名称
# 远程 Telegraf 配置 URL
telegraf_config_url: "http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000"
# 监控配置
disk_usage_warning: 80 # 硬盘使用率警告阈值
disk_usage_critical: 90 # 硬盘使用率严重告警阈值
collection_interval: 30 # 数据收集间隔(秒)
# Telegraf 优化配置
telegraf_log_level: "ERROR" # 只记录错误日志
telegraf_disable_local_logs: true # 禁用本地日志文件

View File

@@ -0,0 +1,37 @@
[nomad_servers]
# 服务器节点 (7个服务器节点)
# ⚠️ 警告:能力越大,责任越大!服务器节点操作需极其谨慎!
# ⚠️ 任何对服务器节点的操作都可能影响整个集群的稳定性!
semaphore ansible_host=127.0.0.1 ansible_user=root ansible_password=3131 ansible_become_password=3131 ansible_ssh_common_args="-o PreferredAuthentications=password -o PubkeyAuthentication=no"
ash1d ansible_host=ash1d.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash2e ansible_host=ash2e.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch2 ansible_host=ch2.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch3 ansible_host=ch3.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
onecloud1 ansible_host=onecloud1.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
de ansible_host=de.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
hcp1 ansible_host=hcp1.tailnet-68f9.ts.net ansible_user=root ansible_password=3131 ansible_become_password=3131
[nomad_clients]
# 客户端节点 (5个客户端节点)
ch4 ansible_host=ch4.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash3c ansible_host=ash3c.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
browser ansible_host=browser.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
influxdb1 ansible_host=influxdb1.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
warden ansible_host=warden.tailnet-68f9.ts.net ansible_user=ben ansible_password=3131 ansible_become_password=3131
[nomad_nodes:children]
nomad_servers
nomad_clients
[nomad_nodes:vars]
# NFS配置
nfs_server=snail
nfs_share=/fs/1000/nfs/Fnsync
mount_point=/mnt/fnsync
# Ansible配置
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
gitea ansible_host=gitea ansible_user=ben ansible_password=3131 ansible_become_password=3131
[gitea]
gitea ansible_host=gitea ansible_user=ben ansible_password=3131 ansible_become_password=3131

View File

@@ -0,0 +1,98 @@
[dev]
dev1 ansible_host=dev1 ansible_user=ben ansible_become=yes ansible_become_pass=3131
dev2 ansible_host=dev2 ansible_user=ben ansible_become=yes ansible_become_pass=3131
[oci_kr]
#ch2 ansible_host=ch2 ansible_user=ben ansible_become=yes ansible_become_pass=3131 # 过期节点,已移除 (2025-09-30)
#ch3 ansible_host=ch3 ansible_user=ben ansible_become=yes ansible_become_pass=3131 # 过期节点,已移除 (2025-09-30)
[oci_us]
ash1d ansible_host=ash1d ansible_user=ben ansible_become=yes ansible_become_pass=3131
ash2e ansible_host=ash2e ansible_user=ben ansible_become=yes ansible_become_pass=3131
[oci_a1]
ch4 ansible_host=ch4 ansible_user=ben ansible_become=yes ansible_become_pass=3131
ash3c ansible_host=ash3c ansible_user=ben ansible_become=yes ansible_become_pass=3131
[huawei]
# hcs 节点已退役 (2025-09-27)
[google]
benwork ansible_host=benwork ansible_user=ben ansible_become=yes ansible_become_pass=3131
[ditigalocean]
# syd ansible_host=syd ansible_user=ben ansible_become=yes ansible_become_pass=3131 # 故障节点,已隔离
[faulty_cloud_servers]
# 故障的云服务器节点,需要通过 OpenTofu 和 Consul 解决
# hcs 节点已退役 (2025-09-27)
syd ansible_host=syd ansible_user=ben ansible_become=yes ansible_become_pass=3131
[aws]
#aws linux dnf
awsirish ansible_host=awsirish ansible_user=ben ansible_become=yes ansible_become_pass=3131
[proxmox]
pve ansible_host=pve ansible_user=root ansible_become=yes ansible_become_pass=Aa313131@ben
xgp ansible_host=xgp ansible_user=root ansible_become=yes ansible_become_pass=Aa313131@ben
nuc12 ansible_host=nuc12 ansible_user=root ansible_become=yes ansible_become_pass=Aa313131@ben
[lxc]
#集中在三台机器不要同时upgrade 会死掉,顺序调度来 (Debian/Ubuntu containers using apt)
gitea ansible_host=gitea.tailnet-68f9.ts.net ansible_user=ben ansible_ssh_private_key_file=/root/.ssh/gitea ansible_become=yes ansible_become_pass=3131
mysql ansible_host=mysql ansible_user=root ansible_become=yes ansible_become_pass=313131
postgresql ansible_host=postgresql ansible_user=root ansible_become=yes ansible_become_pass=313131
[nomadlxc]
influxdb ansible_host=influxdb1 ansible_user=root ansible_become=yes ansible_become_pass=313131
warden ansible_host=warden ansible_user=ben ansible_become=yes ansible_become_pass=3131
[semaphore]
#semaphoressh ansible_host=localhost ansible_user=root ansible_become=yes ansible_become_pass=313131 ansible_ssh_pass=313131 # 过期节点,已移除 (2025-09-30)
[alpine]
#Alpine Linux containers using apk package manager
redis ansible_host=redis ansible_user=root ansible_become=yes ansible_become_pass=313131
authentik ansible_host=authentik ansible_user=root ansible_become=yes ansible_become_pass=313131
calibreweb ansible_host=calibreweb ansible_user=root ansible_become=yes ansible_become_pass=313131
qdrant ansible_host=qdrant ansible_user=root ansible_become=yes
[vm]
kali ansible_host=kali ansible_user=ben ansible_become=yes ansible_become_pass=3131
[hcp]
hcp1 ansible_host=hcp1 ansible_user=root ansible_become=yes ansible_become_pass=313131
hcp2 ansible_host=hcp2 ansible_user=root ansible_become=yes ansible_become_pass=313131
[feiniu]
snail ansible_host=snail ansible_user=houzhongxu ansible_ssh_pass=Aa313131@ben ansible_become=yes ansible_become_pass=Aa313131@ben
[armbian]
onecloud1 ansible_host=100.98.209.50 ansible_user=ben ansible_password=3131 ansible_become_password=3131
de ansible_host=100.120.225.29 ansible_user=ben ansible_password=3131 ansible_become_password=3131
[beijing:children]
nomadlxc
hcp
[all:vars]
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
[nomad_clients:children]
nomadlxc
hcp
oci_a1
huawei
ditigalocean
[nomad_servers:children]
oci_us
oci_kr
semaphore
armbian
[nomad_cluster:children]
nomad_servers
nomad_clients
[beijing:children]
nomadlxc
hcp

View File

@@ -0,0 +1,7 @@
[target_nodes]
master ansible_host=100.117.106.136 ansible_port=60022 ansible_user=ben ansible_become=yes ansible_become_pass=3131
ash3c ansible_host=100.116.80.94 ansible_user=ben ansible_become=yes ansible_become_pass=3131
semaphore ansible_host=100.116.158.95 ansible_user=ben ansible_become=yes ansible_become_pass=3131
[target_nodes:vars]
ansible_ssh_common_args='-o StrictHostKeyChecking=no'

View File

@@ -0,0 +1,14 @@
# Nomad 客户端节点配置
# 此文件包含需要配置为Nomad客户端的6个节点
[nomad_clients]
bj-hcp1 ansible_host=bj-hcp1 ansible_user=root ansible_password=313131 ansible_become_password=313131
bj-influxdb ansible_host=bj-influxdb ansible_user=root ansible_password=313131 ansible_become_password=313131
bj-warden ansible_host=bj-warden ansible_user=ben ansible_password=3131 ansible_become_password=3131
bj-hcp2 ansible_host=bj-hcp2 ansible_user=root ansible_password=313131 ansible_become_password=313131
kr-master ansible_host=master ansible_port=60022 ansible_user=ben ansible_password=3131 ansible_become_password=3131
us-ash3c ansible_host=ash3c ansible_user=ben ansible_password=3131 ansible_become_password=3131
[nomad_clients:vars]
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
client_ip="{{ ansible_host }}"

View File

@@ -0,0 +1,12 @@
[consul_servers:children]
nomad_servers
[consul_servers:vars]
consul_cert_dir=/etc/consul.d/certs
consul_ca_src=security/certificates/ca.pem
consul_cert_src=security/certificates/consul-server.pem
consul_key_src=security/certificates/consul-server-key.pem
[nomad_cluster:children]
nomad_servers
nomad_clients

View File

@@ -0,0 +1,7 @@
[vault_servers]
master ansible_host=100.117.106.136 ansible_user=ben ansible_password=3131 ansible_become_password=3131 ansible_port=60022
ash3c ansible_host=100.116.80.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
warden ansible_host=warden ansible_user=ben ansible_become=yes ansible_become_pass=3131
[vault_servers:vars]
ansible_ssh_common_args='-o StrictHostKeyChecking=no'

View File

@@ -0,0 +1,50 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "onecloud1"
bind_addr = "100.98.209.50"
addresses {
http = "100.98.209.50"
rpc = "100.98.209.50"
serf = "100.98.209.50"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 3
retry_join = ["100.81.26.3", "100.103.147.94", "100.90.159.68", "100.86.141.112", "100.98.209.50", "100.120.225.29"]
}
client {
enabled = false
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden
}
vault {
enabled = true
address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden
token = "hvs.A5Fu4E1oHyezJapVllKPFsWg"
create_from_role = "nomad-cluster"
tls_skip_verify = true
}

View File

@@ -0,0 +1,202 @@
---
- name: Add Warden Server as Nomad Client to Cluster
hosts: warden
become: yes
gather_facts: yes
vars:
nomad_plugin_dir: "/opt/nomad/plugins"
nomad_datacenter: "dc1"
nomad_region: "global"
nomad_servers:
- "100.117.106.136:4647"
- "100.116.80.94:4647"
- "100.97.62.111:4647"
- "100.116.112.45:4647"
- "100.84.197.26:4647"
tasks:
- name: 显示当前处理的节点
debug:
msg: "🔧 将 warden 服务器添加为 Nomad 客户端: {{ inventory_hostname }}"
- name: 检查 Nomad 是否已安装
shell: which nomad || echo "not_found"
register: nomad_check
changed_when: false
- name: 下载并安装 Nomad
block:
- name: 下载 Nomad 1.10.5
get_url:
url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip"
dest: "/tmp/nomad.zip"
mode: '0644'
- name: 解压并安装 Nomad
unarchive:
src: "/tmp/nomad.zip"
dest: "/usr/local/bin/"
remote_src: yes
owner: root
group: root
mode: '0755'
- name: 清理临时文件
file:
path: "/tmp/nomad.zip"
state: absent
when: nomad_check.stdout == "not_found"
- name: 验证 Nomad 安装
shell: nomad version
register: nomad_version_output
- name: 创建 Nomad 配置目录
file:
path: /etc/nomad.d
state: directory
owner: root
group: root
mode: '0755'
- name: 创建 Nomad 数据目录
file:
path: /opt/nomad/data
state: directory
owner: nomad
group: nomad
mode: '0755'
ignore_errors: yes
- name: 创建 Nomad 插件目录
file:
path: "{{ nomad_plugin_dir }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
ignore_errors: yes
- name: 获取服务器 IP 地址
shell: |
ip route get 1.1.1.1 | grep -oP 'src \K\S+'
register: server_ip_result
changed_when: false
- name: 设置服务器 IP 变量
set_fact:
server_ip: "{{ server_ip_result.stdout }}"
- name: 停止 Nomad 服务(如果正在运行)
systemd:
name: nomad
state: stopped
ignore_errors: yes
- name: 创建 Nomad 客户端配置文件
copy:
content: |
# Nomad Client Configuration for warden
datacenter = "{{ nomad_datacenter }}"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "{{ server_ip }}"
server {
enabled = false
}
client {
enabled = true
servers = [
{% for server in nomad_servers %}"{{ server }}"{% if not loop.last %}, {% endif %}{% endfor %}
]
}
plugin_dir = "{{ nomad_plugin_dir }}"
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "127.0.0.1:8500"
}
dest: /etc/nomad.d/nomad.hcl
owner: root
group: root
mode: '0644'
- name: 验证 Nomad 配置
shell: nomad config validate /etc/nomad.d/nomad.hcl
register: nomad_validate
failed_when: nomad_validate.rc != 0
- name: 创建 Nomad systemd 服务文件
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/docs/
Wants=network-online.target
After=network-online.target
[Service]
Type=notify
User=root
Group=root
ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
KillSignal=SIGINT
TimeoutStopSec=5
LimitNOFILE=65536
LimitNPROC=32768
Restart=on-failure
RestartSec=2
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
- name: 重新加载 systemd 配置
systemd:
daemon_reload: yes
- name: 启动并启用 Nomad 服务
systemd:
name: nomad
state: started
enabled: yes
- name: 等待 Nomad 服务启动
wait_for:
port: 4646
host: "{{ server_ip }}"
delay: 5
timeout: 60
- name: 检查 Nomad 客户端状态
shell: nomad node status -self
register: nomad_node_status
retries: 5
delay: 5
until: nomad_node_status.rc == 0
ignore_errors: yes
- name: 显示 Nomad 客户端配置结果
debug:
msg: |
✅ warden 服务器已成功配置为 Nomad 客户端
📦 Nomad 版本: {{ nomad_version_output.stdout.split('\n')[0] }}
🌐 服务器 IP: {{ server_ip }}
🏗️ 数据中心: {{ nomad_datacenter }}
📊 客户端状态: {{ 'SUCCESS' if nomad_node_status.rc == 0 else 'PENDING' }}
🚀 warden 现在是 Nomad 集群的一部分

View File

@@ -0,0 +1,22 @@
---
- name: Thorough cleanup of Nomad configuration backup files
hosts: nomad_nodes
become: yes
tasks:
- name: Remove all backup files with various patterns
shell: |
find /etc/nomad.d/ -name "nomad.hcl.*" -not -name "nomad.hcl" -delete
find /etc/nomad.d/ -name "*.bak" -delete
find /etc/nomad.d/ -name "*.backup*" -delete
find /etc/nomad.d/ -name "*.~" -delete
find /etc/nomad.d/ -name "*.broken" -delete
ignore_errors: yes
- name: List remaining files in /etc/nomad.d/
command: ls -la /etc/nomad.d/
register: remaining_files
changed_when: false
- name: Display remaining files
debug:
var: remaining_files.stdout_lines

View File

@@ -0,0 +1,25 @@
---
- name: Cleanup Nomad configuration backup files
hosts: nomad_nodes
become: yes
tasks:
- name: Remove backup files from /etc/nomad.d/
file:
path: "{{ item }}"
state: absent
loop:
- "/etc/nomad.d/*.bak"
- "/etc/nomad.d/*.backup"
- "/etc/nomad.d/*.~"
- "/etc/nomad.d/*.broken"
- "/etc/nomad.d/nomad.hcl.*"
ignore_errors: yes
- name: List remaining files in /etc/nomad.d/
command: ls -la /etc/nomad.d/
register: remaining_files
changed_when: false
- name: Display remaining files
debug:
var: remaining_files.stdout_lines

View File

@@ -0,0 +1,39 @@
---
- name: 配置Nomad客户端节点
hosts: nomad_clients
become: yes
vars:
nomad_config_dir: /etc/nomad.d
tasks:
- name: 创建Nomad配置目录
file:
path: "{{ nomad_config_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: 复制Nomad客户端配置模板
template:
src: ../templates/nomad-client.hcl
dest: "{{ nomad_config_dir }}/nomad.hcl"
owner: root
group: root
mode: '0644'
- name: 启动Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
daemon_reload: yes
- name: 检查Nomad服务状态
command: systemctl status nomad
register: nomad_status
changed_when: false
- name: 显示Nomad服务状态
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,44 @@
---
- name: 统一配置所有Nomad节点
hosts: nomad_nodes
become: yes
tasks:
- name: 备份当前Nomad配置
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.bak
remote_src: yes
ignore_errors: yes
- name: 生成统一Nomad配置
template:
src: ../templates/nomad-unified.hcl.j2
dest: /etc/nomad.d/nomad.hcl
owner: root
group: root
mode: '0644'
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
daemon_reload: yes
- name: 等待Nomad服务就绪
wait_for:
port: 4646
host: "{{ inventory_hostname }}.tailnet-68f9.ts.net"
delay: 10
timeout: 60
ignore_errors: yes
- name: 检查Nomad服务状态
command: systemctl status nomad
register: nomad_status
changed_when: false
- name: 显示Nomad服务状态
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,62 @@
---
- name: Configure Nomad Dynamic Host Volumes for NFS
hosts: nomad_clients
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Update Nomad configuration for dynamic host volumes
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} DYNAMIC HOST VOLUMES CONFIGURATION"
block: |
client {
# 启用动态host volumes
host_volume "fnsync" {
path = "{{ mount_point }}"
read_only = false
}
# 添加NFS相关的节点元数据
meta {
nfs_server = "{{ nfs_server }}"
nfs_share = "{{ nfs_share }}"
nfs_mounted = "true"
}
}
insertafter: 'client {'
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check Nomad status
command: nomad node status
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,57 @@
---
- name: Configure Podman driver for all Nomad client nodes
hosts: target_nodes
become: yes
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Install Podman if not present
package:
name: podman
state: present
ignore_errors: yes
- name: Enable Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: Update Nomad configuration to use Podman
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^plugin "docker"'
line: 'plugin "podman" {'
state: present
- name: Add Podman plugin configuration
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} PODMAN PLUGIN CONFIG"
block: |
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
insertafter: 'client {'
- name: Start Nomad service
systemd:
name: nomad
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 5
timeout: 30

View File

@@ -0,0 +1,22 @@
---
- name: Configure NOPASSWD sudo for nomad user
hosts: nomad_clients
become: yes
tasks:
- name: Ensure sudoers.d directory exists
file:
path: /etc/sudoers.d
state: directory
owner: root
group: root
mode: '0750'
- name: Allow nomad user passwordless sudo for required commands
copy:
dest: /etc/sudoers.d/nomad
content: |
nomad ALL=(ALL) NOPASSWD: /usr/bin/apt, /usr/bin/systemctl, /bin/mkdir, /bin/chown, /bin/chmod, /bin/mv, /bin/sed, /usr/bin/tee, /usr/sbin/usermod, /usr/bin/unzip, /usr/bin/wget
owner: root
group: root
mode: '0440'
validate: 'visudo -cf %s'

View File

@@ -0,0 +1,226 @@
---
- name: 配置 Nomad 集群使用 Tailscale 网络通讯
hosts: nomad_cluster
become: yes
gather_facts: no
vars:
nomad_config_dir: "/etc/nomad.d"
nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
tasks:
- name: 获取当前节点的 Tailscale IP
shell: tailscale ip | head -1
register: current_tailscale_ip
changed_when: false
ignore_errors: yes
- name: 计算用于 Nomad 的地址(优先 Tailscale回退到 inventory 或 ansible_host
set_fact:
node_addr: "{{ (current_tailscale_ip.stdout | default('')) is match('^100\\.') | ternary((current_tailscale_ip.stdout | trim), (hostvars[inventory_hostname].tailscale_ip | default(ansible_host))) }}"
- name: 确保 Nomad 配置目录存在
file:
path: "{{ nomad_config_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: 生成 Nomad 服务器配置(使用 Tailscale
copy:
dest: "{{ nomad_config_file }}"
owner: root
group: root
mode: '0644'
content: |
datacenter = "{{ nomad_datacenter | default('dc1') }}"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "{{ node_addr }}"
addresses {
http = "{{ node_addr }}"
rpc = "{{ node_addr }}"
serf = "{{ node_addr }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = {{ nomad_bootstrap_expect | default(4) }}
retry_join = [
"100.116.158.95", # semaphore
"100.103.147.94", # ash2e
"100.81.26.3", # ash1d
"100.90.159.68" # ch2
]
encrypt = "{{ nomad_encrypt_key }}"
}
client {
enabled = false
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "{{ node_addr }}:8500"
}
when: nomad_role == "server"
notify: restart nomad
- name: 生成 Nomad 客户端配置(使用 Tailscale
copy:
dest: "{{ nomad_config_file }}"
owner: root
group: root
mode: '0644'
content: |
datacenter = "{{ nomad_datacenter | default('dc1') }}"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "{{ node_addr }}"
addresses {
http = "{{ node_addr }}"
rpc = "{{ node_addr }}"
serf = "{{ node_addr }}"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = false
}
client {
enabled = true
network_interface = "tailscale0"
cpu_total_compute = 0
servers = [
"100.116.158.95:4647", # semaphore
"100.103.147.94:4647", # ash2e
"100.81.26.3:4647", # ash1d
"100.90.159.68:4647" # ch2
]
}
plugin "podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "{{ node_addr }}:8500"
}
when: nomad_role == "client"
notify: restart nomad
- name: 检查 Nomad 二进制文件位置
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
register: nomad_binary_path
failed_when: nomad_binary_path.stdout == ""
- name: 创建/更新 Nomad systemd 服务文件
copy:
dest: "/etc/systemd/system/nomad.service"
owner: root
group: root
mode: '0644'
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
[Service]
Type=notify
User=root
Group=root
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
notify: restart nomad
- name: 确保 Nomad 数据目录存在
file:
path: "/opt/nomad/data"
state: directory
owner: root
group: root
mode: '0755'
- name: 重新加载 systemd daemon
systemd:
daemon_reload: yes
- name: 启用并启动 Nomad 服务
systemd:
name: nomad
enabled: yes
state: started
- name: 等待 Nomad 服务启动
wait_for:
port: 4646
host: "{{ node_addr }}"
delay: 5
timeout: 30
ignore_errors: yes
- name: 检查 Nomad 服务状态
shell: systemctl status nomad --no-pager -l
register: nomad_status
ignore_errors: yes
- name: 显示配置结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 配置完成
🌐 使用地址: {{ node_addr }}
🎯 角色: {{ nomad_role }}
🔧 Nomad 二进制: {{ nomad_binary_path.stdout }}
📊 服务状态: {{ 'active' if nomad_status.rc == 0 else 'failed' }}
{% if nomad_status.rc != 0 %}
❌ 错误信息:
{{ nomad_status.stdout }}
{{ nomad_status.stderr }}
{% endif %}
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted
daemon_reload: yes

View File

@@ -0,0 +1,115 @@
---
- name: Configure Podman for Nomad Integration
hosts: all
become: yes
gather_facts: yes
tasks:
- name: 显示当前处理的节点
debug:
msg: "🔧 正在为 Nomad 配置 Podman: {{ inventory_hostname }}"
- name: 确保 Podman 已安装
package:
name: podman
state: present
- name: 启用并启动 Podman socket 服务
systemd:
name: podman.socket
enabled: yes
state: started
- name: 创建 Podman 系统配置目录
file:
path: /etc/containers
state: directory
mode: '0755'
- name: 配置 Podman 使用系统 socket
copy:
content: |
[engine]
# 使用系统级 socket 而不是用户级 socket
active_service = "system"
[engine.service_destinations]
[engine.service_destinations.system]
uri = "unix:///run/podman/podman.sock"
dest: /etc/containers/containers.conf
mode: '0644'
- name: 检查是否存在 nomad 用户
getent:
database: passwd
key: nomad
register: nomad_user_check
ignore_errors: yes
- name: 为 nomad 用户创建配置目录
file:
path: "/home/nomad/.config/containers"
state: directory
owner: nomad
group: nomad
mode: '0755'
when: nomad_user_check is succeeded
- name: 为 nomad 用户配置 Podman
copy:
content: |
[engine]
active_service = "system"
[engine.service_destinations]
[engine.service_destinations.system]
uri = "unix:///run/podman/podman.sock"
dest: /home/nomad/.config/containers/containers.conf
owner: nomad
group: nomad
mode: '0644'
when: nomad_user_check is succeeded
- name: 将 nomad 用户添加到 podman 组
user:
name: nomad
groups: podman
append: yes
when: nomad_user_check is succeeded
ignore_errors: yes
- name: 创建 podman 组(如果不存在)
group:
name: podman
state: present
ignore_errors: yes
- name: 设置 podman socket 目录权限
file:
path: /run/podman
state: directory
mode: '0755'
group: podman
ignore_errors: yes
- name: 验证 Podman socket 权限
file:
path: /run/podman/podman.sock
mode: '066'
when: nomad_user_check is succeeded
ignore_errors: yes
- name: 验证 Podman 安装
shell: podman --version
register: podman_version
- name: 测试 Podman 功能
shell: podman info
register: podman_info
ignore_errors: yes
- name: 显示配置结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} Podman 配置完成
📦 Podman 版本: {{ podman_version.stdout }}
🐳 Podman 状态: {{ 'SUCCESS' if podman_info.rc == 0 else 'WARNING' }}
👤 Nomad 用户: {{ 'FOUND' if nomad_user_check is succeeded else 'NOT FOUND' }}

View File

@@ -0,0 +1,105 @@
---
- name: 部署韩国节点Nomad配置
hosts: ch2,ch3
become: yes
gather_facts: no
vars:
nomad_config_dir: "/etc/nomad.d"
nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl"
source_config_dir: "/root/mgmt/infrastructure/configs/server"
tasks:
- name: 获取主机名短名称(去掉后缀)
set_fact:
short_hostname: "{{ inventory_hostname | regex_replace('\\$', '') }}"
- name: 确保 Nomad 配置目录存在
file:
path: "{{ nomad_config_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: 部署 Nomad 配置文件到韩国节点
copy:
src: "{{ source_config_dir }}/nomad-{{ short_hostname }}.hcl"
dest: "{{ nomad_config_file }}"
owner: root
group: root
mode: '0644'
backup: yes
notify: restart nomad
- name: 检查 Nomad 二进制文件位置
shell: which nomad || find /usr -name nomad 2>/dev/null | head -1
register: nomad_binary_path
failed_when: nomad_binary_path.stdout == ""
- name: 创建/更新 Nomad systemd 服务文件
copy:
dest: "/etc/systemd/system/nomad.service"
owner: root
group: root
mode: '0644'
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
[Service]
Type=notify
User=root
Group=root
ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
notify: restart nomad
- name: 确保 Nomad 数据目录存在
file:
path: "/opt/nomad/data"
state: directory
owner: root
group: root
mode: '0755'
- name: 重新加载 systemd daemon
systemd:
daemon_reload: yes
- name: 启用并启动 Nomad 服务
systemd:
name: nomad
enabled: yes
state: started
- name: 等待 Nomad 服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
delay: 5
timeout: 30
ignore_errors: yes
- name: 显示 Nomad 服务状态
command: systemctl status nomad
register: nomad_status
changed_when: false
- name: 显示 Nomad 服务状态信息
debug:
var: nomad_status.stdout_lines
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@@ -0,0 +1,41 @@
---
- name: 部署Nomad服务器配置模板
hosts: nomad_servers
become: yes
tasks:
- name: 部署Nomad配置文件
template:
src: nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
backup: yes
owner: root
group: root
mode: '0644'
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,168 @@
---
- name: 磁盘空间分析 - 使用 ncdu 工具
hosts: all
become: yes
vars:
ncdu_scan_paths:
- "/"
- "/var"
- "/opt"
- "/home"
output_dir: "/tmp/disk-analysis"
tasks:
- name: 安装 ncdu 工具
package:
name: ncdu
state: present
register: ncdu_install
- name: 创建输出目录
file:
path: "{{ output_dir }}"
state: directory
mode: '0755'
- name: 检查磁盘空间使用情况
shell: df -h
register: disk_usage
- name: 显示当前磁盘使用情况
debug:
msg: |
=== {{ inventory_hostname }} 磁盘使用情况 ===
{{ disk_usage.stdout }}
- name: 使用 ncdu 扫描根目录并生成报告
shell: |
ncdu -x -o {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json /
async: 300
poll: 0
register: ncdu_root_scan
- name: 使用 ncdu 扫描 /var 目录
shell: |
ncdu -x -o {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json /var
async: 180
poll: 0
register: ncdu_var_scan
when: ansible_mounts | selectattr('mount', 'equalto', '/var') | list | length > 0 or '/var' in ansible_mounts | map(attribute='mount') | list
- name: 使用 ncdu 扫描 /opt 目录
shell: |
ncdu -x -o {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json /opt
async: 120
poll: 0
register: ncdu_opt_scan
when: ansible_mounts | selectattr('mount', 'equalto', '/opt') | list | length > 0 or '/opt' in ansible_mounts | map(attribute='mount') | list
- name: 等待根目录扫描完成
async_status:
jid: "{{ ncdu_root_scan.ansible_job_id }}"
register: ncdu_root_result
until: ncdu_root_result.finished
retries: 60
delay: 5
- name: 等待 /var 目录扫描完成
async_status:
jid: "{{ ncdu_var_scan.ansible_job_id }}"
register: ncdu_var_result
until: ncdu_var_result.finished
retries: 36
delay: 5
when: ncdu_var_scan is defined and ncdu_var_scan.ansible_job_id is defined
- name: 等待 /opt 目录扫描完成
async_status:
jid: "{{ ncdu_opt_scan.ansible_job_id }}"
register: ncdu_opt_result
until: ncdu_opt_result.finished
retries: 24
delay: 5
when: ncdu_opt_scan is defined and ncdu_opt_scan.ansible_job_id is defined
- name: 生成磁盘使用分析报告
shell: |
echo "=== {{ inventory_hostname }} 磁盘分析报告 ===" > {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "生成时间: $(date)" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "=== 磁盘使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
df -h >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "=== 最大的目录 (前10个) ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
du -h --max-depth=2 / 2>/dev/null | sort -hr | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "=== /var 目录最大文件 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
find /var -type f -size +100M -exec ls -lh {} \; 2>/dev/null | head -10 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "=== /tmp 目录使用情况 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
du -sh /tmp/* 2>/dev/null | sort -hr | head -5 >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
echo "=== 日志文件大小 ===" >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
find /var/log -name "*.log" -type f -size +50M -exec ls -lh {} \; 2>/dev/null >> {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
- name: 显示分析报告
shell: cat {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
register: disk_report
- name: 输出磁盘分析结果
debug:
msg: "{{ disk_report.stdout }}"
- name: 检查是否有磁盘使用率超过 80%
shell: df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 80) print $0}'
register: high_usage_disks
- name: 警告高磁盘使用率
debug:
msg: |
⚠️ 警告: {{ inventory_hostname }} 发现高磁盘使用率!
{{ high_usage_disks.stdout }}
when: high_usage_disks.stdout != ""
- name: 创建清理建议
shell: |
echo "=== {{ inventory_hostname }} 清理建议 ===" > {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "1. 检查日志文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
find /var/log -name "*.log" -type f -size +100M -exec echo " 大日志文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "2. 检查临时文件:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
find /tmp -type f -size +50M -exec echo " 大临时文件: {}" \; 2>/dev/null >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "3. 检查包缓存:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
if [ -d /var/cache/apt ]; then
echo " APT 缓存大小: $(du -sh /var/cache/apt 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
fi
if [ -d /var/cache/yum ]; then
echo " YUM 缓存大小: $(du -sh /var/cache/yum 2>/dev/null | cut -f1)" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
fi
echo "" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo "4. 检查容器相关:" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
if command -v podman >/dev/null 2>&1; then
echo " Podman 镜像: $(podman images --format 'table {{.Repository}} {{.Tag}} {{.Size}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
echo " Podman 容器: $(podman ps -a --format 'table {{.Names}} {{.Status}}' 2>/dev/null | wc -l) 个" >> {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
fi
- name: 显示清理建议
shell: cat {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt
register: cleanup_suggestions
- name: 输出清理建议
debug:
msg: "{{ cleanup_suggestions.stdout }}"
- name: 保存 ncdu 文件位置信息
debug:
msg: |
📁 ncdu 扫描文件已保存到:
- 根目录: {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json
- /var 目录: {{ output_dir }}/ncdu-var-{{ inventory_hostname }}.json (如果存在)
- /opt 目录: {{ output_dir }}/ncdu-opt-{{ inventory_hostname }}.json (如果存在)
💡 使用方法:
ncdu -f {{ output_dir }}/ncdu-root-{{ inventory_hostname }}.json
📊 完整报告: {{ output_dir }}/disk-report-{{ inventory_hostname }}.txt
🧹 清理建议: {{ output_dir }}/cleanup-suggestions-{{ inventory_hostname }}.txt

View File

@@ -0,0 +1,96 @@
---
- name: 磁盘清理工具
hosts: all
become: yes
vars:
cleanup_logs: true
cleanup_cache: true
cleanup_temp: true
cleanup_containers: false # 谨慎操作
tasks:
- name: 检查磁盘使用情况 (清理前)
shell: df -h
register: disk_before
- name: 显示清理前磁盘使用情况
debug:
msg: |
=== {{ inventory_hostname }} 清理前磁盘使用情况 ===
{{ disk_before.stdout }}
- name: 清理系统日志 (保留最近7天)
shell: |
journalctl --vacuum-time=7d
find /var/log -name "*.log" -type f -mtime +7 -exec truncate -s 0 {} \;
find /var/log -name "*.log.*" -type f -mtime +7 -delete
when: cleanup_logs | bool
register: log_cleanup
- name: 清理包管理器缓存
block:
- name: 清理 APT 缓存 (Debian/Ubuntu)
shell: |
apt-get clean
apt-get autoclean
apt-get autoremove -y
when: ansible_os_family == "Debian"
- name: 清理 YUM/DNF 缓存 (RedHat/CentOS)
shell: |
if command -v dnf >/dev/null 2>&1; then
dnf clean all
elif command -v yum >/dev/null 2>&1; then
yum clean all
fi
when: ansible_os_family == "RedHat"
when: cleanup_cache | bool
- name: 清理临时文件
shell: |
find /tmp -type f -atime +7 -delete 2>/dev/null || true
find /var/tmp -type f -atime +7 -delete 2>/dev/null || true
rm -rf /tmp/.* 2>/dev/null || true
when: cleanup_temp | bool
- name: 清理 Podman 资源 (谨慎操作)
block:
- name: 停止所有容器
shell: podman stop --all
ignore_errors: yes
- name: 删除未使用的容器
shell: podman container prune -f
ignore_errors: yes
- name: 删除未使用的镜像
shell: podman image prune -f
ignore_errors: yes
- name: 删除未使用的卷
shell: podman volume prune -f
ignore_errors: yes
when: cleanup_containers | bool
- name: 清理核心转储文件
shell: |
find /var/crash -name "core.*" -type f -delete 2>/dev/null || true
find / -name "core" -type f -size +10M -delete 2>/dev/null || true
ignore_errors: yes
- name: 检查磁盘使用情况 (清理后)
shell: df -h
register: disk_after
- name: 显示清理结果
debug:
msg: |
=== {{ inventory_hostname }} 清理完成 ===
清理前:
{{ disk_before.stdout }}
清理后:
{{ disk_after.stdout }}
🧹 清理操作完成!

View File

@@ -0,0 +1,33 @@
---
- name: 分发SSH公钥到Nomad客户端节点
hosts: nomad_clients
become: yes
vars:
ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMSUUfma8FKEFvH8Nq65XM2PZ9kitfgv1q727cKV9y5Z houzhongxu@seekkey.tech"
tasks:
- name: 确保 .ssh 目录存在
file:
path: "/home/{{ ansible_user }}/.ssh"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: '0700'
- name: 添加SSH公钥到 authorized_keys
lineinfile:
path: "/home/{{ ansible_user }}/.ssh/authorized_keys"
line: "{{ ssh_public_key }}"
create: yes
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: '0600'
- name: 验证SSH公钥已添加
command: cat "/home/{{ ansible_user }}/.ssh/authorized_keys"
register: ssh_key_check
changed_when: false
- name: 显示SSH公钥内容
debug:
var: ssh_key_check.stdout_lines

View File

@@ -0,0 +1,32 @@
---
- name: 分发SSH公钥到新节点
hosts: browser,influxdb1,hcp1,warden
become: yes
vars:
ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMSUUfma8FKEFvH8Nq65XM2PZ9kitfgv1q727cKV9y5Z houzhongxu@seekkey.tech"
tasks:
- name: 确保 .ssh 目录存在
file:
path: "/root/.ssh"
state: directory
mode: '0700'
owner: root
group: root
- name: 添加SSH公钥到 authorized_keys
copy:
content: "{{ ssh_public_key }}"
dest: "/root/.ssh/authorized_keys"
mode: '0600'
owner: root
group: root
- name: 验证SSH公钥已添加
command: cat /root/.ssh/authorized_keys
register: ssh_key_check
changed_when: false
- name: 显示SSH公钥内容
debug:
var: ssh_key_check.stdout_lines

View File

@@ -0,0 +1,76 @@
---
- name: Distribute Nomad Podman Driver to all nodes
hosts: nomad_cluster
become: yes
vars:
nomad_user: nomad
nomad_data_dir: /opt/nomad/data
nomad_plugins_dir: "{{ nomad_data_dir }}/plugins"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Create plugins directory
file:
path: "{{ nomad_plugins_dir }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Copy Nomad Podman driver from local
copy:
src: /tmp/nomad-driver-podman
dest: "{{ nomad_plugins_dir }}/nomad-driver-podman"
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Update Nomad configuration for plugin directory
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^plugin_dir'
line: 'plugin_dir = "{{ nomad_plugins_dir }}"'
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Ensure Podman is installed
package:
name: podman
state: present
- name: Enable Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Wait for plugins to load
pause:
seconds: 15
- name: Check driver status
shell: |
/usr/local/bin/nomad node status -self | grep -A 10 "Driver Status" || /usr/bin/nomad node status -self | grep -A 10 "Driver Status"
register: driver_status
failed_when: false
- name: Display driver status
debug:
var: driver_status.stdout_lines

View File

@@ -0,0 +1,12 @@
- name: Distribute new podman binary to specified nomad_clients
hosts: nomadlxc,hcp,huawei,ditigalocean
gather_facts: false
tasks:
- name: Copy new podman binary to /usr/local/bin
copy:
src: /root/mgmt/configuration/podman-remote-static-linux_amd64
dest: /usr/local/bin/podman
owner: root
group: root
mode: '0755'
become: yes

View File

@@ -0,0 +1,39 @@
---
- name: 紧急修复Nomad bootstrap_expect配置
hosts: nomad_servers
become: yes
tasks:
- name: 修复bootstrap_expect为3
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ bootstrap_expect = \d+'
line: ' bootstrap_expect = 3'
backup: yes
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,103 @@
---
- name: Fix ch4 Nomad configuration - convert from server to client
hosts: ch4
become: yes
vars:
ansible_host: 100.117.106.136
tasks:
- name: Backup current Nomad config
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup
remote_src: yes
backup: yes
- name: Update Nomad config to client mode
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CLIENT CONFIG"
block: |
server {
enabled = false
}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = "true"
}
}
insertbefore: '^server \{'
replace: '^server \{.*?\}'
- name: Update client block
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED CLIENT BLOCK"
block: |
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = "true"
}
}
insertbefore: '^client \{'
replace: '^client \{.*?\}'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Verify Nomad client status
shell: |
NOMAD_ADDR=http://localhost:4646 nomad node status | grep -q "ready"
register: nomad_ready
failed_when: nomad_ready.rc != 0
retries: 3
delay: 10
- name: Display completion message
debug:
msg: |
✅ Successfully converted ch4 from Nomad server to client
✅ Nomad service restarted
✅ Configuration updated

View File

@@ -0,0 +1,82 @@
---
- name: Fix master node - rename to ch4 and restore SSH port 22
hosts: master
become: yes
vars:
new_hostname: ch4
old_hostname: master
tasks:
- name: Backup current hostname
copy:
content: "{{ old_hostname }}"
dest: /etc/hostname.backup
mode: '0644'
when: ansible_hostname == old_hostname
- name: Update hostname to ch4
hostname:
name: "{{ new_hostname }}"
when: ansible_hostname == old_hostname
- name: Update /etc/hostname file
copy:
content: "{{ new_hostname }}"
dest: /etc/hostname
mode: '0644'
when: ansible_hostname == old_hostname
- name: Update /etc/hosts file
lineinfile:
path: /etc/hosts
regexp: '^127\.0\.1\.1.*{{ old_hostname }}'
line: '127.0.1.1 {{ new_hostname }}'
state: present
when: ansible_hostname == old_hostname
- name: Update Tailscale hostname
shell: |
tailscale set --hostname={{ new_hostname }}
when: ansible_hostname == old_hostname
- name: Backup SSH config
copy:
src: /etc/ssh/sshd_config
dest: /etc/ssh/sshd_config.backup
remote_src: yes
backup: yes
- name: Restore SSH port to 22
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^Port '
line: 'Port 22'
state: present
- name: Restart SSH service
systemd:
name: ssh
state: restarted
enabled: yes
- name: Wait for SSH to be ready on port 22
wait_for:
port: 22
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Test SSH connection on port 22
ping:
delegate_to: "{{ inventory_hostname }}"
vars:
ansible_port: 22
- name: Display completion message
debug:
msg: |
✅ Successfully renamed {{ old_hostname }} to {{ new_hostname }}
✅ SSH port restored to 22
✅ Tailscale hostname updated
🔄 Please update your inventory file to use the new hostname and port

View File

@@ -0,0 +1,73 @@
---
- name: 修正Nomad节点的Consul角色配置
hosts: nomad_nodes
become: yes
vars:
consul_addresses: "master.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
tasks:
- name: 备份原始Nomad配置
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.bak_{{ ansible_date_time.iso8601 }}
remote_src: yes
- name: 检查节点角色
shell: grep -A 1 "server {" /etc/nomad.d/nomad.hcl | grep "enabled = true" | wc -l
register: is_server
changed_when: false
- name: 检查节点角色
shell: grep -A 1 "client {" /etc/nomad.d/nomad.hcl | grep "enabled = true" | wc -l
register: is_client
changed_when: false
- name: 修正服务器节点的Consul配置
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED BLOCK - CONSUL CONFIG"
block: |
consul {
address = "{{ consul_addresses }}"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = true
client_auto_join = false
}
replace: true
when: is_server.stdout == "1"
- name: 修正客户端节点的Consul配置
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED BLOCK - CONSUL CONFIG"
block: |
consul {
address = "{{ consul_addresses }}"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = false
client_auto_join = true
}
replace: true
when: is_client.stdout == "1"
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
daemon_reload: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示节点角色和配置
debug:
msg: "节点 {{ inventory_hostname }} 是 {{ '服务器' if is_server.stdout == '1' else '客户端' }} 节点Consul配置已更新"

View File

@@ -0,0 +1,43 @@
---
- name: 修复 Nomad 服务器 region 配置
hosts: nomad_servers
become: yes
vars:
nomad_config_dir: /etc/nomad.d
tasks:
- name: 备份当前 Nomad 配置
copy:
src: "{{ nomad_config_dir }}/nomad.hcl"
dest: "{{ nomad_config_dir }}/nomad.hcl.backup.{{ ansible_date_time.epoch }}"
remote_src: yes
ignore_errors: yes
- name: 更新 Nomad 配置文件以添加 region 设置
blockinfile:
path: "{{ nomad_config_dir }}/nomad.hcl"
insertafter: '^datacenter = '
block: |
region = "dc1"
marker: "# {mark} Ansible managed region setting"
notify: restart nomad
- name: 更新节点名称以移除 .global 后缀(如果存在)
replace:
path: "{{ nomad_config_dir }}/nomad.hcl"
regexp: 'name = "(.*)\.global(.*)"'
replace: 'name = "\1\2"'
notify: restart nomad
- name: 确保 retry_join 使用正确的 IP 地址
replace:
path: "{{ nomad_config_dir }}/nomad.hcl"
regexp: 'retry_join = \[(.*)\]'
replace: 'retry_join = ["100.81.26.3", "100.103.147.94", "100.90.159.68", "100.116.158.95", "100.98.209.50", "100.120.225.29"]'
notify: restart nomad
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@@ -0,0 +1,71 @@
---
- name: Install and configure Consul clients on all nodes
hosts: all
become: yes
vars:
consul_servers:
- "100.117.106.136" # ch4 (韩国)
- "100.122.197.112" # warden (北京)
- "100.116.80.94" # ash3c (美国)
tasks:
- name: Get Tailscale IP address
shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d/ -f1
register: tailscale_ip_result
changed_when: false
- name: Set Tailscale IP fact
set_fact:
tailscale_ip: "{{ tailscale_ip_result.stdout }}"
- name: Install Consul
apt:
name: consul
state: present
update_cache: yes
- name: Create Consul data directory
file:
path: /opt/consul/data
state: directory
owner: consul
group: consul
mode: '0755'
- name: Create Consul log directory
file:
path: /var/log/consul
state: directory
owner: consul
group: consul
mode: '0755'
- name: Create Consul config directory
file:
path: /etc/consul.d
state: directory
owner: consul
group: consul
mode: '0755'
- name: Generate Consul client configuration
template:
src: consul-client.hcl.j2
dest: /etc/consul.d/consul.hcl
owner: consul
group: consul
mode: '0644'
notify: restart consul
- name: Enable and start Consul service
systemd:
name: consul
enabled: yes
state: started
daemon_reload: yes
handlers:
- name: restart consul
systemd:
name: consul
state: restarted

View File

@@ -0,0 +1,87 @@
---
- name: Configure Nomad Podman Driver
hosts: target_nodes
become: yes
tasks:
- name: Create backup directory
file:
path: /etc/nomad.d/backup
state: directory
mode: '0755'
- name: Backup current nomad.hcl
copy:
src: /etc/nomad.d/nomad.hcl
dest: "/etc/nomad.d/backup/nomad.hcl.bak.{{ ansible_date_time.iso8601 }}"
remote_src: yes
- name: Create plugin directory
file:
path: /opt/nomad/plugins
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create symlink for podman driver
file:
src: /usr/bin/nomad-driver-podman
dest: /opt/nomad/plugins/nomad-driver-podman
state: link
- name: Copy podman driver configuration
copy:
src: ../../files/podman-driver.hcl
dest: /etc/nomad.d/podman-driver.hcl
owner: root
group: root
mode: '0644'
- name: Remove existing plugin_dir configuration
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^plugin_dir = "/opt/nomad/data/plugins"'
state: absent
- name: Configure Nomad to use Podman driver
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} ANSIBLE MANAGED BLOCK - PODMAN DRIVER"
block: |
plugin_dir = "/opt/nomad/plugins"
plugin "podman" {
config {
volumes {
enabled = true
}
logging {
type = "journald"
}
gc {
container = true
}
}
}
register: nomad_config_result
- name: Restart nomad service
systemd:
name: nomad
state: restarted
enabled: yes
- name: Wait for nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check nomad status
command: nomad node status
register: nomad_status
changed_when: false
- name: Display nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,161 @@
---
- name: Install and Configure Nomad Podman Driver on Client Nodes
hosts: nomad_clients
become: yes
vars:
nomad_plugin_dir: "/opt/nomad/plugins"
tasks:
- name: Create backup directory with timestamp
set_fact:
backup_dir: "/root/backup/{{ ansible_date_time.date }}_{{ ansible_date_time.hour }}{{ ansible_date_time.minute }}{{ ansible_date_time.second }}"
- name: Create backup directory
file:
path: "{{ backup_dir }}"
state: directory
mode: '0755'
- name: Backup current Nomad configuration
copy:
src: /etc/nomad.d/nomad.hcl
dest: "{{ backup_dir }}/nomad.hcl.backup"
remote_src: yes
ignore_errors: yes
- name: Backup current apt sources
shell: |
cp -r /etc/apt/sources.list* {{ backup_dir }}/
dpkg --get-selections > {{ backup_dir }}/installed_packages.txt
ignore_errors: yes
- name: Create temporary directory for apt
file:
path: /tmp/apt-temp
state: directory
mode: '1777'
- name: Download HashiCorp GPG key
get_url:
url: https://apt.releases.hashicorp.com/gpg
dest: /tmp/hashicorp.gpg
mode: '0644'
environment:
TMPDIR: /tmp/apt-temp
- name: Install HashiCorp GPG key
shell: |
gpg --dearmor < /tmp/hashicorp.gpg > /usr/share/keyrings/hashicorp-archive-keyring.gpg
environment:
TMPDIR: /tmp/apt-temp
- name: Add HashiCorp repository
lineinfile:
path: /etc/apt/sources.list.d/hashicorp.list
line: "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com {{ ansible_distribution_release }} main"
create: yes
mode: '0644'
- name: Update apt cache
apt:
update_cache: yes
environment:
TMPDIR: /tmp/apt-temp
ignore_errors: yes
- name: Install nomad-driver-podman
apt:
name: nomad-driver-podman
state: present
environment:
TMPDIR: /tmp/apt-temp
- name: Create Nomad plugin directory
file:
path: "{{ nomad_plugin_dir }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create symlink for nomad-driver-podman in plugin directory
file:
src: /usr/bin/nomad-driver-podman
dest: "{{ nomad_plugin_dir }}/nomad-driver-podman"
state: link
owner: nomad
group: nomad
- name: Get server IP address
shell: |
ip route get 1.1.1.1 | grep -oP 'src \K\S+'
register: server_ip_result
changed_when: false
- name: Set server IP fact
set_fact:
server_ip: "{{ server_ip_result.stdout }}"
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Create updated Nomad client configuration
copy:
content: |
datacenter = "{{ nomad_datacenter }}"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "{{ server_ip }}"
server {
enabled = false
}
client {
enabled = true
servers = ["100.117.106.136:4647", "100.116.80.94:4647", "100.97.62.111:4647", "100.116.112.45:4647", "100.84.197.26:4647"]
}
plugin_dir = "{{ nomad_plugin_dir }}"
plugin "nomad-driver-podman" {
config {
volumes {
enabled = true
}
recover_stopped = true
}
}
consul {
address = "127.0.0.1:8500"
}
dest: /etc/nomad.d/nomad.hcl
owner: nomad
group: nomad
mode: '0640'
backup: yes
- name: Validate Nomad configuration
shell: nomad config validate /etc/nomad.d/nomad.hcl
register: nomad_validate
failed_when: nomad_validate.rc != 0
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ server_ip }}"
delay: 5
timeout: 60
- name: Display backup location
debug:
msg: "Backup created at: {{ backup_dir }}"

View File

@@ -0,0 +1,68 @@
---
- name: 在 master 和 ash3c 节点安装 Consul
hosts: master,ash3c
become: yes
vars:
consul_version: "1.21.5"
consul_arch: "arm64" # 因为这两个节点都是 aarch64
tasks:
- name: 检查节点架构
command: uname -m
register: node_arch
changed_when: false
- name: 显示节点架构
debug:
msg: "节点 {{ inventory_hostname }} 架构: {{ node_arch.stdout }}"
- name: 检查是否已安装 consul
command: which consul
register: consul_check
failed_when: false
changed_when: false
- name: 显示当前 consul 状态
debug:
msg: "Consul 状态: {{ 'already installed' if consul_check.rc == 0 else 'not installed' }}"
- name: 删除错误的 consul 二进制文件(如果存在)
file:
path: /usr/local/bin/consul
state: absent
when: consul_check.rc == 0
- name: 更新 APT 缓存
apt:
update_cache: yes
ignore_errors: yes
- name: 安装 consul 通过 APT
apt:
name: consul={{ consul_version }}-1
state: present
- name: 验证 consul 安装
command: consul version
register: consul_version_check
changed_when: false
- name: 显示安装的 consul 版本
debug:
msg: "安装的 Consul 版本: {{ consul_version_check.stdout_lines[0] }}"
- name: 确保 consul 用户存在
user:
name: consul
system: yes
shell: /bin/false
home: /opt/consul
create_home: no
- name: 创建 consul 数据目录
file:
path: /opt/consul
state: directory
owner: consul
group: consul
mode: '0755'

View File

@@ -0,0 +1,91 @@
---
- name: Install NFS CSI Plugin for Nomad
hosts: nomad_nodes
become: yes
vars:
nomad_user: nomad
nomad_plugins_dir: /opt/nomad/plugins
csi_driver_version: "v4.0.0"
csi_driver_url: "https://github.com/kubernetes-csi/csi-driver-nfs/releases/download/{{ csi_driver_version }}/csi-nfs-driver"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Create plugins directory
file:
path: "{{ nomad_plugins_dir }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Download NFS CSI driver
get_url:
url: "{{ csi_driver_url }}"
dest: "{{ nomad_plugins_dir }}/csi-nfs-driver"
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Install required packages for CSI
package:
name:
- nfs-common
- mount
state: present
- name: Create CSI mount directory
file:
path: /opt/nomad/csi
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Update Nomad configuration for CSI plugin
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} CSI PLUGIN CONFIGURATION"
block: |
plugin_dir = "{{ nomad_plugins_dir }}"
plugin "csi-nfs" {
type = "csi"
config {
driver_name = "nfs.csi.k8s.io"
mount_dir = "/opt/nomad/csi"
health_timeout = "30s"
log_level = "INFO"
}
}
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to start
wait_for:
port: 4646
delay: 10
timeout: 60
- name: Check Nomad status
command: nomad node status
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
var: nomad_status.stdout_lines

View File

@@ -0,0 +1,131 @@
---
- name: Install Nomad by direct download from HashiCorp
hosts: all
become: yes
vars:
nomad_user: "nomad"
nomad_group: "nomad"
nomad_home: "/opt/nomad"
nomad_data_dir: "/opt/nomad/data"
nomad_config_dir: "/etc/nomad.d"
nomad_datacenter: "dc1"
nomad_region: "global"
nomad_server_addresses:
- "100.116.158.95:4647" # semaphore server address
tasks:
- name: Create nomad user
user:
name: "{{ nomad_user }}"
group: "{{ nomad_group }}"
system: yes
shell: /bin/false
home: "{{ nomad_home }}"
create_home: yes
- name: Create nomad directories
file:
path: "{{ item }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_group }}"
mode: '0755'
loop:
- "{{ nomad_home }}"
- "{{ nomad_data_dir }}"
- "{{ nomad_config_dir }}"
- /var/log/nomad
- name: Install unzip package
apt:
name: unzip
state: present
update_cache: yes
- name: Download Nomad binary
get_url:
url: "{{ nomad_url }}"
dest: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
mode: '0644'
timeout: 300
- name: Extract Nomad binary
unarchive:
src: "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
dest: /tmp
remote_src: yes
- name: Copy Nomad binary to /usr/local/bin
copy:
src: /tmp/nomad
dest: /usr/local/bin/nomad
mode: '0755'
owner: root
group: root
remote_src: yes
- name: Create Nomad client configuration
template:
src: templates/nomad-client.hcl.j2
dest: "{{ nomad_config_dir }}/nomad.hcl"
owner: "{{ nomad_user }}"
group: "{{ nomad_group }}"
mode: '0640'
- name: Create Nomad systemd service
copy:
content: |
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty={{ nomad_config_dir }}/nomad.hcl
[Service]
Type=notify
User={{ nomad_user }}
Group={{ nomad_group }}
ExecStart=/usr/local/bin/nomad agent -config={{ nomad_config_dir }}
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/nomad.service
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Enable and start Nomad service
systemd:
name: nomad
enabled: yes
state: started
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 5
timeout: 60
- name: Verify Nomad installation
command: /usr/local/bin/nomad version
register: nomad_version_output
- name: Display Nomad version
debug:
msg: "{{ nomad_version_output.stdout }}"
- name: Clean up downloaded files
file:
path: "{{ item }}"
state: absent
loop:
- "/tmp/nomad_{{ nomad_version }}_linux_amd64.zip"
- /tmp/nomad

View File

@@ -0,0 +1,131 @@
---
- name: Install Nomad Podman Driver Plugin
hosts: target_nodes
become: yes
vars:
nomad_user: nomad
nomad_data_dir: /opt/nomad/data
nomad_plugins_dir: "{{ nomad_data_dir }}/plugins"
podman_driver_version: "0.6.1"
podman_driver_url: "https://releases.hashicorp.com/nomad-driver-podman/{{ podman_driver_version }}/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
tasks:
- name: Stop Nomad service
systemd:
name: nomad
state: stopped
- name: Create plugins directory
file:
path: "{{ nomad_plugins_dir }}"
state: directory
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
- name: Download Nomad Podman driver
get_url:
url: "{{ podman_driver_url }}"
dest: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
mode: '0644'
- name: Extract Nomad Podman driver
unarchive:
src: "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
dest: "/tmp"
remote_src: yes
- name: Install Nomad Podman driver
copy:
src: "/tmp/nomad-driver-podman"
dest: "{{ nomad_plugins_dir }}/nomad-driver-podman"
owner: "{{ nomad_user }}"
group: "{{ nomad_user }}"
mode: '0755'
remote_src: yes
- name: Update Nomad configuration for plugin directory
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} PLUGIN DIRECTORY CONFIGURATION"
block: |
plugin_dir = "{{ nomad_plugins_dir }}"
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Fix Podman socket permissions
file:
path: /run/user/1001/podman/podman.sock
mode: '0666'
ignore_errors: yes
- name: Ensure nomad user can access Podman socket
user:
name: "{{ nomad_user }}"
groups: ben
append: yes
- name: Start Nomad service
systemd:
name: nomad
state: started
enabled: yes
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: localhost
delay: 10
timeout: 60
- name: Verify Nomad is running
systemd:
name: nomad
register: nomad_service_status
- name: Display Nomad service status
debug:
msg: "Nomad service is {{ nomad_service_status.status.ActiveState }}"
- name: Wait for plugins to load
pause:
seconds: 15
- name: Check available drivers
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -self | grep -A 20 "Driver Status"
register: driver_status
failed_when: false
- name: Display driver status
debug:
var: driver_status.stdout_lines
- name: Test Podman driver functionality
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers | keys[]'
register: available_drivers
failed_when: false
- name: Display available drivers
debug:
msg: "Available drivers: {{ available_drivers.stdout_lines | join(', ') }}"
- name: Clean up downloaded files
file:
path: "{{ item }}"
state: absent
loop:
- "/tmp/nomad-driver-podman_{{ podman_driver_version }}_linux_amd64.zip"
- "/tmp/nomad-driver-podman"
- name: Final verification - Check if Podman driver is loaded
shell: |
sudo -u {{ nomad_user }} /usr/local/bin/nomad node status -json | jq -r '.Drivers.podman.Detected'
register: podman_driver_detected
failed_when: false
- name: Display final result
debug:
msg: |
Podman driver installation: {{ 'SUCCESS' if podman_driver_detected.stdout == 'true' else 'NEEDS VERIFICATION' }}
Driver detected: {{ podman_driver_detected.stdout | default('unknown') }}

View File

@@ -0,0 +1,61 @@
---
- name: Install Podman Compose on all Nomad cluster nodes
hosts: nomad_cluster
become: yes
tasks:
- name: Display target node
debug:
msg: "正在安装 Podman Compose 到节点: {{ inventory_hostname }}"
- name: Update package cache
apt:
update_cache: yes
ignore_errors: yes
- name: Install Podman and related tools
apt:
name:
- podman
- podman-compose
- buildah
- skopeo
state: present
ignore_errors: yes
- name: Install additional dependencies
apt:
name:
- python3-pip
- python3-setuptools
state: present
ignore_errors: yes
- name: Install podman-compose via pip if package manager failed
pip:
name: podman-compose
state: present
ignore_errors: yes
- name: Verify Podman installation
shell: podman --version
register: podman_version
- name: Verify Podman Compose installation
shell: podman-compose --version
register: podman_compose_version
ignore_errors: yes
- name: Display installation results
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 安装结果:
📦 Podman: {{ podman_version.stdout }}
🐳 Podman Compose: {{ podman_compose_version.stdout if podman_compose_version.rc == 0 else '安装失败或不可用' }}
- name: Ensure Podman socket is enabled
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes

View File

@@ -0,0 +1,115 @@
---
- name: 在Kali Linux上安装和配置VNC服务器
hosts: kali
become: yes
vars:
vnc_password: "3131" # VNC连接密码
vnc_port: "5901" # VNC服务端口
vnc_geometry: "1280x1024" # VNC分辨率
vnc_depth: "24" # 颜色深度
tasks:
- name: 更新APT缓存
apt:
update_cache: yes
- name: 安装VNC服务器和客户端
apt:
name:
- tigervnc-standalone-server
- tigervnc-viewer
- xfce4
- xfce4-goodies
state: present
- name: 创建VNC配置目录
file:
path: /home/ben/.vnc
state: directory
owner: ben
group: ben
mode: '0700'
- name: 设置VNC密码
shell: |
echo "{{ vnc_password }}" | vncpasswd -f > /home/ben/.vnc/passwd
echo "{{ vnc_password }}" | vncpasswd -f > /home/ben/.vnc/passwd2
become_user: ben
- name: 设置VNC密码文件权限
file:
path: /home/ben/.vnc/passwd
owner: ben
group: ben
mode: '0600'
- name: 设置VNC密码文件2权限
file:
path: /home/ben/.vnc/passwd2
owner: ben
group: ben
mode: '0600'
- name: 创建VNC启动脚本
copy:
dest: /home/ben/.vnc/xstartup
content: |
#!/bin/bash
unset SESSION_MANAGER
unset DBUS_SESSION_BUS_ADDRESS
exec startxfce4
owner: ben
group: ben
mode: '0755'
- name: 创建VNC服务文件
copy:
dest: /etc/systemd/system/vncserver@.service
content: |
[Unit]
Description=Start TigerVNC server at startup
After=syslog.target network.target
[Service]
Type=forking
User=ben
Group=ben
WorkingDirectory=/home/ben
PIDFile=/home/ben/.vnc/%H:%i.pid
ExecStartPre=-/usr/bin/vncserver -kill :%i > /dev/null 2>&1
ExecStart=/usr/bin/vncserver -depth {{ vnc_depth }} -geometry {{ vnc_geometry }} :%i
ExecStop=/usr/bin/vncserver -kill :%i
[Install]
WantedBy=multi-user.target
- name: 重新加载systemd配置
systemd:
daemon_reload: yes
- name: 启用并启动VNC服务
systemd:
name: vncserver@1.service
enabled: yes
state: started
- name: 检查VNC服务状态
command: systemctl status vncserver@1.service
register: vnc_status
ignore_errors: yes
- name: 显示VNC服务状态
debug:
msg: "{{ vnc_status.stdout_lines }}"
- name: 显示VNC连接信息
debug:
msg: |
VNC服务器已成功配置
连接信息:
- 地址: {{ ansible_host }}
- 端口: {{ vnc_port }}
- 密码: {{ vnc_password }}
- 连接命令: vnc://{{ ansible_host }}:{{ vnc_port }}
- 使用macOS屏幕共享应用连接到上述地址

View File

@@ -0,0 +1,36 @@
---
# install_vault.yml
- name: Install HashiCorp Vault
hosts: vault_servers
become: yes
tasks:
- name: Check if Vault is already installed
command: which vault
register: vault_check
ignore_errors: yes
changed_when: false
- name: Install Vault using apt
apt:
name: vault
state: present
update_cache: yes
when: vault_check.rc != 0
- name: Create Vault data directory
file:
path: "{{ vault_data_dir | default('/opt/nomad/data/vault/config') }}"
state: directory
owner: root
group: root
mode: '0755'
recurse: yes
- name: Verify Vault installation
command: vault --version
register: vault_version
changed_when: false
- name: Display Vault version
debug:
var: vault_version.stdout

View File

@@ -0,0 +1,42 @@
---
- name: 配置Nomad节点NFS挂载
hosts: nomad_nodes
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: 安装NFS客户端
package:
name: nfs-common
state: present
- name: 创建挂载目录
file:
path: "{{ mount_point }}"
state: directory
mode: '0755'
- name: 临时挂载NFS共享
mount:
path: "{{ mount_point }}"
src: "{{ nfs_server }}:{{ nfs_share }}"
fstype: nfs4
opts: "rw,relatime,vers=4.2"
state: mounted
- name: 配置开机自动挂载
lineinfile:
path: /etc/fstab
line: "{{ nfs_server }}:{{ nfs_share }} {{ mount_point }} nfs4 rw,relatime,vers=4.2 0 0"
state: present
- name: 验证挂载
command: df -h {{ mount_point }}
register: mount_check
- name: 显示挂载信息
debug:
var: mount_check.stdout_lines

View File

@@ -0,0 +1,86 @@
---
- name: 恢复客户端节点的/etc/hosts文件
hosts: nomad_clients
become: yes
tasks:
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.116\\.158\\.95\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.81\\.26\\.3\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.103\\.147\\.94\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.90\\.159\\.68\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.86\\.141\\.112\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.98\\.209\\.50\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.120\\.225\\.29\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.117\\.106\\.136\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.116\\.80\\.94\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.116\\.112\\.45\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.97\\.62\\.111\\s"
state: absent
- name: 删除添加的主机名解析条目
lineinfile:
path: /etc/hosts
regexp: "^100\\.122\\.197\\.112\\s"
state: absent
- name: 显示恢复后的/etc/hosts文件内容
command: cat /etc/hosts
register: hosts_content
changed_when: false
- name: 显示/etc/hosts文件内容
debug:
var: hosts_content.stdout_lines

View File

@@ -0,0 +1,81 @@
---
- name: Setup complete SSH key authentication for browser host
hosts: browser
become: yes
vars:
target_user: ben
ssh_key_comment: "ansible-generated-key-for-{{ inventory_hostname }}"
tasks:
- name: Copy existing Ed25519 SSH public key to target user
copy:
src: /root/.ssh/id_ed25519.pub
dest: /home/{{ target_user }}/.ssh/id_ed25519.pub
owner: "{{ target_user }}"
group: "{{ target_user }}"
mode: '0644'
- name: Copy existing Ed25519 SSH private key to target user
copy:
src: /root/.ssh/id_ed25519
dest: /home/{{ target_user }}/.ssh/id_ed25519
owner: "{{ target_user }}"
group: "{{ target_user }}"
mode: '0600'
- name: Get SSH public key content
command: cat /home/{{ target_user }}/.ssh/id_ed25519.pub
register: ssh_public_key
become_user: "{{ target_user }}"
changed_when: false
- name: Ensure .ssh directory exists for user
file:
path: /home/{{ target_user }}/.ssh
state: directory
owner: "{{ target_user }}"
group: "{{ target_user }}"
mode: '0700'
- name: Add public key to authorized_keys
authorized_key:
user: "{{ target_user }}"
state: present
key: "{{ ssh_public_key.stdout }}"
become_user: "{{ target_user }}"
- name: Configure SSH to prefer key authentication
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PasswordAuthentication'
line: 'PasswordAuthentication yes'
backup: yes
notify: restart sshd
when: ansible_connection != 'local'
- name: Configure SSH to allow key authentication
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PubkeyAuthentication'
line: 'PubkeyAuthentication yes'
backup: yes
notify: restart sshd
when: ansible_connection != 'local'
- name: Configure SSH authorized keys file permissions
file:
path: /home/{{ target_user }}/.ssh/authorized_keys
owner: "{{ target_user }}"
group: "{{ target_user }}"
mode: '0600'
- name: Display success message
debug:
msg: "SSH key authentication has been configured for user {{ target_user }} on {{ inventory_hostname }}"
handlers:
- name: restart sshd
systemd:
name: sshd
state: restarted
when: ansible_connection != 'local'

View File

@@ -0,0 +1,62 @@
---
- name: Setup SSH key authentication for browser host
hosts: browser
become: yes
vars:
target_user: ben
ssh_key_comment: "ansible-generated-key"
tasks:
- name: Generate SSH key pair if it doesn't exist
user:
name: "{{ target_user }}"
generate_ssh_key: yes
ssh_key_bits: 4096
ssh_key_comment: "{{ ssh_key_comment }}"
become_user: "{{ target_user }}"
- name: Get SSH public key content
command: cat /home/{{ target_user }}/.ssh/id_rsa.pub
register: ssh_public_key
become_user: "{{ target_user }}"
changed_when: false
- name: Display SSH public key for manual configuration
debug:
msg: |
SSH Public Key for {{ inventory_hostname }}:
{{ ssh_public_key.stdout }}
To complete key-based authentication setup:
1. Copy the above public key to the target system's authorized_keys
2. Or use ssh-copy-id command from this system:
ssh-copy-id -i /home/{{ target_user }}/.ssh/id_rsa.pub {{ target_user }}@{{ inventory_hostname }}
- name: Ensure .ssh directory exists for user
file:
path: /home/{{ target_user }}/.ssh
state: directory
owner: "{{ target_user }}"
group: "{{ target_user }}"
mode: '0700'
- name: Configure SSH to prefer key authentication
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PasswordAuthentication'
line: 'PasswordAuthentication yes'
backup: yes
notify: restart sshd
- name: Configure SSH to allow key authentication
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PubkeyAuthentication'
line: 'PubkeyAuthentication yes'
backup: yes
notify: restart sshd
handlers:
- name: restart sshd
systemd:
name: sshd
state: restarted

View File

@@ -0,0 +1,43 @@
---
- name: 设置Nomad节点NFS挂载
hosts: nomad_nodes
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: 安装NFS客户端
package:
name: nfs-common
state: present
- name: 创建挂载目录
file:
path: "{{ mount_point }}"
state: directory
mode: '0755'
- name: 临时挂载NFS共享
mount:
path: "{{ mount_point }}"
src: "{{ nfs_server }}:{{ nfs_share }}"
fstype: nfs4
opts: "rw,relatime,vers=4.2"
state: mounted
- name: 配置开机自动挂载
lineinfile:
path: /etc/fstab
line: "{{ nfs_server }}:{{ nfs_share }} {{ mount_point }} nfs4 rw,relatime,vers=4.2 0 0"
state: present
- name: 验证挂载
command: df -h {{ mount_point }}
register: mount_check
- name: 显示挂载信息
debug:
var: mount_check.stdout_lines

View File

@@ -0,0 +1,187 @@
---
- name: 部署 Telegraf 硬盘监控到 Nomad 集群
hosts: all
become: yes
vars:
# 连接现有的 InfluxDB 2.x + Grafana 监控栈
influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}"
influxdb_token: "{{ influxdb_token }}"
influxdb_org: "{{ influxdb_org | default('nomad') }}"
influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}"
# 远程 Telegraf 配置模式(优先)
use_remote_config: "{{ use_remote_config | default(true) }}"
telegraf_config_url: "{{ telegraf_config_url | default('') }}"
# 硬盘监控阈值
disk_usage_warning: 80 # 80% 使用率警告
disk_usage_critical: 90 # 90% 使用率严重告警
# 监控间隔(秒)
collection_interval: 30
tasks:
- name: 显示正在处理的节点
debug:
msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控"
- name: 添加 InfluxData 仓库密钥
apt_key:
url: https://repos.influxdata.com/influxdata-archive_compat.key
state: present
retries: 3
delay: 5
- name: 添加 InfluxData 仓库
apt_repository:
repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable"
state: present
update_cache: yes
retries: 3
delay: 5
- name: 安装 Telegraf
apt:
name: telegraf
state: present
update_cache: yes
retries: 3
delay: 10
- name: 创建 Telegraf 配置目录
file:
path: /etc/telegraf/telegraf.d
state: directory
owner: telegraf
group: telegraf
mode: '0755'
- name: 清理旧的 Telegraf 日志文件(节省硬盘空间)
file:
path: "{{ item }}"
state: absent
loop:
- /var/log/telegraf
- /var/log/telegraf.log
ignore_errors: yes
- name: 禁用 Telegraf 日志目录创建
file:
path: /var/log/telegraf
state: absent
ignore_errors: yes
- name: 创建 Telegraf 环境变量文件
template:
src: telegraf-env.j2
dest: /etc/default/telegraf
owner: root
group: root
mode: '0600'
backup: yes
notify: restart telegraf
- name: 创建 Telegraf systemd 服务文件(支持远程配置)
template:
src: telegraf.service.j2
dest: /etc/systemd/system/telegraf.service
owner: root
group: root
mode: '0644'
backup: yes
notify:
- reload systemd
- restart telegraf
when: telegraf_config_url is defined and telegraf_config_url != ''
- name: 生成 Telegraf 主配置文件(本地配置模式)
template:
src: telegraf.conf.j2
dest: /etc/telegraf/telegraf.conf
owner: telegraf
group: telegraf
mode: '0644'
backup: yes
notify: restart telegraf
when: telegraf_config_url is not defined or telegraf_config_url == ''
- name: 生成硬盘监控配置
template:
src: disk-monitoring.conf.j2
dest: /etc/telegraf/telegraf.d/disk-monitoring.conf
owner: telegraf
group: telegraf
mode: '0644'
backup: yes
notify: restart telegraf
- name: 生成系统监控配置
template:
src: system-monitoring.conf.j2
dest: /etc/telegraf/telegraf.d/system-monitoring.conf
owner: telegraf
group: telegraf
mode: '0644'
backup: yes
notify: restart telegraf
- name: 启用并启动 Telegraf 服务
systemd:
name: telegraf
state: started
enabled: yes
daemon_reload: yes
- name: 验证 Telegraf 状态
systemd:
name: telegraf
register: telegraf_status
- name: 检查 InfluxDB 连接
uri:
url: "{{ influxdb_url }}/ping"
method: GET
timeout: 5
register: influxdb_ping
ignore_errors: yes
delegate_to: localhost
run_once: true
- name: 显示 InfluxDB 连接状态
debug:
msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}"
run_once: true
- name: 显示 Telegraf 状态
debug:
msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}"
- name: 检查硬盘使用情况
shell: |
df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output;
do
usage=$(echo $output | awk '{print $1}' | sed 's/%//g')
partition=$(echo $output | awk '{print $2}')
mount=$(echo $output | awk '{print $3}')
if [ $usage -ge {{ disk_usage_warning }} ]; then
echo "⚠️ 警告: $mount ($partition) 使用率 $usage%"
else
echo "✅ $mount ($partition) 使用率 $usage%"
fi
done
register: disk_check
changed_when: false
- name: 显示硬盘检查结果
debug:
msg: "{{ disk_check.stdout_lines }}"
handlers:
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart telegraf
systemd:
name: telegraf
state: restarted

View File

@@ -0,0 +1,76 @@
---
- name: 安装并配置新的 Nomad Server 节点
hosts: influxdb1
become: yes
gather_facts: no
tasks:
- name: 更新包缓存
apt:
update_cache: yes
cache_valid_time: 3600
retries: 3
delay: 10
- name: 安装依赖包
apt:
name:
- wget
- curl
- unzip
- podman
- buildah
- skopeo
state: present
retries: 3
delay: 10
- name: 检查 Nomad 是否已安装
shell: which nomad || echo "not_found"
register: nomad_check
changed_when: false
- name: 下载并安装 Nomad
block:
- name: 下载 Nomad 1.10.5
get_url:
url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip"
dest: "/tmp/nomad.zip"
mode: '0644'
- name: 解压 Nomad
unarchive:
src: "/tmp/nomad.zip"
dest: "/usr/bin/"
remote_src: yes
owner: root
group: root
mode: '0755'
- name: 清理临时文件
file:
path: "/tmp/nomad.zip"
state: absent
when: nomad_check.stdout == "not_found"
- name: 验证 Nomad 安装
shell: nomad version
register: nomad_version_output
- name: 显示安装结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 软件安装完成
📦 Podman: {{ ansible_facts.packages.podman[0].version if ansible_facts.packages.podman is defined else 'checking...' }}
🎯 Nomad: {{ nomad_version_output.stdout.split('\n')[0] }}
- name: 启用 Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: 继续完整配置
debug:
msg: "软件安装完成,现在将运行完整的 Nomad 配置..."

View File

@@ -0,0 +1,114 @@
---
- name: Setup Xfce desktop environment and Chrome Dev for browser automation
hosts: browser
become: yes
vars:
target_user: ben
tasks:
- name: Update package lists
apt:
update_cache: yes
cache_valid_time: 3600
- name: Install Xfce desktop environment
apt:
name:
- xfce4
- xfce4-goodies
- lightdm
- xorg
- dbus-x11
state: present
- name: Install additional useful packages for desktop environment
apt:
name:
- firefox-esr
- geany
- thunar-archive-plugin
- xfce4-terminal
- gvfs
- fonts-noto
- fonts-noto-cjk
state: present
- name: Download Google Chrome Dev .deb package
get_url:
url: https://dl.google.com/linux/direct/google-chrome-unstable_current_amd64.deb
dest: /tmp/google-chrome-unstable_current_amd64.deb
mode: '0644'
- name: Install Google Chrome Dev
apt:
deb: /tmp/google-chrome-unstable_current_amd64.deb
- name: Clean up downloaded .deb package
file:
path: /tmp/google-chrome-unstable_current_amd64.deb
state: absent
- name: Install Chrome automation dependencies
apt:
name:
- python3-pip
- python3-venv
- python3-dev
- build-essential
- libssl-dev
- libffi-dev
state: present
- name: Install Python packages for browser automation
pip:
name:
- selenium
- webdriver-manager
- pyvirtualdisplay
executable: pip3
- name: Set up Xfce as default desktop environment
copy:
dest: /etc/lightdm/lightdm.conf
content: |
[Seat:*]
autologin-user={{ target_user }}
autologin-user-timeout=0
autologin-session=xfce
user-session=xfce
- name: Ensure user is in necessary groups
user:
name: "{{ target_user }}"
groups:
- audio
- video
- input
- netdev
append: yes
- name: Create .xprofile for user
copy:
dest: /home/{{ target_user }}/.xprofile
content: |
# Start Xfce on login
startxfce4
owner: "{{ target_user }}"
group: "{{ target_user }}"
mode: '0644'
- name: Enable and start lightdm service
systemd:
name: lightdm
enabled: yes
state: started
- name: Display success message
debug:
msg: "Xfce desktop environment and Chrome Dev have been configured for user {{ target_user }} on {{ inventory_hostname }}"
handlers:
- name: restart lightdm
systemd:
name: lightdm
state: restarted

View File

@@ -0,0 +1,33 @@
---
- name: 启动所有Nomad服务器形成集群
hosts: nomad_servers
become: yes
tasks:
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 启动Nomad服务如果未运行
systemd:
name: nomad
state: started
enabled: yes
when: nomad_status.status.ActiveState != "active"
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 显示Nomad服务状态
debug:
msg: "{{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,61 @@
# Consul Client Configuration for {{ inventory_hostname }}
datacenter = "dc1"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "{{ inventory_hostname }}"
bind_addr = "{{ hostvars[inventory_hostname]['tailscale_ip'] }}"
# Client mode (not server)
server = false
# Connect to Consul servers (指向三节点集群)
retry_join = [
{% for server in consul_servers %}
"{{ server }}"{% if not loop.last %},{% endif %}
{% endfor %}
]
# Performance optimization
performance {
raft_multiplier = 5
}
# Ports configuration
ports {
grpc = 8502
http = 8500
dns = 8600
}
# Enable Connect for service mesh
connect {
enabled = true
}
# Cache configuration for performance
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# Node metadata
node_meta = {
region = "unknown"
zone = "nomad-{{ 'server' if 'server' in group_names else 'client' }}"
}
# UI disabled for clients
ui_config {
enabled = false
}
# ACL configuration (if needed)
acl = {
enabled = false
default_policy = "allow"
}
# Logging
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7

View File

@@ -0,0 +1,106 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ ansible_hostname }}"
bind_addr = "0.0.0.0"
addresses {
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 3
server_join {
retry_join = [
"semaphore.tailnet-68f9.ts.net:4648",
"ash1d.tailnet-68f9.ts.net:4648",
"ash2e.tailnet-68f9.ts.net:4648",
"ch2.tailnet-68f9.ts.net:4648",
"ch3.tailnet-68f9.ts.net:4648",
"onecloud1.tailnet-68f9.ts.net:4648",
"de.tailnet-68f9.ts.net:4648",
"hcp1.tailnet-68f9.ts.net:4648"
]
}
}
{% if ansible_hostname == 'hcp1' %}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647",
"hcp1.tailnet-68f9.ts.net:4647"
]
host_volume "traefik-certs" {
path = "/opt/traefik/certs"
read_only = false
}
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
meta {
consul = "true"
consul_version = "1.21.5"
consul_client = "true"
}
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
{% endif %}
consul {
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = false
client_auto_join = true
}
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}

View File

@@ -0,0 +1,110 @@
# Kali Linux Ansible 测试套件
本目录包含用于测试Kali Linux系统的Ansible playbook集合。
## 测试Playbook列表
### 1. kali-health-check.yml
**用途**: Kali Linux快速健康检查
**描述**: 执行基本的系统状态检查包括系统信息、更新状态、磁盘空间、关键工具安装状态、网络连接、系统负载和SSH服务状态。
**运行方式**:
```bash
cd /root/mgmt/configuration
ansible-playbook -i inventories/production/inventory.ini playbooks/test/kali-health-check.yml
```
### 2. kali-security-tools.yml
**用途**: Kali Linux安全工具测试
**描述**: 专门测试各种Kali Linux安全工具的安装和基本功能包括
- Nmap
- Metasploit Framework
- Wireshark
- John the Ripper
- Hydra
- SQLMap
- Aircrack-ng
- Burp Suite
- Netcat
- Curl
**运行方式**:
```bash
cd /root/mgmt/configuration
ansible-playbook -i inventories/production/inventory.ini playbooks/test/kali-security-tools.yml
```
### 3. test-kali.yml
**用途**: Kali Linux完整系统测试
**描述**: 执行全面的系统测试,包括:
- 系统基本信息收集
- 网络连接测试
- 包管理器测试
- Kali工具检查
- 系统安全性检查
- 系统性能测试
- 网络工具测试
- 生成详细测试报告
**运行方式**:
```bash
cd /root/mgmt/configuration
ansible-playbook -i inventories/production/inventory.ini playbooks/test/test-kali.yml
```
### 4. kali-full-test-suite.yml
**用途**: Kali Linux完整测试套件
**描述**: 按顺序执行所有上述测试,提供全面的系统测试覆盖。
**运行方式**:
```bash
cd /root/mgmt/configuration
ansible-playbook playbooks/test/kali-full-test-suite.yml
```
## 测试结果
### 健康检查
- 直接在终端显示测试结果
- 无额外文件生成
### 安全工具测试
- 终端显示测试结果摘要
- 在Kali系统上生成 `/tmp/kali_security_tools_report.md` 报告文件
### 完整系统测试
- 终端显示测试进度
- 在Kali系统上生成 `/tmp/kali_test_results/` 目录,包含:
- `system_info.txt`: 系统基本信息
- `tool_check.txt`: Kali工具检查结果
- `security_check.txt`: 系统安全检查
- `performance.txt`: 系统性能信息
- `network_tools.txt`: 网络工具测试
- `kali_test.log`: 完整测试日志
- `README.md`: 测试报告摘要
## 前提条件
1. 确保Kali系统在inventory中正确配置
2. 确保Ansible可以连接到Kali系统
3. 确保有足够的权限在Kali系统上执行测试
## 注意事项
1. 某些测试可能需要网络连接
2. 完整系统测试可能需要较长时间
3. 测试结果文件会保存在Kali系统的临时目录中
4. 建议定期清理测试结果文件以节省磁盘空间
## 故障排除
如果测试失败,请检查:
1. 网络连接是否正常
2. Ansible inventory配置是否正确
3. SSH连接是否正常
4. Kali系统是否正常运行
5. 是否有足够的权限执行测试
## 自定义测试
您可以根据需要修改playbook中的测试内容或添加新的测试任务。所有playbook都使用模块化设计便于扩展和维护。

View File

@@ -0,0 +1,50 @@
---
- name: Kali Linux 完整测试套件
hosts: localhost
gather_facts: no
tasks:
- name: 显示测试开始信息
debug:
msg: "开始执行 Kali Linux 完整测试套件"
- name: 执行Kali快速健康检查
command: "ansible-playbook -i ../inventories/production/inventory.ini kali-health-check.yml"
args:
chdir: "/root/mgmt/configuration/playbooks/test"
register: health_check_result
- name: 显示健康检查结果
debug:
msg: "健康检查完成,退出码: {{ health_check_result.rc }}"
- name: 执行Kali安全工具测试
command: "ansible-playbook -i ../inventories/production/inventory.ini kali-security-tools.yml"
args:
chdir: "/root/mgmt/configuration/playbooks/test"
register: security_tools_result
- name: 显示安全工具测试结果
debug:
msg: "安全工具测试完成,退出码: {{ security_tools_result.rc }}"
- name: 执行Kali完整系统测试
command: "ansible-playbook -i ../inventories/production/inventory.ini test-kali.yml"
args:
chdir: "/root/mgmt/configuration/playbooks/test"
register: full_test_result
- name: 显示完整测试结果
debug:
msg: "完整系统测试完成,退出码: {{ full_test_result.rc }}"
- name: 显示测试完成信息
debug:
msg: |
Kali Linux 完整测试套件执行完成!
测试结果摘要:
- 健康检查: {{ '成功' if health_check_result.rc == 0 else '失败' }}
- 安全工具测试: {{ '成功' if security_tools_result.rc == 0 else '失败' }}
- 完整系统测试: {{ '成功' if full_test_result.rc == 0 else '失败' }}
详细测试结果请查看各测试生成的报告文件。

View File

@@ -0,0 +1,86 @@
---
- name: Kali Linux 快速健康检查
hosts: kali
become: yes
gather_facts: yes
tasks:
- name: 显示系统基本信息
debug:
msg: |
=== Kali Linux 系统信息 ===
主机名: {{ ansible_hostname }}
操作系统: {{ ansible_distribution }} {{ ansible_distribution_version }}
内核版本: {{ ansible_kernel }}
架构: {{ ansible_architecture }}
CPU核心数: {{ ansible_processor_vcpus }}
内存总量: {{ ansible_memtotal_mb }} MB
- name: 修复损坏的依赖关系
command: apt --fix-broken install -y
when: ansible_os_family == "Debian"
ignore_errors: yes
- name: 检查系统更新状态
apt:
update_cache: yes
upgrade: dist
check_mode: yes
register: update_check
changed_when: false
ignore_errors: yes
- name: 显示系统更新状态
debug:
msg: "{% if update_check.changed %}系统有可用更新{% else %}系统已是最新{% endif %}"
- name: 检查磁盘空间
command: "df -h /"
register: disk_space
- name: 显示根分区磁盘空间
debug:
msg: "根分区使用情况: {{ disk_space.stdout_lines[1] }}"
- name: 检查关键Kali工具
command: "which {{ item }}"
loop:
- nmap
- metasploit-framework
- wireshark
register: tool_check
ignore_errors: yes
changed_when: false
- name: 显示工具检查结果
debug:
msg: "{% for result in tool_check.results %}{{ result.item }}: {% if result.rc == 0 %}已安装{% else %}未安装{% endif %}{% endfor %}"
- name: 检查网络连接
uri:
url: https://httpbin.org/get
method: GET
timeout: 5
register: network_test
ignore_errors: yes
- name: 显示网络连接状态
debug:
msg: "{% if network_test.failed %}网络连接测试失败{% else %}网络连接正常{% endif %}"
- name: 检查系统负载
command: "uptime"
register: uptime
- name: 显示系统负载
debug:
msg: "系统负载: {{ uptime.stdout }}"
- name: 检查SSH服务状态
systemd:
name: ssh
register: ssh_service
- name: 显示SSH服务状态
debug:
msg: "SSH服务状态: {{ ssh_service.status.ActiveState }}"

View File

@@ -0,0 +1,228 @@
---
- name: Kali Linux 安全工具测试
hosts: kali
become: yes
gather_facts: yes
vars:
test_results: []
tasks:
- name: 初始化测试结果
set_fact:
test_results: []
- name: 测试Nmap
block:
- name: 检查Nmap是否安装
command: "which nmap"
register: nmap_check
ignore_errors: yes
changed_when: false
- name: 测试Nmap基本功能
command: "nmap -sn 127.0.0.1"
register: nmap_test
when: nmap_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Nmap测试结果
set_fact:
test_results: "{{ test_results + ['Nmap: ' + ('✓ 正常工作' if nmap_check.rc == 0 and nmap_test.rc == 0 else '✗ 未安装或异常')] }}"
- name: 测试Metasploit Framework
block:
- name: 检查Metasploit是否安装
command: "which msfconsole"
register: msf_check
ignore_errors: yes
changed_when: false
- name: 测试Metasploit版本
command: "msfconsole --version"
register: msf_version
when: msf_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Metasploit测试结果
set_fact:
test_results: "{{ test_results + ['Metasploit: ' + ('✓ 正常工作' if msf_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试Wireshark
block:
- name: 检查Wireshark是否安装
command: "which wireshark"
register: wireshark_check
ignore_errors: yes
changed_when: false
- name: 检查tshark是否可用
command: "which tshark"
register: tshark_check
when: wireshark_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Wireshark测试结果
set_fact:
test_results: "{{ test_results + ['Wireshark: ' + ('✓ 正常工作' if wireshark_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试John the Ripper
block:
- name: 检查John是否安装
command: "which john"
register: john_check
ignore_errors: yes
changed_when: false
- name: 测试John版本
command: "john --version"
register: john_version
when: john_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录John测试结果
set_fact:
test_results: "{{ test_results + ['John the Ripper: ' + ('✓ 正常工作' if john_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试Hydra
block:
- name: 检查Hydra是否安装
command: "which hydra"
register: hydra_check
ignore_errors: yes
changed_when: false
- name: 测试Hydra帮助
command: "hydra -h"
register: hydra_help
when: hydra_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Hydra测试结果
set_fact:
test_results: "{{ test_results + ['Hydra: ' + ('✓ 正常工作' if hydra_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试SQLMap
block:
- name: 检查SQLMap是否安装
command: "which sqlmap"
register: sqlmap_check
ignore_errors: yes
changed_when: false
- name: 测试SQLMap版本
command: "sqlmap --version"
register: sqlmap_version
when: sqlmap_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录SQLMap测试结果
set_fact:
test_results: "{{ test_results + ['SQLMap: ' + ('✓ 正常工作' if sqlmap_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试Aircrack-ng
block:
- name: 检查Aircrack-ng是否安装
command: "which airmon-ng"
register: aircrack_check
ignore_errors: yes
changed_when: false
- name: 测试Aircrack-ng版本
command: "airmon-ng --version"
register: aircrack_version
when: aircrack_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Aircrack-ng测试结果
set_fact:
test_results: "{{ test_results + ['Aircrack-ng: ' + ('✓ 正常工作' if aircrack_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试Burp Suite
block:
- name: 检查Burp Suite是否安装
command: "which burpsuite"
register: burp_check
ignore_errors: yes
changed_when: false
- name: 记录Burp Suite测试结果
set_fact:
test_results: "{{ test_results + ['Burp Suite: ' + ('✓ 正常工作' if burp_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试Netcat
block:
- name: 检查Netcat是否安装
command: "which nc"
register: nc_check
ignore_errors: yes
changed_when: false
- name: 测试Netcat基本功能
command: "nc -z 127.0.0.1 22"
register: nc_test
when: nc_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Netcat测试结果
set_fact:
test_results: "{{ test_results + ['Netcat: ' + ('✓ 正常工作' if nc_check.rc == 0 else '✗ 未安装')] }}"
- name: 测试Curl
block:
- name: 检查Curl是否安装
command: "which curl"
register: curl_check
ignore_errors: yes
changed_when: false
- name: 测试Curl基本功能
command: "curl -s -o /dev/null -w '%{http_code}' https://httpbin.org/get"
register: curl_test
when: curl_check.rc == 0
ignore_errors: yes
changed_when: false
- name: 记录Curl测试结果
set_fact:
test_results: "{{ test_results + ['Curl: ' + ('✓ 正常工作' if curl_check.rc == 0 else '✗ 未安装')] }}"
- name: 显示所有测试结果
debug:
msg: |
=== Kali Linux 安全工具测试结果 ===
{% for result in test_results %}
{{ result }}
{% endfor %}
- name: 生成测试报告
copy:
content: |
# Kali Linux 安全工具测试报告
**测试时间**: {{ ansible_date_time.iso8601 }}
**测试主机**: {{ ansible_hostname }}
## 测试结果
{% for result in test_results %}
{{ result }}
{% endfor %}
## 建议
{% for result in test_results %}
{% if '✗' in result %}
- {{ result.split(':')[0] }} 未安装,可以使用以下命令安装: `sudo apt install {{ result.split(':')[0].lower().replace(' ', '-') }}`
{% endif %}
{% endfor %}
dest: "/tmp/kali_security_tools_report.md"

View File

@@ -0,0 +1,260 @@
---
- name: Kali Linux 系统测试
hosts: kali
become: yes
gather_facts: yes
vars:
test_results_dir: "/tmp/kali_test_results"
test_log_file: "{{ test_results_dir }}/kali_test.log"
tasks:
- name: 创建测试结果目录
file:
path: "{{ test_results_dir }}"
state: directory
mode: '0755'
- name: 初始化测试日志
copy:
content: "Kali Linux 系统测试日志 - {{ ansible_date_time.iso8601 }}\n\n"
dest: "{{ test_log_file }}"
- name: 记录系统基本信息
block:
- name: 获取系统信息
setup:
register: system_info
- name: 记录系统信息到日志
copy:
content: |
=== 系统基本信息 ===
主机名: {{ ansible_hostname }}
操作系统: {{ ansible_distribution }} {{ ansible_distribution_version }}
内核版本: {{ ansible_kernel }}
架构: {{ ansible_architecture }}
CPU核心数: {{ ansible_processor_vcpus }}
内存总量: {{ ansible_memtotal_mb }} MB
磁盘空间: {{ ansible_mounts | map(attribute='size_total') | sum | human_readable }}
dest: "{{ test_results_dir }}/system_info.txt"
- name: 记录到主日志
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] 系统基本信息收集完成"
- name: 测试网络连接
block:
- name: 测试网络连通性
uri:
url: https://www.google.com
method: GET
timeout: 10
register: network_test
ignore_errors: yes
- name: 记录网络测试结果
lineinfile:
path: "{{ test_log_file }}"
line: "{% if network_test.failed %}[✗] 网络连接测试失败{% else %}[✓] 网络连接测试成功{% endif %}"
- name: 测试包管理器
block:
- name: 更新包列表
apt:
update_cache: yes
changed_when: false
- name: 记录包管理器测试结果
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] APT包管理器工作正常"
- name: 检查Kali工具
block:
- name: 检查常见Kali工具是否安装
command: "which {{ item }}"
loop:
- nmap
- metasploit-framework
- wireshark
- john
- hydra
- sqlmap
- burpsuite
- aircrack-ng
register: tool_check
ignore_errors: yes
changed_when: false
- name: 记录工具检查结果
copy:
content: |
=== Kali工具检查结果 ===
{% for result in tool_check.results %}
{{ result.item }}: {% if result.rc == 0 %}已安装{% else %}未安装{% endif %}
{% endfor %}
dest: "{{ test_results_dir }}/tool_check.txt"
- name: 记录到主日志
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] Kali工具检查完成"
- name: 测试系统安全性
block:
- name: 检查防火墙状态
command: "ufw status"
register: firewall_status
ignore_errors: yes
changed_when: false
- name: 检查SSH配置
command: "grep -E '^PermitRootLogin|^PasswordAuthentication' /etc/ssh/sshd_config"
register: ssh_config
ignore_errors: yes
changed_when: false
- name: 记录安全检查结果
copy:
content: |
=== 系统安全检查 ===
防火墙状态:
{{ firewall_status.stdout }}
SSH配置:
{{ ssh_config.stdout }}
dest: "{{ test_results_dir }}/security_check.txt"
- name: 记录到主日志
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] 系统安全检查完成"
- name: 测试系统性能
block:
- name: 获取CPU使用率
command: "top -bn1 | grep 'Cpu(s)'"
register: cpu_usage
changed_when: false
- name: 获取内存使用情况
command: "free -h"
register: memory_usage
changed_when: false
- name: 获取磁盘使用情况
command: "df -h"
register: disk_usage
changed_when: false
- name: 记录性能测试结果
copy:
content: |
=== 系统性能信息 ===
CPU使用率:
{{ cpu_usage.stdout }}
内存使用情况:
{{ memory_usage.stdout }}
磁盘使用情况:
{{ disk_usage.stdout }}
dest: "{{ test_results_dir }}/performance.txt"
- name: 记录到主日志
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] 系统性能测试完成"
- name: 测试网络工具
block:
- name: 测试ping命令
command: "ping -c 4 8.8.8.8"
register: ping_test
ignore_errors: yes
changed_when: false
- name: 测试nslookup命令
command: "nslookup google.com"
register: nslookup_test
ignore_errors: yes
changed_when: false
- name: 记录网络工具测试结果
copy:
content: |
=== 网络工具测试 ===
Ping测试结果:
{{ ping_test.stdout }}
NSlookup测试结果:
{{ nslookup_test.stdout }}
dest: "{{ test_results_dir }}/network_tools.txt"
- name: 记录到主日志
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] 网络工具测试完成"
- name: 生成测试报告
block:
- name: 创建测试报告
copy:
content: |
# Kali Linux 系统测试报告
**测试时间**: {{ ansible_date_time.iso8601 }}
**测试主机**: {{ ansible_hostname }}
## 测试结果摘要
{% if network_test.failed %}- [✗] 网络连接测试失败{% else %}- [✓] 网络连接测试成功{% endif %}
- [✓] APT包管理器工作正常
- [✓] Kali工具检查完成
- [✓] 系统安全检查完成
- [✓] 系统性能测试完成
- [✓] 网络工具测试完成
## 详细结果
请查看以下文件获取详细测试结果:
- system_info.txt: 系统基本信息
- tool_check.txt: Kali工具检查结果
- security_check.txt: 系统安全检查
- performance.txt: 系统性能信息
- network_tools.txt: 网络工具测试
- kali_test.log: 完整测试日志
## 建议
{% for result in tool_check.results %}
{% if result.rc != 0 %}
- 建议安装 {{ result.item }} 工具: `sudo apt install {{ result.item }}`
{% endif %}
{% endfor %}
dest: "{{ test_results_dir }}/README.md"
- name: 记录到主日志
lineinfile:
path: "{{ test_log_file }}"
line: "[✓] 测试报告生成完成"
- name: 显示测试结果位置
debug:
msg: "Kali Linux 系统测试完成!测试结果保存在 {{ test_results_dir }} 目录中"
- name: 显示测试日志最后几行
command: "tail -10 {{ test_log_file }}"
register: log_tail
- name: 输出测试日志摘要
debug:
msg: "{{ log_tail.stdout_lines }}"

View File

@@ -0,0 +1,50 @@
---
- name: 更新客户端节点的/etc/hosts文件
hosts: nomad_clients
become: yes
vars:
hosts_entries:
- ip: "100.116.158.95"
hostnames: ["semaphore", "bj-semaphore"]
- ip: "100.81.26.3"
hostnames: ["ash1d"]
- ip: "100.103.147.94"
hostnames: ["ash2e"]
- ip: "100.90.159.68"
hostnames: ["ch2"]
- ip: "100.86.141.112"
hostnames: ["ch3"]
- ip: "100.98.209.50"
hostnames: ["onecloud1", "bj-onecloud1"]
- ip: "100.120.225.29"
hostnames: ["de"]
- ip: "100.117.106.136"
hostnames: ["ch4"]
- ip: "100.116.80.94"
hostnames: ["ash3c", "influxdb1"]
- ip: "100.116.112.45"
hostnames: ["browser"]
- ip: "100.97.62.111"
hostnames: ["hcp1", "bj-hcp1"]
- ip: "100.122.197.112"
hostnames: ["warden"]
tasks:
- name: 添加主机名解析到/etc/hosts文件
lineinfile:
path: /etc/hosts
line: "{{ item.ip }} {{ item.hostnames | join(' ') }}"
create: yes
owner: root
group: root
mode: '0644'
loop: "{{ hosts_entries }}"
- name: 显示更新后的/etc/hosts文件内容
command: cat /etc/hosts
register: hosts_content
changed_when: false
- name: 显示/etc/hosts文件内容
debug:
var: hosts_content.stdout_lines

View File

@@ -0,0 +1,43 @@
---
- name: 更新所有Nomad节点的Consul配置
hosts: nomad_nodes
become: yes
vars:
consul_addresses: "master.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
tasks:
- name: 备份原始Nomad配置
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup.{{ ansible_date_time.epoch }}
remote_src: yes
backup: yes
- name: 更新Nomad Consul配置
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^\s*address\s*=\s*".*"'
line: ' address = "{{ consul_addresses }}"'
state: present
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
daemon_reload: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "节点 {{ inventory_hostname }} Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,56 @@
---
- name: 更新Nomad服务器配置添加hcp1作为peer
hosts: nomad_servers
become: yes
vars:
hcp1_ip: "100.97.62.111"
bootstrap_expect: 8
tasks:
- name: 备份原配置文件
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.bak
remote_src: yes
backup: yes
- name: 添加hcp1到retry_join列表
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ retry_join = \['
line: ' retry_join = ["{{ hcp1_ip }}",'
backup: yes
- name: 更新bootstrap_expect为8
lineinfile:
path: /etc/nomad.d/nomad.hcl
regexp: '^ bootstrap_expect = \d+'
line: ' bootstrap_expect = {{ bootstrap_expect }}'
backup: yes
- name: 重启Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
- name: 等待Nomad服务启动
wait_for:
port: 4646
host: "{{ ansible_host }}"
timeout: 30
- name: 检查Nomad服务状态
systemd:
name: nomad
register: nomad_status
- name: 显示Nomad服务状态
debug:
msg: "Nomad服务状态: {{ nomad_status.status.ActiveState }}"

View File

@@ -0,0 +1,31 @@
---
- name: Update Nomad server configuration
hosts: nomad_servers
become: yes
tasks:
- name: Backup current Nomad configuration
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.bak
remote_src: yes
- name: Generate Nomad configuration for each server
template:
src: ../templates/nomad-server.hcl.j2
dest: /etc/nomad.d/nomad.hcl
vars:
server_name: "{{ inventory_hostname }}"
server_ip: "{{ ansible_host }}"
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_host }}"
delay: 10
timeout: 60

View File

@@ -0,0 +1,72 @@
---
- name: Remove Consul configuration from all Nomad servers
hosts: semaphore,ash1d,ash2e,ch2,ch3,onecloud1,de
become: yes
tasks:
- name: Create clean Nomad server configuration
copy:
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ inventory_hostname }}"
bind_addr = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
addresses {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
}
advertise {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4646"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4647"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 7
retry_join = ["ash1d.tailnet-68f9.ts.net","ash2e.tailnet-68f9.ts.net","ch2.tailnet-68f9.ts.net","ch3.tailnet-68f9.ts.net","onecloud1.tailnet-68f9.ts.net","de.tailnet-68f9.ts.net"]
}
client {
enabled = false
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
dest: /etc/nomad.d/nomad.hcl
mode: '0644'
- name: Restart Nomad service
systemd:
name: nomad
state: restarted
- name: Wait for Nomad to be ready
wait_for:
port: 4646
host: "{{ ansible_default_ipv4.address }}"
delay: 5
timeout: 30
- name: Display completion message
debug:
msg: "Removed Consul configuration from {{ inventory_hostname }}"

View File

@@ -0,0 +1,26 @@
---
- name: 紧急回滚 - 恢复直连Consul配置
hosts: nomad_nodes
become: yes
tasks:
- name: 🚨 紧急回滚Consul配置
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'address = "hcp1.tailnet-68f9.ts.net:80"'
replace: 'address = "100.117.106.136:8500"'
notify: restart nomad
- name: ✅ 验证回滚配置
shell: grep "address.*=" /etc/nomad.d/nomad.hcl
register: rollback_config
- name: 📋 显示回滚后配置
debug:
msg: "回滚后配置: {{ rollback_config.stdout }}"
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@@ -0,0 +1,62 @@
# Consul Client Configuration for {{ inventory_hostname }}
datacenter = "dc1"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "{{ inventory_hostname }}"
bind_addr = "{{ ansible_host }}"
# Client mode (not server)
server = false
# Connect to Consul servers (指向三节点集群)
retry_join = [
{% for server in consul_servers %}
"{{ server }}"{% if not loop.last %},{% endif %}
{% endfor %}
]
# Performance optimization
performance {
raft_multiplier = 5
}
# Ports configuration
ports {
grpc = 8502
http = 8500
dns = 8600
}
# Enable Connect for service mesh
connect {
enabled = true
}
# Cache configuration for performance
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# Node metadata
node_meta = {
region = "unknown"
zone = "nomad-{{ 'server' if 'server' in group_names else 'client' }}"
}
# UI disabled for clients
ui_config {
enabled = false
}
# ACL configuration (if needed)
acl = {
enabled = false
default_policy = "allow"
}
# Logging
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7

View File

@@ -0,0 +1,68 @@
# 硬盘监控配置
# 监控所有挂载点的硬盘使用情况
# 硬盘使用率监控
[[inputs.disk]]
## 忽略的文件系统类型
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
## 监控所有挂载点
mount_points = ["/", "/var", "/tmp", "/opt", "/home"]
## 标签配置
[inputs.disk.tags]
service = "disk-monitoring"
# 硬盘 I/O 监控
[[inputs.diskio]]
## 监控所有设备
devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"]
## 跳过序列号收集以提高性能
skip_serial_number = true
[inputs.diskio.tags]
service = "disk-io-monitoring"
# 文件系统 inode 监控
[[inputs.disk]]
## 监控 inode 使用情况
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
## 收集 inode 信息
[inputs.disk.tags]
service = "inode-monitoring"
# 进程监控(可选,用于监控可能占用大量硬盘的进程)
[[inputs.procstat]]
## 监控 Docker 进程(如果存在)
pattern = "docker"
[inputs.procstat.tags]
service = "docker-process"
[[inputs.procstat]]
## 监控 Podman 进程
pattern = "podman"
[inputs.procstat.tags]
service = "podman-process"
[[inputs.procstat]]
## 监控 Nomad 进程
pattern = "nomad"
[inputs.procstat.tags]
service = "nomad-process"
# 日志文件大小监控
[[inputs.filestat]]
files = [
"/var/log/nomad/*.log",
"/var/log/syslog",
"/var/log/kern.log",
"/var/log/auth.log"
]
[inputs.filestat.tags]
service = "log-monitoring"

View File

@@ -0,0 +1,108 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ inventory_hostname }}"
bind_addr = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
addresses {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
}
advertise {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4646"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4647"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = false
}
client {
enabled = true
network_interface = "tailscale0"
# 配置七仙女服务器地址使用完整FQDN
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
# 配置host volumes
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
host_volume "vault-storage" {
path = "/opt/nomad/data/vault-storage"
read_only = false
}
# 禁用Docker驱动只使用Podman
options {
"driver.raw_exec.enable" = "1"
"driver.exec.enable" = "1"
}
# 配置节点元数据
meta {
consul = "true"
consul_version = "1.21.5"
consul_server = {% if inventory_hostname in ['master', 'ash3c', 'warden'] %}"true"{% else %}"false"{% endif %}
}
# 激进的垃圾清理策略
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "master.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = true
client_auto_join = true
}
vault {
enabled = true
address = "http://master.tailnet-68f9.ts.net:8200,http://ash3c.tailnet-68f9.ts.net:8200,http://warden.tailnet-68f9.ts.net:8200"
token = "hvs.A5Fu4E1oHyezJapVllKPFsWg"
create_from_role = "nomad-cluster"
tls_skip_verify = true
}
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}

View File

@@ -0,0 +1,106 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ ansible_hostname }}"
bind_addr = "0.0.0.0"
addresses {
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 3
server_join {
retry_join = [
"semaphore.tailnet-68f9.ts.net:4648",
"ash1d.tailnet-68f9.ts.net:4648",
"ash2e.tailnet-68f9.ts.net:4648",
"ch2.tailnet-68f9.ts.net:4648",
"ch3.tailnet-68f9.ts.net:4648",
"onecloud1.tailnet-68f9.ts.net:4648",
"de.tailnet-68f9.ts.net:4648",
"hcp1.tailnet-68f9.ts.net:4648"
]
}
}
{% if ansible_hostname == 'hcp1' %}
client {
enabled = true
network_interface = "tailscale0"
servers = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647",
"hcp1.tailnet-68f9.ts.net:4647"
]
host_volume "traefik-certs" {
path = "/opt/traefik/certs"
read_only = false
}
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
meta {
consul = "true"
consul_version = "1.21.5"
consul_client = "true"
}
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
{% endif %}
consul {
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = false
client_auto_join = true
}
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}

View File

@@ -0,0 +1,81 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "{{ inventory_hostname }}"
bind_addr = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
addresses {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net"
}
advertise {
http = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4646"
rpc = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4647"
serf = "{{ inventory_hostname }}.tailnet-68f9.ts.net:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = {{ 'true' if inventory_hostname in groups['nomad_servers'] else 'false' }}
{% if inventory_hostname in groups['nomad_servers'] %}
bootstrap_expect = 3
retry_join = [
"semaphore.tailnet-68f9.ts.net",
"ash1d.tailnet-68f9.ts.net",
"ash2e.tailnet-68f9.ts.net",
"ch2.tailnet-68f9.ts.net",
"ch3.tailnet-68f9.ts.net",
"onecloud1.tailnet-68f9.ts.net",
"de.tailnet-68f9.ts.net"
]
{% endif %}
}
client {
enabled = true
meta {
consul = "true"
consul_version = "1.21.5"
}
# 激进的垃圾清理策略
gc_interval = "5m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
}
plugin "nomad-driver-podman" {
config {
socket_path = "unix:///run/podman/podman.sock"
volumes {
enabled = true
}
}
}
consul {
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = true
client_auto_join = true
}
vault {
enabled = true
address = "http://ch4.tailnet-68f9.ts.net:8200,http://ash3c.tailnet-68f9.ts.net:8200,http://warden.tailnet-68f9.ts.net:8200"
token = "hvs.A5Fu4E1oHyezJapVllKPFsWg"
create_from_role = "nomad-cluster"
tls_skip_verify = true
}

View File

@@ -0,0 +1,68 @@
# 系统监控配置
# CPU、内存、网络等系统资源监控
# CPU 监控
[[inputs.cpu]]
## 是否收集每个 CPU 核心的信息
percpu = true
## 是否收集总 CPU 信息
totalcpu = true
## 收集字段
collect_cpu_time = false
## 报告活跃的 CPU
report_active = false
[inputs.cpu.tags]
service = "cpu-monitoring"
# 内存监控
[[inputs.mem]]
[inputs.mem.tags]
service = "memory-monitoring"
# 网络接口监控
[[inputs.net]]
## 接口配置
interfaces = ["eth*", "en*", "tailscale*"]
[inputs.net.tags]
service = "network-monitoring"
# 系统负载监控
[[inputs.system]]
[inputs.system.tags]
service = "system-load"
# 内核统计
[[inputs.kernel]]
[inputs.kernel.tags]
service = "kernel-stats"
# 网络统计
[[inputs.netstat]]
[inputs.netstat.tags]
service = "network-stats"
# 交换分区监控
[[inputs.swap]]
[inputs.swap.tags]
service = "swap-monitoring"
# 服务状态监控
[[inputs.systemd_units]]
## 监控的服务
units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"]
[inputs.systemd_units.tags]
service = "service-monitoring"
# 硬盘健康状态监控(如果支持 SMART
[[inputs.smart]]
## SMART 监控路径
path_smartctl = "/usr/sbin/smartctl"
## 超时设置
timeout = "30s"
[inputs.smart.tags]
service = "smart-monitoring"

View File

@@ -0,0 +1,7 @@
# Telegraf 环境变量配置
# InfluxDB 2.x 认证信息
INFLUX_TOKEN={{ influxdb_token }}
INFLUX_ORG={{ influxdb_org }}
INFLUX_BUCKET={{ influxdb_bucket }}
INFLUX_URL={{ influxdb_url }}

View File

@@ -0,0 +1,53 @@
# Telegraf 主配置文件
# Nomad 集群硬盘监控配置
# 全局设置
[global_tags]
nomad_cluster = "production"
node_role = "{{ nomad_role | default('unknown') }}"
hostname = "{{ inventory_hostname }}"
# Agent 配置
[agent]
interval = "{{ collection_interval | default(30) }}s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "2s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = "{{ inventory_hostname }}"
omit_hostname = false
# 输出配置 - InfluxDB 2.x
[[outputs.influxdb_v2]]
urls = ["{{ influxdb_url }}"]
token = "{{ influxdb_token }}"
organization = "{{ influxdb_org | default('nomad') }}"
bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}"
## 连接配置
timeout = "10s"
max_retries = 3
retry_timeout = "5s"
## 数据精度
precision = "s"
## TLS 配置(如果需要)
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
# insecure_skip_verify = false
# 日志配置 - 禁用本地日志以节省硬盘空间
[log]
## 只输出错误日志到 syslog不生成本地文件
level = "ERROR"
## 禁用本地日志文件
# file = "/var/log/telegraf/telegraf.log"
## 使用 syslog 替代本地文件
logtarget = "syslog"
## 禁用日志轮转
logrotate = false

View File

@@ -0,0 +1,29 @@
[Unit]
Description=Telegraf - 节点监控服务
Documentation=https://github.com/influxdata/telegraf
After=network.target
[Service]
Type=notify
User=telegraf
Group=telegraf
ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }}
ExecReload=/bin/kill -HUP $MAINPID
KillMode=control-group
Restart=on-failure
RestartSec=5
TimeoutStopSec=20
EnvironmentFile=/etc/default/telegraf
# 安全配置
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/telegraf
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,45 @@
# Vault Configuration for {{ inventory_hostname }}
# Storage backend - Consul
storage "consul" {
address = "127.0.0.1:8500"
path = "vault/"
# Consul datacenter
datacenter = "{{ vault_datacenter }}"
# Service registration
service = "vault"
service_tags = "vault-server"
# Session TTL
session_ttl = "15s"
lock_wait_time = "15s"
}
# Listener configuration
listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = 1
}
# API address - 使用Tailscale网络地址
api_addr = "http://{{ ansible_host }}:8200"
# Cluster address - 使用Tailscale网络地址
cluster_addr = "http://{{ ansible_host }}:8201"
# UI
ui = true
# Cluster name
cluster_name = "{{ vault_cluster_name }}"
# Disable mlock for development (remove in production)
disable_mlock = true
# Log level
log_level = "INFO"
# Plugin directory
plugin_directory = "/opt/vault/plugins"

View File

@@ -0,0 +1,34 @@
[Unit]
Description=Vault
Documentation=https://www.vaultproject.io/docs/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/vault.d/vault.hcl
StartLimitIntervalSec=60
StartLimitBurst=3
[Service]
Type=notify
User=vault
Group=vault
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
PrivateDevices=yes
SecureBits=keep-caps
AmbientCapabilities=CAP_IPC_LOCK
CapabilityBoundingSet=CAP_SYSLOG CAP_IPC_LOCK
NoNewPrivileges=yes
ExecStart=/usr/bin/vault server -config=/etc/vault.d/vault.hcl
ExecReload=/bin/kill --signal HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=5
TimeoutStopSec=30
StartLimitInterval=60
StartLimitBurst=3
LimitNOFILE=65536
LimitMEMLOCK=infinity
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,45 @@
---
- name: 实现路由反射器架构 - 所有节点通过Traefik访问Consul
hosts: nomad_nodes
become: yes
vars:
traefik_endpoint: "hcp1.tailnet-68f9.ts.net:80"
tasks:
- name: 📊 显示架构优化信息
debug:
msg: |
🎯 实现BGP路由反射器模式
📉 连接数优化Full Mesh (54连接) → Star Topology (21连接)
🌐 所有节点 → Traefik → Consul Leader
run_once: true
- name: 🔍 检查当前Consul配置
shell: grep "address.*=" /etc/nomad.d/nomad.hcl
register: current_config
ignore_errors: yes
- name: 📋 显示当前配置
debug:
msg: "当前配置: {{ current_config.stdout }}"
- name: 🔧 更新Consul地址为Traefik端点
replace:
path: /etc/nomad.d/nomad.hcl
regexp: 'address = "[^"]*"'
replace: 'address = "{{ traefik_endpoint }}"'
notify: restart nomad
- name: ✅ 验证配置更新
shell: grep "address.*=" /etc/nomad.d/nomad.hcl
register: new_config
- name: 📋 显示新配置
debug:
msg: "新配置: {{ new_config.stdout }}"
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted

View File

@@ -0,0 +1,66 @@
---
- name: Initialize Vault Cluster
hosts: ch4 # 只在一个节点初始化
become: yes
tasks:
- name: Check if Vault is already initialized
uri:
url: "http://{{ ansible_host }}:8200/v1/sys/health"
method: GET
status_code: [200, 429, 472, 473, 501, 503]
register: vault_health
- name: Initialize Vault (only if not initialized)
uri:
url: "http://{{ ansible_host }}:8200/v1/sys/init"
method: POST
body_format: json
body:
secret_shares: 5
secret_threshold: 3
status_code: 200
register: vault_init_result
when: not vault_health.json.initialized
- name: Save initialization results to local file
copy:
content: |
# Vault Cluster Initialization Results
Generated on: {{ ansible_date_time.iso8601 }}
Initialized by: {{ inventory_hostname }}
## Root Token
{{ vault_init_result.json.root_token }}
## Unseal Keys
{% for key in vault_init_result.json.keys %}
Key {{ loop.index }}: {{ key }}
{% endfor %}
## Base64 Unseal Keys
{% for key in vault_init_result.json.keys_base64 %}
Key {{ loop.index }} (base64): {{ key }}
{% endfor %}
## Important Notes
- Store these keys securely and separately
- You need 3 out of 5 keys to unseal Vault
- Root token provides full access to Vault
- Consider revoking root token after initial setup
dest: /tmp/vault-init-results.txt
delegate_to: localhost
when: vault_init_result is defined and vault_init_result.json is defined
- name: Display initialization results
debug:
msg: |
Vault initialized successfully!
Root Token: {{ vault_init_result.json.root_token }}
Unseal Keys: {{ vault_init_result.json.keys }}
when: vault_init_result is defined and vault_init_result.json is defined
- name: Display already initialized message
debug:
msg: "Vault is already initialized on {{ inventory_hostname }}"
when: vault_health.json.initialized

View File

@@ -0,0 +1,85 @@
---
- name: Deploy Vault Cluster with Consul Integration
hosts: ch4,ash3c,warden
become: yes
vars:
vault_version: "1.15.2"
vault_datacenter: "dc1"
vault_cluster_name: "vault-cluster"
tasks:
- name: Update apt cache
apt:
update_cache: yes
cache_valid_time: 3600
- name: Add HashiCorp GPG key (if not exists)
shell: |
if [ ! -f /etc/apt/sources.list.d/hashicorp.list ]; then
curl -fsSL https://apt.releases.hashicorp.com/gpg | gpg --dearmor | sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
fi
args:
creates: /etc/apt/sources.list.d/hashicorp.list
- name: Install Vault
apt:
name: vault
state: present
update_cache: yes
allow_downgrade: yes
- name: Create vault user and directories
block:
- name: Create vault data directory
file:
path: /opt/vault/data
state: directory
owner: vault
group: vault
mode: '0755'
- name: Create vault config directory
file:
path: /etc/vault.d
state: directory
owner: vault
group: vault
mode: '0755'
- name: Generate Vault configuration
template:
src: vault.hcl.j2
dest: /etc/vault.d/vault.hcl
owner: vault
group: vault
mode: '0640'
notify: restart vault
- name: Create Vault systemd service
template:
src: vault.service.j2
dest: /etc/systemd/system/vault.service
owner: root
group: root
mode: '0644'
notify:
- reload systemd
- restart vault
- name: Enable and start Vault service
systemd:
name: vault
enabled: yes
state: started
daemon_reload: yes
handlers:
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart vault
systemd:
name: vault
state: restarted

View File

@@ -0,0 +1,67 @@
---
- name: Verify Vault Cluster Status
hosts: ch4,ash3c,warden
become: yes
tasks:
- name: Check Vault service status
systemd:
name: vault
register: vault_service_status
- name: Display Vault service status
debug:
msg: "Vault service on {{ inventory_hostname }}: {{ vault_service_status.status.ActiveState }}"
- name: Check Vault process
shell: ps aux | grep vault | grep -v grep
register: vault_process
ignore_errors: yes
- name: Display Vault process
debug:
msg: "Vault process on {{ inventory_hostname }}: {{ vault_process.stdout_lines }}"
- name: Check Vault port 8200
wait_for:
port: 8200
host: "{{ ansible_default_ipv4.address }}"
timeout: 10
register: vault_port_check
ignore_errors: yes
- name: Display port check result
debug:
msg: "Vault port 8200 on {{ inventory_hostname }}: {{ 'OPEN' if vault_port_check.failed == false else 'CLOSED' }}"
- name: Get Vault status
uri:
url: "http://{{ ansible_default_ipv4.address }}:8200/v1/sys/health"
method: GET
status_code: [200, 429, 472, 473, 501, 503]
register: vault_health
ignore_errors: yes
- name: Display Vault health status
debug:
msg: "Vault health on {{ inventory_hostname }}: {{ vault_health.json if vault_health.json is defined else 'Connection failed' }}"
- name: Check Consul integration
uri:
url: "http://127.0.0.1:8500/v1/kv/vault/?recurse"
method: GET
register: consul_vault_kv
ignore_errors: yes
- name: Display Consul Vault KV
debug:
msg: "Consul Vault KV on {{ inventory_hostname }}: {{ 'Found vault keys' if consul_vault_kv.status == 200 else 'No vault keys found' }}"
- name: Check Vault logs for errors
shell: journalctl -u vault --no-pager -n 10 | grep -i error || echo "No errors found"
register: vault_logs
ignore_errors: yes
- name: Display Vault error logs
debug:
msg: "Vault errors on {{ inventory_hostname }}: {{ vault_logs.stdout_lines }}"

View File

@@ -0,0 +1,91 @@
# 查看Oracle云实例状态脚本
# 用于查看美国区和韩国区的实例状态
# 韩国区配置 - 使用默认provider
# 美国区配置 - 使用us alias
# 获取韩国区的所有实例
data "oci_core_instances" "korea_instances" {
compartment_id = data.consul_keys.oracle_config.var.tenancy_ocid
filter {
name = "lifecycle_state"
values = ["RUNNING", "STOPPED", "STOPPING", "STARTING"]
}
}
# 获取美国区的所有实例
data "oci_core_instances" "us_instances" {
provider = oci.us
compartment_id = data.consul_keys.oracle_config_us.var.tenancy_ocid
filter {
name = "lifecycle_state"
values = ["RUNNING", "STOPPED", "STOPPING", "STARTING"]
}
}
# 获取韩国区实例的详细信息
data "oci_core_instance" "korea_instance_details" {
count = length(data.oci_core_instances.korea_instances.instances)
instance_id = data.oci_core_instances.korea_instances.instances[count.index].id
}
# 获取美国区实例的详细信息
data "oci_core_instance" "us_instance_details" {
provider = oci.us
count = length(data.oci_core_instances.us_instances.instances)
instance_id = data.oci_core_instances.us_instances.instances[count.index].id
}
# 输出韩国区实例信息
output "korea_instances" {
description = "韩国区实例状态"
value = {
count = length(data.oci_core_instances.korea_instances.instances)
instances = [
for instance in data.oci_core_instance.korea_instance_details : {
id = instance.id
name = instance.display_name
state = instance.state
shape = instance.shape
region = "ap-chuncheon-1"
ad = instance.availability_domain
public_ip = instance.public_ip
private_ip = instance.private_ip
time_created = instance.time_created
}
]
}
}
# 输出美国区实例信息
output "us_instances" {
description = "美国区实例状态"
value = {
count = length(data.oci_core_instances.us_instances.instances)
instances = [
for instance in data.oci_core_instance.us_instance_details : {
id = instance.id
name = instance.display_name
state = instance.state
shape = instance.shape
region = "us-ashburn-1"
ad = instance.availability_domain
public_ip = instance.public_ip
private_ip = instance.private_ip
time_created = instance.time_created
}
]
}
}
# 输出总计信息
output "summary" {
description = "实例总计信息"
value = {
total_instances = length(data.oci_core_instances.korea_instances.instances) + length(data.oci_core_instances.us_instances.instances)
korea_count = length(data.oci_core_instances.korea_instances.instances)
us_count = length(data.oci_core_instances.us_instances.instances)
}
}

View File

@@ -0,0 +1,225 @@
# 开发环境主配置文件
# 引入共享版本配置
terraform {
required_version = ">= 1.6"
required_providers {
# Oracle Cloud Infrastructure
oci = {
source = "oracle/oci"
version = "~> 7.20"
}
# 其他常用提供商
random = {
source = "hashicorp/random"
version = "~> 3.1"
}
tls = {
source = "hashicorp/tls"
version = "~> 4.0"
}
local = {
source = "hashicorp/local"
version = "~> 2.1"
}
# Consul Provider
consul = {
source = "hashicorp/consul"
version = "~> 2.22.0"
}
# HashiCorp Vault Provider
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
# Cloudflare Provider
cloudflare = {
source = "cloudflare/cloudflare"
version = "~> 3.0"
}
}
# 后端配置
backend "local" {
path = "terraform.tfstate"
}
}
# Consul Provider配置 - 使用Tailscale IP而非localhost
provider "consul" {
address = "100.116.158.95:8500"
scheme = "http"
datacenter = "dc1"
}
# 从Consul获取Cloudflare配置
data "consul_keys" "cloudflare_config" {
key {
name = "token"
path = "config/dev/cloudflare/token"
}
}
# Cloudflare Provider配置
provider "cloudflare" {
api_token = data.consul_keys.cloudflare_config.var.token
}
# 从Consul获取Oracle Cloud配置
data "consul_keys" "oracle_config" {
key {
name = "tenancy_ocid"
path = "config/dev/oracle/kr/tenancy_ocid"
}
key {
name = "user_ocid"
path = "config/dev/oracle/kr/user_ocid"
}
key {
name = "fingerprint"
path = "config/dev/oracle/kr/fingerprint"
}
key {
name = "private_key"
path = "config/dev/oracle/kr/private_key"
}
}
# 从Consul获取Oracle Cloud美国区域配置
data "consul_keys" "oracle_config_us" {
key {
name = "tenancy_ocid"
path = "config/dev/oracle/us/tenancy_ocid"
}
key {
name = "user_ocid"
path = "config/dev/oracle/us/user_ocid"
}
key {
name = "fingerprint"
path = "config/dev/oracle/us/fingerprint"
}
key {
name = "private_key"
path = "config/dev/oracle/us/private_key"
}
}
# 使用从Consul获取的配置的OCI Provider
provider "oci" {
tenancy_ocid = data.consul_keys.oracle_config.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config.var.user_ocid
fingerprint = data.consul_keys.oracle_config.var.fingerprint
private_key = file(var.oci_config.private_key_path)
region = "ap-chuncheon-1"
}
# 美国区域的OCI Provider
provider "oci" {
alias = "us"
tenancy_ocid = data.consul_keys.oracle_config_us.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config_us.var.user_ocid
fingerprint = data.consul_keys.oracle_config_us.var.fingerprint
private_key = file(var.oci_config.private_key_path)
region = "us-ashburn-1"
}
# Oracle Cloud 基础设施
module "oracle_cloud" {
source = "../../providers/oracle-cloud"
# 传递变量
environment = var.environment
project_name = var.project_name
owner = var.owner
vpc_cidr = var.vpc_cidr
availability_zones = var.availability_zones
common_tags = var.common_tags
# 使用从Consul获取的配置
oci_config = {
tenancy_ocid = data.consul_keys.oracle_config.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config.var.user_ocid
fingerprint = data.consul_keys.oracle_config.var.fingerprint
private_key_path = var.oci_config.private_key_path
region = "ap-chuncheon-1"
compartment_ocid = ""
}
# 开发环境特定配置
instance_count = 1
instance_size = "VM.Standard.E2.1.Micro" # 免费层
}
# 输出
output "oracle_cloud_outputs" {
description = "Oracle Cloud 基础设施输出"
value = module.oracle_cloud
}
# Nomad 多数据中心集群
module "nomad_cluster" {
source = "../../modules/nomad-cluster"
# 部署控制变量 - 禁用所有计算资源创建
deploy_korea_node = false
deploy_us_node = false # 暂时禁用美国节点
# Oracle Cloud 配置
oracle_config = {
tenancy_ocid = data.consul_keys.oracle_config.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config.var.user_ocid
fingerprint = data.consul_keys.oracle_config.var.fingerprint
private_key_path = var.oci_config.private_key_path
region = "ap-chuncheon-1"
compartment_ocid = ""
}
# 通用配置
common_tags = var.common_tags
ssh_public_key = var.ssh_public_key
# Nomad 特定配置
nomad_version = "1.7.7"
nomad_encrypt_key = var.nomad_encrypt_key
# Oracle Cloud 特定配置
oracle_availability_domain = "Uocm:AP-CHUNCHEON-1-AD-1"
oracle_subnet_id = module.oracle_cloud.subnet_ids[0] # 使用第一个子网
# 依赖关系
depends_on = [module.oracle_cloud]
}
# Cloudflare 连通性测试
data "cloudflare_zones" "available" {
filter {
status = "active"
}
}
data "cloudflare_accounts" "available" {}
# 输出 Cloudflare 连通性测试结果
output "cloudflare_connectivity_test" {
description = "Cloudflare API 连通性测试结果"
value = {
zones_count = length(data.cloudflare_zones.available.zones)
accounts_count = length(data.cloudflare_accounts.available.accounts)
zones = [for zone in data.cloudflare_zones.available.zones : {
name = zone.name
id = zone.id
}]
accounts = [for account in data.cloudflare_accounts.available.accounts : {
name = account.name
id = account.id
}]
}
}

View File

@@ -0,0 +1,169 @@
# 开发环境变量定义
variable "environment" {
description = "环境名称"
type = string
default = "dev"
}
variable "project_name" {
description = "项目名称"
type = string
default = "mgmt"
}
variable "owner" {
description = "项目所有者"
type = string
default = "ben"
}
variable "cloud_providers" {
description = "要启用的云服务商列表"
type = list(string)
default = ["oracle"]
}
variable "vpc_cidr" {
description = "VPC CIDR 块"
type = string
default = "10.0.0.0/16"
}
variable "availability_zones" {
description = "可用区列表"
type = list(string)
default = ["a", "b"]
}
variable "common_tags" {
description = "通用标签"
type = map(string)
default = {
Environment = "dev"
Project = "mgmt"
ManagedBy = "terraform"
}
}
# Oracle Cloud 配置
variable "oci_config" {
description = "Oracle Cloud 配置"
type = object({
tenancy_ocid = string
user_ocid = string
fingerprint = string
private_key_path = string
region = string
compartment_ocid = optional(string)
})
default = {
tenancy_ocid = ""
user_ocid = ""
fingerprint = ""
private_key_path = ""
region = "ap-seoul-1"
compartment_ocid = ""
}
}
# 华为云配置
variable "huawei_config" {
description = "华为云配置"
type = object({
access_key = string
secret_key = string
region = string
project_id = optional(string)
})
default = {
access_key = ""
secret_key = ""
region = "cn-north-4"
project_id = ""
}
sensitive = true
}
# Google Cloud 配置
variable "gcp_config" {
description = "Google Cloud 配置"
type = object({
project_id = string
region = string
zone = string
credentials_file = string
})
default = {
project_id = ""
region = "asia-northeast3"
zone = "asia-northeast3-a"
credentials_file = ""
}
}
# AWS 配置
variable "aws_config" {
description = "AWS 配置"
type = object({
region = string
access_key = string
secret_key = string
})
default = {
region = "ap-northeast-2"
access_key = ""
secret_key = ""
}
sensitive = true
}
# DigitalOcean 配置
variable "do_config" {
description = "DigitalOcean 配置"
type = object({
token = string
region = string
})
default = {
token = ""
region = "sgp1"
}
sensitive = true
}
# HashiCorp Vault 配置 - 使用Tailscale IP而非localhost
variable "vault_config" {
description = "HashiCorp Vault 配置"
type = object({
address = string
token = string
})
default = {
address = "http://100.116.158.95:8200"
token = ""
}
sensitive = true
}
variable "vault_token" {
description = "Vault 访问令牌"
type = string
default = ""
sensitive = true
}
# SSH 公钥配置
variable "ssh_public_key" {
description = "SSH 公钥,用于访问云实例"
type = string
default = ""
}
# Nomad 配置
variable "nomad_encrypt_key" {
description = "Nomad 集群加密密钥"
type = string
default = ""
sensitive = true
}

View File

@@ -0,0 +1,169 @@
# Nomad 多数据中心生产环境配置
# 部署架构: CN(dc1) + KR(dc2) + US(dc3)
terraform {
required_version = ">= 1.0"
required_providers {
oci = {
source = "oracle/oci"
version = "~> 7.20"
}
huaweicloud = {
source = "huaweicloud/huaweicloud"
version = "~> 1.60"
}
}
}
# Oracle Cloud Provider (韩国)
provider "oci" {
alias = "korea"
tenancy_ocid = var.oracle_tenancy_ocid
user_ocid = var.oracle_user_ocid
fingerprint = var.oracle_fingerprint
private_key_path = var.oracle_private_key_path
region = "ap-seoul-1" # 韩国首尔
}
# 华为云 Provider (美国)
provider "huaweicloud" {
alias = "us"
access_key = var.huawei_access_key
secret_key = var.huawei_secret_key
region = "us-east-1" # 美国东部
}
# 本地变量
locals {
project_name = "nomad-multi-dc"
environment = "production"
common_tags = {
Project = local.project_name
Environment = local.environment
ManagedBy = "terraform"
Owner = "devops-team"
}
}
# 数据源:获取 SSH 公钥
data "local_file" "ssh_public_key" {
filename = pathexpand("~/.ssh/id_rsa.pub")
}
# Oracle Cloud 基础设施 (韩国 - dc2)
module "oracle_infrastructure" {
source = "../../providers/oracle-cloud"
providers = {
oci = oci.korea
}
project_name = local.project_name
environment = local.environment
vpc_cidr = "10.1.0.0/16"
oci_config = {
tenancy_ocid = var.oracle_tenancy_ocid
user_ocid = var.oracle_user_ocid
fingerprint = var.oracle_fingerprint
private_key_path = var.oracle_private_key_path
region = "ap-seoul-1"
}
common_tags = local.common_tags
}
# 华为云基础设施 (美国 - dc3)
module "huawei_infrastructure" {
source = "../../providers/huawei-cloud"
providers = {
huaweicloud = huaweicloud.us
}
project_name = local.project_name
environment = local.environment
vpc_cidr = "10.2.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b"]
common_tags = local.common_tags
}
# Nomad 多数据中心集群
module "nomad_cluster" {
source = "../../modules/nomad-cluster"
# 部署配置
deploy_korea_node = var.deploy_korea_node
deploy_us_node = var.deploy_us_node
# Oracle Cloud 配置
oracle_config = {
tenancy_ocid = var.oracle_tenancy_ocid
user_ocid = var.oracle_user_ocid
fingerprint = var.oracle_fingerprint
private_key_path = var.oracle_private_key_path
region = "ap-seoul-1"
}
oracle_subnet_id = module.oracle_infrastructure.public_subnet_ids[0]
oracle_security_group_id = module.oracle_infrastructure.security_group_id
# 华为云配置
huawei_config = {
access_key = var.huawei_access_key
secret_key = var.huawei_secret_key
region = "us-east-1"
}
huawei_subnet_id = module.huawei_infrastructure.public_subnet_ids[0]
huawei_security_group_id = module.huawei_infrastructure.security_group_id
# 通用配置
ssh_public_key = data.local_file.ssh_public_key.content
common_tags = local.common_tags
# Nomad 配置
nomad_version = "1.10.5"
nomad_encrypt_key = var.nomad_encrypt_key
}
# 生成 Ansible inventory
resource "local_file" "ansible_inventory" {
filename = "${path.module}/generated/nomad-cluster-inventory.yml"
content = yamlencode({
all = {
children = {
nomad_servers = {
hosts = module.nomad_cluster.ansible_inventory.all.children.nomad_servers.hosts
}
}
vars = {
ansible_user = "ubuntu"
ansible_ssh_private_key_file = "~/.ssh/id_rsa"
ansible_ssh_common_args = "-o StrictHostKeyChecking=no"
}
}
})
}
# 生成部署后配置脚本
resource "local_file" "post_deploy_script" {
filename = "${path.module}/generated/post-deploy.sh"
content = templatefile("${path.module}/templates/post-deploy.sh", {
cluster_overview = module.nomad_cluster.cluster_overview
endpoints = module.nomad_cluster.cluster_endpoints
})
file_permission = "0755"
}
# 生成跨数据中心测试任务
resource "local_file" "cross_dc_test_job" {
filename = "${path.module}/generated/cross-dc-test.nomad"
content = templatefile("${path.module}/templates/cross-dc-test.nomad", {
datacenters = ["dc1", "dc2", "dc3"]
})
}

View File

@@ -0,0 +1,46 @@
# Nomad 多数据中心生产环境输出
output "cluster_overview" {
description = "Nomad 多数据中心集群概览"
value = module.nomad_cluster.cluster_overview
}
output "cluster_endpoints" {
description = "集群连接端点"
value = module.nomad_cluster.cluster_endpoints
}
output "oracle_korea_node" {
description = "Oracle Cloud 韩国节点信息"
value = module.nomad_cluster.oracle_korea_node
}
output "huawei_us_node" {
description = "华为云美国节点信息"
value = module.nomad_cluster.huawei_us_node
}
output "deployment_summary" {
description = "部署摘要"
value = {
total_nodes = module.nomad_cluster.cluster_overview.total_nodes
datacenters = keys(module.nomad_cluster.cluster_overview.datacenters)
next_steps = [
"1. 等待所有节点启动完成 (约 5-10 分钟)",
"2. 运行: ./generated/post-deploy.sh",
"3. 验证集群: nomad server members",
"4. 测试跨 DC 调度: nomad job run generated/cross-dc-test.nomad",
"5. 访问 Web UI 查看集群状态"
]
web_ui_urls = module.nomad_cluster.cluster_endpoints.nomad_ui_urls
ssh_commands = module.nomad_cluster.cluster_endpoints.ssh_commands
}
}
output "verification_commands" {
description = "验证命令"
value = module.nomad_cluster.verification_commands
}

View File

@@ -0,0 +1,22 @@
# Nomad 多数据中心生产环境配置示例
# 复制此文件为 terraform.tfvars 并填入实际值
# 部署控制
deploy_korea_node = true # 是否部署韩国节点
deploy_us_node = true # 是否部署美国节点
# Oracle Cloud 配置 (韩国 - dc2)
# 获取方式: https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm
oracle_tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaa..."
oracle_user_ocid = "ocid1.user.oc1..aaaaaaaa..."
oracle_fingerprint = "aa:bb:cc:dd:ee:ff:..."
oracle_private_key_path = "~/.oci/oci_api_key.pem"
# 华为云配置 (美国 - dc3)
# 获取方式: https://console.huaweicloud.com/iam/#/mine/accessKey
huawei_access_key = "YOUR_HUAWEI_ACCESS_KEY"
huawei_secret_key = "YOUR_HUAWEI_SECRET_KEY"
# Nomad 集群加密密钥 (可选,已有默认值)
# 生成方式: nomad operator keygen
nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="

View File

@@ -0,0 +1,81 @@
# Nomad 多数据中心生产环境变量
# 部署控制
variable "deploy_korea_node" {
description = "是否部署韩国节点 (Oracle Cloud)"
type = bool
default = true
}
variable "deploy_us_node" {
description = "是否部署美国节点 (华为云)"
type = bool
default = true
}
# Oracle Cloud 配置
variable "oracle_tenancy_ocid" {
description = "Oracle Cloud 租户 OCID"
type = string
sensitive = true
}
variable "oracle_user_ocid" {
description = "Oracle Cloud 用户 OCID"
type = string
sensitive = true
}
variable "oracle_fingerprint" {
description = "Oracle Cloud API 密钥指纹"
type = string
sensitive = true
}
variable "oracle_private_key_path" {
description = "Oracle Cloud 私钥文件路径"
type = string
sensitive = true
}
# 华为云配置
variable "huawei_access_key" {
description = "华为云访问密钥"
type = string
sensitive = true
}
variable "huawei_secret_key" {
description = "华为云秘密密钥"
type = string
sensitive = true
}
# Nomad 配置
variable "nomad_encrypt_key" {
description = "Nomad 集群加密密钥"
type = string
sensitive = true
default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}
# Vault 配置
variable "vault_config" {
description = "Vault 配置"
type = object({
address = string
token = string
})
default = {
address = "http://100.116.158.95:8200"
token = ""
}
sensitive = true
}
variable "vault_token" {
description = "Vault 访问令牌"
type = string
default = ""
sensitive = true
}

View File

@@ -0,0 +1,155 @@
# Staging环境主配置文件
# 引入共享版本配置
terraform {
required_version = ">= 1.6"
required_providers {
# Oracle Cloud Infrastructure
oci = {
source = "oracle/oci"
version = "~> 7.20"
}
# 其他常用提供商
random = {
source = "hashicorp/random"
version = "~> 3.1"
}
tls = {
source = "hashicorp/tls"
version = "~> 4.0"
}
local = {
source = "hashicorp/local"
version = "~> 2.1"
}
# Consul Provider
consul = {
source = "hashicorp/consul"
version = "~> 2.22.0"
}
# HashiCorp Vault Provider
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
}
# 后端配置
backend "local" {
path = "terraform.tfstate"
}
}
# Consul Provider配置
provider "consul" {
address = "100.116.158.95:8500"
scheme = "http"
datacenter = "dc1"
}
# Vault Provider配置
provider "vault" {
address = var.vault_config.address
token = var.vault_token
}
# 从Consul获取Oracle Cloud配置
data "consul_keys" "oracle_config" {
key {
name = "tenancy_ocid"
path = "config/staging/oracle/kr/tenancy_ocid"
}
key {
name = "user_ocid"
path = "config/staging/oracle/kr/user_ocid"
}
key {
name = "fingerprint"
path = "config/staging/oracle/kr/fingerprint"
}
key {
name = "private_key"
path = "config/staging/oracle/kr/private_key"
}
}
# 从Consul获取Oracle Cloud美国区域配置
data "consul_keys" "oracle_config_us" {
key {
name = "tenancy_ocid"
path = "config/staging/oracle/us/tenancy_ocid"
}
key {
name = "user_ocid"
path = "config/staging/oracle/us/user_ocid"
}
key {
name = "fingerprint"
path = "config/staging/oracle/us/fingerprint"
}
key {
name = "private_key"
path = "config/staging/oracle/us/private_key"
}
}
# 使用从Consul获取的配置的OCI Provider
provider "oci" {
tenancy_ocid = data.consul_keys.oracle_config.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config.var.user_ocid
fingerprint = data.consul_keys.oracle_config.var.fingerprint
private_key = data.consul_keys.oracle_config.var.private_key
region = "ap-chuncheon-1"
}
# 美国区域的OCI Provider
provider "oci" {
alias = "us"
tenancy_ocid = data.consul_keys.oracle_config_us.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config_us.var.user_ocid
fingerprint = data.consul_keys.oracle_config_us.var.fingerprint
private_key = data.consul_keys.oracle_config_us.var.private_key
region = "us-ashburn-1"
}
# Oracle Cloud 基础设施
module "oracle_cloud" {
source = "../../providers/oracle-cloud"
# 传递变量
environment = var.environment
project_name = var.project_name
owner = var.owner
vpc_cidr = var.vpc_cidr
availability_zones = var.availability_zones
common_tags = var.common_tags
# 使用从Consul获取的配置
oci_config = {
tenancy_ocid = data.consul_keys.oracle_config.var.tenancy_ocid
user_ocid = data.consul_keys.oracle_config.var.user_ocid
fingerprint = data.consul_keys.oracle_config.var.fingerprint
private_key = data.consul_keys.oracle_config.var.private_key
region = "ap-chuncheon-1"
}
# Staging环境特定配置
instance_count = 2
instance_size = "VM.Standard.E2.1.Micro"
providers = {
oci = oci
}
}
# 输出
output "oracle_cloud_outputs" {
description = "Oracle Cloud 基础设施输出"
value = module.oracle_cloud
}

Some files were not shown because too many files have changed in this diff Show More