1 feat: 重构基础设施架构并完善Consul集群配置

2
     3	主要变更:
     4	- 重构Terraform/OpenTofu目录结构,统一迁移至infrastructure/opentofu
     5	- 添加"7天创造世界"文档,记录基础设施建设演进逻辑
     6	- 更新Consul集群配置管理经验,添加实际案例和解决方案
     7	- 修正README中的Sticky Note,反映Consul集群健康状态
     8	- 添加Ansible部署配置和inventory文件
     9	- 完善项目文档结构,添加各组件配置指南
    10
    11	技术架构演进:
    12	- 第1天: Tailscale网络连接基础 
    13	- 第2天: Ansible分布式控制 
    14	- 第3天: Nomad服务感知与任务调度 
    15	- 第4天: Consul配置集中管理 
    16	- 第5天: OpenTofu状态一致性 
    17	- 第6天: Vault密钥管理 
    18	- 第7天: Waypoint应用部署 
This commit is contained in:
2025-09-30 03:46:33 +00:00
parent c0064b2cad
commit e8bfc76038
119 changed files with 1772 additions and 631 deletions

View File

@@ -0,0 +1,159 @@
# Nomad 多数据中心集群模块
# 支持跨地域部署CN(dc1) + KR(dc2) + US(dc3)
terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "~> 7.20"
}
huaweicloud = {
source = "huaweicloud/huaweicloud"
version = "~> 1.60"
}
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
# 本地变量
locals {
nomad_version = "1.10.5"
# 通用 Nomad 配置
nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
# 数据中心配置
datacenters = {
dc1 = {
name = "dc1"
region = "cn"
location = "China"
provider = "existing" # 现有的 semaphore 节点
}
dc2 = {
name = "dc2"
region = "kr"
location = "Korea"
provider = "oracle"
}
dc3 = {
name = "dc3"
region = "us"
location = "US"
provider = "huawei" # 或 aws
}
}
# 用户数据模板
user_data_template = templatefile("${path.module}/templates/nomad-userdata.sh", {
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
})
}
# 数据源:获取现有的 semaphore 节点信息
data "external" "semaphore_info" {
program = ["bash", "-c", <<-EOF
echo '{
"ip": "100.116.158.95",
"datacenter": "dc1",
"status": "existing"
}'
EOF
]
}
# Oracle Cloud 韩国节点 (dc2)
module "oracle_korea_node" {
source = "../compute"
count = var.deploy_korea_node ? 1 : 0
# Oracle Cloud 特定配置
provider_type = "oracle"
# 实例配置
instance_config = {
name = "nomad-master-kr"
datacenter = "dc2"
instance_type = "VM.Standard.E2.1.Micro" # 免费层
image_id = var.oracle_ubuntu_image_id
subnet_id = var.oracle_subnet_id
# Nomad 配置
nomad_role = "server"
bootstrap_expect = 1
bind_addr = "auto" # 自动检测
# 网络配置
security_groups = [var.oracle_security_group_id]
# 标签
tags = merge(var.common_tags, {
Name = "nomad-master-kr"
Datacenter = "dc2"
Role = "nomad-server"
Provider = "oracle"
})
}
# 用户数据
user_data = templatefile("${path.module}/templates/nomad-userdata.sh", {
datacenter = "dc2"
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
bootstrap_expect = 1
bind_addr = "auto"
server_enabled = true
client_enabled = true
})
}
# 华为云美国节点 (dc3)
module "huawei_us_node" {
source = "../compute"
count = var.deploy_us_node ? 1 : 0
# 华为云特定配置
provider_type = "huawei"
# 实例配置
instance_config = {
name = "nomad-ash3c-us"
datacenter = "dc3"
instance_type = "s6.small.1" # 1vCPU 1GB
image_id = var.huawei_ubuntu_image_id
subnet_id = var.huawei_subnet_id
# Nomad 配置
nomad_role = "server"
bootstrap_expect = 1
bind_addr = "auto"
# 网络配置
security_groups = [var.huawei_security_group_id]
# 标签
tags = merge(var.common_tags, {
Name = "nomad-ash3c-us"
Datacenter = "dc3"
Role = "nomad-server"
Provider = "huawei"
})
}
# 用户数据
user_data = templatefile("${path.module}/templates/nomad-userdata.sh", {
datacenter = "dc3"
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
bootstrap_expect = 1
bind_addr = "auto"
server_enabled = true
client_enabled = true
})
}

View File

@@ -0,0 +1,145 @@
# Nomad 多数据中心集群输出
# 集群概览
output "cluster_overview" {
description = "Nomad 多数据中心集群概览"
value = {
datacenters = {
dc1 = {
name = "dc1"
location = "China (CN)"
provider = "existing"
node = "semaphore"
ip = "100.116.158.95"
status = "existing"
}
dc2 = var.deploy_korea_node ? {
name = "dc2"
location = "Korea (KR)"
provider = "oracle"
node = "master"
ip = try(module.oracle_korea_node[0].public_ip, "pending")
status = "deployed"
} : null
dc3 = var.deploy_us_node ? {
name = "dc3"
location = "US"
provider = "huawei"
node = "ash3c"
ip = try(module.huawei_us_node[0].public_ip, "pending")
status = "deployed"
} : null
}
total_nodes = 1 + (var.deploy_korea_node ? 1 : 0) + (var.deploy_us_node ? 1 : 0)
}
}
# Oracle Cloud 韩国节点输出
output "oracle_korea_node" {
description = "Oracle Cloud 韩国节点信息"
value = var.deploy_korea_node ? {
instance_id = try(module.oracle_korea_node[0].instance_id, null)
public_ip = try(module.oracle_korea_node[0].public_ip, null)
private_ip = try(module.oracle_korea_node[0].private_ip, null)
datacenter = "dc2"
provider = "oracle"
region = var.oracle_config.region
# 连接信息
ssh_command = try("ssh ubuntu@${module.oracle_korea_node[0].public_ip}", null)
nomad_ui = try("http://${module.oracle_korea_node[0].public_ip}:4646", null)
} : null
}
# 华为云美国节点输出
output "huawei_us_node" {
description = "华为云美国节点信息"
value = var.deploy_us_node ? {
instance_id = try(module.huawei_us_node[0].instance_id, null)
public_ip = try(module.huawei_us_node[0].public_ip, null)
private_ip = try(module.huawei_us_node[0].private_ip, null)
datacenter = "dc3"
provider = "huawei"
region = var.huawei_config.region
# 连接信息
ssh_command = try("ssh ubuntu@${module.huawei_us_node[0].public_ip}", null)
nomad_ui = try("http://${module.huawei_us_node[0].public_ip}:4646", null)
} : null
}
# 集群连接信息
output "cluster_endpoints" {
description = "集群连接端点"
value = {
nomad_ui_urls = compact([
"http://100.116.158.95:4646", # dc1 - semaphore
var.deploy_korea_node ? try("http://${module.oracle_korea_node[0].public_ip}:4646", null) : null, # dc2
var.deploy_us_node ? try("http://${module.huawei_us_node[0].public_ip}:4646", null) : null # dc3
])
ssh_commands = compact([
"ssh root@100.116.158.95", # dc1 - semaphore
var.deploy_korea_node ? try("ssh ubuntu@${module.oracle_korea_node[0].public_ip}", null) : null, # dc2
var.deploy_us_node ? try("ssh ubuntu@${module.huawei_us_node[0].public_ip}", null) : null # dc3
])
}
}
# Ansible inventory 生成
output "ansible_inventory" {
description = "生成的 Ansible inventory"
value = {
all = {
children = {
nomad_servers = {
hosts = merge(
{
semaphore = {
ansible_host = "100.116.158.95"
datacenter = "dc1"
provider = "existing"
}
},
var.deploy_korea_node ? {
master = {
ansible_host = try(module.oracle_korea_node[0].public_ip, "pending")
datacenter = "dc2"
provider = "oracle"
}
} : {},
var.deploy_us_node ? {
ash3c = {
ansible_host = try(module.huawei_us_node[0].public_ip, "pending")
datacenter = "dc3"
provider = "huawei"
}
} : {}
)
}
}
}
}
}
# 部署后验证命令
output "verification_commands" {
description = "部署后验证命令"
value = [
"# 检查集群状态",
"nomad server members",
"",
"# 检查各数据中心节点",
"nomad node status -verbose",
"",
"# 跨数据中心任务调度测试",
"nomad job run examples/cross-dc-test.nomad",
"",
"# 访问 UI",
join("\n", [for url in compact([
"http://100.116.158.95:4646",
var.deploy_korea_node ? try("http://${module.oracle_korea_node[0].public_ip}:4646", null) : null,
var.deploy_us_node ? try("http://${module.huawei_us_node[0].public_ip}:4646", null) : null
]) : "curl -s ${url}/v1/status/leader"])
]
}

View File

@@ -0,0 +1,228 @@
#!/bin/bash
# Nomad 多数据中心节点自动配置脚本
# 数据中心: ${datacenter}
set -e
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/nomad-setup.log
}
log "开始配置 Nomad 节点 - 数据中心: ${datacenter}"
# 更新系统
log "更新系统包..."
apt-get update -y
apt-get upgrade -y
# 安装必要的包
log "安装必要的包..."
apt-get install -y \
curl \
wget \
unzip \
jq \
podman \
htop \
net-tools \
vim
# 启动 Podman
log "启动 Podman 服务..."
systemctl enable podman
systemctl start podman
usermod -aG podman ubuntu
# 安装 Nomad
log "安装 Nomad ${nomad_version}..."
cd /tmp
wget -q https://releases.hashicorp.com/nomad/${nomad_version}/nomad_${nomad_version}_linux_amd64.zip
unzip nomad_${nomad_version}_linux_amd64.zip
mv nomad /usr/local/bin/
chmod +x /usr/local/bin/nomad
# 创建 Nomad 用户和目录
log "创建 Nomad 用户和目录..."
useradd --system --home /etc/nomad.d --shell /bin/false nomad
mkdir -p /opt/nomad/data
mkdir -p /etc/nomad.d
mkdir -p /var/log/nomad
chown -R nomad:nomad /opt/nomad /etc/nomad.d /var/log/nomad
# 获取本机 IP 地址
if [ "${bind_addr}" = "auto" ]; then
# 尝试多种方法获取 IP
BIND_ADDR=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || \
curl -s http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip -H "Metadata-Flavor: Google" 2>/dev/null || \
ip route get 8.8.8.8 | awk '{print $7; exit}' || \
hostname -I | awk '{print $1}')
else
BIND_ADDR="${bind_addr}"
fi
log "检测到 IP 地址: $BIND_ADDR"
# 创建 Nomad 配置文件
log "创建 Nomad 配置文件..."
cat > /etc/nomad.d/nomad.hcl << EOF
datacenter = "${datacenter}"
region = "global"
data_dir = "/opt/nomad/data"
bind_addr = "$BIND_ADDR"
%{ if server_enabled }
server {
enabled = true
bootstrap_expect = ${bootstrap_expect}
encrypt = "${nomad_encrypt_key}"
}
%{ endif }
%{ if client_enabled }
client {
enabled = true
host_volume "podman-sock" {
path = "/run/podman/podman.sock"
read_only = false
}
}
%{ endif }
ui {
enabled = true
}
addresses {
http = "0.0.0.0"
rpc = "$BIND_ADDR"
serf = "$BIND_ADDR"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
plugin "podman" {
config {
volumes {
enabled = true
}
}
}
telemetry {
collection_interval = "10s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
log_level = "INFO"
log_file = "/var/log/nomad/nomad.log"
EOF
# 创建 systemd 服务文件
log "创建 systemd 服务文件..."
cat > /etc/systemd/system/nomad.service << EOF
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
Type=notify
User=nomad
Group=nomad
ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
# 启动 Nomad 服务
log "启动 Nomad 服务..."
systemctl daemon-reload
systemctl enable nomad
systemctl start nomad
# 等待服务启动
log "等待 Nomad 服务启动..."
sleep 10
# 验证安装
log "验证 Nomad 安装..."
if systemctl is-active --quiet nomad; then
log "✅ Nomad 服务运行正常"
log "📊 节点信息:"
/usr/local/bin/nomad node status -self || true
else
log "❌ Nomad 服务启动失败"
systemctl status nomad --no-pager || true
journalctl -u nomad --no-pager -n 20 || true
fi
# 配置防火墙(如果需要)
log "配置防火墙规则..."
if command -v ufw >/dev/null 2>&1; then
ufw allow 4646/tcp # HTTP API
ufw allow 4647/tcp # RPC
ufw allow 4648/tcp # Serf
ufw allow 22/tcp # SSH
fi
# 创建有用的别名和脚本
log "创建管理脚本..."
cat > /usr/local/bin/nomad-status << 'EOF'
#!/bin/bash
echo "=== Nomad 服务状态 ==="
systemctl status nomad --no-pager
echo -e "\n=== Nomad 集群成员 ==="
nomad server members 2>/dev/null || echo "无法连接到集群"
echo -e "\n=== Nomad 节点状态 ==="
nomad node status 2>/dev/null || echo "无法获取节点状态"
echo -e "\n=== 最近日志 ==="
journalctl -u nomad --no-pager -n 5
EOF
chmod +x /usr/local/bin/nomad-status
# 添加到 ubuntu 用户的 bashrc
echo 'alias ns="nomad-status"' >> /home/ubuntu/.bashrc
echo 'alias nomad-logs="journalctl -u nomad -f"' >> /home/ubuntu/.bashrc
log "🎉 Nomad 节点配置完成!"
log "📍 数据中心: ${datacenter}"
log "🌐 IP 地址: $BIND_ADDR"
log "🔗 Web UI: http://$BIND_ADDR:4646"
log "📝 使用 'nomad-status' 或 'ns' 命令查看状态"
# 输出重要信息到 motd
cat > /etc/update-motd.d/99-nomad << EOF
#!/bin/bash
echo ""
echo "🚀 Nomad 节点信息:"
echo " 数据中心: ${datacenter}"
echo " IP 地址: $BIND_ADDR"
echo " Web UI: http://$BIND_ADDR:4646"
echo " 状态检查: nomad-status"
echo ""
EOF
chmod +x /etc/update-motd.d/99-nomad
log "节点配置脚本执行完成"

View File

@@ -0,0 +1,118 @@
# Nomad 多数据中心集群变量定义
variable "deploy_korea_node" {
description = "是否部署韩国节点 (Oracle Cloud)"
type = bool
default = true
}
variable "deploy_us_node" {
description = "是否部署美国节点 (华为云)"
type = bool
default = true
}
# Oracle Cloud 配置
variable "oracle_config" {
description = "Oracle Cloud 配置"
type = object({
tenancy_ocid = string
user_ocid = string
fingerprint = string
private_key_path = string
region = string
})
sensitive = true
}
variable "oracle_ubuntu_image_id" {
description = "Oracle Cloud Ubuntu 镜像 ID"
type = string
default = "" # 将通过数据源自动获取
}
variable "oracle_subnet_id" {
description = "Oracle Cloud 子网 ID"
type = string
}
variable "oracle_security_group_id" {
description = "Oracle Cloud 安全组 ID"
type = string
}
# 华为云配置
variable "huawei_config" {
description = "华为云配置"
type = object({
access_key = string
secret_key = string
region = string
})
sensitive = true
}
variable "huawei_ubuntu_image_id" {
description = "华为云 Ubuntu 镜像 ID"
type = string
default = "" # 将通过数据源自动获取
}
variable "huawei_subnet_id" {
description = "华为云子网 ID"
type = string
}
variable "huawei_security_group_id" {
description = "华为云安全组 ID"
type = string
}
# 通用配置
variable "common_tags" {
description = "通用标签"
type = map(string)
default = {
Project = "nomad-multi-dc"
Environment = "production"
ManagedBy = "opentofu"
}
}
variable "ssh_public_key" {
description = "SSH 公钥"
type = string
}
variable "allowed_cidr_blocks" {
description = "允许访问的 CIDR 块"
type = list(string)
default = ["0.0.0.0/0"] # 生产环境应该限制
}
# Nomad 特定配置
variable "nomad_version" {
description = "Nomad 版本"
type = string
default = "1.10.5"
}
variable "nomad_encrypt_key" {
description = "Nomad 集群加密密钥"
type = string
sensitive = true
default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}
# 网络配置
variable "vpc_cidr" {
description = "VPC CIDR 块"
type = string
default = "10.0.0.0/16"
}
variable "availability_zones" {
description = "可用区列表"
type = list(string)
default = ["a", "b"]
}