Clean repository: organized structure and GitOps setup

- Organized root directory structure
- Moved orphan files to proper locations
- Updated .gitignore to ignore temporary files
- Set up Gitea Runner for GitOps automation
- Fixed Tailscale access issues
- Added workflow for automated Nomad deployment
This commit is contained in:
2025-10-09 06:13:45 +00:00
commit 89ee6f7967
306 changed files with 30781 additions and 0 deletions

View File

@@ -0,0 +1,158 @@
# Nomad 多数据中心集群模块
# 支持跨地域部署CN(dc1) + KR(dc2) + US(dc3)
terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "~> 7.20"
}
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
# 本地变量
locals {
nomad_version = "1.10.5"
# 通用 Nomad 配置
nomad_encrypt_key = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
# 数据中心配置
datacenters = {
dc1 = {
name = "dc1"
region = "cn"
location = "China"
provider = "existing" # 现有的 semaphore 节点
}
dc2 = {
name = "dc2"
region = "kr"
location = "Korea"
provider = "oracle"
}
dc3 = {
name = "dc3"
region = "us"
location = "US"
provider = "aws" # 暂时使用AWS替代华为云
}
}
# 用户数据模板
user_data_template = templatefile("${path.module}/templates/nomad-userdata.sh", {
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
VERSION_ID = "20.04" # Ubuntu 20.04
NOMAD_VERSION = local.nomad_version
NOMAD_ZIP = "nomad_${local.nomad_version}_linux_amd64.zip"
NOMAD_URL = "https://releases.hashicorp.com/nomad/${local.nomad_version}/nomad_${local.nomad_version}_linux_amd64.zip"
NOMAD_SHA256_URL = "https://releases.hashicorp.com/nomad/${local.nomad_version}/nomad_${local.nomad_version}_SHA256SUMS"
bind_addr = "auto"
nomad_servers = "\"127.0.0.1\""
})
}
# 数据源:获取现有的 semaphore 节点信息
data "external" "semaphore_info" {
program = ["bash", "-c", <<-EOF
echo '{
"ip": "100.116.158.95",
"datacenter": "dc1",
"status": "existing"
}'
EOF
]
}
# Oracle Cloud 韩国节点 (dc2)
resource "oci_core_instance" "nomad_kr_node" {
count = var.deploy_korea_node ? 1 : 0
# 基础配置
compartment_id = var.oracle_config.compartment_ocid
display_name = "nomad-master-kr"
availability_domain = var.oracle_availability_domain
shape = "VM.Standard.E2.1.Micro" # 免费层
# 源配置
source_details {
source_type = "image"
source_id = var.oracle_ubuntu_image_id
}
# 网络配置
create_vnic_details {
subnet_id = var.oracle_subnet_id
display_name = "nomad-kr-vnic"
assign_public_ip = true
}
# 元数据
metadata = {
ssh_authorized_keys = var.ssh_public_key
user_data = base64encode(templatefile("${path.module}/templates/nomad-userdata.sh", {
datacenter = "dc2"
nomad_version = local.nomad_version
nomad_encrypt_key = local.nomad_encrypt_key
bootstrap_expect = 1
bind_addr = "auto"
server_enabled = true
client_enabled = true
VERSION_ID = "20.04" # Ubuntu 20.04
NOMAD_VERSION = local.nomad_version
NOMAD_ZIP = "nomad_${local.nomad_version}_linux_amd64.zip"
NOMAD_URL = "https://releases.hashicorp.com/nomad/${local.nomad_version}/nomad_${local.nomad_version}_linux_amd64.zip"
NOMAD_SHA256_URL = "https://releases.hashicorp.com/nomad/${local.nomad_version}/nomad_${local.nomad_version}_SHA256SUMS"
nomad_servers = "\"127.0.0.1\""
}))
}
# 标签
defined_tags = merge(var.common_tags, {
"Name" = "nomad-master-kr"
"Datacenter" = "dc2"
"Role" = "nomad-server"
"Provider" = "oracle"
})
}
# 华为云美国节点 (dc3) - 暂时禁用
# resource "huaweicloud_compute_instance_v2" "nomad_us_node" {
# count = var.deploy_us_node ? 1 : 0
#
# name = "nomad-ash3c-us"
# image_id = var.huawei_ubuntu_image_id
# flavor_id = "s6.small.1" # 1vCPU 1GB
#
# # 网络配置
# network {
# uuid = var.huawei_subnet_id
# }
#
# # 元数据
# metadata = {
# ssh_authorized_keys = var.ssh_public_key
# user_data = base64encode(templatefile("${path.module}/templates/nomad-userdata.sh", {
# datacenter = "dc3"
# nomad_version = local.nomad_version
# nomad_encrypt_key = local.nomad_encrypt_key
# bootstrap_expect = 1
# bind_addr = "auto"
# server_enabled = true
# client_enabled = true
# }))
# }
#
# # 标签
# tags = merge(var.common_tags, {
# Name = "nomad-ash3c-us"
# Datacenter = "dc3"
# Role = "nomad-server"
# Provider = "huawei"
# })
# }

View File

@@ -0,0 +1,145 @@
# Nomad 多数据中心集群输出
# 集群概览
output "cluster_overview" {
description = "Nomad 多数据中心集群概览"
value = {
datacenters = {
dc1 = {
name = "dc1"
location = "China (CN)"
provider = "existing"
node = "semaphore"
ip = "100.116.158.95"
status = "existing"
}
dc2 = var.deploy_korea_node ? {
name = "dc2"
location = "Korea (KR)"
provider = "oracle"
node = "ch4"
ip = try(oci_core_instance.nomad_kr_node[0].public_ip, "pending")
status = "deployed"
} : null
dc3 = var.deploy_us_node ? {
name = "dc3"
location = "US"
provider = "aws" # 暂时使用AWS替代华为云
node = "ash3c"
ip = "pending" # 暂时禁用
status = "disabled"
} : null
}
total_nodes = 1 + (var.deploy_korea_node ? 1 : 0) + (var.deploy_us_node ? 1 : 0)
}
}
# Oracle Cloud 韩国节点输出
output "oracle_korea_node" {
description = "Oracle Cloud 韩国节点信息"
value = var.deploy_korea_node ? {
instance_id = try(oci_core_instance.nomad_kr_node[0].id, null)
public_ip = try(oci_core_instance.nomad_kr_node[0].public_ip, null)
private_ip = try(oci_core_instance.nomad_kr_node[0].private_ip, null)
datacenter = "dc2"
provider = "oracle"
region = var.oracle_config.region
# 连接信息
ssh_command = try("ssh ubuntu@${oci_core_instance.nomad_kr_node[0].public_ip}", null)
nomad_ui = try("http://${oci_core_instance.nomad_kr_node[0].public_ip}:4646", null)
} : null
}
# 华为云美国节点输出 - 暂时禁用
# output "huawei_us_node" {
# description = "华为云美国节点信息"
# value = var.deploy_us_node ? {
# instance_id = try(huaweicloud_compute_instance_v2.nomad_us_node[0].id, null)
# public_ip = try(huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4, null)
# private_ip = try(huaweicloud_compute_instance_v2.nomad_us_node[0].network[0].fixed_ip_v4, null)
# datacenter = "dc3"
# provider = "huawei"
# region = var.huawei_config.region
#
# # 连接信息
# ssh_command = try("ssh ubuntu@${huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4}", null)
# nomad_ui = try("http://${huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4}:4646", null)
# } : null
# }
# 集群连接信息
output "cluster_endpoints" {
description = "集群连接端点"
value = {
nomad_ui_urls = compact([
"http://100.116.158.95:4646", # dc1 - semaphore
var.deploy_korea_node ? try("http://${oci_core_instance.nomad_kr_node[0].public_ip}:4646", null) : null, # dc2
# var.deploy_us_node ? try("http://${huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4}:4646", null) : null # dc3 - 暂时禁用
])
ssh_commands = compact([
"ssh root@100.116.158.95", # dc1 - semaphore
var.deploy_korea_node ? try("ssh ubuntu@${oci_core_instance.nomad_kr_node[0].public_ip}", null) : null, # dc2
# var.deploy_us_node ? try("ssh ubuntu@${huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4}", null) : null # dc3 - 暂时禁用
])
}
}
# Ansible inventory 生成
output "ansible_inventory" {
description = "生成的 Ansible inventory"
value = {
all = {
children = {
nomad_servers = {
hosts = merge(
{
semaphore = {
ansible_host = "100.116.158.95"
datacenter = "dc1"
provider = "existing"
}
},
var.deploy_korea_node ? {
master = {
ansible_host = try(oci_core_instance.nomad_kr_node[0].public_ip, "pending")
datacenter = "dc2"
provider = "oracle"
}
} : {}
# var.deploy_us_node ? {
# ash3c = {
# ansible_host = try(huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4, "pending")
# datacenter = "dc3"
# provider = "huawei"
# }
# } : {} # 暂时禁用
)
}
}
}
}
}
# 部署后验证命令
output "verification_commands" {
description = "部署后验证命令"
value = [
"# 检查集群状态",
"nomad server members",
"",
"# 检查各数据中心节点",
"nomad node status -verbose",
"",
"# 跨数据中心任务调度测试",
"nomad job run examples/cross-dc-test.nomad",
"",
"# 访问 UI",
join("\n", [for url in compact([
"http://100.116.158.95:4646",
var.deploy_korea_node ? try("http://${oci_core_instance.nomad_kr_node[0].public_ip}:4646", null) : null,
# var.deploy_us_node ? try("http://${huaweicloud_compute_instance_v2.nomad_us_node[0].access_ip_v4}:4646", null) : null # dc3 - 暂时禁用
]) : "curl -s ${url}/v1/status/leader"])
]
}

View File

@@ -0,0 +1,276 @@
#!/bin/bash
# Nomad 节点用户数据脚本
# 用于自动配置 Nomad 节点,支持服务器和客户端模式
set -e
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log "开始 Nomad 节点配置..."
# 更新系统
log "更新系统包..."
apt-get update
apt-get upgrade -y
# 安装必要工具
log "安装必要工具..."
apt-get install -y curl unzip wget gnupg software-properties-common
# 安装 Podman (作为容器运行时)
log "安装 Podman..."
. /etc/os-release
echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_${VERSION_ID}/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_${VERSION_ID}/Release.key" | apt-key add -
apt-get update
apt-get install -y podman
# 配置 Podman
log "配置 Podman..."
mkdir -p /etc/containers
echo -e "[registries.search]\nregistries = ['docker.io']" > /etc/containers/registries.conf
# 下载并安装 Nomad
log "安装 Nomad..."
NOMAD_VERSION=${nomad_version}
NOMAD_ZIP="nomad_${NOMAD_VERSION}_linux_amd64.zip"
NOMAD_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/${NOMAD_ZIP}"
NOMAD_SHA256_URL="https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_SHA256SUMS"
cd /tmp
wget -q ${NOMAD_URL}
wget -q ${NOMAD_SHA256_URL}
sha256sum -c nomad_${NOMAD_VERSION}_SHA256SUMS --ignore-missing
unzip -o ${NOMAD_ZIP} -d /usr/local/bin/
chmod +x /usr/local/bin/nomad
# 创建 Nomad 用户和目录
log "创建 Nomad 用户和目录..."
useradd --system --home /etc/nomad.d --shell /bin/false nomad
mkdir -p /opt/nomad/data
mkdir -p /etc/nomad.d
mkdir -p /var/log/nomad
chown -R nomad:nomad /opt/nomad /etc/nomad.d /var/log/nomad
# 获取本机 IP 地址
if [ "${bind_addr}" = "auto" ]; then
# 尝试多种方法获取 IP
BIND_ADDR=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || \
curl -s http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip -H "Metadata-Flavor: Google" 2>/dev/null || \
ip route get 8.8.8.8 | awk '{print $7; exit}' || \
hostname -I | awk '{print $1}')
else
BIND_ADDR="${bind_addr}"
fi
log "检测到 IP 地址: $BIND_ADDR"
# 创建 Nomad 配置文件
log "创建 Nomad 配置文件..."
cat > /etc/nomad.d/nomad.hcl << EOF
# Nomad 配置文件
datacenter = "${datacenter}"
data_dir = "/opt/nomad/data"
log_level = "INFO"
# 客户端配置
client {
enabled = true
servers = ["${nomad_servers}"]
options {
"driver.raw_exec.enable" = "1"
"driver.podman.enabled" = "1"
}
}
# 服务器配置
server {
enabled = ${server_enabled}
bootstrap_expect = ${bootstrap_expect}
}
# Consul 集成
consul {
address = "127.0.0.1:8500"
token = "${consul_token}"
}
# 加密设置
encrypt = "${nomad_encrypt_key}"
# 网络配置
network {
mode = "bridge"
}
# UI 配置
ui {
enabled = true
}
# 插件目录
plugin_dir = "/opt/nomad/plugins"
EOF
# 创建 systemd 服务文件
log "创建 systemd 服务文件..."
cat > /etc/systemd/system/nomad.service << EOF
[Unit]
Description=Nomad
Documentation=https://www.nomadproject.io/
Wants=network-online.target
After=network-online.target
[Service]
ExecReload=/bin/kill -HUP \$MAINPID
ExecStart=/usr/local/bin/nomad agent -config /etc/nomad.d
KillMode=process
KillSignal=SIGINT
LimitNOFILE=65536
LimitNPROC=infinity
Restart=on-failure
RestartSec=2
StartLimitBurst=3
StartLimitInterval=10
TasksMax=infinity
[Install]
WantedBy=multi-user.target
EOF
# 启动 Nomad 服务
log "启动 Nomad 服务..."
systemctl daemon-reload
systemctl enable nomad
systemctl start nomad
# 等待服务启动
log "等待 Nomad 服务启动..."
sleep 10
# 验证 Nomad 状态
if systemctl is-active --quiet nomad; then
log "Nomad 服务启动成功"
else
log "Nomad 服务启动失败"
journalctl -u nomad --no-pager
exit 1
fi
# 创建 Nomad 客户端状态检查脚本
log "创建状态检查脚本..."
cat > /usr/local/bin/check-nomad.sh << 'EOF'
#!/bin/bash
# Nomad 状态检查脚本
set -e
# 检查 Nomad 服务状态
if systemctl is-active --quiet nomad; then
echo "Nomad 服务运行正常"
else
echo "Nomad 服务未运行"
exit 1
fi
# 检查 Nomad 节点状态
NODE_STATUS=$(nomad node status -self -json | jq -r '.Status')
if [ "$NODE_STATUS" = "ready" ]; then
echo "Nomad 节点状态: $NODE_STATUS"
else
echo "Nomad 节点状态异常: $NODE_STATUS"
exit 1
fi
# 检查 Nomad 集群成员
SERVER_MEMBERS=$(nomad server members 2>/dev/null | grep -c "alive" || echo "0")
if [ "$SERVER_MEMBERS" -gt 0 ]; then
echo "Nomad 集群服务器成员: $SERVER_MEMBERS"
else
echo "未找到 Nomad 集群服务器成员"
exit 1
fi
echo "Nomad 状态检查完成"
EOF
chmod +x /usr/local/bin/check-nomad.sh
# 设置防火墙规则
log "设置防火墙规则..."
if command -v ufw >/dev/null 2>&1; then
ufw allow 4646/tcp # Nomad HTTP
ufw allow 4647/tcp # Nomad RPC
ufw allow 4648/tcp # Nomad Serf
ufw --force enable
elif command -v firewall-cmd >/dev/null 2>&1; then
firewall-cmd --permanent --add-port=4646/tcp
firewall-cmd --permanent --add-port=4647/tcp
firewall-cmd --permanent --add-port=4648/tcp
firewall-cmd --reload
fi
# 创建简单的 Nomad 任务示例
log "创建示例任务..."
mkdir -p /opt/nomad/examples
cat > /opt/nomad/examples/redis.nomad << 'EOF'
job "redis" {
datacenters = ["dc1", "dc2", "dc3"]
type = "service"
priority = 50
update {
stagger = "10s"
max_parallel = 1
}
group "redis" {
count = 1
restart {
attempts = 3
delay = "30s"
interval = "5m"
mode = "fail"
}
task "redis" {
driver = "podman"
config {
image = "redis:alpine"
ports = ["redis"]
}
resources {
cpu = 200 # MHz
memory = 128 # MB
network {
mbits = 10
port "redis" {
static = 6379
}
}
}
service {
name = "redis"
port = "redis"
check {
type = "tcp"
interval = "10s"
timeout = "2s"
}
}
}
}
}
EOF
log "Nomad 节点配置完成"
log "Nomad UI 可通过 http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):4646 访问"

View File

@@ -0,0 +1,115 @@
# Nomad 多数据中心集群变量定义
variable "deploy_korea_node" {
description = "是否部署韩国节点 (Oracle Cloud)"
type = bool
default = true
}
variable "deploy_us_node" {
description = "是否部署美国节点 (暂时禁用)"
type = bool
default = false
}
# Oracle Cloud 配置
variable "oracle_config" {
description = "Oracle Cloud 配置"
type = object({
tenancy_ocid = string
user_ocid = string
fingerprint = string
private_key_path = string
region = string
compartment_ocid = string
})
sensitive = true
}
variable "oracle_availability_domain" {
description = "Oracle Cloud 可用域"
type = string
default = "" # 将通过数据源自动获取
}
variable "oracle_ubuntu_image_id" {
description = "Oracle Cloud Ubuntu 镜像 ID"
type = string
default = "" # 将通过数据源自动获取
}
variable "oracle_subnet_id" {
description = "Oracle Cloud 子网 ID"
type = string
}
# 华为云配置 - 暂时禁用
# variable "huawei_config" {
# description = "华为云配置"
# type = object({
# access_key = string
# secret_key = string
# region = string
# })
# sensitive = true
# }
# variable "huawei_ubuntu_image_id" {
# description = "华为云 Ubuntu 镜像 ID"
# type = string
# default = "" # 将通过数据源自动获取
# }
# variable "huawei_subnet_id" {
# description = "华为云子网 ID"
# type = string
# }
# 通用配置
variable "common_tags" {
description = "通用标签"
type = map(string)
default = {
Project = "nomad-multi-dc"
Environment = "production"
ManagedBy = "terraform"
}
}
variable "ssh_public_key" {
description = "SSH 公钥"
type = string
}
variable "allowed_cidr_blocks" {
description = "允许访问的 CIDR 块"
type = list(string)
default = ["0.0.0.0/0"] # 生产环境应该限制
}
# Nomad 特定配置
variable "nomad_version" {
description = "Nomad 版本"
type = string
default = "1.10.5"
}
variable "nomad_encrypt_key" {
description = "Nomad 集群加密密钥"
type = string
sensitive = true
default = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ="
}
# 网络配置
variable "vpc_cidr" {
description = "VPC CIDR 块"
type = string
default = "10.0.0.0/16"
}
variable "availability_zones" {
description = "可用区列表"
type = list(string)
default = ["a", "b"]
}