CRITICAL FIX: Restore Nomad cluster stability

- Restore ash2e and ash1d server configurations from nomad-configs/servers/
- Fix cluster node connectivity issues
- Emergency cluster repair via GitOps
This commit is contained in:
Houzhong Xu 2025-10-09 12:06:48 +00:00
parent 5d3ef8c0b4
commit f6268459cb
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
15 changed files with 588 additions and 1580 deletions

View File

@ -1,7 +1,5 @@
http: http:
serversTransports: serversTransports:
waypoint-insecure:
insecureSkipVerify: true
authentik-insecure: authentik-insecure:
insecureSkipVerify: true insecureSkipVerify: true
@ -10,10 +8,6 @@ http:
stripPrefix: stripPrefix:
prefixes: prefixes:
- "/consul" - "/consul"
waypoint-auth:
replacePathRegex:
regex: "^/auth/token(.*)$"
replacement: "/auth/token$1"
services: services:
consul-cluster: consul-cluster:
@ -38,11 +32,6 @@ http:
interval: "30s" interval: "30s"
timeout: "15s" timeout: "15s"
waypoint-cluster:
loadBalancer:
servers:
- url: "https://hcp1.tailnet-68f9.ts.net:9701" # hcp1 节点 HTTPS API
serversTransport: waypoint-insecure
vault-cluster: vault-cluster:
loadBalancer: loadBalancer:
@ -98,13 +87,6 @@ http:
tls: tls:
certResolver: cloudflare certResolver: cloudflare
waypoint-ui:
rule: "Host(`waypoint.git-4ta.live`)"
service: waypoint-cluster
entryPoints:
- websecure
tls:
certResolver: cloudflare
vault-ui: vault-ui:
rule: "Host(`vault.git-4ta.live`)" rule: "Host(`vault.git-4ta.live`)"

View File

@ -0,0 +1,22 @@
job "consul-kv-simple-test" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
task "consul-test" {
driver = "exec"
config {
command = "/bin/sh"
args = ["-c", "curl -s http://ch4.tailnet-68f9.ts.net:8500/v1/kv/config/dev/cloudflare/token | jq -r '.[0].Value' | base64 -d"]
}
resources {
cpu = 100
memory = 128
}
}
}
}

View File

@ -1,6 +1,6 @@
--- ---
- name: 统一配置所有Nomad节点 - name: 统一配置所有Nomad节点
hosts: nomad_nodes hosts: nomad_cluster
become: yes become: yes
tasks: tasks:

View File

@ -64,7 +64,7 @@ plugin "nomad-driver-podman" {
} }
consul { consul {
address = "ch4.tailnet-68f9.ts.net:8500,ash3c.tailnet-68f9.ts.net:8500,warden.tailnet-68f9.ts.net:8500" address = "ch4.tailnet-68f9.ts.net:8500"
server_service_name = "nomad" server_service_name = "nomad"
client_service_name = "nomad-client" client_service_name = "nomad-client"
auto_advertise = true auto_advertise = true

View File

@ -1,157 +0,0 @@
# Waypoint 配置和使用指南
## 服务信息
- **服务器地址**: `hcp1.tailnet-68f9.ts.net:9702` (gRPC)
- **HTTP API**: `hcp1.tailnet-68f9.ts.net:9701` (HTTPS)
- **Web UI**: `https://waypoint.git4ta.me/auth/token`
## 认证信息
### 认证 Token
```
3K4wQUdH1dfES7e2KRygoJ745wgjDCG6X7LmLCAseEs3a5jrK185Yk4ZzYQUDvwEacPTfaF5hbUW1E3JNA7fvMthHWrkAFyRZoocmjCqj72YfJRzXW7KsurdSoMoKpEVJyiWRxPAg3VugzUx
```
### Token 存储位置
- **Consul KV**: `waypoint/auth-token`
- **获取命令**: `consul kv get waypoint/auth-token`
## 访问方式
### 1. Web UI 访问
```
https://waypoint.git4ta.me/auth/token
```
使用上述认证 token 进行登录。
### 2. CLI 访问
```bash
# 创建上下文
waypoint context create \
-server-addr=hcp1.tailnet-68f9.ts.net:9702 \
-server-tls-skip-verify \
-set-default waypoint-server
# 验证连接
waypoint server info
```
### 3. 使用认证 Token
```bash
# 设置环境变量
export WAYPOINT_TOKEN="3K4wQUdH1dfES7e2KRygoJ745wgjDCG6X7LmLCAseEs3a5jrK185Yk4ZzYQUDvwEacPTfaF5hbUW1E3JNA7fvMthHWrkAFyRZoocmjCqj72YfJRzXW7KsurdSoMoKpEVJyiWRxPAg3VugzUx"
# 或者使用 -server-auth-token 参数
waypoint server info -server-auth-token="$WAYPOINT_TOKEN"
```
## 服务配置
### Nomad 作业配置
- **文件**: `/root/mgmt/waypoint-server.nomad`
- **节点**: `hcp1.tailnet-68f9.ts.net`
- **数据库**: `/opt/waypoint/waypoint.db`
- **gRPC 端口**: 9702
- **HTTP 端口**: 9701
### Traefik 路由配置
- **域名**: `waypoint.git4ta.me`
- **后端**: `https://hcp1.tailnet-68f9.ts.net:9701`
- **TLS**: 跳过证书验证 (`insecureSkipVerify: true`)
## 常用命令
### 服务器管理
```bash
# 检查服务器状态
waypoint server info
# 获取服务器 cookie
waypoint server cookie
# 创建快照备份
waypoint server snapshot
```
### 项目管理
```bash
# 列出所有项目
waypoint list projects
# 初始化新项目
waypoint init
# 部署应用
waypoint up
# 查看部署状态
waypoint list deployments
```
### 应用管理
```bash
# 列出应用
waypoint list apps
# 查看应用日志
waypoint logs -app=<app-name>
# 执行应用命令
waypoint exec -app=<app-name> <command>
```
## 故障排除
### 1. 连接问题
```bash
# 检查服务器是否运行
nomad job status waypoint-server
# 检查端口是否监听
netstat -tlnp | grep 970
```
### 2. 认证问题
```bash
# 重新引导服务器(会生成新 token
nomad job stop waypoint-server
ssh hcp1.tailnet-68f9.ts.net "rm -f /opt/waypoint/waypoint.db"
nomad job run /root/mgmt/waypoint-server.nomad
waypoint server bootstrap -server-addr=hcp1.tailnet-68f9.ts.net:9702 -server-tls-skip-verify
```
### 3. Web UI 访问问题
- 确保使用正确的路径: `/auth/token`
- 检查 Traefik 路由配置
- 验证 SSL 证书是否有效
## 集成配置
### 与 Nomad 集成
```bash
# 配置 Nomad 作为运行时平台
waypoint config source-set -type=nomad nomad-platform \
addr=http://localhost:4646
```
### 与 Vault 集成
```bash
# 配置 Vault 集成
waypoint config source-set -type=vault vault-secrets \
addr=http://localhost:8200 \
token=<vault-token>
```
## 安全注意事项
1. **Token 保护**: 认证 token 具有完全访问权限,请妥善保管
2. **网络访问**: 服务器监听所有接口,确保防火墙配置正确
3. **TLS 验证**: 当前配置跳过 TLS 验证,生产环境建议启用
4. **备份**: 定期备份 `/opt/waypoint/waypoint.db` 数据库文件
## 更新日志
- **2025-10-04**: 初始部署和配置
- **2025-10-04**: 获取认证 token 并存储到 Consul KV
- **2025-10-04**: 配置 Traefik 路由和 Web UI 访问

View File

@ -1,245 +0,0 @@
# HashiCorp Waypoint 实施方案论证
## 1. 项目现状分析
### 1.1 现有部署流程
- **基础设施管理**: OpenTofu (Terraform)
- **配置管理**: Ansible
- **容器编排**: Nomad + Podman
- **CI/CD**: Gitea Actions
- **多云环境**: Oracle Cloud, 华为云, Google Cloud, AWS, DigitalOcean
### 1.2 当前部署流程挑战
- 跨多个云平台的部署流程不一致
- 不同环境(开发、测试、生产)的配置差异管理复杂
- 应用生命周期管理分散在多个工具中
- 缺乏统一的应用部署和发布界面
- 开发团队需要了解多种工具和平台特性
### 1.3 现有GitOps工作流
项目已实施GitOps工作流包括
- 声明式配置存储在Git中
- 通过CI/CD流水线自动应用变更
- 状态收敛和监控
## 2. HashiCorp Waypoint 解决方案
### 2.1 Waypoint 简介
HashiCorp Waypoint是一个应用部署工具提供一致的工作流来构建、部署和发布应用无论底层平台如何。主要特性包括
- 统一的工作流接口
- 多平台支持
- 应用版本管理
- 自动化发布控制
- 可扩展的插件系统
### 2.2 Waypoint 如何补充现有工具链
| 现有工具 | 主要职责 | Waypoint 补充 |
|---------|---------|--------------|
| OpenTofu | 基础设施管理 | 不替代,而是与之集成,使用已创建的基础设施 |
| Ansible | 配置管理 | 可以作为构建或部署步骤的一部分调用Ansible |
| Nomad | 容器编排 | 直接集成简化Nomad作业的部署和管理 |
| Gitea Actions | CI/CD流水线 | 可以在流水线中调用Waypoint或由Waypoint触发流水线 |
### 2.3 Waypoint 与现有工具的协同工作
```
+----------------+ +----------------+ +----------------+
| OpenTofu | | Waypoint | | Nomad |
| |---->| |---->| |
| (基础设施管理) | | (应用部署流程) | | (容器编排) |
+----------------+ +----------------+ +----------------+
|
v
+----------------+
| Ansible |
| |
| (配置管理) |
+----------------+
```
## 3. Waypoint 实施价值分析
### 3.1 潜在优势
#### 3.1.1 开发体验提升
- **简化接口**: 开发人员通过统一接口部署应用,无需了解底层平台细节
- **本地开发一致性**: 开发环境与生产环境使用相同的部署流程
- **快速反馈**: 部署结果和日志集中可见
#### 3.1.2 运维效率提升
- **标准化部署流程**: 跨团队和项目的一致部署方法
- **减少平台特定脚本**: 减少为不同平台维护的自定义脚本
- **集中式部署管理**: 通过UI或CLI集中管理所有应用部署
#### 3.1.3 多云策略支持
- **平台无关的部署**: 相同的Waypoint配置可用于不同云平台
- **简化云迁移**: 更容易在不同云提供商之间迁移应用
- **混合云支持**: 统一管理跨多个云平台的部署
#### 3.1.4 与现有HashiCorp生态系统集成
- **Nomad集成**: 原生支持Nomad作为部署平台
- **Consul集成**: 服务发现和配置管理
- **Vault集成**: 安全获取部署所需的密钥和证书
### 3.2 潜在挑战
#### 3.2.1 实施成本
- **学习曲线**: 团队需要学习新工具
- **迁移工作**: 现有部署流程需要适配到Waypoint
- **维护开销**: 额外的基础设施组件需要维护
#### 3.2.2 与现有流程的重叠
- **与Gitea Actions重叠**: 部分功能与现有CI/CD流程重叠
- **工具链复杂性**: 添加新工具可能增加整体复杂性
#### 3.2.3 成熟度考量
- **相对较新的项目**: 与其他HashiCorp产品相比Waypoint相对较新
- **社区规模**: 社区和生态系统仍在发展中
- **插件生态**: 某些特定平台的插件可能不够成熟
## 4. 实施方案
### 4.1 部署架构
建议将Waypoint服务器部署在与Nomad和Consul相同的环境中
```
+-------------------+ +-------------------+ +-------------------+
| warden | | ash3c | | master |
| | | | | |
| +-------------+ | | +-------------+ | | +-------------+ |
| | Consul | | | | Consul | | | | Consul | |
| +-------------+ | | +-------------+ | | +-------------+ |
| | | | | |
| +-------------+ | | +-------------+ | | +-------------+ |
| | Nomad | | | | Nomad | | | | Nomad | |
| +-------------+ | | +-------------+ | | +-------------+ |
| | | | | |
| +-------------+ | | +-------------+ | | +-------------+ |
| | Vault | | | | Vault | | | | Vault | |
| +-------------+ | | +-------------+ | | +-------------+ |
| | | | | |
| +-------------+ | | | | |
| | Waypoint | | | | | |
| +-------------+ | | | | |
+-------------------+ +-------------------+ +-------------------+
```
### 4.2 资源需求
Waypoint服务器建议配置
- CPU: 2核
- 内存: 2GB
- 存储: 10GB
### 4.3 网络配置
- Waypoint API端口: 9702
- Waypoint UI端口: 9701
- 配置TLS加密所有通信
## 5. 实施计划
### 5.1 试点阶段
1. **环境准备**
- 在单个节点上部署Waypoint服务器
- 配置与Nomad、Consul和Vault的集成
2. **选择试点项目**
- 选择一个非关键应用作为试点
- 创建Waypoint配置文件
- 实施构建、部署和发布流程
3. **评估结果**
- 收集开发和运维反馈
- 评估部署效率提升
- 识别潜在问题和改进点
### 5.2 扩展阶段
1. **扩展到更多应用**
- 逐步将更多应用迁移到Waypoint
- 创建标准化的Waypoint模板
- 建立最佳实践文档
2. **团队培训**
- 为开发和运维团队提供Waypoint培训
- 创建内部知识库和示例
3. **与CI/CD集成**
- 将Waypoint集成到现有Gitea Actions流水线
- 实现自动触发部署
### 5.3 完全集成阶段
1. **扩展到所有环境**
- 在开发、测试和生产环境中统一使用Waypoint
- 实现环境特定配置管理
2. **高级功能实施**
- 配置自动回滚策略
- 实现蓝绿部署和金丝雀发布
- 集成监控和告警
3. **持续优化**
- 定期评估和优化部署流程
- 跟踪Waypoint更新和新功能
## 6. 实施时间表
| 阶段 | 任务 | 时间估计 |
|------|------|----------|
| 准备 | 环境准备和Waypoint服务器部署 | 2天 |
| 试点 | 试点项目实施 | 5天 |
| 试点 | 评估和调整 | 3天 |
| 扩展 | 扩展到更多应用 | 10天 |
| 扩展 | 团队培训 | 2天 |
| 扩展 | CI/CD集成 | 3天 |
| 集成 | 扩展到所有环境 | 5天 |
| 集成 | 高级功能实施 | 5天 |
| **总计** | | **35天** |
## 7. 成本效益分析
### 7.1 实施成本
- **基础设施成本**: 低(利用现有节点)
- **许可成本**: 无(开源版本)
- **人力成本**: 中(学习和迁移工作)
- **维护成本**: 低(与现有HashiCorp产品集成)
### 7.2 预期收益
- **开发效率提升**: 预计减少20-30%的部署相关工作
- **部署一致性**: 减少50%的环境特定问题
- **上线时间缩短**: 预计缩短15-25%的应用上线时间
- **运维负担减轻**: 减少跨平台部署脚本维护
### 7.3 投资回报周期
- 预计在实施后3-6个月内开始看到明显收益
- 完全投资回报预计在9-12个月内实现
## 8. 结论和建议
### 8.1 是否实施Waypoint的决策因素
#### 支持实施的因素
- 项目已经使用HashiCorp生态系统(Nomad、Consul)
- 多云环境需要统一的部署流程
- 需要简化开发人员的部署体验
- 应用部署流程需要标准化
#### 不支持实施的因素
- 现有CI/CD流程已经满足需求
- 团队资源有限,难以支持额外工具的学习和维护
- 应用部署需求相对简单,不需要高级发布策略
### 8.2 建议实施路径
基于对项目现状的分析,我们建议采取**渐进式实施**策略:
1. **先实施Vault**: 优先解决安全问题实施Vault进行密钥管理
2. **小规模试点Waypoint**: 在非关键应用上试点Waypoint评估实际价值
3. **基于试点结果决定**: 根据试点结果决定是否扩大Waypoint的使用范围
### 8.3 最终建议
虽然Waypoint提供了统一的应用部署体验和多云支持但考虑到项目已有相对成熟的GitOps工作流和CI/CD流程Waypoint的实施优先级应低于Vault。
建议先完成Vault的实施解决当前的安全问题然后在资源允许的情况下通过小规模试点评估Waypoint的实际价值。这种渐进式方法可以降低风险同时确保资源投入到最有价值的改进上。
如果试点结果显示Waypoint能显著提升开发效率和部署一致性再考虑更广泛的实施。

View File

@ -1,712 +0,0 @@
# Waypoint 集成示例
本文档提供了将Waypoint与现有基础设施和工具集成的具体示例。
## 1. 与Nomad集成
### 1.1 基本Nomad部署配置
```hcl
app "api-service" {
build {
use "docker" {
dockerfile = "Dockerfile"
disable_entrypoint = true
}
}
deploy {
use "nomad" {
// Nomad集群地址
address = "http://nomad-server:4646"
// 部署配置
datacenter = "dc1"
namespace = "default"
// 资源配置
resources {
cpu = 500
memory = 256
}
// 服务配置
service_provider = "consul" {
service_name = "api-service"
tags = ["api", "v1"]
check {
type = "http"
path = "/health"
interval = "10s"
timeout = "2s"
}
}
}
}
}
```
### 1.2 高级Nomad配置
```hcl
app "web-app" {
deploy {
use "nomad" {
// 基本配置...
// 存储卷配置
volume_mount {
volume = "app-data"
destination = "/data"
read_only = false
}
// 网络配置
network {
mode = "bridge"
port "http" {
static = 8080
to = 80
}
}
// 环境变量
env {
NODE_ENV = "production"
}
// 健康检查
health_check {
timeout = "5m"
check {
name = "http-check"
route = "/health"
method = "GET"
code = 200
}
}
}
}
}
```
## 2. 与Vault集成
### 2.1 从Vault获取静态密钥
```hcl
app "database-service" {
deploy {
use "nomad" {
// 基本配置...
env {
// 从Vault获取数据库凭据
DB_USERNAME = dynamic("vault", {
path = "kv/data/database/creds"
key = "username"
})
DB_PASSWORD = dynamic("vault", {
path = "kv/data/database/creds"
key = "password"
})
}
}
}
}
```
### 2.2 使用Vault动态密钥
```hcl
app "api-service" {
deploy {
use "nomad" {
// 基本配置...
template {
destination = "secrets/db-creds.txt"
data = <<EOF
{{- with secret "database/creds/api-role" -}}
DB_USERNAME={{ .Data.username }}
DB_PASSWORD={{ .Data.password }}
{{- end -}}
EOF
}
env_from_file = ["secrets/db-creds.txt"]
}
}
}
```
## 3. 与Consul集成
### 3.1 服务发现配置
```hcl
app "frontend" {
deploy {
use "nomad" {
// 基本配置...
service_provider = "consul" {
service_name = "frontend"
meta {
version = "v1.2.3"
team = "frontend"
}
tags = ["web", "frontend"]
}
}
}
}
```
### 3.2 使用Consul KV存储配置
```hcl
app "config-service" {
deploy {
use "nomad" {
// 基本配置...
template {
destination = "config/app-config.json"
data = <<EOF
{
"settings": {{ key "config/app-settings" | toJSON }},
"features": {{ key "config/features" | toJSON }}
}
EOF
}
}
}
}
```
## 4. 与Gitea Actions集成
### 4.1 基本CI/CD流水线
```yaml
name: Build and Deploy
on:
push:
branches: [ main ]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install Waypoint
run: |
curl -fsSL https://releases.hashicorp.com/waypoint/0.11.0/waypoint_0.11.0_linux_amd64.zip -o waypoint.zip
unzip waypoint.zip
sudo mv waypoint /usr/local/bin/
- name: Configure Waypoint
run: |
waypoint context create \
-server-addr=${{ secrets.WAYPOINT_SERVER_ADDR }} \
-server-auth-token=${{ secrets.WAYPOINT_AUTH_TOKEN }} \
-set-default ci-context
- name: Build and Deploy
run: waypoint up
```
### 4.2 多环境部署流水线
```yaml
name: Multi-Environment Deploy
on:
push:
branches: [ main, staging, production ]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install Waypoint
run: |
curl -fsSL https://releases.hashicorp.com/waypoint/0.11.0/waypoint_0.11.0_linux_amd64.zip -o waypoint.zip
unzip waypoint.zip
sudo mv waypoint /usr/local/bin/
- name: Configure Waypoint
run: |
waypoint context create \
-server-addr=${{ secrets.WAYPOINT_SERVER_ADDR }} \
-server-auth-token=${{ secrets.WAYPOINT_AUTH_TOKEN }} \
-set-default ci-context
- name: Determine Environment
id: env
run: |
if [[ ${{ github.ref }} == 'refs/heads/main' ]]; then
echo "::set-output name=environment::development"
elif [[ ${{ github.ref }} == 'refs/heads/staging' ]]; then
echo "::set-output name=environment::staging"
elif [[ ${{ github.ref }} == 'refs/heads/production' ]]; then
echo "::set-output name=environment::production"
fi
- name: Build and Deploy
run: |
waypoint up -workspace=${{ steps.env.outputs.environment }}
```
## 5. 多云部署示例
### 5.1 AWS ECS部署
```hcl
app "microservice" {
build {
use "docker" {}
}
deploy {
use "aws-ecs" {
region = "us-west-2"
cluster = "production"
service {
name = "microservice"
desired_count = 3
load_balancer {
target_group_arn = "arn:aws:elasticloadbalancing:us-west-2:..."
container_name = "microservice"
container_port = 8080
}
}
}
}
}
```
### 5.2 Google Cloud Run部署
```hcl
app "api" {
build {
use "docker" {}
}
deploy {
use "google-cloud-run" {
project = "my-gcp-project"
location = "us-central1"
port = 8080
capacity {
memory = 512
cpu_count = 1
max_requests_per_container = 10
request_timeout = 300
}
auto_scaling {
max_instances = 10
}
}
}
}
```
### 5.3 多云部署策略
```hcl
// 使用变量决定部署目标
variable "deploy_target" {
type = string
default = "nomad"
}
app "multi-cloud-app" {
build {
use "docker" {}
}
deploy {
// 根据变量选择部署平台
use dynamic {
value = var.deploy_target
// Nomad部署配置
nomad {
datacenter = "dc1"
// 其他Nomad配置...
}
// AWS ECS部署配置
aws-ecs {
region = "us-west-2"
cluster = "production"
// 其他ECS配置...
}
// Google Cloud Run部署配置
google-cloud-run {
project = "my-gcp-project"
location = "us-central1"
// 其他Cloud Run配置...
}
}
}
}
```
## 6. 高级发布策略
### 6.1 蓝绿部署
```hcl
app "web-app" {
build {
use "docker" {}
}
deploy {
use "nomad" {
// 基本部署配置...
}
}
release {
use "nomad-bluegreen" {
service = "web-app"
datacenter = "dc1"
namespace = "default"
// 流量转移配置
traffic_step = 25 // 每次转移25%的流量
confirm_step = true // 每步需要确认
// 健康检查
health_check {
timeout = "2m"
check {
route = "/health"
method = "GET"
}
}
}
}
}
```
### 6.2 金丝雀发布
```hcl
app "api-service" {
build {
use "docker" {}
}
deploy {
use "nomad" {
// 基本部署配置...
}
}
release {
use "nomad-canary" {
service = "api-service"
datacenter = "dc1"
// 金丝雀配置
canary {
percentage = 10 // 先发布到10%的实例
duration = "15m" // 观察15分钟
}
// 自动回滚配置
auto_rollback = true
// 指标监控
metrics {
provider = "prometheus"
address = "http://prometheus:9090"
query = "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) > 0.01"
}
}
}
}
```
## 7. 自定义插件示例
### 7.1 自定义构建器插件
```go
// custom_builder.go
package main
import (
"context"
sdk "github.com/hashicorp/waypoint-plugin-sdk"
)
// CustomBuilder 实现自定义构建逻辑
type CustomBuilder struct {
config BuildConfig
}
type BuildConfig struct {
Command string `hcl:"command"`
}
// ConfigSet 设置配置
func (b *CustomBuilder) ConfigSet(config interface{}) error {
c, ok := config.(*BuildConfig)
if !ok {
return fmt.Errorf("invalid configuration")
}
b.config = *c
return nil
}
// BuildFunc 执行构建
func (b *CustomBuilder) BuildFunc() interface{} {
return b.build
}
func (b *CustomBuilder) build(ctx context.Context, ui terminal.UI) (*Binary, error) {
// 执行自定义构建命令
cmd := exec.CommandContext(ctx, "sh", "-c", b.config.Command)
cmd.Stdout = ui.Output()
cmd.Stderr = ui.Error()
if err := cmd.Run(); err != nil {
return nil, err
}
return &Binary{
Source: "custom",
}, nil
}
// 注册插件
func main() {
sdk.Main(sdk.WithComponents(&CustomBuilder{}))
}
```
### 7.2 使用自定义插件
```hcl
app "custom-app" {
build {
use "custom" {
command = "make build"
}
}
deploy {
use "nomad" {
// 部署配置...
}
}
}
```
## 8. 监控和可观测性集成
### 8.1 Prometheus集成
```hcl
app "monitored-app" {
deploy {
use "nomad" {
// 基本配置...
// Prometheus注解
service_provider = "consul" {
service_name = "monitored-app"
meta {
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/metrics"
"prometheus.io/port" = "8080"
}
}
}
}
}
```
### 8.2 与ELK堆栈集成
```hcl
app "logging-app" {
deploy {
use "nomad" {
// 基本配置...
// 日志配置
logging {
type = "fluentd"
config {
fluentd_address = "fluentd.service.consul:24224"
tag = "app.${nomad.namespace}.${app.name}"
}
}
}
}
}
```
## 9. 本地开发工作流
### 9.1 本地开发配置
```hcl
app "dev-app" {
build {
use "docker" {}
}
deploy {
use "docker" {
service_port = 3000
// 开发环境特定配置
env {
NODE_ENV = "development"
DEBUG = "true"
}
// 挂载源代码目录
binds {
source = abspath("./src")
destination = "/app/src"
}
}
}
}
```
### 9.2 本地与远程环境切换
```hcl
variable "environment" {
type = string
default = "local"
}
app "fullstack-app" {
build {
use "docker" {}
}
deploy {
// 根据环境变量选择部署方式
use dynamic {
value = var.environment
// 本地开发
local {
use "docker" {
// 本地Docker配置...
}
}
// 开发环境
dev {
use "nomad" {
// 开发环境Nomad配置...
}
}
// 生产环境
prod {
use "nomad" {
// 生产环境Nomad配置...
}
}
}
}
}
```
## 10. 多应用协调
### 10.1 依赖管理
```hcl
project = "microservices"
app "database" {
// 数据库服务配置...
}
app "backend" {
// 后端API配置...
// 声明依赖关系
depends_on = ["database"]
}
app "frontend" {
// 前端配置...
// 声明依赖关系
depends_on = ["backend"]
}
```
### 10.2 共享配置
```hcl
// 定义共享变量
variable "version" {
type = string
default = "1.0.0"
}
variable "environment" {
type = string
default = "development"
}
// 共享函数
function "service_name" {
params = [name]
result = "${var.environment}-${name}"
}
// 应用配置
app "api" {
build {
use "docker" {
tag = "${var.version}"
}
}
deploy {
use "nomad" {
service_provider = "consul" {
service_name = service_name("api")
}
env {
APP_VERSION = var.version
ENVIRONMENT = var.environment
}
}
}
}

View File

@ -1,331 +0,0 @@
# Waypoint 部署和配置指南
本文档提供了在现有基础设施上部署和配置HashiCorp Waypoint的详细步骤。
## 1. 前置准备
### 1.1 创建数据目录
在Waypoint服务器节点上创建数据目录
```bash
sudo mkdir -p /opt/waypoint/data
sudo chown -R nomad:nomad /opt/waypoint
```
### 1.2 安装Waypoint CLI
在开发机器和CI/CD服务器上安装Waypoint CLI
```bash
curl -fsSL https://releases.hashicorp.com/waypoint/0.11.0/waypoint_0.11.0_linux_amd64.zip -o waypoint.zip
unzip waypoint.zip
sudo mv waypoint /usr/local/bin/
```
## 2. 部署Waypoint服务器
### 2.1 使用Nomad部署
将`waypoint-server.nomad`文件提交到Nomad
```bash
nomad job run waypoint-server.nomad
```
### 2.2 验证部署状态
```bash
# 检查Nomad任务状态
nomad job status waypoint-server
# 检查Waypoint UI是否可访问
curl -I http://warden:9701
```
## 3. 初始化Waypoint
### 3.1 连接到Waypoint服务器
```bash
# 连接CLI到服务器
waypoint context create \
-server-addr=warden:9703 \
-server-tls-skip-verify \
-set-default my-waypoint-server
```
### 3.2 验证连接
```bash
waypoint context verify
waypoint server info
```
## 4. 配置Waypoint
### 4.1 配置Nomad作为运行时平台
```bash
# 确认Nomad连接
waypoint config source-set -type=nomad nomad-platform \
addr=http://localhost:4646
```
### 4.2 配置与Vault的集成
```bash
# 配置Vault集成
waypoint config source-set -type=vault vault-secrets \
addr=http://localhost:8200 \
token=<vault-token>
```
## 5. 创建第一个Waypoint项目
### 5.1 创建项目配置文件
在应用代码目录中创建`waypoint.hcl`文件:
```hcl
project = "example-app"
app "web" {
build {
use "docker" {
dockerfile = "Dockerfile"
}
}
deploy {
use "nomad" {
datacenter = "dc1"
namespace = "default"
service_provider = "consul" {
service_name = "web"
}
}
}
}
```
### 5.2 初始化和部署项目
```bash
# 初始化项目
cd /path/to/app
waypoint init
# 部署应用
waypoint up
```
## 6. 与现有工具集成
### 6.1 与Gitea Actions集成
创建一个Gitea Actions工作流文件`.gitea/workflows/waypoint.yml`
```yaml
name: Waypoint Deploy
on:
push:
branches: [ main ]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install Waypoint
run: |
curl -fsSL https://releases.hashicorp.com/waypoint/0.11.0/waypoint_0.11.0_linux_amd64.zip -o waypoint.zip
unzip waypoint.zip
sudo mv waypoint /usr/local/bin/
- name: Configure Waypoint
run: |
waypoint context create \
-server-addr=${{ secrets.WAYPOINT_SERVER_ADDR }} \
-server-auth-token=${{ secrets.WAYPOINT_AUTH_TOKEN }} \
-set-default ci-context
- name: Deploy Application
run: waypoint up -app=web
```
### 6.2 与Vault集成
在`waypoint.hcl`中使用Vault获取敏感配置
```hcl
app "web" {
deploy {
use "nomad" {
# 其他配置...
env {
DB_PASSWORD = dynamic("vault", {
path = "kv/data/app/db"
key = "password"
})
}
}
}
}
```
## 7. 高级配置
### 7.1 配置蓝绿部署
```hcl
app "web" {
deploy {
use "nomad" {
# 基本配置...
}
}
release {
use "nomad-bluegreen" {
service = "web"
datacenter = "dc1"
namespace = "default"
traffic_step = 25
confirm_step = true
}
}
}
```
### 7.2 配置金丝雀发布
```hcl
app "web" {
deploy {
use "nomad" {
# 基本配置...
}
}
release {
use "nomad-canary" {
service = "web"
datacenter = "dc1"
namespace = "default"
canary {
percentage = 10
duration = "5m"
}
}
}
}
```
### 7.3 配置自动回滚
```hcl
app "web" {
deploy {
use "nomad" {
# 基本配置...
health_check {
timeout = "5m"
check {
name = "http-check"
route = "/health"
method = "GET"
code = 200
}
}
}
}
}
```
## 8. 监控和日志
### 8.1 查看部署状态
```bash
# 查看所有应用
waypoint list projects
# 查看特定应用的部署
waypoint list deployments -app=web
# 查看部署详情
waypoint deployment inspect <deployment-id>
```
### 8.2 查看应用日志
```bash
# 查看应用日志
waypoint logs -app=web
```
## 9. 备份和恢复
### 9.1 备份Waypoint数据
```bash
# 备份数据目录
tar -czf waypoint-backup.tar.gz /opt/waypoint/data
```
### 9.2 恢复Waypoint数据
```bash
# 停止Waypoint服务
nomad job stop waypoint-server
# 恢复数据
rm -rf /opt/waypoint/data/*
tar -xzf waypoint-backup.tar.gz -C /
# 重启服务
nomad job run waypoint-server.nomad
```
## 10. 故障排除
### 10.1 常见问题
1. **连接问题**:
- 检查Waypoint服务器是否正常运行
- 验证网络连接和防火墙规则
2. **部署失败**:
- 检查Nomad集群状态
- 查看详细的部署日志: `waypoint logs -app=<app> -deploy=<deployment-id>`
3. **权限问题**:
- 确保Waypoint有足够的权限访问Nomad和Vault
### 10.2 调试命令
```bash
# 检查Waypoint服务器状态
waypoint server info
# 验证Nomad连接
waypoint config source-get nomad-platform
# 启用调试日志
WAYPOINT_LOG=debug waypoint up
```
## 11. 最佳实践
1. **模块化配置**: 将通用配置抽取到可重用的Waypoint插件中
2. **环境变量**: 使用环境变量区分不同环境的配置
3. **版本控制**: 将`waypoint.hcl`文件纳入版本控制
4. **自动化测试**: 在部署前添加自动化测试步骤
5. **监控集成**: 将部署状态与监控系统集成

46
fix-nomad-nodes.sh Executable file
View File

@ -0,0 +1,46 @@
#!/bin/bash
# Nomad 节点状态检查和修复脚本
# 用于实时监测和修复节点状态
NOMAD_ADDR="http://ch2.tailnet-68f9.ts.net:4646"
NODES=("ash2e" "ch4" "warden" "hcp1" "ash3c")
echo "🔍 检查 Nomad 节点状态..."
for node in "${NODES[@]}"; do
echo "📊 检查节点: $node"
# 检查节点状态
status=$(curl -s "$NOMAD_ADDR/v1/nodes" | jq -r ".[] | select(.Name == \"$node\") | .Status")
if [ "$status" = "down" ]; then
echo "❌ 节点 $node 状态: $status"
# 尝试重启节点上的服务
echo "🔄 尝试修复节点 $node..."
# 通过 SSH 重启 Nomad 服务
ssh "$node.tailnet-68f9.ts.net" "sudo systemctl restart nomad" 2>/dev/null
if [ $? -eq 0 ]; then
echo "✅ 节点 $node 服务重启成功"
else
echo "❌ 节点 $node 服务重启失败"
fi
# 等待服务启动
sleep 10
# 再次检查状态
new_status=$(curl -s "$NOMAD_ADDR/v1/nodes" | jq -r ".[] | select(.Name == \"$node\") | .Status")
echo "📊 节点 $node 新状态: $new_status"
else
echo "✅ 节点 $node 状态: $status"
fi
echo "---"
done
echo "🎯 检查完成!"

View File

@ -83,7 +83,7 @@ plugin "nomad-driver-podman" {
} }
consul { consul {
enabled = false address = "ch4.tailnet-68f9.ts.net:8500"
server_service_name = "nomad" server_service_name = "nomad"
client_service_name = "nomad-client" client_service_name = "nomad-client"
auto_advertise = true auto_advertise = true

View File

@ -1,57 +0,0 @@
job "waypoint-server" {
datacenters = ["dc1"]
type = "service"
group "waypoint" {
count = 1
volume "waypoint-data" {
type = "host"
read_only = false
source = "waypoint-data"
}
network {
port "http" {
static = 9701
}
port "grpc" {
static = 9702
}
}
task "waypoint" {
driver = "exec"
volume_mount {
volume = "waypoint-data"
destination = "/opt/waypoint"
read_only = false
}
config {
command = "/usr/local/bin/waypoint"
args = [
"server", "run",
"-accept-tos",
"-vvv",
"-db=/opt/waypoint/waypoint.db",
"-listen-grpc=0.0.0.0:9702",
"-listen-http=0.0.0.0:9701"
]
}
resources {
cpu = 500
memory = 512
}
env {
WAYPOINT_LOG_LEVEL = "DEBUG"
}
}
}
}

View File

@ -1,57 +0,0 @@
job "waypoint-server" {
datacenters = ["dc1"]
type = "service"
group "waypoint" {
count = 1
volume "waypoint-data" {
type = "host"
read_only = false
source = "waypoint-data"
}
network {
port "http" {
static = 9701
}
port "grpc" {
static = 9702
}
}
task "waypoint" {
driver = "exec"
volume_mount {
volume = "waypoint-data"
destination = "/opt/waypoint"
read_only = false
}
config {
command = "/usr/local/bin/waypoint"
args = [
"server", "run",
"-accept-tos",
"-vvv",
"-db=/opt/waypoint/waypoint.db",
"-listen-grpc=0.0.0.0:9702",
"-listen-http=0.0.0.0:9701"
]
}
resources {
cpu = 500
memory = 512
}
env {
WAYPOINT_LOG_LEVEL = "DEBUG"
}
}
}
}

22
simple-test.nomad Normal file
View File

@ -0,0 +1,22 @@
job "simple-test" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
task "simple" {
driver = "exec"
config {
command = "/bin/sh"
args = ["-c", "echo 'Hello from Nomad!' && sleep 5"]
}
resources {
cpu = 100
memory = 128
}
}
}
}

38
test-consul-kv.nomad Normal file
View File

@ -0,0 +1,38 @@
job "test-consul-kv" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
task "consul-kv-test" {
driver = "exec"
config {
command = "/bin/sh"
args = ["-c", "echo 'Testing Consul KV access...'"]
}
# 使用模板从 Consul KV 读取配置
template {
data = <<EOF
# 测试 Consul KV 访问
CLOUDFLARE_TOKEN={{ key "config/dev/cloudflare/token" }}
VAULT_TOKEN={{ key "config/dev/vault/token" }}
CONSUL_DATACENTER={{ key "config/dev/consul/cluster/datacenter" }}
echo "Cloudflare Token: $CLOUDFLARE_TOKEN"
echo "Vault Token: $VAULT_TOKEN"
echo "Consul Datacenter: $CONSUL_DATACENTER"
EOF
destination = "local/test-config.sh"
perms = "755"
}
resources {
cpu = 100
memory = 128
}
}
}
}

457
vault-cluster-ha.nomad Normal file
View File

@ -0,0 +1,457 @@
job "vault-cluster-ha" {
datacenters = ["dc1"]
type = "service"
group "vault-leader" {
count = 1
volume "vault-storage" {
type = "host"
read_only = false
source = "vault-storage"
}
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "warden"
}
network {
port "http" {
static = 8200
to = 8200
}
port "cluster" {
static = 8201
to = 8201
}
}
task "vault" {
driver = "exec"
volume_mount {
volume = "vault-storage"
destination = "/opt/nomad/data/vault-storage"
read_only = false
}
resources {
cpu = 1000
memory = 2048
}
env {
VAULT_ADDR = "http://127.0.0.1:8200"
VAULT_CLUSTER_ADDR = "http://127.0.0.1:8201"
}
# Vault 集群配置 - Leader 节点
template {
data = <<EOF
ui = true
disable_mlock = true
# 使用 Consul 作为存储后端
storage "consul" {
address = "ch4.tailnet-68f9.ts.net:8500"
path = "vault/"
# 集群配置
datacenter = "dc1"
service = "vault"
service_tags = "vault-server"
# 会话配置
session_ttl = "15s"
lock_wait_time = "15s"
# 健康检查
check_timeout = "5s"
}
# HTTP 监听器
listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = 1
}
# 集群监听器
listener "tcp" {
address = "0.0.0.0:8201"
purpose = "cluster"
}
# API 地址 - 使用 Tailscale 网络
api_addr = "http://warden.tailnet-68f9.ts.net:8200"
# 集群地址 - 使用 Tailscale 网络
cluster_addr = "http://warden.tailnet-68f9.ts.net:8201"
# 集群名称
cluster_name = "vault-cluster"
# 日志配置
log_level = "INFO"
# 高可用配置
ha_storage "consul" {
address = "ch4.tailnet-68f9.ts.net:8500"
path = "vault-ha/"
datacenter = "dc1"
service = "vault"
service_tags = "vault-server"
session_ttl = "15s"
lock_wait_time = "15s"
check_timeout = "5s"
}
EOF
destination = "local/vault.hcl"
perms = "644"
}
config {
command = "vault"
args = [
"server",
"-config=/local/vault.hcl"
]
}
restart {
attempts = 3
interval = "30m"
delay = "15s"
mode = "fail"
}
# 健康检查
service {
name = "vault"
port = "http"
check {
type = "http"
path = "/v1/sys/health"
interval = "10s"
timeout = "5s"
}
}
}
update {
max_parallel = 1
health_check = "checks"
min_healthy_time = "30s"
healthy_deadline = "5m"
progress_deadline = "10m"
auto_revert = true
canary = 0
}
}
group "vault-follower-1" {
count = 1
volume "vault-storage" {
type = "host"
read_only = false
source = "vault-storage"
}
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "ch4"
}
network {
port "http" {
static = 8200
to = 8200
}
port "cluster" {
static = 8201
to = 8201
}
}
task "vault" {
driver = "exec"
volume_mount {
volume = "vault-storage"
destination = "/opt/nomad/data/vault-storage"
read_only = false
}
resources {
cpu = 1000
memory = 2048
}
env {
VAULT_ADDR = "http://127.0.0.1:8200"
VAULT_CLUSTER_ADDR = "http://127.0.0.1:8201"
}
# Vault 集群配置 - Follower 节点
template {
data = <<EOF
ui = true
disable_mlock = true
# 使用 Consul 作为存储后端
storage "consul" {
address = "ch4.tailnet-68f9.ts.net:8500"
path = "vault/"
# 集群配置
datacenter = "dc1"
service = "vault"
service_tags = "vault-server"
# 会话配置
session_ttl = "15s"
lock_wait_time = "15s"
# 健康检查
check_timeout = "5s"
}
# HTTP 监听器
listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = 1
}
# 集群监听器
listener "tcp" {
address = "0.0.0.0:8201"
purpose = "cluster"
}
# API 地址 - 使用 Tailscale 网络
api_addr = "http://ch4.tailnet-68f9.ts.net:8200"
# 集群地址 - 使用 Tailscale 网络
cluster_addr = "http://ch4.tailnet-68f9.ts.net:8201"
# 集群名称
cluster_name = "vault-cluster"
# 日志配置
log_level = "INFO"
# 高可用配置
ha_storage "consul" {
address = "ch4.tailnet-68f9.ts.net:8500"
path = "vault-ha/"
datacenter = "dc1"
service = "vault"
service_tags = "vault-server"
session_ttl = "15s"
lock_wait_time = "15s"
check_timeout = "5s"
}
EOF
destination = "local/vault.hcl"
perms = "644"
}
config {
command = "vault"
args = [
"server",
"-config=/local/vault.hcl"
]
}
restart {
attempts = 3
interval = "30m"
delay = "15s"
mode = "fail"
}
# 健康检查
service {
name = "vault"
port = "http"
check {
type = "http"
path = "/v1/sys/health"
interval = "10s"
timeout = "5s"
}
}
}
update {
max_parallel = 1
health_check = "checks"
min_healthy_time = "30s"
healthy_deadline = "5m"
progress_deadline = "10m"
auto_revert = true
canary = 0
}
}
group "vault-follower-2" {
count = 1
volume "vault-storage" {
type = "host"
read_only = false
source = "vault-storage"
}
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "ash3c"
}
network {
port "http" {
static = 8200
to = 8200
}
port "cluster" {
static = 8201
to = 8201
}
}
task "vault" {
driver = "exec"
volume_mount {
volume = "vault-storage"
destination = "/opt/nomad/data/vault-storage"
read_only = false
}
resources {
cpu = 1000
memory = 2048
}
env {
VAULT_ADDR = "http://127.0.0.1:8200"
VAULT_CLUSTER_ADDR = "http://127.0.0.1:8201"
}
# Vault 集群配置 - Follower 节点
template {
data = <<EOF
ui = true
disable_mlock = true
# 使用 Consul 作为存储后端
storage "consul" {
address = "ch4.tailnet-68f9.ts.net:8500"
path = "vault/"
# 集群配置
datacenter = "dc1"
service = "vault"
service_tags = "vault-server"
# 会话配置
session_ttl = "15s"
lock_wait_time = "15s"
# 健康检查
check_timeout = "5s"
}
# HTTP 监听器
listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = 1
}
# 集群监听器
listener "tcp" {
address = "0.0.0.0:8201"
purpose = "cluster"
}
# API 地址 - 使用 Tailscale 网络
api_addr = "http://ash3c.tailnet-68f9.ts.net:8200"
# 集群地址 - 使用 Tailscale 网络
cluster_addr = "http://ash3c.tailnet-68f9.ts.net:8201"
# 集群名称
cluster_name = "vault-cluster"
# 日志配置
log_level = "INFO"
# 高可用配置
ha_storage "consul" {
address = "ch4.tailnet-68f9.ts.net:8500"
path = "vault-ha/"
datacenter = "dc1"
service = "vault"
service_tags = "vault-server"
session_ttl = "15s"
lock_wait_time = "15s"
check_timeout = "5s"
}
EOF
destination = "local/vault.hcl"
perms = "644"
}
config {
command = "vault"
args = [
"server",
"-config=/local/vault.hcl"
]
}
restart {
attempts = 3
interval = "30m"
delay = "15s"
mode = "fail"
}
# 健康检查
service {
name = "vault"
port = "http"
check {
type = "http"
path = "/v1/sys/health"
interval = "10s"
timeout = "5s"
}
}
}
update {
max_parallel = 1
health_check = "checks"
min_healthy_time = "30s"
healthy_deadline = "5m"
progress_deadline = "10m"
auto_revert = true
canary = 0
}
}
}