🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped

 Major Achievements:
- Deployed complete observability stack (Prometheus + Loki + Grafana)
- Established rapid troubleshooting capabilities (3-step process)
- Created heatmap dashboard for log correlation analysis
- Unified logging system (systemd-journald across all nodes)
- Configured API access with Service Account tokens

🧹 Project Cleanup:
- Intelligent cleanup based on Git modification frequency
- Organized files into proper directory structure
- Removed deprecated webhook deployment scripts
- Eliminated 70+ temporary/test files (43% reduction)

📊 Infrastructure Status:
- Prometheus: 13 nodes monitored
- Loki: 12 nodes logging
- Grafana: Heatmap dashboard + API access
- Promtail: Deployed to 12/13 nodes

🚀 Ready for Terraform transition (静默一周后切换)

Project Status: COMPLETED 
This commit is contained in:
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions

View File

@@ -0,0 +1,249 @@
job "traefik-cloudflare-v3" {
datacenters = ["dc1"]
type = "service"
group "traefik" {
count = 1
constraint {
attribute = "${node.unique.name}"
value = "hcp1"
}
volume "traefik-certs" {
type = "host"
read_only = false
source = "traefik-certs"
}
network {
mode = "host"
port "http" {
static = 80
}
port "https" {
static = 443
}
port "traefik" {
static = 8080
}
}
task "traefik" {
driver = "exec"
config {
command = "/usr/local/bin/traefik"
args = [
"--configfile=/local/traefik.yml"
]
}
env {
CLOUDFLARE_EMAIL = "locksmithknight@gmail.com"
CLOUDFLARE_DNS_API_TOKEN = "0aPWoLaQ59l0nyL1jIVzZaEx2e41Gjgcfhn3ztJr"
CLOUDFLARE_ZONE_API_TOKEN = "0aPWoLaQ59l0nyL1jIVzZaEx2e41Gjgcfhn3ztJr"
}
volume_mount {
volume = "traefik-certs"
destination = "/opt/traefik/certs"
read_only = false
}
template {
data = <<EOF
api:
dashboard: true
insecure: true
entryPoints:
web:
address: "0.0.0.0:80"
http:
redirections:
entrypoint:
to: websecure
scheme: https
permanent: true
websecure:
address: "0.0.0.0:443"
traefik:
address: "0.0.0.0:8080"
providers:
consulCatalog:
endpoint:
address: "warden.tailnet-68f9.ts.net:8500"
scheme: "http"
watch: true
exposedByDefault: false
prefix: "traefik"
defaultRule: "Host(`{{ .Name }}.git-4ta.live`)"
file:
filename: /local/dynamic.yml
watch: true
certificatesResolvers:
cloudflare:
acme:
email: {{ env "CLOUDFLARE_EMAIL" }}
storage: /opt/traefik/certs/acme.json
dnsChallenge:
provider: cloudflare
delayBeforeCheck: 30s
log:
level: DEBUG
EOF
destination = "local/traefik.yml"
}
template {
data = <<EOF
http:
serversTransports:
waypoint-insecure:
insecureSkipVerify: true
authentik-insecure:
insecureSkipVerify: true
middlewares:
consul-stripprefix:
stripPrefix:
prefixes:
- "/consul"
waypoint-auth:
replacePathRegex:
regex: "^/auth/token(.*)$"
replacement: "/auth/token$1"
services:
consul-cluster:
loadBalancer:
servers:
- url: "http://ch4.tailnet-68f9.ts.net:8500" # 韩国Leader
- url: "http://warden.tailnet-68f9.ts.net:8500" # 北京Follower
- url: "http://ash3c.tailnet-68f9.ts.net:8500" # 美国Follower
healthCheck:
path: "/v1/status/leader"
interval: "30s"
timeout: "15s"
nomad-cluster:
loadBalancer:
servers:
- url: "http://ch2.tailnet-68f9.ts.net:4646" # 韩国Leader
- url: "http://ash3c.tailnet-68f9.ts.net:4646" # 美国Follower
healthCheck:
path: "/v1/status/leader"
interval: "30s"
timeout: "15s"
waypoint-cluster:
loadBalancer:
servers:
- url: "https://hcp1.tailnet-68f9.ts.net:9701" # hcp1 节点 HTTPS API
serversTransport: waypoint-insecure
vault-cluster:
loadBalancer:
servers:
- url: "http://warden.tailnet-68f9.ts.net:8200" # 北京,单节点
healthCheck:
path: "/ui/"
interval: "30s"
timeout: "15s"
authentik-cluster:
loadBalancer:
servers:
- url: "https://authentik.tailnet-68f9.ts.net:9443" # Authentik容器HTTPS端口
serversTransport: authentik-insecure
healthCheck:
path: "/flows/-/default/authentication/"
interval: "30s"
timeout: "15s"
routers:
consul-api:
rule: "Host(`consul.git-4ta.live`)"
service: consul-cluster
middlewares:
- consul-stripprefix
entryPoints:
- websecure
tls:
certResolver: cloudflare
traefik-dashboard:
rule: "Host(`traefik.git-4ta.live`)"
service: dashboard@internal
middlewares:
- dashboard_redirect@internal
- dashboard_stripprefix@internal
entryPoints:
- websecure
tls:
certResolver: cloudflare
traefik-api:
rule: "Host(`traefik.git-4ta.live`) && PathPrefix(`/api`)"
service: api@internal
entryPoints:
- websecure
tls:
certResolver: cloudflare
nomad-ui:
rule: "Host(`nomad.git-4ta.live`)"
service: nomad-cluster
entryPoints:
- websecure
tls:
certResolver: cloudflare
waypoint-ui:
rule: "Host(`waypoint.git-4ta.live`)"
service: waypoint-cluster
entryPoints:
- websecure
tls:
certResolver: cloudflare
vault-ui:
rule: "Host(`vault.git-4ta.live`)"
service: vault-cluster
entryPoints:
- websecure
tls:
certResolver: cloudflare
authentik-ui:
rule: "Host(`authentik1.git-4ta.live`)"
service: authentik-cluster
entryPoints:
- websecure
tls:
certResolver: cloudflare
EOF
destination = "local/dynamic.yml"
}
template {
data = <<EOF
CLOUDFLARE_EMAIL=locksmithknight@gmail.com
CLOUDFLARE_DNS_API_TOKEN=0aPWoLaQ59l0nyL1jIVzZaEx2e41Gjgcfhn3ztJr
CLOUDFLARE_ZONE_API_TOKEN=0aPWoLaQ59l0nyL1jIVzZaEx2e41Gjgcfhn3ztJr
EOF
destination = "local/cloudflare.env"
env = true
}
resources {
cpu = 500
memory = 512
}
}
}
}