🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped

 Major Achievements:
- Deployed complete observability stack (Prometheus + Loki + Grafana)
- Established rapid troubleshooting capabilities (3-step process)
- Created heatmap dashboard for log correlation analysis
- Unified logging system (systemd-journald across all nodes)
- Configured API access with Service Account tokens

🧹 Project Cleanup:
- Intelligent cleanup based on Git modification frequency
- Organized files into proper directory structure
- Removed deprecated webhook deployment scripts
- Eliminated 70+ temporary/test files (43% reduction)

📊 Infrastructure Status:
- Prometheus: 13 nodes monitored
- Loki: 12 nodes logging
- Grafana: Heatmap dashboard + API access
- Promtail: Deployed to 12/13 nodes

🚀 Ready for Terraform transition (静默一周后切换)

Project Status: COMPLETED 
This commit is contained in:
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions

View File

@@ -0,0 +1,6 @@
{
"node_name": "ash1d",
"bind_addr": "100.81.26.3",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "ash1d"
bind_addr = "100.81.26.3"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "ash1d-service"
port = 8080
tags = ["ash1d", "client"]
}
# 健康检查
checks {
name = "ash1d-health"
tcp = "100.81.26.3:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "ash2e",
"bind_addr": "100.81.26.4",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "ash2e"
bind_addr = "100.81.26.4"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "ash2e-service"
port = 8080
tags = ["ash2e", "client"]
}
# 健康检查
checks {
name = "ash2e-health"
tcp = "100.81.26.4:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "ash3c",
"bind_addr": "100.116.80.94",
"node_zone": "server",
"ui_enabled": true
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "ash3c"
bind_addr = "100.116.80.94"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "server"
}
# UI配置
ui_config {
enabled = true
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "ash3c-service"
port = 8080
tags = ["ash3c", "client"]
}
# 健康检查
checks {
name = "ash3c-health"
tcp = "100.116.80.94:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "browser",
"bind_addr": "100.116.112.45",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "browser"
bind_addr = "100.116.112.45"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "browser-service"
port = 8080
tags = ["browser", "client"]
}
# 健康检查
checks {
name = "browser-health"
tcp = "100.116.112.45:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "ch2",
"bind_addr": "100.117.106.135",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "ch2"
bind_addr = "100.117.106.135"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "ch2-service"
port = 8080
tags = ["ch2", "client"]
}
# 健康检查
checks {
name = "ch2-health"
tcp = "100.117.106.135:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "ch3",
"bind_addr": "100.117.106.137",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "ch3"
bind_addr = "100.117.106.137"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "ch3-service"
port = 8080
tags = ["ch3", "client"]
}
# 健康检查
checks {
name = "ch3-health"
tcp = "100.117.106.137:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "ch4",
"bind_addr": "100.117.106.134",
"node_zone": "server",
"ui_enabled": true
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "ch4"
bind_addr = "100.117.106.134"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "server"
}
# UI配置
ui_config {
enabled = true
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "ch4-service"
port = 8080
tags = ["ch4", "client"]
}
# 健康检查
checks {
name = "ch4-health"
tcp = "100.117.106.134:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "de",
"bind_addr": "100.98.209.52",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "de"
bind_addr = "100.98.209.52"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "de-service"
port = 8080
tags = ["de", "client"]
}
# 健康检查
checks {
name = "de-health"
tcp = "100.98.209.52:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "hcp1",
"bind_addr": "100.116.112.46",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "hcp1"
bind_addr = "100.116.112.46"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "hcp1-service"
port = 8080
tags = ["hcp1", "client"]
}
# 健康检查
checks {
name = "hcp1-health"
tcp = "100.116.112.46:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "influxdb",
"bind_addr": "100.116.112.47",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "influxdb"
bind_addr = "100.116.112.47"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "influxdb-service"
port = 8080
tags = ["influxdb", "client"]
}
# 健康检查
checks {
name = "influxdb-health"
tcp = "100.116.112.47:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "onecloud1",
"bind_addr": "100.98.209.53",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "onecloud1"
bind_addr = "100.98.209.53"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "onecloud1-service"
port = 8080
tags = ["onecloud1", "client"]
}
# 健康检查
checks {
name = "onecloud1-health"
tcp = "100.98.209.53:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "semaphore",
"bind_addr": "100.98.209.54",
"node_zone": "client",
"ui_enabled": false
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "semaphore"
bind_addr = "100.98.209.54"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "client"
}
# UI配置
ui_config {
enabled = False
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "semaphore-service"
port = 8080
tags = ["semaphore", "client"]
}
# 健康检查
checks {
name = "semaphore-health"
tcp = "100.98.209.54:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密

View File

@@ -0,0 +1,6 @@
{
"node_name": "warden",
"bind_addr": "100.122.197.112",
"node_zone": "server",
"ui_enabled": true
}

View File

@@ -0,0 +1,81 @@
# Consul 客户端配置模板
# 适用于所有13个节点服务器由Nomad接管
# 基础配置
datacenter = "pacific"
data_dir = "/opt/consul/data"
log_level = "INFO"
node_name = "warden"
bind_addr = "100.122.197.112"
# 客户端模式服务器由Nomad接管
server = false
# 连接到Consul服务器集群
retry_join = [
"100.117.106.136", # ch4 (韩国)
"100.122.197.112", # warden (北京)
"100.116.80.94" # ash3c (美国)
]
# 性能优化
performance {
raft_multiplier = 5
}
# 端口配置
ports {
grpc = 8502
http = 8500
dns = 8600
}
# 启用Connect服务网格
connect {
enabled = true
}
# 缓存配置
cache {
entry_fetch_max_burst = 42
entry_fetch_rate = 30
}
# 节点元数据
node_meta = {
region = "pacific"
zone = "server"
}
# UI配置
ui_config {
enabled = true
}
# ACL配置
acl = {
enabled = false
default_policy = "allow"
}
# 日志配置
log_file = "/var/log/consul/consul.log"
log_rotate_duration = "24h"
log_rotate_max_files = 7
# 服务发现
services {
name = "warden-service"
port = 8080
tags = ["warden", "client"]
}
# 健康检查
checks {
name = "warden-health"
tcp = "100.122.197.112:8080"
interval = "10s"
timeout = "3s"
}
# 自动加密