🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped

 Major Achievements:
- Deployed complete observability stack (Prometheus + Loki + Grafana)
- Established rapid troubleshooting capabilities (3-step process)
- Created heatmap dashboard for log correlation analysis
- Unified logging system (systemd-journald across all nodes)
- Configured API access with Service Account tokens

🧹 Project Cleanup:
- Intelligent cleanup based on Git modification frequency
- Organized files into proper directory structure
- Removed deprecated webhook deployment scripts
- Eliminated 70+ temporary/test files (43% reduction)

📊 Infrastructure Status:
- Prometheus: 13 nodes monitored
- Loki: 12 nodes logging
- Grafana: Heatmap dashboard + API access
- Promtail: Deployed to 12/13 nodes

🚀 Ready for Terraform transition (静默一周后切换)

Project Status: COMPLETED 
This commit is contained in:
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions

View File

@@ -0,0 +1,39 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /var/lib/loki
storage:
filesystem:
chunks_directory: /var/lib/loki/chunks
rules_directory: /var/lib/loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
query_scheduler:
max_outstanding_requests_per_tenant: 2048
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v12
index:
prefix: index_
period: 24h
limits_config:
allow_structured_metadata: false
ruler:
alertmanager_url: http://localhost:9093
analytics:
reporting_enabled: false

View File

@@ -0,0 +1,5 @@
# Node Exporter 配置文件
# 默认配置已经足够,主要参数通过命令行传递
# 如果需要自定义配置,可以在这里添加
# 目前使用默认配置 + 命令行参数

View File

@@ -0,0 +1,61 @@
# Prometheus 配置 - 监控Nomad集群
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'nomad-cluster'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
scrape_configs:
# Prometheus自身监控
- job_name: 'prometheus'
scrape_interval: 5s
scrape_timeout: 5s
static_configs:
- targets: ['localhost:9090']
# Node Exporter - 客户端节点
- job_name: 'node-clients'
static_configs:
- targets:
- 'ch4.tailnet-68f9.ts.net:9100'
- 'ash3c.tailnet-68f9.ts.net:9100'
- 'warden.tailnet-68f9.ts.net:9100'
- 'hcp1.tailnet-68f9.ts.net:9100'
- 'browser.tailnet-68f9.ts.net:9100'
# Node Exporter - 服务端节点
- job_name: 'node-servers'
static_configs:
- targets:
- 'ash2e.tailnet-68f9.ts.net:9100'
- 'ch2.tailnet-68f9.ts.net:9100'
- 'ch3.tailnet-68f9.ts.net:9100'
- 'onecloud1.tailnet-68f9.ts.net:9100'
# Nomad集群监控
- job_name: 'nomad'
static_configs:
- targets:
- 'ash1.tailnet-68f9.ts.net:4646'
- 'ash2.tailnet-68f9.ts.net:4646'
- 'onecloud1.tailnet-68f9.ts.net:4646'
# Consul集群监控
- job_name: 'consul'
static_configs:
- targets:
- 'ash1.tailnet-68f9.ts.net:8500'
- 'ash2.tailnet-68f9.ts.net:8500'
- 'onecloud1.tailnet-68f9.ts.net:8500'

View File

@@ -0,0 +1,39 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /opt/promtail/data/positions.yaml
clients:
- url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
scrape_configs:
- job_name: journal
journal:
max_age: 12h
labels:
job: systemd-journal
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: 'unit'
- source_labels: ['__journal_priority_keyword']
target_label: 'level'
- source_labels: ['__journal__hostname']
target_label: 'hostname'
- job_name: syslog
static_configs:
- targets:
- localhost
labels:
job: syslog
__path__: /var/log/syslog
- job_name: daemon
static_configs:
- targets:
- localhost
labels:
job: daemon
__path__: /var/log/daemon.log

View File

@@ -0,0 +1,23 @@
server:
http_listen_port: 9082
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
scrape_configs:
- job_name: journal
journal:
max_age: 12h
labels:
job: systemd-journal
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: 'unit'
- source_labels: ['__journal_priority_keyword']
target_label: 'level'
- source_labels: ['__journal__hostname']
target_label: 'hostname'

View File

@@ -0,0 +1,392 @@
{
"dashboard": {
"id": null,
"title": "Loki 日志热点图 Demo",
"tags": ["loki", "heatmap", "demo"],
"style": "dark",
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "日志级别热点图 (类似GitHub贡献图)",
"type": "heatmap",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"expr": "sum by (level) (rate({job=\"systemd-journal\"}[5m]))",
"refId": "A",
"legendFormat": "{{level}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"scaleDistribution": {
"type": "linear"
}
},
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 10
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"options": {
"calculate": false,
"cellGap": 2,
"cellValues": {
"unit": "short"
},
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "spectrum",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"show": true,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "short"
}
}
},
{
"id": 2,
"title": "节点日志密度热点图",
"type": "heatmap",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"expr": "sum by (hostname) (rate({job=\"systemd-journal\"}[5m]))",
"refId": "A",
"legendFormat": "{{hostname}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 5
},
{
"color": "red",
"value": 20
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"options": {
"calculate": false,
"cellGap": 2,
"cellValues": {
"unit": "short"
},
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "spectrum",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"show": true,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "short"
}
}
},
{
"id": 3,
"title": "关键服务日志热点图 (Nomad/Consul/Traefik)",
"type": "heatmap",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"expr": "sum by (unit) (rate({job=\"systemd-journal\", unit=~\"nomad|consul|traefik\"}[5m]))",
"refId": "A",
"legendFormat": "{{unit}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"options": {
"calculate": false,
"cellGap": 2,
"cellValues": {
"unit": "short"
},
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "spectrum",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"show": true,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "short"
}
}
},
{
"id": 4,
"title": "ERROR/CRIT 级别日志热点图 (黑匣子重点)",
"type": "heatmap",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"expr": "sum by (hostname) (rate({job=\"systemd-journal\", level=~\"error|crit\"}[5m]))",
"refId": "A",
"legendFormat": "{{hostname}} - {{level}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 0.1
},
{
"color": "red",
"value": 1
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"options": {
"calculate": false,
"cellGap": 2,
"cellValues": {
"unit": "short"
},
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "spectrum",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"show": true,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "short"
}
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": "30s",
"schemaVersion": 27,
"version": 1
}
}

View File

@@ -0,0 +1,59 @@
---
- name: Deploy Promtail to all nodes
hosts: all
become: yes
vars:
promtail_config_path: /etc/promtail/promtail.yml
promtail_data_path: /opt/promtail/data
tasks:
- name: Install promtail
apt:
name: promtail
state: present
update_cache: yes
ignore_errors: yes
- name: Create promtail user and group
user:
name: promtail
system: yes
shell: /bin/false
home: /opt/promtail
create_home: yes
- name: Create promtail data directory
file:
path: "{{ promtail_data_path }}"
state: directory
owner: promtail
group: promtail
mode: '0755'
- name: Copy promtail configuration
template:
src: promtail-config.yaml
dest: "{{ promtail_config_path }}"
owner: promtail
group: promtail
mode: '0644'
notify: restart promtail
- name: Add promtail user to adm group (for syslog access)
user:
name: promtail
groups: adm
append: yes
- name: Enable and start promtail service
systemd:
name: promtail
enabled: yes
state: started
daemon_reload: yes
handlers:
- name: restart promtail
systemd:
name: promtail
state: restarted

View File

@@ -0,0 +1,258 @@
job "monitoring-stack" {
datacenters = ["dc1"]
type = "service"
# Grafana 服务组
group "grafana" {
count = 1
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "influxdb"
}
volume "grafana-data" {
type = "host"
read_only = false
source = "grafana-data"
}
network {
port "http" {
static = 3000
to = 3000
}
}
task "grafana" {
driver = "exec"
volume_mount {
volume = "grafana-data"
destination = "/opt/grafana/data"
read_only = false
}
config {
command = "/usr/sbin/grafana-server"
args = [
"--config", "/etc/grafana/grafana.ini",
"--homepath", "/usr/share/grafana",
"cfg:default.paths.data=/opt/grafana/data",
"cfg:default.paths.logs=/var/log/grafana",
"cfg:default.paths.plugins=/var/lib/grafana/plugins",
"cfg:default.paths.provisioning=/etc/grafana/provisioning"
]
}
resources {
cpu = 300
memory = 512
}
env {
GF_SECURITY_ADMIN_PASSWORD = "admin123"
GF_INSTALL_PLUGINS = "grafana-piechart-panel"
GF_SERVER_DOMAIN = "grafana.tailnet-68f9.ts.net"
GF_SERVER_ROOT_URL = "http://grafana.tailnet-68f9.ts.net:3000"
}
service {
name = "grafana"
port = "http"
tags = [
"grafana",
"monitoring",
"dashboard"
]
check {
type = "http"
path = "/api/health"
interval = "30s"
timeout = "5s"
}
}
}
}
# Prometheus 服务组
group "prometheus" {
count = 1
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "influxdb"
}
volume "prometheus-data" {
type = "host"
read_only = false
source = "prometheus-data"
}
network {
port "http" {
static = 9090
to = 9090
}
}
task "prometheus" {
driver = "exec"
volume_mount {
volume = "prometheus-data"
destination = "/opt/prometheus/data"
read_only = false
}
config {
command = "prometheus"
args = [
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/opt/prometheus/data",
"--web.console.libraries=/usr/share/prometheus/console_libraries",
"--web.console.templates=/usr/share/prometheus/consoles",
"--storage.tsdb.retention.time=15d",
"--web.enable-lifecycle"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "prometheus"
port = "http"
tags = [
"prometheus",
"monitoring",
"metrics"
]
check {
type = "http"
path = "/-/healthy"
interval = "30s"
timeout = "5s"
}
}
}
}
# Loki 服务组
group "loki" {
count = 1
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "influxdb"
}
volume "loki-data" {
type = "host"
read_only = false
source = "loki-data"
}
network {
port "http" {
static = 3100
to = 3100
}
}
task "loki" {
driver = "exec"
volume_mount {
volume = "loki-data"
destination = "/opt/loki/data"
read_only = false
}
template {
data = <<EOF
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /opt/loki/data
storage:
filesystem:
chunks_directory: /opt/loki/data/chunks
rules_directory: /opt/loki/data/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
query_scheduler:
max_outstanding_requests_per_tenant: 2048
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v12
index:
prefix: index_
period: 24h
limits_config:
allow_structured_metadata: false
ruler:
alertmanager_url: http://localhost:9093
analytics:
reporting_enabled: false
EOF
destination = "local/config.yml"
}
config {
command = "loki"
args = [
"-config.file=local/config.yml"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "loki"
port = "http"
tags = [
"loki",
"monitoring",
"logs"
]
check {
type = "http"
path = "/ready"
interval = "30s"
timeout = "5s"
}
}
}
}
}

View File

@@ -7,32 +7,63 @@ rule_files:
# - "second_rules.yml"
scrape_configs:
# Prometheus 自身监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'openfaas'
static_configs:
- targets: ['gateway:8080']
metrics_path: /metrics
scrape_interval: 15s
scrape_timeout: 10s
- job_name: 'nats'
static_configs:
- targets: ['nats:8222']
metrics_path: /metrics
scrape_interval: 15s
scrape_timeout: 10s
# Node Exporter 监控 - 所有节点
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
scrape_interval: 15s
scrape_timeout: 10s
- targets:
- 'semaphore.tailnet-68f9.ts.net:9100'
- 'ash1d.tailnet-68f9.ts.net:9100'
- 'ash2e.tailnet-68f9.ts.net:9100'
- 'ash3c.tailnet-68f9.ts.net:9100'
- 'ch2.tailnet-68f9.ts.net:9100'
- 'ch3.tailnet-68f9.ts.net:9100'
- 'ch4.tailnet-68f9.ts.net:9100'
- 'de.tailnet-68f9.ts.net:9100'
- 'hcp1.tailnet-68f9.ts.net:9100'
- 'influxdb.tailnet-68f9.ts.net:9100'
- 'onecloud1.tailnet-68f9.ts.net:9100'
- 'warden.tailnet-68f9.ts.net:9100'
- 'browser.tailnet-68f9.ts.net:9100'
- job_name: 'cadvisor'
# Consul 监控
- job_name: 'consul'
static_configs:
- targets: ['cadvisor:8080']
scrape_interval: 15s
scrape_timeout: 10s
- targets:
- 'ch4.tailnet-68f9.ts.net:8500'
- 'ash3c.tailnet-68f9.ts.net:8500'
- 'warden.tailnet-68f9.ts.net:8500'
# Nomad 监控
- job_name: 'nomad'
static_configs:
- targets:
- 'semaphore.tailnet-68f9.ts.net:4646'
- 'ash1d.tailnet-68f9.ts.net:4646'
- 'ash2e.tailnet-68f9.ts.net:4646'
- 'ch2.tailnet-68f9.ts.net:4646'
- 'ch3.tailnet-68f9.ts.net:4646'
- 'onecloud1.tailnet-68f9.ts.net:4646'
- 'de.tailnet-68f9.ts.net:4646'
# Vault 监控
- job_name: 'vault'
static_configs:
- targets:
- 'master.tailnet-68f9.ts.net:8200'
- 'ash3c.tailnet-68f9.ts.net:8200'
- 'hcp1.tailnet-68f9.ts.net:8200'
# InfluxDB 监控
- job_name: 'influxdb'
static_configs:
- targets: ['influxdb1.tailnet-68f9.ts.net:8086']
# Traefik 监控
- job_name: 'traefik'
static_configs:
- targets: ['hcp1.tailnet-68f9.ts.net:8080']