🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped
✅ Major Achievements: - Deployed complete observability stack (Prometheus + Loki + Grafana) - Established rapid troubleshooting capabilities (3-step process) - Created heatmap dashboard for log correlation analysis - Unified logging system (systemd-journald across all nodes) - Configured API access with Service Account tokens 🧹 Project Cleanup: - Intelligent cleanup based on Git modification frequency - Organized files into proper directory structure - Removed deprecated webhook deployment scripts - Eliminated 70+ temporary/test files (43% reduction) 📊 Infrastructure Status: - Prometheus: 13 nodes monitored - Loki: 12 nodes logging - Grafana: Heatmap dashboard + API access - Promtail: Deployed to 12/13 nodes 🚀 Ready for Terraform transition (静默一周后切换) Project Status: COMPLETED ✅
This commit is contained in:
39
infrastructure/monitor/configs/loki/loki.yml
Normal file
39
infrastructure/monitor/configs/loki/loki.yml
Normal file
@@ -0,0 +1,39 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /var/lib/loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /var/lib/loki/chunks
|
||||
rules_directory: /var/lib/loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_scheduler:
|
||||
max_outstanding_requests_per_tenant: 2048
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v12
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
allow_structured_metadata: false
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
@@ -0,0 +1,5 @@
|
||||
# Node Exporter 配置文件
|
||||
# 默认配置已经足够,主要参数通过命令行传递
|
||||
|
||||
# 如果需要自定义配置,可以在这里添加
|
||||
# 目前使用默认配置 + 命令行参数
|
||||
61
infrastructure/monitor/configs/prometheus/prometheus.yml
Normal file
61
infrastructure/monitor/configs/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,61 @@
|
||||
# Prometheus 配置 - 监控Nomad集群
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
monitor: 'nomad-cluster'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['localhost:9093']
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
# - "first_rules.yml"
|
||||
# - "second_rules.yml"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
scrape_configs:
|
||||
# Prometheus自身监控
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 5s
|
||||
scrape_timeout: 5s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Node Exporter - 客户端节点
|
||||
- job_name: 'node-clients'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ch4.tailnet-68f9.ts.net:9100'
|
||||
- 'ash3c.tailnet-68f9.ts.net:9100'
|
||||
- 'warden.tailnet-68f9.ts.net:9100'
|
||||
- 'hcp1.tailnet-68f9.ts.net:9100'
|
||||
- 'browser.tailnet-68f9.ts.net:9100'
|
||||
|
||||
# Node Exporter - 服务端节点
|
||||
- job_name: 'node-servers'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ash2e.tailnet-68f9.ts.net:9100'
|
||||
- 'ch2.tailnet-68f9.ts.net:9100'
|
||||
- 'ch3.tailnet-68f9.ts.net:9100'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:9100'
|
||||
|
||||
# Nomad集群监控
|
||||
- job_name: 'nomad'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ash1.tailnet-68f9.ts.net:4646'
|
||||
- 'ash2.tailnet-68f9.ts.net:4646'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:4646'
|
||||
|
||||
# Consul集群监控
|
||||
- job_name: 'consul'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ash1.tailnet-68f9.ts.net:8500'
|
||||
- 'ash2.tailnet-68f9.ts.net:8500'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:8500'
|
||||
39
infrastructure/monitor/configs/promtail/promtail-config.yaml
Normal file
39
infrastructure/monitor/configs/promtail/promtail-config.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /opt/promtail/data/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: journal
|
||||
journal:
|
||||
max_age: 12h
|
||||
labels:
|
||||
job: systemd-journal
|
||||
relabel_configs:
|
||||
- source_labels: ['__journal__systemd_unit']
|
||||
target_label: 'unit'
|
||||
- source_labels: ['__journal_priority_keyword']
|
||||
target_label: 'level'
|
||||
- source_labels: ['__journal__hostname']
|
||||
target_label: 'hostname'
|
||||
|
||||
- job_name: syslog
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: syslog
|
||||
__path__: /var/log/syslog
|
||||
|
||||
- job_name: daemon
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: daemon
|
||||
__path__: /var/log/daemon.log
|
||||
@@ -0,0 +1,23 @@
|
||||
server:
|
||||
http_listen_port: 9082
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: journal
|
||||
journal:
|
||||
max_age: 12h
|
||||
labels:
|
||||
job: systemd-journal
|
||||
relabel_configs:
|
||||
- source_labels: ['__journal__systemd_unit']
|
||||
target_label: 'unit'
|
||||
- source_labels: ['__journal_priority_keyword']
|
||||
target_label: 'level'
|
||||
- source_labels: ['__journal__hostname']
|
||||
target_label: 'hostname'
|
||||
Reference in New Issue
Block a user