🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped
✅ Major Achievements: - Deployed complete observability stack (Prometheus + Loki + Grafana) - Established rapid troubleshooting capabilities (3-step process) - Created heatmap dashboard for log correlation analysis - Unified logging system (systemd-journald across all nodes) - Configured API access with Service Account tokens 🧹 Project Cleanup: - Intelligent cleanup based on Git modification frequency - Organized files into proper directory structure - Removed deprecated webhook deployment scripts - Eliminated 70+ temporary/test files (43% reduction) 📊 Infrastructure Status: - Prometheus: 13 nodes monitored - Loki: 12 nodes logging - Grafana: Heatmap dashboard + API access - Promtail: Deployed to 12/13 nodes 🚀 Ready for Terraform transition (静默一周后切换) Project Status: COMPLETED ✅
This commit is contained in:
39
infrastructure/monitor/configs/loki/loki.yml
Normal file
39
infrastructure/monitor/configs/loki/loki.yml
Normal file
@@ -0,0 +1,39 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /var/lib/loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /var/lib/loki/chunks
|
||||
rules_directory: /var/lib/loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_scheduler:
|
||||
max_outstanding_requests_per_tenant: 2048
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v12
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
allow_structured_metadata: false
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
@@ -0,0 +1,5 @@
|
||||
# Node Exporter 配置文件
|
||||
# 默认配置已经足够,主要参数通过命令行传递
|
||||
|
||||
# 如果需要自定义配置,可以在这里添加
|
||||
# 目前使用默认配置 + 命令行参数
|
||||
61
infrastructure/monitor/configs/prometheus/prometheus.yml
Normal file
61
infrastructure/monitor/configs/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,61 @@
|
||||
# Prometheus 配置 - 监控Nomad集群
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
monitor: 'nomad-cluster'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['localhost:9093']
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
# - "first_rules.yml"
|
||||
# - "second_rules.yml"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
scrape_configs:
|
||||
# Prometheus自身监控
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 5s
|
||||
scrape_timeout: 5s
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Node Exporter - 客户端节点
|
||||
- job_name: 'node-clients'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ch4.tailnet-68f9.ts.net:9100'
|
||||
- 'ash3c.tailnet-68f9.ts.net:9100'
|
||||
- 'warden.tailnet-68f9.ts.net:9100'
|
||||
- 'hcp1.tailnet-68f9.ts.net:9100'
|
||||
- 'browser.tailnet-68f9.ts.net:9100'
|
||||
|
||||
# Node Exporter - 服务端节点
|
||||
- job_name: 'node-servers'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ash2e.tailnet-68f9.ts.net:9100'
|
||||
- 'ch2.tailnet-68f9.ts.net:9100'
|
||||
- 'ch3.tailnet-68f9.ts.net:9100'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:9100'
|
||||
|
||||
# Nomad集群监控
|
||||
- job_name: 'nomad'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ash1.tailnet-68f9.ts.net:4646'
|
||||
- 'ash2.tailnet-68f9.ts.net:4646'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:4646'
|
||||
|
||||
# Consul集群监控
|
||||
- job_name: 'consul'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'ash1.tailnet-68f9.ts.net:8500'
|
||||
- 'ash2.tailnet-68f9.ts.net:8500'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:8500'
|
||||
39
infrastructure/monitor/configs/promtail/promtail-config.yaml
Normal file
39
infrastructure/monitor/configs/promtail/promtail-config.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /opt/promtail/data/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: journal
|
||||
journal:
|
||||
max_age: 12h
|
||||
labels:
|
||||
job: systemd-journal
|
||||
relabel_configs:
|
||||
- source_labels: ['__journal__systemd_unit']
|
||||
target_label: 'unit'
|
||||
- source_labels: ['__journal_priority_keyword']
|
||||
target_label: 'level'
|
||||
- source_labels: ['__journal__hostname']
|
||||
target_label: 'hostname'
|
||||
|
||||
- job_name: syslog
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: syslog
|
||||
__path__: /var/log/syslog
|
||||
|
||||
- job_name: daemon
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: daemon
|
||||
__path__: /var/log/daemon.log
|
||||
@@ -0,0 +1,23 @@
|
||||
server:
|
||||
http_listen_port: 9082
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: journal
|
||||
journal:
|
||||
max_age: 12h
|
||||
labels:
|
||||
job: systemd-journal
|
||||
relabel_configs:
|
||||
- source_labels: ['__journal__systemd_unit']
|
||||
target_label: 'unit'
|
||||
- source_labels: ['__journal_priority_keyword']
|
||||
target_label: 'level'
|
||||
- source_labels: ['__journal__hostname']
|
||||
target_label: 'hostname'
|
||||
392
infrastructure/monitor/dashboards/loki-heatmap-demo.json
Normal file
392
infrastructure/monitor/dashboards/loki-heatmap-demo.json
Normal file
@@ -0,0 +1,392 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "Loki 日志热点图 Demo",
|
||||
"tags": ["loki", "heatmap", "demo"],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "日志级别热点图 (类似GitHub贡献图)",
|
||||
"type": "heatmap",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "loki"
|
||||
},
|
||||
"expr": "sum by (level) (rate({job=\"systemd-journal\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{level}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
},
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"cellGap": 2,
|
||||
"cellValues": {
|
||||
"unit": "short"
|
||||
},
|
||||
"color": {
|
||||
"exponent": 0.5,
|
||||
"fill": "dark-orange",
|
||||
"mode": "spectrum",
|
||||
"reverse": false,
|
||||
"scale": "exponential",
|
||||
"scheme": "Spectral",
|
||||
"steps": 64
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto"
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": false
|
||||
},
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "short"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "节点日志密度热点图",
|
||||
"type": "heatmap",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "loki"
|
||||
},
|
||||
"expr": "sum by (hostname) (rate({job=\"systemd-journal\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{hostname}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 5
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 20
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"cellGap": 2,
|
||||
"cellValues": {
|
||||
"unit": "short"
|
||||
},
|
||||
"color": {
|
||||
"exponent": 0.5,
|
||||
"fill": "dark-orange",
|
||||
"mode": "spectrum",
|
||||
"reverse": false,
|
||||
"scale": "exponential",
|
||||
"scheme": "Spectral",
|
||||
"steps": 64
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto"
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": false
|
||||
},
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "short"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "关键服务日志热点图 (Nomad/Consul/Traefik)",
|
||||
"type": "heatmap",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "loki"
|
||||
},
|
||||
"expr": "sum by (unit) (rate({job=\"systemd-journal\", unit=~\"nomad|consul|traefik\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{unit}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"cellGap": 2,
|
||||
"cellValues": {
|
||||
"unit": "short"
|
||||
},
|
||||
"color": {
|
||||
"exponent": 0.5,
|
||||
"fill": "dark-orange",
|
||||
"mode": "spectrum",
|
||||
"reverse": false,
|
||||
"scale": "exponential",
|
||||
"scheme": "Spectral",
|
||||
"steps": 64
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto"
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": false
|
||||
},
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "short"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "ERROR/CRIT 级别日志热点图 (黑匣子重点)",
|
||||
"type": "heatmap",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "loki"
|
||||
},
|
||||
"expr": "sum by (hostname) (rate({job=\"systemd-journal\", level=~\"error|crit\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{hostname}} - {{level}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 0.1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"cellGap": 2,
|
||||
"cellValues": {
|
||||
"unit": "short"
|
||||
},
|
||||
"color": {
|
||||
"exponent": 0.5,
|
||||
"fill": "dark-orange",
|
||||
"mode": "spectrum",
|
||||
"reverse": false,
|
||||
"scale": "exponential",
|
||||
"scheme": "Spectral",
|
||||
"steps": 64
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto"
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": false
|
||||
},
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "short"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 27,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
59
infrastructure/monitor/deploy-promtail.yml
Normal file
59
infrastructure/monitor/deploy-promtail.yml
Normal file
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Deploy Promtail to all nodes
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
promtail_config_path: /etc/promtail/promtail.yml
|
||||
promtail_data_path: /opt/promtail/data
|
||||
|
||||
tasks:
|
||||
- name: Install promtail
|
||||
apt:
|
||||
name: promtail
|
||||
state: present
|
||||
update_cache: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Create promtail user and group
|
||||
user:
|
||||
name: promtail
|
||||
system: yes
|
||||
shell: /bin/false
|
||||
home: /opt/promtail
|
||||
create_home: yes
|
||||
|
||||
- name: Create promtail data directory
|
||||
file:
|
||||
path: "{{ promtail_data_path }}"
|
||||
state: directory
|
||||
owner: promtail
|
||||
group: promtail
|
||||
mode: '0755'
|
||||
|
||||
- name: Copy promtail configuration
|
||||
template:
|
||||
src: promtail-config.yaml
|
||||
dest: "{{ promtail_config_path }}"
|
||||
owner: promtail
|
||||
group: promtail
|
||||
mode: '0644'
|
||||
notify: restart promtail
|
||||
|
||||
- name: Add promtail user to adm group (for syslog access)
|
||||
user:
|
||||
name: promtail
|
||||
groups: adm
|
||||
append: yes
|
||||
|
||||
- name: Enable and start promtail service
|
||||
systemd:
|
||||
name: promtail
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
|
||||
handlers:
|
||||
- name: restart promtail
|
||||
systemd:
|
||||
name: promtail
|
||||
state: restarted
|
||||
258
infrastructure/monitor/monitoring-stack.nomad
Normal file
258
infrastructure/monitor/monitoring-stack.nomad
Normal file
@@ -0,0 +1,258 @@
|
||||
job "monitoring-stack" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
# Grafana 服务组
|
||||
group "grafana" {
|
||||
count = 1
|
||||
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
operator = "="
|
||||
value = "influxdb"
|
||||
}
|
||||
|
||||
volume "grafana-data" {
|
||||
type = "host"
|
||||
read_only = false
|
||||
source = "grafana-data"
|
||||
}
|
||||
|
||||
network {
|
||||
port "http" {
|
||||
static = 3000
|
||||
to = 3000
|
||||
}
|
||||
}
|
||||
|
||||
task "grafana" {
|
||||
driver = "exec"
|
||||
|
||||
volume_mount {
|
||||
volume = "grafana-data"
|
||||
destination = "/opt/grafana/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
config {
|
||||
command = "/usr/sbin/grafana-server"
|
||||
args = [
|
||||
"--config", "/etc/grafana/grafana.ini",
|
||||
"--homepath", "/usr/share/grafana",
|
||||
"cfg:default.paths.data=/opt/grafana/data",
|
||||
"cfg:default.paths.logs=/var/log/grafana",
|
||||
"cfg:default.paths.plugins=/var/lib/grafana/plugins",
|
||||
"cfg:default.paths.provisioning=/etc/grafana/provisioning"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 300
|
||||
memory = 512
|
||||
}
|
||||
|
||||
env {
|
||||
GF_SECURITY_ADMIN_PASSWORD = "admin123"
|
||||
GF_INSTALL_PLUGINS = "grafana-piechart-panel"
|
||||
GF_SERVER_DOMAIN = "grafana.tailnet-68f9.ts.net"
|
||||
GF_SERVER_ROOT_URL = "http://grafana.tailnet-68f9.ts.net:3000"
|
||||
}
|
||||
|
||||
service {
|
||||
name = "grafana"
|
||||
port = "http"
|
||||
|
||||
tags = [
|
||||
"grafana",
|
||||
"monitoring",
|
||||
"dashboard"
|
||||
]
|
||||
|
||||
check {
|
||||
type = "http"
|
||||
path = "/api/health"
|
||||
interval = "30s"
|
||||
timeout = "5s"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Prometheus 服务组
|
||||
group "prometheus" {
|
||||
count = 1
|
||||
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
operator = "="
|
||||
value = "influxdb"
|
||||
}
|
||||
|
||||
volume "prometheus-data" {
|
||||
type = "host"
|
||||
read_only = false
|
||||
source = "prometheus-data"
|
||||
}
|
||||
|
||||
network {
|
||||
port "http" {
|
||||
static = 9090
|
||||
to = 9090
|
||||
}
|
||||
}
|
||||
|
||||
task "prometheus" {
|
||||
driver = "exec"
|
||||
|
||||
volume_mount {
|
||||
volume = "prometheus-data"
|
||||
destination = "/opt/prometheus/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
config {
|
||||
command = "prometheus"
|
||||
args = [
|
||||
"--config.file=/etc/prometheus/prometheus.yml",
|
||||
"--storage.tsdb.path=/opt/prometheus/data",
|
||||
"--web.console.libraries=/usr/share/prometheus/console_libraries",
|
||||
"--web.console.templates=/usr/share/prometheus/consoles",
|
||||
"--storage.tsdb.retention.time=15d",
|
||||
"--web.enable-lifecycle"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 300
|
||||
memory = 512
|
||||
}
|
||||
|
||||
service {
|
||||
name = "prometheus"
|
||||
port = "http"
|
||||
|
||||
tags = [
|
||||
"prometheus",
|
||||
"monitoring",
|
||||
"metrics"
|
||||
]
|
||||
|
||||
check {
|
||||
type = "http"
|
||||
path = "/-/healthy"
|
||||
interval = "30s"
|
||||
timeout = "5s"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Loki 服务组
|
||||
group "loki" {
|
||||
count = 1
|
||||
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
operator = "="
|
||||
value = "influxdb"
|
||||
}
|
||||
|
||||
volume "loki-data" {
|
||||
type = "host"
|
||||
read_only = false
|
||||
source = "loki-data"
|
||||
}
|
||||
|
||||
network {
|
||||
port "http" {
|
||||
static = 3100
|
||||
to = 3100
|
||||
}
|
||||
}
|
||||
|
||||
task "loki" {
|
||||
driver = "exec"
|
||||
|
||||
volume_mount {
|
||||
volume = "loki-data"
|
||||
destination = "/opt/loki/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
template {
|
||||
data = <<EOF
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /opt/loki/data
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /opt/loki/data/chunks
|
||||
rules_directory: /opt/loki/data/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_scheduler:
|
||||
max_outstanding_requests_per_tenant: 2048
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v12
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
allow_structured_metadata: false
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
EOF
|
||||
destination = "local/config.yml"
|
||||
}
|
||||
|
||||
config {
|
||||
command = "loki"
|
||||
args = [
|
||||
"-config.file=local/config.yml"
|
||||
]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 300
|
||||
memory = 512
|
||||
}
|
||||
|
||||
service {
|
||||
name = "loki"
|
||||
port = "http"
|
||||
|
||||
tags = [
|
||||
"loki",
|
||||
"monitoring",
|
||||
"logs"
|
||||
]
|
||||
|
||||
check {
|
||||
type = "http"
|
||||
path = "/ready"
|
||||
interval = "30s"
|
||||
timeout = "5s"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,32 +7,63 @@ rule_files:
|
||||
# - "second_rules.yml"
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus 自身监控
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'openfaas'
|
||||
static_configs:
|
||||
- targets: ['gateway:8080']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
|
||||
- job_name: 'nats'
|
||||
static_configs:
|
||||
- targets: ['nats:8222']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Node Exporter 监控 - 所有节点
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
- targets:
|
||||
- 'semaphore.tailnet-68f9.ts.net:9100'
|
||||
- 'ash1d.tailnet-68f9.ts.net:9100'
|
||||
- 'ash2e.tailnet-68f9.ts.net:9100'
|
||||
- 'ash3c.tailnet-68f9.ts.net:9100'
|
||||
- 'ch2.tailnet-68f9.ts.net:9100'
|
||||
- 'ch3.tailnet-68f9.ts.net:9100'
|
||||
- 'ch4.tailnet-68f9.ts.net:9100'
|
||||
- 'de.tailnet-68f9.ts.net:9100'
|
||||
- 'hcp1.tailnet-68f9.ts.net:9100'
|
||||
- 'influxdb.tailnet-68f9.ts.net:9100'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:9100'
|
||||
- 'warden.tailnet-68f9.ts.net:9100'
|
||||
- 'browser.tailnet-68f9.ts.net:9100'
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
# Consul 监控
|
||||
- job_name: 'consul'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
- targets:
|
||||
- 'ch4.tailnet-68f9.ts.net:8500'
|
||||
- 'ash3c.tailnet-68f9.ts.net:8500'
|
||||
- 'warden.tailnet-68f9.ts.net:8500'
|
||||
|
||||
# Nomad 监控
|
||||
- job_name: 'nomad'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'semaphore.tailnet-68f9.ts.net:4646'
|
||||
- 'ash1d.tailnet-68f9.ts.net:4646'
|
||||
- 'ash2e.tailnet-68f9.ts.net:4646'
|
||||
- 'ch2.tailnet-68f9.ts.net:4646'
|
||||
- 'ch3.tailnet-68f9.ts.net:4646'
|
||||
- 'onecloud1.tailnet-68f9.ts.net:4646'
|
||||
- 'de.tailnet-68f9.ts.net:4646'
|
||||
|
||||
# Vault 监控
|
||||
- job_name: 'vault'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'master.tailnet-68f9.ts.net:8200'
|
||||
- 'ash3c.tailnet-68f9.ts.net:8200'
|
||||
- 'hcp1.tailnet-68f9.ts.net:8200'
|
||||
|
||||
# InfluxDB 监控
|
||||
- job_name: 'influxdb'
|
||||
static_configs:
|
||||
- targets: ['influxdb1.tailnet-68f9.ts.net:8086']
|
||||
|
||||
# Traefik 监控
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['hcp1.tailnet-68f9.ts.net:8080']
|
||||
|
||||
Reference in New Issue
Block a user