mgmt/infrastructure/monitor/monitoring-stack.nomad

258 lines
4.9 KiB
HCL

job "monitoring-stack" {
datacenters = ["dc1"]
type = "service"
# Grafana 服务组
group "grafana" {
count = 1
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "influxdb"
}
volume "grafana-data" {
type = "host"
read_only = false
source = "grafana-data"
}
network {
port "http" {
static = 3000
to = 3000
}
}
task "grafana" {
driver = "exec"
volume_mount {
volume = "grafana-data"
destination = "/opt/grafana/data"
read_only = false
}
config {
command = "/usr/sbin/grafana-server"
args = [
"--config", "/etc/grafana/grafana.ini",
"--homepath", "/usr/share/grafana",
"cfg:default.paths.data=/opt/grafana/data",
"cfg:default.paths.logs=/var/log/grafana",
"cfg:default.paths.plugins=/var/lib/grafana/plugins",
"cfg:default.paths.provisioning=/etc/grafana/provisioning"
]
}
resources {
cpu = 300
memory = 512
}
env {
GF_SECURITY_ADMIN_PASSWORD = "admin123"
GF_INSTALL_PLUGINS = "grafana-piechart-panel"
GF_SERVER_DOMAIN = "grafana.tailnet-68f9.ts.net"
GF_SERVER_ROOT_URL = "http://grafana.tailnet-68f9.ts.net:3000"
}
service {
name = "grafana"
port = "http"
tags = [
"grafana",
"monitoring",
"dashboard"
]
check {
type = "http"
path = "/api/health"
interval = "30s"
timeout = "5s"
}
}
}
}
# Prometheus 服务组
group "prometheus" {
count = 1
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "influxdb"
}
volume "prometheus-data" {
type = "host"
read_only = false
source = "prometheus-data"
}
network {
port "http" {
static = 9090
to = 9090
}
}
task "prometheus" {
driver = "exec"
volume_mount {
volume = "prometheus-data"
destination = "/opt/prometheus/data"
read_only = false
}
config {
command = "prometheus"
args = [
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/opt/prometheus/data",
"--web.console.libraries=/usr/share/prometheus/console_libraries",
"--web.console.templates=/usr/share/prometheus/consoles",
"--storage.tsdb.retention.time=15d",
"--web.enable-lifecycle"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "prometheus"
port = "http"
tags = [
"prometheus",
"monitoring",
"metrics"
]
check {
type = "http"
path = "/-/healthy"
interval = "30s"
timeout = "5s"
}
}
}
}
# Loki 服务组
group "loki" {
count = 1
constraint {
attribute = "${node.unique.name}"
operator = "="
value = "influxdb"
}
volume "loki-data" {
type = "host"
read_only = false
source = "loki-data"
}
network {
port "http" {
static = 3100
to = 3100
}
}
task "loki" {
driver = "exec"
volume_mount {
volume = "loki-data"
destination = "/opt/loki/data"
read_only = false
}
template {
data = <<EOF
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /opt/loki/data
storage:
filesystem:
chunks_directory: /opt/loki/data/chunks
rules_directory: /opt/loki/data/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
query_scheduler:
max_outstanding_requests_per_tenant: 2048
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v12
index:
prefix: index_
period: 24h
limits_config:
allow_structured_metadata: false
ruler:
alertmanager_url: http://localhost:9093
analytics:
reporting_enabled: false
EOF
destination = "local/config.yml"
}
config {
command = "loki"
args = [
"-config.file=local/config.yml"
]
}
resources {
cpu = 300
memory = 512
}
service {
name = "loki"
port = "http"
tags = [
"loki",
"monitoring",
"logs"
]
check {
type = "http"
path = "/ready"
interval = "30s"
timeout = "5s"
}
}
}
}
}