🎉 Complete Nomad monitoring infrastructure project

✅ Major Achievements: - Deployed complete observability stack (Prometheus + Loki + Grafana) - Established rapid troubleshooting capabilities (3-step process) - Created heatmap dashboard for log correlation analysis - Unified logging system (systemd-journald across all nodes) - Configured API access with Service Account tokens 🧹 Project Cleanup: - Intelligent cleanup based on Git modification frequency - Organized files into proper directory structure - Removed deprecated webhook deployment scripts - Eliminated 70+ temporary/test files (43% reduction) 📊 Infrastructure Status: - Prometheus: 13 nodes monitored - Loki: 12 nodes logging - Grafana: Heatmap dashboard + API access - Promtail: Deployed to 12/13 nodes 🚀 Ready for Terraform transition (静默一周后切换) Project Status: COMPLETED ✅
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions
--- a/infrastructure/monitor/configs/loki/loki.yml
+++ b/infrastructure/monitor/configs/loki/loki.yml
@@ -0,0 +1,39 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+
+common:
+  path_prefix: /var/lib/loki
+  storage:
+    filesystem:
+      chunks_directory: /var/lib/loki/chunks
+      rules_directory: /var/lib/loki/rules
+  replication_factor: 1
+  ring:
+    instance_addr: 127.0.0.1
+    kvstore:
+      store: inmemory
+
+query_scheduler:
+  max_outstanding_requests_per_tenant: 2048
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v12
+      index:
+        prefix: index_
+        period: 24h
+
+limits_config:
+  allow_structured_metadata: false
+
+ruler:
+  alertmanager_url: http://localhost:9093
+
+analytics:
+  reporting_enabled: false
--- a/infrastructure/monitor/configs/node-exporter/node-exporter-config.yml
+++ b/infrastructure/monitor/configs/node-exporter/node-exporter-config.yml
@@ -0,0 +1,5 @@
+# Node Exporter 配置文件
+# 默认配置已经足够，主要参数通过命令行传递
+
+# 如果需要自定义配置，可以在这里添加
+# 目前使用默认配置 + 命令行参数
--- a/infrastructure/monitor/configs/prometheus/prometheus.yml
+++ b/infrastructure/monitor/configs/prometheus/prometheus.yml
@@ -0,0 +1,61 @@
+# Prometheus 配置 - 监控Nomad集群
+global:
+  scrape_interval:     15s
+  evaluation_interval: 15s
+  external_labels:
+      monitor: 'nomad-cluster'
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  # - "first_rules.yml"
+  # - "second_rules.yml"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+scrape_configs:
+  # Prometheus自身监控
+  - job_name: 'prometheus'
+    scrape_interval: 5s
+    scrape_timeout: 5s
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # Node Exporter - 客户端节点
+  - job_name: 'node-clients'
+    static_configs:
+      - targets: 
+          - 'ch4.tailnet-68f9.ts.net:9100'
+          - 'ash3c.tailnet-68f9.ts.net:9100'
+          - 'warden.tailnet-68f9.ts.net:9100'
+          - 'hcp1.tailnet-68f9.ts.net:9100'
+          - 'browser.tailnet-68f9.ts.net:9100'
+
+  # Node Exporter - 服务端节点
+  - job_name: 'node-servers'
+    static_configs:
+      - targets: 
+          - 'ash2e.tailnet-68f9.ts.net:9100'
+          - 'ch2.tailnet-68f9.ts.net:9100'
+          - 'ch3.tailnet-68f9.ts.net:9100'
+          - 'onecloud1.tailnet-68f9.ts.net:9100'
+
+  # Nomad集群监控
+  - job_name: 'nomad'
+    static_configs:
+      - targets: 
+          - 'ash1.tailnet-68f9.ts.net:4646'
+          - 'ash2.tailnet-68f9.ts.net:4646'
+          - 'onecloud1.tailnet-68f9.ts.net:4646'
+
+  # Consul集群监控
+  - job_name: 'consul'
+    static_configs:
+      - targets: 
+          - 'ash1.tailnet-68f9.ts.net:8500'
+          - 'ash2.tailnet-68f9.ts.net:8500'
+          - 'onecloud1.tailnet-68f9.ts.net:8500'
--- a/infrastructure/monitor/configs/promtail/promtail-config.yaml
+++ b/infrastructure/monitor/configs/promtail/promtail-config.yaml
@@ -0,0 +1,39 @@
+server:
+  http_listen_port: 9080
+  grpc_listen_port: 0
+
+positions:
+  filename: /opt/promtail/data/positions.yaml
+
+clients:
+  - url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
+
+scrape_configs:
+  - job_name: journal
+    journal:
+      max_age: 12h
+      labels:
+        job: systemd-journal
+    relabel_configs:
+      - source_labels: ['__journal__systemd_unit']
+        target_label: 'unit'
+      - source_labels: ['__journal_priority_keyword']
+        target_label: 'level'
+      - source_labels: ['__journal__hostname']
+        target_label: 'hostname'
+
+  - job_name: syslog
+    static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: syslog
+          __path__: /var/log/syslog
+
+  - job_name: daemon
+    static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: daemon
+          __path__: /var/log/daemon.log
--- a/infrastructure/monitor/configs/promtail/promtail-journal.yaml
+++ b/infrastructure/monitor/configs/promtail/promtail-journal.yaml
@@ -0,0 +1,23 @@
+server:
+  http_listen_port: 9082
+  grpc_listen_port: 0
+
+positions:
+  filename: /tmp/positions.yaml
+
+clients:
+  - url: http://influxdb.tailnet-68f9.ts.net:3100/loki/api/v1/push
+
+scrape_configs:
+  - job_name: journal
+    journal:
+      max_age: 12h
+      labels:
+        job: systemd-journal
+    relabel_configs:
+      - source_labels: ['__journal__systemd_unit']
+        target_label: 'unit'
+      - source_labels: ['__journal_priority_keyword']
+        target_label: 'level'
+      - source_labels: ['__journal__hostname']
+        target_label: 'hostname'
--- a/infrastructure/monitor/dashboards/loki-heatmap-demo.json
+++ b/infrastructure/monitor/dashboards/loki-heatmap-demo.json
@@ -0,0 +1,392 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "Loki 日志热点图 Demo",
+    "tags": ["loki", "heatmap", "demo"],
+    "style": "dark",
+    "timezone": "browser",
+    "panels": [
+      {
+        "id": 1,
+        "title": "日志级别热点图 (类似GitHub贡献图)",
+        "type": "heatmap",
+        "targets": [
+          {
+            "datasource": {
+              "type": "loki",
+              "uid": "loki"
+            },
+            "expr": "sum by (level) (rate({job=\"systemd-journal\"}[5m]))",
+            "refId": "A",
+            "legendFormat": "{{level}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "custom": {
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              },
+              "scaleDistribution": {
+                "type": "linear"
+              }
+            },
+            "color": {
+              "mode": "palette-classic"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 1
+                },
+                {
+                  "color": "red",
+                  "value": 10
+                }
+              ]
+            }
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        },
+        "options": {
+          "calculate": false,
+          "cellGap": 2,
+          "cellValues": {
+            "unit": "short"
+          },
+          "color": {
+            "exponent": 0.5,
+            "fill": "dark-orange",
+            "mode": "spectrum",
+            "reverse": false,
+            "scale": "exponential",
+            "scheme": "Spectral",
+            "steps": 64
+          },
+          "exemplars": {
+            "color": "rgba(255,0,255,0.7)"
+          },
+          "filterValues": {
+            "le": 1e-9
+          },
+          "legend": {
+            "show": true
+          },
+          "rowsFrame": {
+            "layout": "auto"
+          },
+          "tooltip": {
+            "show": true,
+            "yHistogram": false
+          },
+          "yAxis": {
+            "axisPlacement": "left",
+            "reverse": false,
+            "unit": "short"
+          }
+        }
+      },
+      {
+        "id": 2,
+        "title": "节点日志密度热点图",
+        "type": "heatmap",
+        "targets": [
+          {
+            "datasource": {
+              "type": "loki",
+              "uid": "loki"
+            },
+            "expr": "sum by (hostname) (rate({job=\"systemd-journal\"}[5m]))",
+            "refId": "A",
+            "legendFormat": "{{hostname}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "custom": {
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              }
+            },
+            "color": {
+              "mode": "palette-classic"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 5
+                },
+                {
+                  "color": "red",
+                  "value": 20
+                }
+              ]
+            }
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        },
+        "options": {
+          "calculate": false,
+          "cellGap": 2,
+          "cellValues": {
+            "unit": "short"
+          },
+          "color": {
+            "exponent": 0.5,
+            "fill": "dark-orange",
+            "mode": "spectrum",
+            "reverse": false,
+            "scale": "exponential",
+            "scheme": "Spectral",
+            "steps": 64
+          },
+          "exemplars": {
+            "color": "rgba(255,0,255,0.7)"
+          },
+          "filterValues": {
+            "le": 1e-9
+          },
+          "legend": {
+            "show": true
+          },
+          "rowsFrame": {
+            "layout": "auto"
+          },
+          "tooltip": {
+            "show": true,
+            "yHistogram": false
+          },
+          "yAxis": {
+            "axisPlacement": "left",
+            "reverse": false,
+            "unit": "short"
+          }
+        }
+      },
+      {
+        "id": 3,
+        "title": "关键服务日志热点图 (Nomad/Consul/Traefik)",
+        "type": "heatmap",
+        "targets": [
+          {
+            "datasource": {
+              "type": "loki",
+              "uid": "loki"
+            },
+            "expr": "sum by (unit) (rate({job=\"systemd-journal\", unit=~\"nomad|consul|traefik\"}[5m]))",
+            "refId": "A",
+            "legendFormat": "{{unit}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "custom": {
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              }
+            },
+            "color": {
+              "mode": "palette-classic"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "yellow",
+                  "value": 1
+                },
+                {
+                  "color": "red",
+                  "value": 5
+                }
+              ]
+            }
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 24,
+          "x": 0,
+          "y": 8
+        },
+        "options": {
+          "calculate": false,
+          "cellGap": 2,
+          "cellValues": {
+            "unit": "short"
+          },
+          "color": {
+            "exponent": 0.5,
+            "fill": "dark-orange",
+            "mode": "spectrum",
+            "reverse": false,
+            "scale": "exponential",
+            "scheme": "Spectral",
+            "steps": 64
+          },
+          "exemplars": {
+            "color": "rgba(255,0,255,0.7)"
+          },
+          "filterValues": {
+            "le": 1e-9
+          },
+          "legend": {
+            "show": true
+          },
+          "rowsFrame": {
+            "layout": "auto"
+          },
+          "tooltip": {
+            "show": true,
+            "yHistogram": false
+          },
+          "yAxis": {
+            "axisPlacement": "left",
+            "reverse": false,
+            "unit": "short"
+          }
+        }
+      },
+      {
+        "id": 4,
+        "title": "ERROR/CRIT 级别日志热点图 (黑匣子重点)",
+        "type": "heatmap",
+        "targets": [
+          {
+            "datasource": {
+              "type": "loki",
+              "uid": "loki"
+            },
+            "expr": "sum by (hostname) (rate({job=\"systemd-journal\", level=~\"error|crit\"}[5m]))",
+            "refId": "A",
+            "legendFormat": "{{hostname}} - {{level}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "custom": {
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              }
+            },
+            "color": {
+              "mode": "palette-classic"
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "orange",
+                  "value": 0.1
+                },
+                {
+                  "color": "red",
+                  "value": 1
+                }
+              ]
+            }
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 24,
+          "x": 0,
+          "y": 16
+        },
+        "options": {
+          "calculate": false,
+          "cellGap": 2,
+          "cellValues": {
+            "unit": "short"
+          },
+          "color": {
+            "exponent": 0.5,
+            "fill": "dark-orange",
+            "mode": "spectrum",
+            "reverse": false,
+            "scale": "exponential",
+            "scheme": "Spectral",
+            "steps": 64
+          },
+          "exemplars": {
+            "color": "rgba(255,0,255,0.7)"
+          },
+          "filterValues": {
+            "le": 1e-9
+          },
+          "legend": {
+            "show": true
+          },
+          "rowsFrame": {
+            "layout": "auto"
+          },
+          "tooltip": {
+            "show": true,
+            "yHistogram": false
+          },
+          "yAxis": {
+            "axisPlacement": "left",
+            "reverse": false,
+            "unit": "short"
+          }
+        }
+      }
+    ],
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "timepicker": {},
+    "templating": {
+      "list": []
+    },
+    "annotations": {
+      "list": []
+    },
+    "refresh": "30s",
+    "schemaVersion": 27,
+    "version": 1
+  }
+}
--- a/infrastructure/monitor/deploy-promtail.yml
+++ b/infrastructure/monitor/deploy-promtail.yml
@@ -0,0 +1,59 @@
+---
+- name: Deploy Promtail to all nodes
+  hosts: all
+  become: yes
+  vars:
+    promtail_config_path: /etc/promtail/promtail.yml
+    promtail_data_path: /opt/promtail/data
+    
+  tasks:
+    - name: Install promtail
+      apt:
+        name: promtail
+        state: present
+        update_cache: yes
+      ignore_errors: yes
+
+    - name: Create promtail user and group
+      user:
+        name: promtail
+        system: yes
+        shell: /bin/false
+        home: /opt/promtail
+        create_home: yes
+
+    - name: Create promtail data directory
+      file:
+        path: "{{ promtail_data_path }}"
+        state: directory
+        owner: promtail
+        group: promtail
+        mode: '0755'
+
+    - name: Copy promtail configuration
+      template:
+        src: promtail-config.yaml
+        dest: "{{ promtail_config_path }}"
+        owner: promtail
+        group: promtail
+        mode: '0644'
+      notify: restart promtail
+
+    - name: Add promtail user to adm group (for syslog access)
+      user:
+        name: promtail
+        groups: adm
+        append: yes
+
+    - name: Enable and start promtail service
+      systemd:
+        name: promtail
+        enabled: yes
+        state: started
+        daemon_reload: yes
+
+  handlers:
+    - name: restart promtail
+      systemd:
+        name: promtail
+        state: restarted
--- a/infrastructure/monitor/monitoring-stack.nomad
+++ b/infrastructure/monitor/monitoring-stack.nomad
@@ -0,0 +1,258 @@
+job "monitoring-stack" {
+  datacenters = ["dc1"]
+  type        = "service"
+
+  # Grafana 服务组
+  group "grafana" {
+    count = 1
+
+    constraint {
+      attribute = "${node.unique.name}"
+      operator  = "="
+      value     = "influxdb"
+    }
+
+    volume "grafana-data" {
+      type      = "host"
+      read_only = false
+      source    = "grafana-data"
+    }
+
+    network {
+      port "http" {
+        static = 3000
+        to     = 3000
+      }
+    }
+
+    task "grafana" {
+      driver = "exec"
+
+      volume_mount {
+        volume      = "grafana-data"
+        destination = "/opt/grafana/data"
+        read_only   = false
+      }
+
+      config {
+        command = "/usr/sbin/grafana-server"
+        args = [
+          "--config", "/etc/grafana/grafana.ini",
+          "--homepath", "/usr/share/grafana",
+          "cfg:default.paths.data=/opt/grafana/data",
+          "cfg:default.paths.logs=/var/log/grafana",
+          "cfg:default.paths.plugins=/var/lib/grafana/plugins",
+          "cfg:default.paths.provisioning=/etc/grafana/provisioning"
+        ]
+      }
+
+      resources {
+        cpu    = 300
+        memory = 512
+      }
+
+      env {
+        GF_SECURITY_ADMIN_PASSWORD = "admin123"
+        GF_INSTALL_PLUGINS = "grafana-piechart-panel"
+        GF_SERVER_DOMAIN = "grafana.tailnet-68f9.ts.net"
+        GF_SERVER_ROOT_URL = "http://grafana.tailnet-68f9.ts.net:3000"
+      }
+
+      service {
+        name = "grafana"
+        port = "http"
+        
+        tags = [
+          "grafana",
+          "monitoring",
+          "dashboard"
+        ]
+
+        check {
+          type     = "http"
+          path     = "/api/health"
+          interval = "30s"
+          timeout  = "5s"
+        }
+      }
+    }
+  }
+
+  # Prometheus 服务组
+  group "prometheus" {
+    count = 1
+
+    constraint {
+      attribute = "${node.unique.name}"
+      operator  = "="
+      value     = "influxdb"
+    }
+
+    volume "prometheus-data" {
+      type      = "host"
+      read_only = false
+      source    = "prometheus-data"
+    }
+
+    network {
+      port "http" {
+        static = 9090
+        to     = 9090
+      }
+    }
+
+    task "prometheus" {
+      driver = "exec"
+
+      volume_mount {
+        volume      = "prometheus-data"
+        destination = "/opt/prometheus/data"
+        read_only   = false
+      }
+
+      config {
+        command = "prometheus"
+        args = [
+          "--config.file=/etc/prometheus/prometheus.yml",
+          "--storage.tsdb.path=/opt/prometheus/data",
+          "--web.console.libraries=/usr/share/prometheus/console_libraries",
+          "--web.console.templates=/usr/share/prometheus/consoles",
+          "--storage.tsdb.retention.time=15d",
+          "--web.enable-lifecycle"
+        ]
+      }
+
+      resources {
+        cpu    = 300
+        memory = 512
+      }
+
+      service {
+        name = "prometheus"
+        port = "http"
+        
+        tags = [
+          "prometheus",
+          "monitoring",
+          "metrics"
+        ]
+
+        check {
+          type     = "http"
+          path     = "/-/healthy"
+          interval = "30s"
+          timeout  = "5s"
+        }
+      }
+    }
+  }
+
+  # Loki 服务组
+  group "loki" {
+    count = 1
+
+    constraint {
+      attribute = "${node.unique.name}"
+      operator  = "="
+      value     = "influxdb"
+    }
+
+    volume "loki-data" {
+      type      = "host"
+      read_only = false
+      source    = "loki-data"
+    }
+
+    network {
+      port "http" {
+        static = 3100
+        to     = 3100
+      }
+    }
+
+    task "loki" {
+      driver = "exec"
+
+      volume_mount {
+        volume      = "loki-data"
+        destination = "/opt/loki/data"
+        read_only   = false
+      }
+
+      template {
+        data = <<EOF
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+
+common:
+  path_prefix: /opt/loki/data
+  storage:
+    filesystem:
+      chunks_directory: /opt/loki/data/chunks
+      rules_directory: /opt/loki/data/rules
+  replication_factor: 1
+  ring:
+    instance_addr: 127.0.0.1
+    kvstore:
+      store: inmemory
+
+query_scheduler:
+  max_outstanding_requests_per_tenant: 2048
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v12
+      index:
+        prefix: index_
+        period: 24h
+
+limits_config:
+  allow_structured_metadata: false
+
+ruler:
+  alertmanager_url: http://localhost:9093
+
+analytics:
+  reporting_enabled: false
+EOF
+        destination = "local/config.yml"
+      }
+
+      config {
+        command = "loki"
+        args = [
+          "-config.file=local/config.yml"
+        ]
+      }
+
+      resources {
+        cpu    = 300
+        memory = 512
+      }
+
+      service {
+        name = "loki"
+        port = "http"
+        
+        tags = [
+          "loki",
+          "monitoring",
+          "logs"
+        ]
+
+        check {
+          type     = "http"
+          path     = "/ready"
+          interval = "30s"
+          timeout  = "5s"
+        }
+      }
+    }
+  }
+}
--- a/infrastructure/monitor/prometheus.yml
+++ b/infrastructure/monitor/prometheus.yml
@@ -7,32 +7,63 @@ rule_files:
  # - "second_rules.yml"

 scrape_configs:
+  # Prometheus 自身监控
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

-  - job_name: 'openfaas'
-    static_configs:
-      - targets: ['gateway:8080']
-    metrics_path: /metrics
-    scrape_interval: 15s
-    scrape_timeout: 10s
-
-  - job_name: 'nats'
-    static_configs:
-      - targets: ['nats:8222']
-    metrics_path: /metrics
-    scrape_interval: 15s
-    scrape_timeout: 10s
-
+  # Node Exporter 监控 - 所有节点
  - job_name: 'node-exporter'
    static_configs:
-      - targets: ['node-exporter:9100']
-    scrape_interval: 15s
-    scrape_timeout: 10s
+      - targets: 
+        - 'semaphore.tailnet-68f9.ts.net:9100'
+        - 'ash1d.tailnet-68f9.ts.net:9100'
+        - 'ash2e.tailnet-68f9.ts.net:9100'
+        - 'ash3c.tailnet-68f9.ts.net:9100'
+        - 'ch2.tailnet-68f9.ts.net:9100'
+        - 'ch3.tailnet-68f9.ts.net:9100'
+        - 'ch4.tailnet-68f9.ts.net:9100'
+        - 'de.tailnet-68f9.ts.net:9100'
+        - 'hcp1.tailnet-68f9.ts.net:9100'
+        - 'influxdb.tailnet-68f9.ts.net:9100'
+        - 'onecloud1.tailnet-68f9.ts.net:9100'
+        - 'warden.tailnet-68f9.ts.net:9100'
+        - 'browser.tailnet-68f9.ts.net:9100'

-  - job_name: 'cadvisor'
+  # Consul 监控
+  - job_name: 'consul'
    static_configs:
-      - targets: ['cadvisor:8080']
-    scrape_interval: 15s
-    scrape_timeout: 10s
+      - targets: 
+        - 'ch4.tailnet-68f9.ts.net:8500'
+        - 'ash3c.tailnet-68f9.ts.net:8500'
+        - 'warden.tailnet-68f9.ts.net:8500'
+
+  # Nomad 监控
+  - job_name: 'nomad'
+    static_configs:
+      - targets:
+        - 'semaphore.tailnet-68f9.ts.net:4646'
+        - 'ash1d.tailnet-68f9.ts.net:4646'
+        - 'ash2e.tailnet-68f9.ts.net:4646'
+        - 'ch2.tailnet-68f9.ts.net:4646'
+        - 'ch3.tailnet-68f9.ts.net:4646'
+        - 'onecloud1.tailnet-68f9.ts.net:4646'
+        - 'de.tailnet-68f9.ts.net:4646'
+
+  # Vault 监控
+  - job_name: 'vault'
+    static_configs:
+      - targets:
+        - 'master.tailnet-68f9.ts.net:8200'
+        - 'ash3c.tailnet-68f9.ts.net:8200'
+        - 'hcp1.tailnet-68f9.ts.net:8200'
+
+  # InfluxDB 监控
+  - job_name: 'influxdb'
+    static_configs:
+      - targets: ['influxdb1.tailnet-68f9.ts.net:8086']
+
+  # Traefik 监控
+  - job_name: 'traefik'
+    static_configs:
+      - targets: ['hcp1.tailnet-68f9.ts.net:8080']