feat(监控): 添加Telegraf监控配置和磁盘监控脚本
refactor(容器): 从Docker迁移到Podman并更新Nomad配置 fix(配置): 修复代理和别名配置问题 docs(文档): 更新配置文件和脚本注释 chore(清理): 移除不再使用的Consul和Docker相关文件
This commit is contained in:
68
configuration/templates/disk-monitoring.conf.j2
Normal file
68
configuration/templates/disk-monitoring.conf.j2
Normal file
@@ -0,0 +1,68 @@
|
||||
# 硬盘监控配置
|
||||
# 监控所有挂载点的硬盘使用情况
|
||||
|
||||
# 硬盘使用率监控
|
||||
[[inputs.disk]]
|
||||
## 忽略的文件系统类型
|
||||
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
|
||||
|
||||
## 监控所有挂载点
|
||||
mount_points = ["/", "/var", "/tmp", "/opt", "/home"]
|
||||
|
||||
## 标签配置
|
||||
[inputs.disk.tags]
|
||||
service = "disk-monitoring"
|
||||
|
||||
# 硬盘 I/O 监控
|
||||
[[inputs.diskio]]
|
||||
## 监控所有设备
|
||||
devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"]
|
||||
|
||||
## 跳过序列号收集以提高性能
|
||||
skip_serial_number = true
|
||||
|
||||
[inputs.diskio.tags]
|
||||
service = "disk-io-monitoring"
|
||||
|
||||
# 文件系统 inode 监控
|
||||
[[inputs.disk]]
|
||||
## 监控 inode 使用情况
|
||||
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
|
||||
|
||||
## 收集 inode 信息
|
||||
[inputs.disk.tags]
|
||||
service = "inode-monitoring"
|
||||
|
||||
# 进程监控(可选,用于监控可能占用大量硬盘的进程)
|
||||
[[inputs.procstat]]
|
||||
## 监控 Docker 进程(如果存在)
|
||||
pattern = "docker"
|
||||
|
||||
[inputs.procstat.tags]
|
||||
service = "docker-process"
|
||||
|
||||
[[inputs.procstat]]
|
||||
## 监控 Podman 进程
|
||||
pattern = "podman"
|
||||
|
||||
[inputs.procstat.tags]
|
||||
service = "podman-process"
|
||||
|
||||
[[inputs.procstat]]
|
||||
## 监控 Nomad 进程
|
||||
pattern = "nomad"
|
||||
|
||||
[inputs.procstat.tags]
|
||||
service = "nomad-process"
|
||||
|
||||
# 日志文件大小监控
|
||||
[[inputs.filestat]]
|
||||
files = [
|
||||
"/var/log/nomad/*.log",
|
||||
"/var/log/syslog",
|
||||
"/var/log/kern.log",
|
||||
"/var/log/auth.log"
|
||||
]
|
||||
|
||||
[inputs.filestat.tags]
|
||||
service = "log-monitoring"
|
||||
68
configuration/templates/system-monitoring.conf.j2
Normal file
68
configuration/templates/system-monitoring.conf.j2
Normal file
@@ -0,0 +1,68 @@
|
||||
# 系统监控配置
|
||||
# CPU、内存、网络等系统资源监控
|
||||
|
||||
# CPU 监控
|
||||
[[inputs.cpu]]
|
||||
## 是否收集每个 CPU 核心的信息
|
||||
percpu = true
|
||||
## 是否收集总 CPU 信息
|
||||
totalcpu = true
|
||||
## 收集字段
|
||||
collect_cpu_time = false
|
||||
## 报告活跃的 CPU
|
||||
report_active = false
|
||||
|
||||
[inputs.cpu.tags]
|
||||
service = "cpu-monitoring"
|
||||
|
||||
# 内存监控
|
||||
[[inputs.mem]]
|
||||
[inputs.mem.tags]
|
||||
service = "memory-monitoring"
|
||||
|
||||
# 网络接口监控
|
||||
[[inputs.net]]
|
||||
## 接口配置
|
||||
interfaces = ["eth*", "en*", "tailscale*"]
|
||||
|
||||
[inputs.net.tags]
|
||||
service = "network-monitoring"
|
||||
|
||||
# 系统负载监控
|
||||
[[inputs.system]]
|
||||
[inputs.system.tags]
|
||||
service = "system-load"
|
||||
|
||||
# 内核统计
|
||||
[[inputs.kernel]]
|
||||
[inputs.kernel.tags]
|
||||
service = "kernel-stats"
|
||||
|
||||
# 网络统计
|
||||
[[inputs.netstat]]
|
||||
[inputs.netstat.tags]
|
||||
service = "network-stats"
|
||||
|
||||
# 交换分区监控
|
||||
[[inputs.swap]]
|
||||
[inputs.swap.tags]
|
||||
service = "swap-monitoring"
|
||||
|
||||
# 服务状态监控
|
||||
[[inputs.systemd_units]]
|
||||
## 监控的服务
|
||||
units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"]
|
||||
|
||||
[inputs.systemd_units.tags]
|
||||
service = "service-monitoring"
|
||||
|
||||
# 硬盘健康状态监控(如果支持 SMART)
|
||||
[[inputs.smart]]
|
||||
## SMART 监控路径
|
||||
path_smartctl = "/usr/sbin/smartctl"
|
||||
|
||||
## 超时设置
|
||||
timeout = "30s"
|
||||
|
||||
[inputs.smart.tags]
|
||||
service = "smart-monitoring"
|
||||
7
configuration/templates/telegraf-env.j2
Normal file
7
configuration/templates/telegraf-env.j2
Normal file
@@ -0,0 +1,7 @@
|
||||
# Telegraf 环境变量配置
|
||||
# InfluxDB 2.x 认证信息
|
||||
|
||||
INFLUX_TOKEN={{ influxdb_token }}
|
||||
INFLUX_ORG={{ influxdb_org }}
|
||||
INFLUX_BUCKET={{ influxdb_bucket }}
|
||||
INFLUX_URL={{ influxdb_url }}
|
||||
53
configuration/templates/telegraf.conf.j2
Normal file
53
configuration/templates/telegraf.conf.j2
Normal file
@@ -0,0 +1,53 @@
|
||||
# Telegraf 主配置文件
|
||||
# Nomad 集群硬盘监控配置
|
||||
|
||||
# 全局设置
|
||||
[global_tags]
|
||||
nomad_cluster = "production"
|
||||
node_role = "{{ nomad_role | default('unknown') }}"
|
||||
hostname = "{{ inventory_hostname }}"
|
||||
|
||||
# Agent 配置
|
||||
[agent]
|
||||
interval = "{{ collection_interval | default(30) }}s"
|
||||
round_interval = true
|
||||
metric_batch_size = 1000
|
||||
metric_buffer_limit = 10000
|
||||
collection_jitter = "2s"
|
||||
flush_interval = "10s"
|
||||
flush_jitter = "0s"
|
||||
precision = ""
|
||||
hostname = "{{ inventory_hostname }}"
|
||||
omit_hostname = false
|
||||
|
||||
# 输出配置 - InfluxDB 2.x
|
||||
[[outputs.influxdb_v2]]
|
||||
urls = ["{{ influxdb_url }}"]
|
||||
token = "{{ influxdb_token }}"
|
||||
organization = "{{ influxdb_org | default('nomad') }}"
|
||||
bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}"
|
||||
|
||||
## 连接配置
|
||||
timeout = "10s"
|
||||
max_retries = 3
|
||||
retry_timeout = "5s"
|
||||
|
||||
## 数据精度
|
||||
precision = "s"
|
||||
|
||||
## TLS 配置(如果需要)
|
||||
# tls_ca = "/etc/telegraf/ca.pem"
|
||||
# tls_cert = "/etc/telegraf/cert.pem"
|
||||
# tls_key = "/etc/telegraf/key.pem"
|
||||
# insecure_skip_verify = false
|
||||
|
||||
# 日志配置 - 禁用本地日志以节省硬盘空间
|
||||
[log]
|
||||
## 只输出错误日志到 syslog,不生成本地文件
|
||||
level = "ERROR"
|
||||
## 禁用本地日志文件
|
||||
# file = "/var/log/telegraf/telegraf.log"
|
||||
## 使用 syslog 替代本地文件
|
||||
logtarget = "syslog"
|
||||
## 禁用日志轮转
|
||||
logrotate = false
|
||||
29
configuration/templates/telegraf.service.j2
Normal file
29
configuration/templates/telegraf.service.j2
Normal file
@@ -0,0 +1,29 @@
|
||||
[Unit]
|
||||
Description=Telegraf - 节点监控服务
|
||||
Documentation=https://github.com/influxdata/telegraf
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
User=telegraf
|
||||
Group=telegraf
|
||||
ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }}
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=control-group
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
TimeoutStopSec=20
|
||||
EnvironmentFile=/etc/default/telegraf
|
||||
|
||||
# 安全配置
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=/var/lib/telegraf
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
ProtectControlGroups=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user