feat(监控): 添加Telegraf监控配置和磁盘监控脚本

refactor(容器): 从Docker迁移到Podman并更新Nomad配置

fix(配置): 修复代理和别名配置问题

docs(文档): 更新配置文件和脚本注释

chore(清理): 移除不再使用的Consul和Docker相关文件
This commit is contained in:
2025-09-24 03:46:30 +00:00
parent 3f45ad8361
commit d0e7f64c1d
92 changed files with 3552 additions and 7737 deletions

View File

@@ -0,0 +1,68 @@
# 硬盘监控配置
# 监控所有挂载点的硬盘使用情况
# 硬盘使用率监控
[[inputs.disk]]
## 忽略的文件系统类型
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
## 监控所有挂载点
mount_points = ["/", "/var", "/tmp", "/opt", "/home"]
## 标签配置
[inputs.disk.tags]
service = "disk-monitoring"
# 硬盘 I/O 监控
[[inputs.diskio]]
## 监控所有设备
devices = ["sda", "sdb", "sdc", "sdd", "nvme0n1", "nvme1n1"]
## 跳过序列号收集以提高性能
skip_serial_number = true
[inputs.diskio.tags]
service = "disk-io-monitoring"
# 文件系统 inode 监控
[[inputs.disk]]
## 监控 inode 使用情况
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
## 收集 inode 信息
[inputs.disk.tags]
service = "inode-monitoring"
# 进程监控(可选,用于监控可能占用大量硬盘的进程)
[[inputs.procstat]]
## 监控 Docker 进程(如果存在)
pattern = "docker"
[inputs.procstat.tags]
service = "docker-process"
[[inputs.procstat]]
## 监控 Podman 进程
pattern = "podman"
[inputs.procstat.tags]
service = "podman-process"
[[inputs.procstat]]
## 监控 Nomad 进程
pattern = "nomad"
[inputs.procstat.tags]
service = "nomad-process"
# 日志文件大小监控
[[inputs.filestat]]
files = [
"/var/log/nomad/*.log",
"/var/log/syslog",
"/var/log/kern.log",
"/var/log/auth.log"
]
[inputs.filestat.tags]
service = "log-monitoring"

View File

@@ -0,0 +1,68 @@
# 系统监控配置
# CPU、内存、网络等系统资源监控
# CPU 监控
[[inputs.cpu]]
## 是否收集每个 CPU 核心的信息
percpu = true
## 是否收集总 CPU 信息
totalcpu = true
## 收集字段
collect_cpu_time = false
## 报告活跃的 CPU
report_active = false
[inputs.cpu.tags]
service = "cpu-monitoring"
# 内存监控
[[inputs.mem]]
[inputs.mem.tags]
service = "memory-monitoring"
# 网络接口监控
[[inputs.net]]
## 接口配置
interfaces = ["eth*", "en*", "tailscale*"]
[inputs.net.tags]
service = "network-monitoring"
# 系统负载监控
[[inputs.system]]
[inputs.system.tags]
service = "system-load"
# 内核统计
[[inputs.kernel]]
[inputs.kernel.tags]
service = "kernel-stats"
# 网络统计
[[inputs.netstat]]
[inputs.netstat.tags]
service = "network-stats"
# 交换分区监控
[[inputs.swap]]
[inputs.swap.tags]
service = "swap-monitoring"
# 服务状态监控
[[inputs.systemd_units]]
## 监控的服务
units = ["nomad.service", "docker.service", "podman.service", "telegraf.service", "tailscaled.service"]
[inputs.systemd_units.tags]
service = "service-monitoring"
# 硬盘健康状态监控(如果支持 SMART
[[inputs.smart]]
## SMART 监控路径
path_smartctl = "/usr/sbin/smartctl"
## 超时设置
timeout = "30s"
[inputs.smart.tags]
service = "smart-monitoring"

View File

@@ -0,0 +1,7 @@
# Telegraf 环境变量配置
# InfluxDB 2.x 认证信息
INFLUX_TOKEN={{ influxdb_token }}
INFLUX_ORG={{ influxdb_org }}
INFLUX_BUCKET={{ influxdb_bucket }}
INFLUX_URL={{ influxdb_url }}

View File

@@ -0,0 +1,53 @@
# Telegraf 主配置文件
# Nomad 集群硬盘监控配置
# 全局设置
[global_tags]
nomad_cluster = "production"
node_role = "{{ nomad_role | default('unknown') }}"
hostname = "{{ inventory_hostname }}"
# Agent 配置
[agent]
interval = "{{ collection_interval | default(30) }}s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "2s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = "{{ inventory_hostname }}"
omit_hostname = false
# 输出配置 - InfluxDB 2.x
[[outputs.influxdb_v2]]
urls = ["{{ influxdb_url }}"]
token = "{{ influxdb_token }}"
organization = "{{ influxdb_org | default('nomad') }}"
bucket = "{{ influxdb_bucket | default('nomad_monitoring') }}"
## 连接配置
timeout = "10s"
max_retries = 3
retry_timeout = "5s"
## 数据精度
precision = "s"
## TLS 配置(如果需要)
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
# insecure_skip_verify = false
# 日志配置 - 禁用本地日志以节省硬盘空间
[log]
## 只输出错误日志到 syslog不生成本地文件
level = "ERROR"
## 禁用本地日志文件
# file = "/var/log/telegraf/telegraf.log"
## 使用 syslog 替代本地文件
logtarget = "syslog"
## 禁用日志轮转
logrotate = false

View File

@@ -0,0 +1,29 @@
[Unit]
Description=Telegraf - 节点监控服务
Documentation=https://github.com/influxdata/telegraf
After=network.target
[Service]
Type=notify
User=telegraf
Group=telegraf
ExecStart=/usr/bin/telegraf --config {{ telegraf_config_url }}
ExecReload=/bin/kill -HUP $MAINPID
KillMode=control-group
Restart=on-failure
RestartSec=5
TimeoutStopSec=20
EnvironmentFile=/etc/default/telegraf
# 安全配置
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/telegraf
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
[Install]
WantedBy=multi-user.target