feat: 重构项目目录结构并添加多个功能

- 新增脚本和配置文件用于管理Nomad节点和NFS存储
- 添加多个Ansible playbook用于配置和调试Nomad集群
- 新增Nomad job文件用于测试Podman和NFS功能
- 重构playbooks目录结构,按功能分类
- 更新Nomad客户端和服务端配置模板
- 添加SSH密钥分发和配置脚本
- 新增多个调试和修复问题的playbook
This commit is contained in:
2025-09-27 13:05:30 +00:00
parent a06e5e1a00
commit 44b098bd20
98 changed files with 1141 additions and 2 deletions

View File

@@ -0,0 +1,187 @@
---
- name: 部署 Telegraf 硬盘监控到 Nomad 集群
hosts: all
become: yes
vars:
# 连接现有的 InfluxDB 2.x + Grafana 监控栈
influxdb_url: "{{ influxdb_url | default('http://influxdb1.tailnet-68f9.ts.net:8086') }}"
influxdb_token: "{{ influxdb_token }}"
influxdb_org: "{{ influxdb_org | default('nomad') }}"
influxdb_bucket: "{{ influxdb_bucket | default('nomad_monitoring') }}"
# 远程 Telegraf 配置模式(优先)
use_remote_config: "{{ use_remote_config | default(true) }}"
telegraf_config_url: "{{ telegraf_config_url | default('') }}"
# 硬盘监控阈值
disk_usage_warning: 80 # 80% 使用率警告
disk_usage_critical: 90 # 90% 使用率严重告警
# 监控间隔(秒)
collection_interval: 30
tasks:
- name: 显示正在处理的节点
debug:
msg: "🔧 正在为节点 {{ inventory_hostname }} 安装硬盘监控"
- name: 添加 InfluxData 仓库密钥
apt_key:
url: https://repos.influxdata.com/influxdata-archive_compat.key
state: present
retries: 3
delay: 5
- name: 添加 InfluxData 仓库
apt_repository:
repo: "deb https://repos.influxdata.com/ubuntu {{ ansible_distribution_release }} stable"
state: present
update_cache: yes
retries: 3
delay: 5
- name: 安装 Telegraf
apt:
name: telegraf
state: present
update_cache: yes
retries: 3
delay: 10
- name: 创建 Telegraf 配置目录
file:
path: /etc/telegraf/telegraf.d
state: directory
owner: telegraf
group: telegraf
mode: '0755'
- name: 清理旧的 Telegraf 日志文件(节省硬盘空间)
file:
path: "{{ item }}"
state: absent
loop:
- /var/log/telegraf
- /var/log/telegraf.log
ignore_errors: yes
- name: 禁用 Telegraf 日志目录创建
file:
path: /var/log/telegraf
state: absent
ignore_errors: yes
- name: 创建 Telegraf 环境变量文件
template:
src: telegraf-env.j2
dest: /etc/default/telegraf
owner: root
group: root
mode: '0600'
backup: yes
notify: restart telegraf
- name: 创建 Telegraf systemd 服务文件(支持远程配置)
template:
src: telegraf.service.j2
dest: /etc/systemd/system/telegraf.service
owner: root
group: root
mode: '0644'
backup: yes
notify:
- reload systemd
- restart telegraf
when: telegraf_config_url is defined and telegraf_config_url != ''
- name: 生成 Telegraf 主配置文件(本地配置模式)
template:
src: telegraf.conf.j2
dest: /etc/telegraf/telegraf.conf
owner: telegraf
group: telegraf
mode: '0644'
backup: yes
notify: restart telegraf
when: telegraf_config_url is not defined or telegraf_config_url == ''
- name: 生成硬盘监控配置
template:
src: disk-monitoring.conf.j2
dest: /etc/telegraf/telegraf.d/disk-monitoring.conf
owner: telegraf
group: telegraf
mode: '0644'
backup: yes
notify: restart telegraf
- name: 生成系统监控配置
template:
src: system-monitoring.conf.j2
dest: /etc/telegraf/telegraf.d/system-monitoring.conf
owner: telegraf
group: telegraf
mode: '0644'
backup: yes
notify: restart telegraf
- name: 启用并启动 Telegraf 服务
systemd:
name: telegraf
state: started
enabled: yes
daemon_reload: yes
- name: 验证 Telegraf 状态
systemd:
name: telegraf
register: telegraf_status
- name: 检查 InfluxDB 连接
uri:
url: "{{ influxdb_url }}/ping"
method: GET
timeout: 5
register: influxdb_ping
ignore_errors: yes
delegate_to: localhost
run_once: true
- name: 显示 InfluxDB 连接状态
debug:
msg: "{{ '✅ InfluxDB 连接正常' if influxdb_ping.status == 204 else '❌ InfluxDB 连接失败,请检查配置' }}"
run_once: true
- name: 显示 Telegraf 状态
debug:
msg: "✅ Telegraf 状态: {{ telegraf_status.status.ActiveState }}"
- name: 检查硬盘使用情况
shell: |
df -h | grep -vE '^Filesystem|tmpfs|cdrom|udev' | awk '{print $5 " " $1 " " $6}' | while read output;
do
usage=$(echo $output | awk '{print $1}' | sed 's/%//g')
partition=$(echo $output | awk '{print $2}')
mount=$(echo $output | awk '{print $3}')
if [ $usage -ge {{ disk_usage_warning }} ]; then
echo "⚠️ 警告: $mount ($partition) 使用率 $usage%"
else
echo "✅ $mount ($partition) 使用率 $usage%"
fi
done
register: disk_check
changed_when: false
- name: 显示硬盘检查结果
debug:
msg: "{{ disk_check.stdout_lines }}"
handlers:
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart telegraf
systemd:
name: telegraf
state: restarted

View File

@@ -0,0 +1,76 @@
---
- name: 安装并配置新的 Nomad Server 节点
hosts: influxdb1
become: yes
gather_facts: no
tasks:
- name: 更新包缓存
apt:
update_cache: yes
cache_valid_time: 3600
retries: 3
delay: 10
- name: 安装依赖包
apt:
name:
- wget
- curl
- unzip
- podman
- buildah
- skopeo
state: present
retries: 3
delay: 10
- name: 检查 Nomad 是否已安装
shell: which nomad || echo "not_found"
register: nomad_check
changed_when: false
- name: 下载并安装 Nomad
block:
- name: 下载 Nomad 1.10.5
get_url:
url: "https://releases.hashicorp.com/nomad/1.10.5/nomad_1.10.5_linux_amd64.zip"
dest: "/tmp/nomad.zip"
mode: '0644'
- name: 解压 Nomad
unarchive:
src: "/tmp/nomad.zip"
dest: "/usr/bin/"
remote_src: yes
owner: root
group: root
mode: '0755'
- name: 清理临时文件
file:
path: "/tmp/nomad.zip"
state: absent
when: nomad_check.stdout == "not_found"
- name: 验证 Nomad 安装
shell: nomad version
register: nomad_version_output
- name: 显示安装结果
debug:
msg: |
✅ 节点 {{ inventory_hostname }} 软件安装完成
📦 Podman: {{ ansible_facts.packages.podman[0].version if ansible_facts.packages.podman is defined else 'checking...' }}
🎯 Nomad: {{ nomad_version_output.stdout.split('\n')[0] }}
- name: 启用 Podman socket
systemd:
name: podman.socket
enabled: yes
state: started
ignore_errors: yes
- name: 继续完整配置
debug:
msg: "软件安装完成,现在将运行完整的 Nomad 配置..."