feat: 重构项目目录结构并添加多个功能

- 新增脚本和配置文件用于管理Nomad节点和NFS存储
- 添加多个Ansible playbook用于配置和调试Nomad集群
- 新增Nomad job文件用于测试Podman和NFS功能
- 重构playbooks目录结构,按功能分类
- 更新Nomad客户端和服务端配置模板
- 添加SSH密钥分发和配置脚本
- 新增多个调试和修复问题的playbook
This commit is contained in:
Houzhong Xu 2025-09-27 13:05:30 +00:00
parent a06e5e1a00
commit 44b098bd20
No known key found for this signature in database
GPG Key ID: B44BEB1438F1B46F
98 changed files with 1141 additions and 2 deletions

View File

@ -0,0 +1,20 @@
[nomad_nodes]
# 本机节点 (已通过PVE挂载NFS)
semaphore ansible_host=100.116.158.95 ansible_user=root
# 云服务器节点 (需要配置NFS挂载)
ash1d.global ansible_host=100.81.26.3 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ash2e.global ansible_host=100.103.147.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch2.global ansible_host=100.90.159.68 ansible_user=ben ansible_password=3131 ansible_become_password=3131
ch3.global ansible_host=100.86.141.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131
master ansible_host=100.117.106.136 ansible_user=ben ansible_password=3131 ansible_become_password=3131 ansible_port=60022
ash3c ansible_host=100.116.80.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131
[nomad_nodes:vars]
# NFS配置
nfs_server=snail
nfs_share=/fs/1000/nfs/Fnsync
mount_point=/mnt/fnsync
# Ansible配置
ansible_ssh_common_args='-o StrictHostKeyChecking=no'

View File

@ -0,0 +1,45 @@
# NFS CSI Volume 配置
type = "csi"
id = "nfs-fnsync"
name = "nfs-fnsync"
external_id = "nfs-fnsync"
# 插件配置
plugin_id = "nfs"
capacity_min = "1GiB"
capacity_max = "100GiB"
# 挂载选项
mount_options {
fs_type = "nfs4"
mount_flags = ["rw", "relatime", "vers=4.2"]
}
# 访问模式
access_mode = "single-node-writer"
attachment_mode = "file-system"
# 拓扑约束
topology_request {
preferred {
topology {
segments = {
"rack" = "rack-1"
}
}
}
required {
topology {
segments = {
"datacenter" = "dc1"
}
}
}
}
# 参数配置
parameters {
server = "snail"
share = "/fs/1000/nfs/Fnsync"
}

View File

@ -0,0 +1,82 @@
---
- name: Setup NFS for different container types
hosts: all
become: yes
vars:
nfs_server: snail
nfs_export_path: /fs/1000/nfs/Fnsync
nfs_mount_path: /mnt/fnsync
nfs_options_local: "rw,sync,vers=4.2"
nfs_options_overseas: "rw,sync,vers=3,timeo=600,retrans=2"
tasks:
- name: Detect container type and location
set_fact:
container_type: "{{ 'lxc' if inventory_hostname in groups['lxc'] else 'pve' }}"
is_overseas: "{{ inventory_hostname in ['ash1d', 'ash2e', 'ash3c', 'ch2', 'ch3'] }}"
- name: Install NFS client for all nodes
package:
name: nfs-common
state: present
- name: Create mount directory for all nodes
file:
path: "{{ nfs_mount_path }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Mount NFS for local LXC containers (direct mount)
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options_local }}"
state: mounted
when: container_type == 'lxc' and not is_overseas
- name: Mount NFS for overseas PVE containers (with retry options)
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options_overseas }}"
state: mounted
when: container_type == 'pve' and is_overseas
- name: Ensure NFS mount persists after reboot
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options_local if container_type == 'lxc' and not is_overseas else nfs_options_overseas }}"
state: present
- name: Verify NFS mount
command: df -h "{{ nfs_mount_path }}"
register: mount_result
ignore_errors: yes
- name: Display mount status
debug:
msg: "{{ inventory_hostname }} - {{ container_type }} - {{ '海外' if is_overseas else '本地' }} - Mount: {{ '成功' if mount_result.rc == 0 else '失败' }}"
- name: Create Nomad directories for LXC containers
file:
path: "{{ nfs_mount_path }}/nomad/{{ inventory_hostname }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
when: container_type == 'lxc'
- name: Create shared volumes directory for PVE containers
file:
path: "{{ nfs_mount_path }}/nomad/volumes/{{ inventory_hostname }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
when: container_type == 'pve'

View File

@ -4,8 +4,8 @@
gather_facts: false
vars:
nfs_server: snail
nfs_export_path: /fs/1000/nfs
nfs_mount_path: /opt/consul-shared
nfs_export_path: /fs/1000/nfs/Fnsync
nfs_mount_path: /mnt/fnsync
tasks:
- name: Install NFS client and mount on master

View File

@ -0,0 +1,50 @@
---
- name: Configure Nomad client for NFS volumes
hosts: nomad_clients
become: yes
vars:
nfs_mount_path: /mnt/fnsync
tasks:
- name: Create Nomad plugin directory for NFS
file:
path: /opt/nomad/plugins
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Configure Nomad client to use NFS volumes
blockinfile:
path: /etc/nomad.d/nomad.hcl
marker: "# {mark} NFS VOLUME CONFIGURATION"
block: |
plugin "nomad-driver-podman" {
config {
volumes {
enabled = true
}
}
}
client {
host_volume "nfs-shared" {
path = "{{ nfs_mount_path }}/nomad/volumes"
read_only = false
}
}
insertafter: 'data_dir = "/opt/nomad/data"'
- name: Restart Nomad service to apply changes
systemd:
name: nomad
state: restarted
- name: Verify Nomad client configuration
command: nomad node status -self
register: nomad_status
ignore_errors: yes
- name: Display Nomad status
debug:
msg: "{{ inventory_hostname }} - Nomad status: {{ '运行中' if nomad_status.rc == 0 else '异常' }}"

View File

@ -0,0 +1,63 @@
---
- name: Setup NFS Storage for Nomad Cluster
hosts: nomad_cluster
become: yes
vars:
nfs_server: snail
nfs_export_path: /fs/1000/nfs/Fnsync
nfs_mount_path: /mnt/fnsync
nfs_options: "rw,sync,vers=4.2"
tasks:
- name: Install NFS client packages
package:
name: nfs-common
state: present
- name: Create NFS mount directory
file:
path: "{{ nfs_mount_path }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Mount NFS share
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options }}"
state: mounted
- name: Ensure NFS mount persists after reboot
mount:
path: "{{ nfs_mount_path }}"
src: "{{ nfs_server }}:{{ nfs_export_path }}"
fstype: nfs
opts: "{{ nfs_options }}"
state: present
- name: Verify NFS mount
command: df -h "{{ nfs_mount_path }}"
register: mount_result
- name: Display mount result
debug:
var: mount_result.stdout
- name: Create Nomad data directories on NFS
file:
path: "{{ nfs_mount_path }}/nomad/{{ inventory_hostname }}"
state: directory
owner: nomad
group: nomad
mode: '0755'
- name: Create shared volumes directory
file:
path: "{{ nfs_mount_path }}/nomad/volumes"
state: directory
owner: nomad
group: nomad
mode: '0755'

View File

@ -0,0 +1,39 @@
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
# 使用Tailscale网络但绑定到本地接口
bind_addr = "0.0.0.0"
server {
enabled = false
}
# 启用客户端模式支持混合存储
client {
enabled = true
servers = ["100.116.158.95:4647"]
# 配置host volumes
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
}
# 指定Tailscale地址用于通信
addresses {
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
consul {
address = "100.116.158.95:8500"
}

146
docs/nomad-nfs-setup.md Normal file
View File

@ -0,0 +1,146 @@
# Nomad集群NFS配置指南
## 概述
本文档介绍如何为Nomad集群配置NFS存储支持不同类型的容器和地理位置。
## 容器类型分类
### 1. 本地LXC容器
- **位置**: 本地网络环境
- **节点示例**: influxdb, warden, hcp1, hcp2
- **特点**: 直接使用已映射的NFS目录
- **NFS参数**: `rw,sync,vers=4.2`
### 2. 海外PVE容器
- **位置**: 海外云服务器
- **节点示例**: ash1d, ash2e, ash3c, ch2, ch3
- **特点**: 需要网络优化参数
- **NFS参数**: `rw,sync,vers=3,timeo=600,retrans=2`
## NFS配置详情
### NFS服务器信息
- **服务器**: snail
- **导出路径**: `/fs/1000/nfs/Fnsync`
- **挂载点**: `/mnt/fnsync`
### 当前挂载状态
```bash
# 检查当前挂载
df -h | grep fnsync
# 输出: snail:/fs/1000/nfs/Fnsync 8.2T 2.2T 6.0T 27% /mnt/fnsync
```
## 部署步骤
### 1. 自动部署
```bash
chmod +x scripts/deploy-nfs-for-nomad.sh
./scripts/deploy-nfs-for-nomad.sh
```
### 2. 手动分步部署
```bash
# 步骤1: 配置NFS挂载
ansible-playbook -i configuration/inventories/production/inventory.ini \
playbooks/setup-nfs-by-container-type.yml
# 步骤2: 配置Nomad客户端
ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \
playbooks/setup-nomad-nfs-client.yml
```
## Nomad作业配置
### 使用NFS卷的Nomad作业示例
```hcl
job "nfs-example" {
volume "nfs-shared" {
type = "host"
source = "nfs-shared"
read_only = false
}
task "app" {
volume_mount {
volume = "nfs-shared"
destination = "/shared"
read_only = false
}
}
}
```
### 针对不同容器类型的约束
```hcl
# 本地LXC容器约束
constraint {
attribute = "${attr.unique.hostname}"
operator = "regexp"
value = "(influxdb|warden|hcp1|hcp2)"
}
# 海外PVE容器约束
constraint {
attribute = "${attr.unique.hostname}"
operator = "regexp"
value = "(ash1d|ash2e|ash3c|ch2|ch3)"
}
```
## 验证和监控
### 验证命令
```bash
# 检查NFS挂载
ansible all -i configuration/inventories/production/inventory.ini \
-m shell -a "df -h /mnt/fnsync"
# 检查Nomad状态
nomad node status
# 检查NFS任务状态
nomad job status nfs-multi-type-example
```
### 监控指标
- NFS挂载状态
- 网络延迟(海外节点)
- 存储使用情况
- Nomad任务运行状态
## 故障排除
### 常见问题
1. **NFS挂载失败**
- 检查网络连通性: `ping snail`
- 验证NFS服务: `showmount -e snail`
- 检查防火墙设置
2. **海外节点连接慢**
- 使用NFSv3协议
- 增加超时参数
- 考虑使用缓存方案
3. **Nomad卷无法挂载**
- 检查Nomad客户端配置
- 验证目录权限
- 检查Nomad服务状态
## 最佳实践
1. **数据备份**: 定期备份NFS上的重要数据
2. **监控告警**: 设置NFS挂载状态监控
3. **容量规划**: 监控存储使用情况
4. **网络优化**: 为海外节点配置合适的网络参数
## 相关文件
- `playbooks/setup-nfs-by-container-type.yml` - NFS挂载配置
- `playbooks/setup-nomad-nfs-client.yml` - Nomad客户端配置
- `jobs/nomad-nfs-multi-type.nomad` - 示例Nomad作业
- `scripts/deploy-nfs-for-nomad.sh` - 部署脚本

View File

@ -0,0 +1,59 @@
#!/bin/bash
# 脚本用于检查和修复远程节点上的apt临时文件权限问题
# 定义需要检查的节点列表
NODES=('ash2e' 'ash1d' 'ch2')
# 循环检查每个节点
for NODE in "${NODES[@]}"; do
echo "\n===== 检查节点: $NODE ====="
# 检查SSH连接是否可用
if ! ssh -q -o BatchMode=yes -o ConnectTimeout=5 root@$NODE "echo Connected > /dev/null"; then
echo "错误: 无法连接到节点 $NODE"
continue
fi
echo "成功连接到节点 $NODE"
# 1. 检查/tmp目录权限
echo -n "检查/tmp目录权限: "
ssh root@$NODE "ls -ld /tmp"
# 2. 检查磁盘空间
echo "检查磁盘空间:"
ssh root@$NODE "df -h"
# 3. 测试创建临时文件
echo -n "测试创建临时文件: "
if ssh root@$NODE "touch /tmp/test-apt-temp-$$ 2>/dev/null && echo 成功 && rm -f /tmp/test-apt-temp-$$"; then
echo "临时文件创建成功"
else
echo "错误: 无法创建临时文件"
# 尝试修复/tmp目录权限
echo "尝试修复/tmp目录权限..."
ssh root@$NODE "chmod 1777 /tmp"
fi
# 4. 清理apt缓存
echo "清理apt缓存..."
ssh root@$NODE "apt clean"
# 5. 尝试运行apt update测试
echo "测试apt update (仅输出前10行)..."
ssh root@$NODE "apt update 2>&1 | head -n 10"
echo "\n节点 $NODE 检查完成\n"
done
# 提供额外的修复建议
echo "\n===== 修复建议 ====="
echo "1. 如果问题仍然存在,请检查以下内容:"
echo " - /etc/apt/apt.conf文件中的配置"
echo " - apt-key命令的完整性 (dpkg -l apt)
echo " - 系统的临时文件清理服务状态"
echo "2. 可以尝试的命令:"
echo " - dpkg --configure -a"
echo " - apt-get install --reinstall apt apt-utils"
echo "3. 对于特定节点的持久问题,建议登录到该节点进行更详细的故障排除。"

65
jobs/hybrid-nfs-app.nomad Normal file
View File

@ -0,0 +1,65 @@
job "hybrid-nfs-app" {
datacenters = ["dc1"]
type = "service"
# 使用约束条件区分存储类型
constraint {
attribute = "${attr.unique.hostname}"
operator = "regexp"
value = "semaphore"
}
group "app" {
count = 1
network {
port "http" {
static = 8080
}
}
# 对于本机semaphore使用host volume
volume "local-storage" {
type = "host"
read_only = false
source = "local-fnsync"
}
task "web-app" {
driver = "exec"
config {
command = "python3"
args = ["-m", "http.server", "8080", "--directory", "local/fnsync"]
}
template {
data = <<EOH
<h1>Hybrid NFS App - Running on {{ env "attr.unique.hostname" }}</h1>
<p>Storage Type: {{ with eq (env "attr.unique.hostname") "semaphore" }}PVE Mount{{ else }}NFS{{ end }}</p>
<p>Timestamp: {{ now | date "2006-01-02 15:04:05" }}</p>
EOH
destination = "local/fnsync/index.html"
}
resources {
cpu = 100
memory = 128
}
service {
name = "hybrid-nfs-app"
port = "http"
tags = ["hybrid", "nfs", "web"]
check {
type = "http"
path = "/"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -0,0 +1,51 @@
job "nfs-app-example" {
datacenters = ["dc1"]
type = "service"
group "app" {
count = 1
# 使用NFS存储卷
volume "nfs-storage" {
type = "host"
read_only = false
source = "nfs-fnsync"
}
task "web-app" {
driver = "docker"
config {
image = "nginx:alpine"
ports = ["http"]
# 挂载NFS卷到容器
mount {
type = "volume"
target = "/usr/share/nginx/html"
source = "nfs-storage"
readonly = false
}
}
resources {
cpu = 100
memory = 128
}
service {
name = "nfs-web-app"
port = "http"
tags = ["nfs", "web"]
check {
type = "http"
path = "/"
interval = "10s"
timeout = "2s"
}
}
}
}
}

View File

@ -0,0 +1,34 @@
job "nfs-storage-test" {
datacenters = ["dc1"]
type = "batch"
group "test" {
count = 1
volume "nfs-storage" {
type = "csi"
read_only = false
source = "nfs-fnsync"
}
task "storage-test" {
driver = "exec"
volume_mount {
volume = "nfs-storage"
destination = "/mnt/nfs"
read_only = false
}
config {
command = "/bin/sh"
args = ["-c", "echo 'NFS Storage Test - $(hostname) - $(date)' > /mnt/nfs/test-$(hostname).txt && ls -la /mnt/nfs/"]
}
resources {
cpu = 50
memory = 64
}
}
}
}

View File

@ -0,0 +1,84 @@
job "nfs-multi-type-example" {
datacenters = ["dc1"]
type = "service"
# 为本地LXC容器配置的任务组
group "lxc-apps" {
count = 2
constraint {
attribute = "${attr.unique.hostname}"
operator = "regexp"
value = "(influxdb|hcp)"
}
volume "lxc-nfs" {
type = "host"
source = "nfs-shared"
read_only = false
}
task "lxc-app" {
driver = "podman"
config {
image = "alpine:latest"
args = ["tail", "-f", "/dev/null"]
}
volume_mount {
volume = "lxc-nfs"
destination = "/shared/lxc"
read_only = false
}
resources {
cpu = 100
memory = 64
}
}
}
# 为海外PVE容器配置的任务组
group "pve-apps" {
count = 3
constraint {
attribute = "${attr.unique.hostname}"
operator = "regexp"
value = "(ash1d|ash2e|ash3c|ch2|ch3)"
}
volume "pve-nfs" {
type = "host"
source = "nfs-shared"
read_only = false
}
task "pve-app" {
driver = "podman"
config {
image = "alpine:latest"
args = ["tail", "-f", "/dev/null"]
# 为海外节点添加网络优化参数
network_mode = "host"
}
volume_mount {
volume = "pve-nfs"
destination = "/shared/pve"
read_only = false
}
resources {
cpu = 100
memory = 64
network {
mbits = 5
}
}
}
}
}

View File

@ -0,0 +1,34 @@
job "nfs-volume-example" {
datacenters = ["dc1"]
type = "service"
group "nfs-app" {
count = 1
volume "nfs-shared" {
type = "host"
source = "nfs-shared"
read_only = false
}
task "app" {
driver = "podman"
config {
image = "alpine:latest"
args = ["tail", "-f", "/dev/null"]
}
volume_mount {
volume = "nfs-shared"
destination = "/shared"
read_only = false
}
resources {
cpu = 100
memory = 64
}
}
}
}

4
list_playbooks.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
# 列出所有playbooks文件
ls -1 /root/mgmt/configuration/playbooks/*.yml

View File

@ -0,0 +1,72 @@
---
- name: 配置Nomad客户端节点
hosts: nomad_nodes:!semaphore
become: yes
vars:
nomad_config_dir: /etc/nomad.d
tasks:
- name: 创建Nomad配置目录
file:
path: "{{ nomad_config_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: 复制Nomad客户端配置
copy:
content: |
datacenter = "dc1"
data_dir = "/opt/nomad/data"
log_level = "INFO"
bind_addr = "0.0.0.0"
server {
enabled = false
}
client {
enabled = true
servers = ["100.116.158.95:4647"]
host_volume "fnsync" {
path = "/mnt/fnsync"
read_only = false
}
}
addresses {
http = "{{ ansible_host }}"
rpc = "{{ ansible_host }}"
serf = "{{ ansible_host }}"
}
advertise {
http = "{{ ansible_host }}:4646"
rpc = "{{ ansible_host }}:4647"
serf = "{{ ansible_host }}:4648"
}
consul {
address = "100.116.158.95:8500"
}
dest: "{{ nomad_config_dir }}/nomad.hcl"
owner: root
group: root
mode: '0644'
- name: 启动Nomad服务
systemd:
name: nomad
state: restarted
enabled: yes
daemon_reload: yes
- name: 检查Nomad服务状态
command: systemctl status nomad
register: nomad_status
changed_when: false
- name: 显示Nomad服务状态
debug:
var: nomad_status.stdout_lines

42
playbooks/nfs-mount.yml Normal file
View File

@ -0,0 +1,42 @@
---
- name: 配置Nomad节点NFS挂载
hosts: nomad_nodes
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: 安装NFS客户端
package:
name: nfs-common
state: present
- name: 创建挂载目录
file:
path: "{{ mount_point }}"
state: directory
mode: '0755'
- name: 临时挂载NFS共享
mount:
path: "{{ mount_point }}"
src: "{{ nfs_server }}:{{ nfs_share }}"
fstype: nfs4
opts: "rw,relatime,vers=4.2"
state: mounted
- name: 配置开机自动挂载
lineinfile:
path: /etc/fstab
line: "{{ nfs_server }}:{{ nfs_share }} {{ mount_point }} nfs4 rw,relatime,vers=4.2 0 0"
state: present
- name: 验证挂载
command: df -h {{ mount_point }}
register: mount_check
- name: 显示挂载信息
debug:
var: mount_check.stdout_lines

View File

@ -0,0 +1,43 @@
---
- name: 设置Nomad节点NFS挂载
hosts: nomad_nodes
become: yes
vars:
nfs_server: "snail"
nfs_share: "/fs/1000/nfs/Fnsync"
mount_point: "/mnt/fnsync"
tasks:
- name: 安装NFS客户端
package:
name: nfs-common
state: present
- name: 创建挂载目录
file:
path: "{{ mount_point }}"
state: directory
mode: '0755'
- name: 临时挂载NFS共享
mount:
path: "{{ mount_point }}"
src: "{{ nfs_server }}:{{ nfs_share }}"
fstype: nfs4
opts: "rw,relatime,vers=4.2"
state: mounted
- name: 配置开机自动挂载
lineinfile:
path: /etc/fstab
line: "{{ nfs_server }}:{{ nfs_share }} {{ mount_point }} nfs4 rw,relatime,vers=4.2 0 0"
state: present
- name: 验证挂载
command: df -h {{ mount_point }}
register: mount_check
- name: 显示挂载信息
debug:
var: mount_check.stdout_lines

69
scripts/deploy-nfs-for-nomad.sh Executable file
View File

@ -0,0 +1,69 @@
#!/bin/bash
# Nomad集群NFS配置部署脚本
# 根据容器类型和地理位置进行分情况处理
set -e
echo "🚀 开始部署Nomad集群NFS配置..."
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 函数:打印带颜色的消息
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# 检查当前目录
if [ ! -f "configuration/inventories/production/inventory.ini" ]; then
log_error "请在mgmt项目根目录运行此脚本"
exit 1
fi
# 1. 为所有节点配置NFS挂载
log_info "步骤1: 为所有节点配置NFS挂载 (根据容器类型和地理位置)"
ansible-playbook -i configuration/inventories/production/inventory.ini \
playbooks/setup-nfs-by-container-type.yml
# 2. 为Nomad客户端配置NFS卷支持
log_info "步骤2: 配置Nomad客户端支持NFS卷"
ansible-playbook -i configuration/inventories/production/nomad-cluster.ini \
playbooks/setup-nomad-nfs-client.yml
# 3. 验证NFS挂载状态
log_info "步骤3: 验证所有节点的NFS挂载状态"
ansible all -i configuration/inventories/production/inventory.ini \
-m shell -a "df -h /mnt/fnsync 2>/dev/null || echo 'NFS未挂载'" \
--limit '!snail'
# 4. 验证Nomad客户端配置
log_info "步骤4: 验证Nomad客户端配置"
ansible nomad_clients -i configuration/inventories/production/nomad-cluster.ini \
-m shell -a "nomad node status -self 2>/dev/null || echo 'Nomad未运行'"
# 5. 部署示例NFS任务可选
read -p "是否部署示例NFS任务(y/n): " deploy_example
if [ "$deploy_example" = "y" ] || [ "$deploy_example" = "Y" ]; then
log_info "部署示例NFS任务..."
nomad run jobs/nomad-nfs-multi-type.nomad
echo "等待任务启动..."
sleep 10
nomad job status nfs-multi-type-example
fi
log_info "✅ NFS配置部署完成!"
echo ""
echo "📋 使用说明:"
echo "1. NFS挂载点: /mnt/fnsync"
echo "2. 本地LXC容器: 直接使用挂载目录"
echo "3. 海外PVE容器: 使用优化参数挂载"
echo "4. Nomad作业: 使用host volume 'nfs-shared'"
echo ""
echo "🔧 手动验证命令:"
echo " - 检查NFS挂载: df -h /mnt/fnsync"
echo " - 检查Nomad状态: nomad node status"
echo " - 运行NFS任务: nomad run jobs/nomad-nfs-multi-type.nomad"

View File

@ -0,0 +1,29 @@
#!/bin/bash
# 分发SSH公钥到所有Nomad节点
echo "分发SSH公钥到Nomad节点..."
# 节点列表
NODES=(
"100.81.26.3" # ash1d.global
"100.103.147.94" # ash2e.global
"100.90.159.68" # ch2.global
"100.86.141.112" # ch3.global
"100.117.106.136" # master
"100.116.80.94" # ash3c
)
PUB_KEY=$(cat /home/ben/.ssh/id_ed25519.pub)
for NODE in "${NODES[@]}"; do
echo "正在配置节点: $NODE"
# 尝试使用现有密钥连接并添加新密钥
ssh-keyscan -H $NODE >> ~/.ssh/known_hosts 2>/dev/null
# 使用现有认证方式添加密钥
ssh root@$NODE "echo '$PUB_KEY' >> /root/.ssh/authorized_keys" 2>/dev/null && \
echo "$NODE 配置成功" || echo "$NODE 配置失败"
done
echo "密钥分发完成"

View File

@ -0,0 +1,22 @@
---
- name: 设置Nomad节点SSH密钥认证
hosts: nomad_nodes
become: yes
vars:
ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIIddJVPEvFRtzhWwYjr21lKTar+d7R5Kn/6bhd2s231 ben@ch2"
tasks:
- name: 确保.ssh目录存在
file:
path: /root/.ssh
state: directory
mode: '0700'
- name: 添加SSH公钥到authorized_keys
authorized_key:
user: root
state: present
key: "{{ ssh_public_key }}"
- name: 测试SSH连接
ping:

86
scripts/verify-nfs-config.sh Executable file
View File

@ -0,0 +1,86 @@
#!/bin/bash
# NFS配置验证脚本
set -e
echo "🔍 验证NFS配置状态..."
# 颜色定义
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# 1. 检查本地NFS挂载
log_info "1. 检查本地NFS挂载状态"
if df -h | grep -q "/mnt/fnsync"; then
log_info "✅ 本地NFS挂载正常"
df -h | grep "/mnt/fnsync"
else
log_error "❌ 本地NFS未挂载"
fi
# 2. 检查配置文件存在
log_info "2. 检查配置文件"
config_files=(
"playbooks/setup-nfs-by-container-type.yml"
"playbooks/setup-nomad-nfs-client.yml"
"jobs/nomad-nfs-multi-type.nomad"
"scripts/deploy-nfs-for-nomad.sh"
"docs/nomad-nfs-setup.md"
)
for file in "${config_files[@]}"; do
if [ -f "$file" ]; then
log_info "$file 存在"
else
log_error "$file 不存在"
fi
done
# 3. 检查Ansible inventory
log_info "3. 检查Ansible配置"
if [ -f "configuration/inventories/production/inventory.ini" ]; then
log_info "✅ inventory.ini 存在"
echo "节点分类:"
grep -E "\[.*\]" configuration/inventories/production/inventory.ini | head -10
else
log_error "❌ inventory.ini 不存在"
fi
# 4. 检查Nomad服务状态
log_info "4. 检查Nomad服务"
if command -v nomad &> /dev/null; then
if nomad node status &> /dev/null; then
log_info "✅ Nomad服务运行正常"
nomad node status -self | grep -E "(Name|Status|Datacenter)"
else
log_warn "⚠️ Nomad服务未运行或无法连接"
fi
else
log_warn "⚠️ Nomad命令未安装"
fi
# 5. 检查NFS服务器连通性
log_info "5. 检查NFS服务器连通性"
if ping -c 1 -W 3 snail &> /dev/null; then
log_info "✅ NFS服务器 snail 可达"
if command -v showmount &> /dev/null; then
showmount -e snail 2>/dev/null || log_warn "⚠️ 无法获取NFS导出列表"
fi
else
log_error "❌ NFS服务器 snail 不可达"
fi
echo ""
echo "📊 验证完成!"
echo ""
echo "🚀 下一步操作:"
echo "1. 运行部署脚本: ./scripts/deploy-nfs-for-nomad.sh"
echo "2. 查看详细文档: cat docs/nomad-nfs-setup.md"
echo "3. 测试NFS功能: nomad run jobs/nomad-nfs-multi-type.nomad"