diff --git a/ansible/playbooks/consul-persistent-storage.yml b/ansible/playbooks/consul-persistent-storage.yml new file mode 100644 index 0000000..26c5f57 --- /dev/null +++ b/ansible/playbooks/consul-persistent-storage.yml @@ -0,0 +1,137 @@ +--- +- name: Configure Consul Persistent Storage + hosts: ch4,ash3c,warden + become: yes + gather_facts: yes + + vars: + consul_data_dir: "/opt/consul/data" + nomad_config_file: "/etc/nomad.d/nomad.hcl" + + tasks: + - name: Create consul data directory + file: + path: "{{ consul_data_dir }}" + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Backup existing nomad configuration + copy: + src: "{{ nomad_config_file }}" + dest: "{{ nomad_config_file }}.backup.{{ ansible_date_time.epoch }}" + remote_src: yes + backup: yes + + - name: Check if consul-data host volume already configured + lineinfile: + path: "{{ nomad_config_file }}" + line: ' host_volume "consul-data" {' + state: absent + check_mode: yes + register: consul_volume_check + changed_when: false + + - name: Add consul-data host volume configuration + blockinfile: + path: "{{ nomad_config_file }}" + marker: "# {mark} CONSUL PERSISTENT STORAGE" + block: | + + # Consul 持久化存储 + client { + host_volume "consul-data" { + path = "{{ consul_data_dir }}" + read_only = false + } + } + insertafter: EOF + when: consul_volume_check is not changed + notify: restart nomad + + - name: Verify consul data directory permissions + file: + path: "{{ consul_data_dir }}" + owner: nomad + group: nomad + mode: '0755' + state: directory + + - name: Display consul data directory info + stat: + path: "{{ consul_data_dir }}" + register: consul_dir_stat + + - name: Show directory information + debug: + msg: | + Consul data directory: {{ consul_data_dir }} + Owner: {{ consul_dir_stat.stat.pw_name }}:{{ consul_dir_stat.stat.gr_name }} + Permissions: {{ consul_dir_stat.stat.mode }} + + handlers: + - name: restart nomad + systemd: + name: nomad + state: restarted + daemon_reload: yes + + - name: wait for nomad + wait_for: + port: 4646 + host: "{{ ansible_host }}" + delay: 5 + timeout: 30 + listen: restart nomad + +- name: Verify Nomad client status + hosts: ch4,ash3c,warden + become: yes + gather_facts: no + + tasks: + - name: Check nomad service status + systemd: + name: nomad + register: nomad_status + + - name: Display nomad status + debug: + msg: | + Node: {{ inventory_hostname }} + Nomad Status: {{ nomad_status.status.ActiveState }} + + - name: Verify nomad client connectivity + uri: + url: "http://{{ ansible_host }}:4646/v1/status/leader" + method: GET + timeout: 10 + register: nomad_api_check + ignore_errors: yes + + - name: Show connectivity result + debug: + msg: | + Node: {{ inventory_hostname }} + API Check: {{ 'SUCCESS' if nomad_api_check.status == 200 else 'FAILED' }} + +- name: Summary Report + hosts: localhost + gather_facts: no + run_once: true + + tasks: + - name: Display completion summary + debug: + msg: | + ✅ Consul 持久化存储配置完成! + + 已配置节点: ch4, ash3c, warden + 数据目录: /opt/consul/data + 权限: nomad:nomad (755) + + 下一步: + 1. 部署持久化 Consul job + 2. 验证集群状态 + 3. 恢复 KV 数据 \ No newline at end of file diff --git a/docs/CONSUL_PERSISTENCE_FIX.md b/docs/CONSUL_PERSISTENCE_FIX.md new file mode 100644 index 0000000..fed0852 --- /dev/null +++ b/docs/CONSUL_PERSISTENCE_FIX.md @@ -0,0 +1,107 @@ +# Consul 持久化存储修复方案 + +## 🚨 问题诊断 + +**根本原因:你的 Consul 集群确实没有配置持久化存储!** + +### 当前问题: +1. **数据目录** `/opt/nomad/data/consul` 只是容器内临时目录 +2. **没有 volume 挂载** - 重启后数据完全丢失 +3. **缺少 onecloud1 节点** - 配置与实际运行状态不一致 + +### 影响: +- ✅ **Consul 服务发现正常** - 这部分数据在内存中 +- ❌ **KV 存储数据丢失** - 所有配置、tokens、证书都没了 +- ❌ **ACL 配置丢失** - 权限设置重置 +- ❌ **服务配置丢失** - 注册的服务元数据丢失 + +## 🔧 修复方案 + +### 第一步:配置持久化存储 + +**在每个 Consul 节点上运行:** +```bash +# 在 ch4, ash3c, warden 节点上分别执行 +./scripts/setup-consul-persistent-storage.sh +``` + +**这个脚本会:** +1. 创建 `/opt/consul/data` 目录 +2. 设置正确的权限 (nomad:nomad) +3. 在 Nomad 配置中添加 host volume +4. 重启 Nomad 客户端 + +### 第二步:部署持久化 Consul + +**停止当前 job:** +```bash +nomad job stop consul-cluster-nomad +``` + +**部署新配置:** +```bash +nomad job run infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent.nomad +``` + +### 第三步:恢复数据 + +**如果有备份数据:** +```bash +# 从 Consul KV 备份恢复 +consul kv import @backup.json + +# 或从快照恢复 +consul snapshot restore backup.snap +``` + +**如果没有备份:** +- 需要重新配置所有 KV 数据 +- 重新设置 Cloudflare tokens +- 重新注册服务 + +## 🎯 新配置的优势 + +### 持久化存储: +- **Host Volume** - 数据存储在宿主机 `/opt/consul/data` +- **重启安全** - 重启 job 不会丢失数据 +- **跨 allocation** - 数据在 allocation 之间保持 + +### 改进配置: +- **统一 bootstrap-expect=3** - 所有节点都知道集群大小 +- **健康检查** - 自动监控服务状态 +- **日志级别** - 便于调试 +- **服务注册** - 自动注册到 Consul + +## 📋 执行清单 + +### 准备阶段: +- [ ] 备份当前 KV 数据 (如果还有) +- [ ] 记录当前服务注册状态 +- [ ] 准备重新配置的数据 + +### 执行阶段: +- [ ] 在 ch4 节点运行存储配置脚本 +- [ ] 在 ash3c 节点运行存储配置脚本 +- [ ] 在 warden 节点运行存储配置脚本 +- [ ] 停止当前 Consul job +- [ ] 部署持久化 Consul job +- [ ] 验证集群状态 + +### 验证阶段: +- [ ] 检查 Consul 集群状态 +- [ ] 验证 leader 选举 +- [ ] 测试 KV 存储 +- [ ] 恢复关键配置数据 + +## 🚨 重要提醒 + +**这是一个严重的架构缺陷!** +- 生产环境的 Consul 集群没有持久化存储是不可接受的 +- 这相当于把银行的金库建在沙滩上 +- 必须立即修复,否则随时可能再次丢失数据 + +**修复后的好处:** +- 真正的高可用 Consul 集群 +- 数据持久化保证 +- 符合生产环境标准 +- 可以安全地重启和维护 \ No newline at end of file diff --git a/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent-clean.nomad b/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent-clean.nomad new file mode 100644 index 0000000..caacb24 --- /dev/null +++ b/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent-clean.nomad @@ -0,0 +1,199 @@ +job "consul-cluster-nomad" { + datacenters = ["dc1"] + type = "service" + + group "consul-ch4" { + constraint { + attribute = "${node.unique.name}" + value = "ch4" + } + + # 持久化存储配置 + volume "consul-data" { + type = "host" + source = "consul-data" + read_only = false + } + + network { + port "http" { + static = 8500 + } + port "server" { + static = 8300 + } + port "serf-lan" { + static = 8301 + } + port "serf-wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 挂载持久化存储 + volume_mount { + volume = "consul-data" + destination = "/opt/consul/data" + read_only = false + } + + config { + command = "consul" + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/opt/consul/data", + "-client=100.117.106.136", + "-bind=100.117.106.136", + "-advertise=100.117.106.136", + "-retry-join=ash3c.tailnet-68f9.ts.net:8301", + "-retry-join=warden.tailnet-68f9.ts.net:8301", + "-ui", + "-http-port=8500", + "-server-port=8300", + "-serf-lan-port=8301", + "-serf-wan-port=8302" + ] + } + + resources { + cpu = 300 + memory = 512 + } + } + } + + group "consul-ash3c" { + constraint { + attribute = "${node.unique.name}" + value = "ash3c" + } + + # 持久化存储配置 + volume "consul-data" { + type = "host" + source = "consul-data" + read_only = false + } + + network { + port "http" { + static = 8500 + } + port "server" { + static = 8300 + } + port "serf-lan" { + static = 8301 + } + port "serf-wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 挂载持久化存储 + volume_mount { + volume = "consul-data" + destination = "/opt/consul/data" + read_only = false + } + + config { + command = "consul" + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/opt/consul/data", + "-client=100.116.80.94", + "-bind=100.116.80.94", + "-advertise=100.116.80.94", + "-retry-join=ch4.tailnet-68f9.ts.net:8301", + "-retry-join=warden.tailnet-68f9.ts.net:8301", + "-ui", + "-http-port=8500", + "-server-port=8300", + "-serf-lan-port=8301", + "-serf-wan-port=8302" + ] + } + + resources { + cpu = 300 + memory = 512 + } + } + } + + group "consul-warden" { + constraint { + attribute = "${node.unique.name}" + value = "warden" + } + + # 持久化存储配置 + volume "consul-data" { + type = "host" + source = "consul-data" + read_only = false + } + + network { + port "http" { + static = 8500 + } + port "server" { + static = 8300 + } + port "serf-lan" { + static = 8301 + } + port "serf-wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 挂载持久化存储 + volume_mount { + volume = "consul-data" + destination = "/opt/consul/data" + read_only = false + } + + config { + command = "consul" + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/opt/consul/data", + "-client=100.122.197.112", + "-bind=100.122.197.112", + "-advertise=100.122.197.112", + "-retry-join=ch4.tailnet-68f9.ts.net:8301", + "-retry-join=ash3c.tailnet-68f9.ts.net:8301", + "-ui", + "-http-port=8500", + "-server-port=8300", + "-serf-lan-port=8301", + "-serf-wan-port=8302" + ] + } + + resources { + cpu = 300 + memory = 512 + } + } + } +} \ No newline at end of file diff --git a/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent.nomad b/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent.nomad new file mode 100644 index 0000000..bb004c0 --- /dev/null +++ b/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-persistent.nomad @@ -0,0 +1,241 @@ +job "consul-cluster-nomad" { + datacenters = ["dc1"] + type = "service" + + group "consul-ch4" { + constraint { + attribute = "${node.unique.name}" + value = "ch4" + } + + # 持久化存储配置 + volume "consul-data" { + type = "host" + source = "consul-data" + read_only = false + } + + network { + port "http" { + static = 8500 + } + port "server" { + static = 8300 + } + port "serf-lan" { + static = 8301 + } + port "serf-wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 挂载持久化存储 + volume_mount { + volume = "consul-data" + destination = "/opt/consul/data" + read_only = false + } + + config { + command = "consul" + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/opt/consul/data", + "-client=100.117.106.136", + "-bind=100.117.106.136", + "-advertise=100.117.106.136", + "-retry-join=ash3c.tailnet-68f9.ts.net:8301", + "-retry-join=warden.tailnet-68f9.ts.net:8301", + "-ui", + "-http-port=8500", + "-server-port=8300", + "-serf-lan-port=8301", + "-serf-wan-port=8302", + "-log-level=INFO" + ] + } + + resources { + cpu = 300 + memory = 512 + } + + # 健康检查 + service { + name = "consul" + port = "http" + + check { + type = "http" + path = "/v1/status/leader" + interval = "10s" + timeout = "3s" + } + } + } + } + + group "consul-ash3c" { + constraint { + attribute = "${node.unique.name}" + value = "ash3c" + } + + # 持久化存储配置 + volume "consul-data" { + type = "host" + source = "consul-data" + read_only = false + } + + network { + port "http" { + static = 8500 + } + port "server" { + static = 8300 + } + port "serf-lan" { + static = 8301 + } + port "serf-wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 挂载持久化存储 + volume_mount { + volume = "consul-data" + destination = "/opt/consul/data" + read_only = false + } + + config { + command = "consul" + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/opt/consul/data", + "-client=100.116.80.94", + "-bind=100.116.80.94", + "-advertise=100.116.80.94", + "-retry-join=ch4.tailnet-68f9.ts.net:8301", + "-retry-join=warden.tailnet-68f9.ts.net:8301", + "-ui", + "-http-port=8500", + "-server-port=8300", + "-serf-lan-port=8301", + "-serf-wan-port=8302", + "-log-level=INFO" + ] + } + + resources { + cpu = 300 + memory = 512 + } + + # 健康检查 + service { + name = "consul" + port = "http" + + check { + type = "http" + path = "/v1/status/leader" + interval = "10s" + timeout = "3s" + } + } + } + } + + group "consul-warden" { + constraint { + attribute = "${node.unique.name}" + value = "warden" + } + + # 持久化存储配置 + volume "consul-data" { + type = "host" + source = "consul-data" + read_only = false + } + + network { + port "http" { + static = 8500 + } + port "server" { + static = 8300 + } + port "serf-lan" { + static = 8301 + } + port "serf-wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 挂载持久化存储 + volume_mount { + volume = "consul-data" + destination = "/opt/consul/data" + read_only = false + } + + config { + command = "consul" + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/opt/consul/data", + "-client=100.122.197.112", + "-bind=100.122.197.112", + "-advertise=100.122.197.112", + "-retry-join=ch4.tailnet-68f9.ts.net:8301", + "-retry-join=ash3c.tailnet-68f9.ts.net:8301", + "-ui", + "-http-port=8500", + "-server-port=8300", + "-serf-lan-port=8301", + "-serf-wan-port=8302", + "-log-level=INFO" + ] + } + + resources { + cpu = 300 + memory = 512 + } + + # 健康检查 + service { + name = "consul" + port = "http" + + check { + type = "http" + path = "/v1/status/leader" + interval = "10s" + timeout = "3s" + } + } + } + } +} \ No newline at end of file diff --git a/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-simple.nomad b/infrastructure/nomad/nomad-jobs/consul-cluster/consul-cluster-simple.nomad new file mode 100644 index 0000000..e69de29 diff --git a/scripts/setup-consul-persistent-storage.sh b/scripts/setup-consul-persistent-storage.sh new file mode 100755 index 0000000..eb0a43a --- /dev/null +++ b/scripts/setup-consul-persistent-storage.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Consul 持久化存储配置脚本 +# 需要在每个 Nomad 节点上运行 + +set -e + +echo "=== Consul 持久化存储配置 ===" + +# 1. 创建数据目录 +CONSUL_DATA_DIR="/opt/consul/data" +echo "创建 Consul 数据目录: $CONSUL_DATA_DIR" +sudo mkdir -p $CONSUL_DATA_DIR +sudo chown -R nomad:nomad $CONSUL_DATA_DIR +sudo chmod 755 $CONSUL_DATA_DIR + +# 2. 检查 Nomad 客户端配置 +NOMAD_CONFIG="/etc/nomad.d/nomad.hcl" +echo "检查 Nomad 配置文件: $NOMAD_CONFIG" + +# 3. 添加 host volume 配置 +if ! grep -q "consul-data" $NOMAD_CONFIG; then + echo "添加 consul-data host volume 配置到 Nomad" + + # 备份原配置 + sudo cp $NOMAD_CONFIG $NOMAD_CONFIG.backup.$(date +%Y%m%d_%H%M%S) + + # 添加 host volume 配置 + sudo tee -a $NOMAD_CONFIG << 'EOF' + +# Consul 持久化存储 +client { + host_volume "consul-data" { + path = "/opt/consul/data" + read_only = false + } +} +EOF + + echo "✅ 已添加 consul-data host volume 配置" +else + echo "✅ consul-data host volume 配置已存在" +fi + +# 4. 重启 Nomad 客户端 +echo "重启 Nomad 客户端以应用配置..." +sudo systemctl restart nomad +sleep 5 + +# 5. 验证配置 +echo "验证 Nomad 客户端状态..." +sudo systemctl status nomad --no-pager -l + +echo "" +echo "=== 配置完成 ===" +echo "数据目录: $CONSUL_DATA_DIR" +echo "权限: $(ls -ld $CONSUL_DATA_DIR)" +echo "" +echo "下一步:" +echo "1. 在所有节点 (ch4, ash3c, warden) 运行此脚本" +echo "2. 停止当前 Consul job: nomad job stop consul-cluster-nomad" +echo "3. 部署新的持久化配置: nomad job run consul-cluster-persistent.nomad" \ No newline at end of file