diff --git a/.gitea/issues/consul-nomad-access-lesson.md b/.gitea/issues/consul-nomad-access-lesson.md index 696f00b..8e12f66 100644 --- a/.gitea/issues/consul-nomad-access-lesson.md +++ b/.gitea/issues/consul-nomad-access-lesson.md @@ -64,6 +64,22 @@ curl http://100.x.x.x:4646/v1/status/leader - 为新团队成员创建培训材料 - 添加到项目入门指南中 +## 🎯 我的庄严承诺 + +### 关于 HCP 服务管理的决心 + +**我郑重承诺:我永远不会用 Ansible 管理除了 Nomad 之外的 HCP 服务!** + +**我郑重承诺:我永远不会用 Ansible 管理除了 Nomad 之外的 HCP 服务!** + +**我郑重承诺:我永远不会用 Ansible 管理除了 Nomad 之外的 HCP 服务!** + +这个承诺基于以下深刻教训: +- 系统级服务与 Nomad 托管服务会产生端口冲突 +- 双重管理会导致不可预测的行为 +- Nomad 应该拥有对其托管服务的完全控制权 +- Ansible 只用于基础设施层面的 Nomad 管理 + ## 🎉 致谢 感谢所有为这个项目做出贡献的开发者和社区成员! diff --git a/Makefile b/Makefile index 459a78f..b651eaa 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ help: ## 显示帮助信息 # 环境设置 setup: ## 设置开发环境 @echo "🚀 设置开发环境..." - @bash scripts/setup/setup-environment.sh + @bash scripts/setup/environment/setup-environment.sh # OpenTofu 操作 init: ## 初始化 OpenTofu @@ -54,11 +54,11 @@ podman-down: ## 停止开发环境 # 测试 test: ## 运行测试 @echo "🧪 运行测试..." - @bash scripts/utilities/run-tests.sh + @bash scripts/testing/test-runner.sh test-mcp: ## 运行MCP服务器测试 @echo "🧪 运行MCP服务器测试..." - @./run_tests.sh + @bash scripts/testing/mcp/test_local_mcp_servers.sh test-kali: ## 运行Kali Linux快速健康检查 @echo "🧪 运行Kali Linux快速健康检查..." @@ -74,12 +74,12 @@ test-kali-full: ## 运行Kali Linux完整测试套件 lint: ## 代码检查 @echo "🔍 代码检查..." - @bash scripts/utilities/lint.sh + @bash scripts/ci-cd/quality/lint.sh # 文档 docs: ## 生成文档 @echo "📚 生成文档..." - @bash scripts/utilities/generate-docs.sh + @bash scripts/ci-cd/build/generate-docs.sh # 清理 clean: ## 清理临时文件 @@ -91,7 +91,7 @@ clean: ## 清理临时文件 # 备份 backup: ## 创建备份 @echo "💾 创建备份..." - @bash scripts/utilities/backup.sh + @bash scripts/utilities/backup/backup-all.sh # 监控 monitor: ## 启动监控 @@ -101,4 +101,4 @@ monitor: ## 启动监控 # 安全扫描 security-scan: ## 安全扫描 @echo "🔒 安全扫描..." - @bash scripts/utilities/security-scan.sh \ No newline at end of file + @bash scripts/ci-cd/quality/security-scan.sh \ No newline at end of file diff --git a/README.md b/README.md index 906c543..e05febc 100644 --- a/README.md +++ b/README.md @@ -556,4 +556,31 @@ make test-kali-full ## 🎉 致谢 -感谢所有为这个项目做出贡献的开发者和社区成员! \ No newline at end of file +感谢所有为这个项目做出贡献的开发者和社区成员! +## 脚本整理 + +项目脚本已重新整理,按功能分类存放在 `scripts/` 目录中: + +- `scripts/setup/` - 环境设置和初始化 +- `scripts/deployment/` - 部署相关脚本 +- `scripts/testing/` - 测试脚本 +- `scripts/utilities/` - 工具脚本 +- `scripts/mcp/` - MCP 服务器相关 +- `scripts/ci-cd/` - CI/CD 相关 + +详细信息请查看 [脚本索引](scripts/SCRIPT_INDEX.md)。 + + +## 脚本整理 + +项目脚本已重新整理,按功能分类存放在 `scripts/` 目录中: + +- `scripts/setup/` - 环境设置和初始化 +- `scripts/deployment/` - 部署相关脚本 +- `scripts/testing/` - 测试脚本 +- `scripts/utilities/` - 工具脚本 +- `scripts/mcp/` - MCP 服务器相关 +- `scripts/ci-cd/` - CI/CD 相关 + +详细信息请查看 [脚本索引](scripts/SCRIPT_INDEX.md)。 + diff --git a/README.md.backup b/README.md.backup new file mode 100644 index 0000000..e19f39f --- /dev/null +++ b/README.md.backup @@ -0,0 +1,572 @@ +# 🏗️ 基础设施管理项目 + +这是一个现代化的多云基础设施管理平台,专注于 OpenTofu、Ansible 和 Nomad + Podman 的集成管理。 + +## 📝 重要提醒 (Sticky Note) + +### ✅ Consul集群状态更新 + +**当前状态**:Consul集群运行健康,所有节点正常运行 + +**集群信息**: +- **Leader**: warden (100.122.197.112:8300) +- **节点数量**: 3个服务器节点 +- **健康状态**: 所有节点健康检查通过 +- **节点列表**: + - master (100.117.106.136) - 韩国主节点 + - ash3c (100.116.80.94) - 美国服务器节点 + - warden (100.122.197.112) - 北京服务器节点,当前集群leader + +**配置状态**: +- Ansible inventory配置与实际集群状态一致 +- 所有节点均为服务器模式 +- bootstrap_expect=3,符合实际节点数量 + +**依赖关系**: +- Tailscale (第1天) ✅ +- Ansible (第2天) ✅ +- Nomad (第3天) ✅ +- Consul (第4天) ✅ **已完成** +- Terraform (第5天) ✅ **进展良好** +- Vault (第6天) ⏳ 计划中 +- Waypoint (第7天) ⏳ 计划中 + +**下一步计划**: +- 继续推进Terraform状态管理 +- 准备Vault密钥管理集成 +- 规划Waypoint应用部署流程 + +--- + +## 🎯 项目特性 + +- **🌩️ 多云支持**: Oracle Cloud, 华为云, Google Cloud, AWS, DigitalOcean +- **🏗️ 基础设施即代码**: 使用 OpenTofu 管理云资源 +- **⚙️ 配置管理**: 使用 Ansible 自动化配置和部署 +- **🐳 容器编排**: Nomad 集群管理和 Podman 容器运行时 +- **🔄 CI/CD**: Gitea Actions 自动化流水线 +- **📊 监控**: Prometheus + Grafana 监控体系 +- **🔐 安全**: 多层安全防护和合规性 + +## 🔄 架构分层与职责划分 + +### ⚠️ 重要:Terraform 与 Nomad 的职责区分 + +本项目采用分层架构,明确区分了不同工具的职责范围,避免混淆: + +#### 1. **Terraform/OpenTofu 层面 - 基础设施生命周期管理** +- **职责**: 管理云服务商提供的计算资源(虚拟机)的生命周期 +- **操作范围**: + - 创建、更新、删除虚拟机实例 + - 管理网络资源(VCN、子网、安全组等) + - 管理存储资源(块存储、对象存储等) + - 管理负载均衡器等云服务 +- **目标**: 确保底层基础设施的正确配置和状态管理 + +#### 2. **Nomad 层面 - 应用资源调度与编排** +- **职责**: 在已经运行起来的虚拟机内部进行资源分配和应用编排 +- **操作范围**: + - 在现有虚拟机上调度和运行容器化应用 + - 管理应用的生命周期(启动、停止、更新) + - 资源分配和限制(CPU、内存、存储) + - 服务发现和负载均衡 +- **目标**: 在已有基础设施上高效运行应用服务 + +#### 3. **关键区别** +- **Terraform** 关注的是**虚拟机本身**的生命周期管理 +- **Nomad** 关注的是**在虚拟机内部**运行的应用的资源调度 +- **Terraform** 决定"有哪些虚拟机" +- **Nomad** 决定"虚拟机上运行什么应用" + +#### 4. **工作流程示例** +``` +1. Terraform 创建虚拟机 (云服务商层面) + ↓ +2. 虚拟机启动并运行操作系统 + ↓ +3. 在虚拟机上安装和配置 Nomad 客户端 + ↓ +4. Nomad 在虚拟机上调度和运行应用容器 +``` + +**重要提醒**: 这两个层面不可混淆,Terraform 不应该管理应用层面的资源,Nomad 也不应该创建虚拟机。严格遵守此分层架构是项目成功的关键。 + +## 📁 项目结构 + +``` +mgmt/ +├── .gitea/workflows/ # CI/CD 工作流 +├── tofu/ # OpenTofu 基础设施代码 (基础设施生命周期管理) +│ ├── environments/ # 环境配置 (dev/staging/prod) +│ ├── modules/ # 可复用模块 +│ ├── providers/ # 云服务商配置 +│ └── shared/ # 共享配置 +├── configuration/ # Ansible 配置管理 +│ ├── inventories/ # 主机清单 +│ ├── playbooks/ # 剧本 +│ ├── templates/ # 模板文件 +│ └── group_vars/ # 组变量 +├── jobs/ # Nomad 作业定义 (应用资源调度与编排) +│ ├── consul/ # Consul 集群配置 +│ └── podman/ # Podman 相关作业 +├── configs/ # 配置文件 +│ ├── nomad-master.hcl # Nomad 主节点配置 +│ └── nomad-ash3c.hcl # Nomad 客户端配置 +├── docs/ # 文档 +├── security/ # 安全配置 +│ ├── certificates/ # 证书文件 +│ └── policies/ # 安全策略 +├── tests/ # 测试脚本和报告 +│ ├── mcp_servers/ # MCP服务器测试脚本 +│ ├── mcp_server_test_report.md # MCP服务器测试报告 +│ └── legacy/ # 旧的测试脚本 +├── tools/ # 工具和实用程序 +├── playbooks/ # 核心Ansible剧本 +└── Makefile # 项目管理命令 +``` + +**架构分层说明**: +- **tofu/** 目录包含 Terraform/OpenTofu 代码,负责管理云服务商提供的计算资源生命周期 +- **jobs/** 目录包含 Nomad 作业定义,负责在已有虚拟机内部进行应用资源调度 +- 这两个目录严格分离,确保职责边界清晰 + +**注意:** 项目已从 Docker Swarm 迁移到 Nomad + Podman,原有的 swarm 目录已不再使用。所有中间过程脚本和测试文件已清理,保留核心配置文件以符合GitOps原则。 + +## 🔄 GitOps 原则 + +本项目遵循 GitOps 工作流,确保基础设施状态与 Git 仓库中的代码保持一致: + +- **声明式配置**: 所有基础设施和应用程序配置都以声明式方式存储在 Git 中 +- **版本控制和审计**: 所有变更都通过 Git 提交,提供完整的变更历史和审计跟踪 +- **自动化同步**: 通过 CI/CD 流水线自动将 Git 中的变更应用到实际环境 +- **状态收敛**: 系统会持续监控实际状态,并自动修复任何与期望状态的偏差 + +### GitOps 工作流程 + +1. **声明期望状态**: 在 Git 中定义基础设施和应用程序的期望状态 +2. **提交变更**: 通过 Git 提交来应用变更 +3. **自动同步**: CI/CD 系统检测到变更并自动应用到环境 +4. **状态验证**: 系统验证实际状态与期望状态一致 +5. **监控和告警**: 持续监控状态并在出现偏差时发出告警 + +这种工作流确保了环境的一致性、可重复性和可靠性,同时提供了完整的变更历史和回滚能力。 + +## 🚀 快速开始 + +### 1. 环境准备 + +```bash +# 克隆项目 +git clone +cd mgmt + +# 检查环境状态 +./mgmt.sh status + +# 快速部署(适用于开发环境) +./mgmt.sh deploy +``` + +### 2. 配置云服务商 + +```bash +# 复制配置模板 +cp tofu/environments/dev/terraform.tfvars.example tofu/environments/dev/terraform.tfvars + +# 编辑配置文件,填入你的云服务商凭据 +vim tofu/environments/dev/terraform.tfvars +``` + +### 3. 初始化基础设施 + +```bash +# 初始化 OpenTofu +./mgmt.sh tofu init + +# 查看执行计划 +./mgmt.sh tofu plan + +# 应用基础设施变更 +cd tofu/environments/dev && tofu apply +``` + +### 4. 部署 Nomad 服务 + +```bash +# 部署 Consul 集群 +nomad run /root/mgmt/jobs/consul/consul-cluster.nomad + +# 查看 Nomad 任务 +nomad job status + +# 查看节点状态 +nomad node status +``` + +### ⚠️ 重要提示:网络访问注意事项 + +**Tailscale 网络访问**: +- 本项目中的 Nomad 和 Consul 服务通过 Tailscale 网络进行访问 +- 访问 Nomad (端口 4646) 和 Consul (端口 8500) 时,必须使用 Tailscale 分配的 IP 地址 +- 错误示例:`http://127.0.0.1:4646` 或 `http://localhost:8500` (无法连接) +- 正确示例:`http://100.x.x.x:4646` 或 `http://100.x.x.x:8500` (使用 Tailscale IP) + +**获取 Tailscale IP**: +```bash +# 查看当前节点的 Tailscale IP +tailscale ip -4 + +# 查看所有 Tailscale 网络中的节点 +tailscale status +``` + +**常见问题**: +- 如果遇到 "connection refused" 错误,请确认是否使用了正确的 Tailscale IP +- 确保 Tailscale 服务已启动并正常运行 +- 检查网络策略是否允许通过 Tailscale 接口访问相关端口 +- 更多详细经验和解决方案,请参考:[Consul 和 Nomad 访问问题经验教训](.gitea/issues/consul-nomad-access-lesson.md) + +### 🔄 Nomad 集群领导者轮换与访问策略 + +**Nomad 集群领导者机制**: +- Nomad 使用 Raft 协议实现分布式一致性,集群中只有一个领导者节点 +- 领导者节点负责处理所有写入操作和协调集群状态 +- 当领导者节点故障时,集群会自动选举新的领导者 + +**领导者轮换时的访问策略**: + +1. **动态发现领导者**: +```bash +# 查询当前领导者节点 +curl -s http://<任意Nomad服务器IP>:4646/v1/status/leader +# 返回结果示例: "100.90.159.68:4647" + +# 使用返回的领导者地址进行API调用 +curl -s http://100.90.159.68:4646/v1/nodes +``` + +2. **负载均衡方案**: + - **DNS 负载均衡**:使用 Consul DNS 服务,通过 `nomad.service.consul` 解析到当前领导者 + - **代理层负载均衡**:在 Nginx/HAProxy 配置中添加健康检查,自动路由到活跃的领导者节点 + - **客户端重试机制**:在客户端代码中实现重试逻辑,当连接失败时尝试其他服务器节点 + +3. **推荐访问模式**: +```bash +# 使用领导者发现脚本 +#!/bin/bash +# 获取任意一个Nomad服务器IP +SERVER_IP="100.116.158.95" +# 查询当前领导者 +LEADER=$(curl -s http://${SERVER_IP}:4646/v1/status/leader | sed 's/"//g') +# 使用领导者地址执行命令 +nomad node status -address=http://${LEADER} +``` + +4. **高可用性配置**: + - 将所有 Nomad 服务器节点添加到客户端配置中 + - 客户端会自动连接到可用的服务器节点 + - 对于写入操作,客户端会自动重定向到领导者节点 + +**注意事项**: +- Nomad 集群领导者轮换是自动进行的,通常不需要人工干预 +- 在领导者选举期间,集群可能会短暂无法处理写入操作 +- 建议在应用程序中实现适当的重试逻辑,以处理领导者切换期间的临时故障 + +## 🛠️ 常用命令 + +| 命令 | 描述 | +|------|------| +| `make status` | 显示项目状态总览 | +| `make deploy` | 快速部署所有服务 | +| `make cleanup` | 清理所有部署的服务 | +| `cd tofu/environments/dev && tofu ` | OpenTofu 管理命令 | +| `nomad job status` | 查看 Nomad 任务状态 | +| `nomad node status` | 查看 Nomad 节点状态 | +| `podman ps` | 查看运行中的容器 | +| `ansible-playbook playbooks/configure-nomad-clients.yml` | 配置 Nomad 客户端 | +| `./run_tests.sh` 或 `make test-mcp` | 运行所有MCP服务器测试 | +| `make test-kali` | 运行Kali Linux快速健康检查 | +| `make test-kali-security` | 运行Kali Linux安全工具测试 | +| `make test-kali-full` | 运行Kali Linux完整测试套件 | + +## 🌩️ 支持的云服务商 + +### Oracle Cloud Infrastructure (OCI) +- ✅ 计算实例 +- ✅ 网络配置 (VCN, 子网, 安全组) +- ✅ 存储 (块存储, 对象存储) +- ✅ 负载均衡器 + +### 华为云 +- ✅ 弹性云服务器 (ECS) +- ✅ 虚拟私有云 (VPC) +- ✅ 弹性负载均衡 (ELB) +- ✅ 云硬盘 (EVS) + +### Google Cloud Platform +- ✅ Compute Engine +- ✅ VPC 网络 +- ✅ Cloud Load Balancing +- ✅ Persistent Disk + +### Amazon Web Services +- ✅ EC2 实例 +- ✅ VPC 网络 +- ✅ Application Load Balancer +- ✅ EBS 存储 + +### DigitalOcean +- ✅ Droplets +- ✅ VPC 网络 +- ✅ Load Balancers +- ✅ Block Storage + +## 🔄 CI/CD 流程 + +### 基础设施部署流程 +1. **代码提交** → 触发 Gitea Actions +2. **OpenTofu Plan** → 生成执行计划 +3. **人工审核** → 确认变更 +4. **OpenTofu Apply** → 应用基础设施变更 +5. **Ansible 部署** → 配置和部署应用 + +### 应用部署流程 +1. **应用代码更新** → 构建容器镜像 +2. **镜像推送** → 推送到镜像仓库 +3. **Nomad Job 更新** → 更新任务定义 +4. **Nomad 部署** → 滚动更新服务 +5. **健康检查** → 验证部署状态 + +## 📊 监控和可观测性 + +### 监控组件 +- **Prometheus**: 指标收集和存储 +- **Grafana**: 可视化仪表板 +- **AlertManager**: 告警管理 +- **Node Exporter**: 系统指标导出 + +### 日志管理 +- **ELK Stack**: Elasticsearch + Logstash + Kibana +- **Fluentd**: 日志收集和转发 +- **结构化日志**: JSON 格式标准化 + +## 🔐 安全最佳实践 + +### 基础设施安全 +- **网络隔离**: VPC, 安全组, 防火墙 +- **访问控制**: IAM 角色和策略 +- **数据加密**: 传输和静态加密 +- **密钥管理**: 云服务商密钥管理服务 + +### 应用安全 +- **容器安全**: 镜像扫描, 最小权限 +- **网络安全**: 服务网格, TLS 终止 +- **秘密管理**: Docker Secrets, Ansible Vault +- **安全审计**: 日志监控和审计 + +## 🧪 测试策略 + +### 基础设施测试 +- **语法检查**: OpenTofu validate +- **安全扫描**: Checkov, tfsec +- **合规检查**: OPA (Open Policy Agent) + +### 应用测试 +- **单元测试**: 应用代码测试 +- **集成测试**: 服务间集成测试 +- **端到端测试**: 完整流程测试 + +### MCP服务器测试 +项目包含完整的MCP(Model Context Protocol)服务器测试套件,位于 `tests/mcp_servers/` 目录: + +- **context7服务器测试**: 验证初始化、工具列表和搜索功能 +- **qdrant服务器测试**: 测试文档添加、搜索和删除功能 +- **qdrant-ollama服务器测试**: 验证向量数据库与LLM集成功能 + +测试脚本包括Shell脚本和Python脚本,支持通过JSON-RPC协议直接测试MCP服务器功能。详细的测试结果和问题修复记录请参考 `tests/mcp_server_test_report.md`。 + +运行测试: +```bash +# 运行单个测试脚本 +cd tests/mcp_servers +./test_local_mcp_servers.sh + +# 或运行Python测试 +python test_mcp_servers_simple.py +``` + +### Kali Linux系统测试 +项目包含完整的Kali Linux系统测试套件,位于 `configuration/playbooks/test/` 目录。测试包括: + +1. **快速健康检查** (`kali-health-check.yml`): 基本系统状态检查 +2. **安全工具测试** (`kali-security-tools.yml`): 测试各种安全工具的安装和功能 +3. **完整系统测试** (`test-kali.yml`): 全面的系统测试和报告生成 +4. **完整测试套件** (`kali-full-test-suite.yml`): 按顺序执行所有测试 + +运行测试: +```bash +# Kali Linux快速健康检查 +make test-kali + +# Kali Linux安全工具测试 +make test-kali-security + +# Kali Linux完整测试套件 +make test-kali-full +``` + +## 📚 文档 + +- [Consul集群故障排除](docs/consul-cluster-troubleshooting.md) +- [磁盘管理](docs/disk-management.md) +- [Nomad NFS设置](docs/nomad-nfs-setup.md) +- [Consul-Terraform集成](docs/setup/consul-terraform-integration.md) +- [OCI凭据设置](docs/setup/oci-credentials-setup.md) +- [Oracle云设置](docs/setup/oracle-cloud-setup.md) + +## 🤝 贡献指南 + +1. Fork 项目 +2. 创建特性分支 (`git checkout -b feature/amazing-feature`) +3. 提交变更 (`git commit -m 'Add amazing feature'`) +4. 推送到分支 (`git push origin feature/amazing-feature`) +5. 创建 Pull Request + +## 📄 许可证 + +本项目采用 MIT 许可证 - 查看 [LICENSE](LICENSE) 文件了解详情。 + +## 🆘 支持 + +如果你遇到问题或有疑问: + +1. 查看 [文档](docs/) +2. 搜索 [Issues](../../issues) +3. 创建新的 [Issue](../../issues/new) + +## ⚠️ 重要经验教训 + +### Terraform 与 Nomad 职责区分 +**问题**:在基础设施管理中容易混淆 Terraform 和 Nomad 的职责范围,导致架构设计混乱。 + +**根本原因**:Terraform 和 Nomad 虽然都是基础设施管理工具,但它们在架构中处于不同层面,负责不同类型的资源管理。 + +**解决方案**: +1. **明确分层架构**: + - **Terraform/OpenTofu**:负责云服务商提供的计算资源(虚拟机)的生命周期管理 + - **Nomad**:负责在已有虚拟机内部进行应用资源调度和编排 + +2. **职责边界清晰**: + - Terraform 决定"有哪些虚拟机" + - Nomad 决定"虚拟机上运行什么应用" + - 两者不应越界管理对方的资源 + +3. **工作流程分离**: + ``` + 1. Terraform 创建虚拟机 (云服务商层面) + ↓ + 2. 虚拟机启动并运行操作系统 + ↓ + 3. 在虚拟机上安装和配置 Nomad 客户端 + ↓ + 4. Nomad 在虚拟机上调度和运行应用容器 + ``` + +**重要提醒**:严格遵守这种分层架构是项目成功的关键。任何混淆这两个层面职责的做法都会导致架构混乱和管理困难。 + +### Consul 和 Nomad 访问问题 +**问题**:尝试访问 Consul 服务时,使用 `http://localhost:8500` 或 `http://127.0.0.1:8500` 无法连接。 + +**根本原因**:本项目中的 Consul 和 Nomad 服务通过 Nomad + Podman 在集群中运行,并通过 Tailscale 网络进行访问。这些服务不在本地运行,因此无法通过 localhost 访问。 + +**解决方案**: +1. **使用 Tailscale IP**:必须使用 Tailscale 分配的 IP 地址访问服务 + ```bash + # 查看当前节点的 Tailscale IP + tailscale ip -4 + + # 查看所有 Tailscale 网络中的节点 + tailscale status + + # 访问 Consul (使用实际的 Tailscale IP) + curl http://100.x.x.x:8500/v1/status/leader + + # 访问 Nomad (使用实际的 Tailscale IP) + curl http://100.x.x.x:4646/v1/status/leader + ``` + +2. **服务发现**:Consul 集群由 3 个节点组成,Nomad 集群由十多个节点组成,需要正确识别服务运行的节点 + +3. **集群架构**: + - Consul 集群:3 个节点 (kr-master, us-ash3c, bj-warden) + - Nomad 集群:十多个节点,包括服务器节点和客户端节点 + +**重要提醒**:在开发和调试过程中,始终记住使用 Tailscale IP 而不是 localhost 访问集群服务。这是本项目架构的基本要求,必须严格遵守。 + +### Consul 集群配置管理经验 +**问题**:Consul集群配置文件与实际运行状态不一致,导致集群管理混乱和配置错误。 + +**根本原因**:Ansible inventory配置文件中的节点信息与实际Consul集群中的节点状态不匹配,包括节点角色、数量和expect值等关键配置。 + +**解决方案**: +1. **定期验证集群状态**:使用Consul API定期检查集群实际状态,确保配置文件与实际运行状态一致 + ```bash + # 查看Consul集群节点信息 + curl -s http://:8500/v1/catalog/nodes + + # 查看节点详细信息 + curl -s http://:8500/v1/agent/members + + # 查看集群leader信息 + curl -s http://:8500/v1/status/leader + ``` + +2. **保持配置文件一致性**:确保所有相关的inventory配置文件(如`csol-consul-nodes.ini`、`consul-nodes.ini`、`consul-cluster.ini`)保持一致,包括: + - 服务器节点列表和数量 + - 客户端节点列表和数量 + - `bootstrap_expect`值(必须与实际服务器节点数量匹配) + - 节点角色和IP地址 + +3. **正确识别节点角色**:通过API查询确认每个节点的实际角色,避免将服务器节点误配置为客户端节点,或反之 + ```json + // API返回的节点信息示例 + { + "Name": "warden", + "Addr": "100.122.197.112", + "Port": 8300, + "Status": 1, + "ProtocolVersion": 2, + "Delegate": 1, + "Server": true // 确认节点角色 + } + ``` + +4. **更新配置流程**:当发现配置与实际状态不匹配时,按照以下步骤更新: + - 使用API获取集群实际状态 + - 根据实际状态更新所有相关配置文件 + - 确保所有配置文件中的信息保持一致 + - 更新配置文件中的说明和注释,反映最新的集群状态 + +**实际案例**: +- **初始状态**:配置文件显示2个服务器节点和5个客户端节点,`bootstrap_expect=2` +- **实际状态**:Consul集群运行3个服务器节点(master、ash3c、warden),无客户端节点,`expect=3` +- **解决方案**:更新所有配置文件,将服务器节点数量改为3个,移除所有客户端节点配置,将`bootstrap_expect`值更新为3 + +**重要提醒**:Consul集群配置必须与实际运行状态保持严格一致。任何不匹配都可能导致集群不稳定或功能异常。定期使用Consul API验证集群状态,并及时更新配置文件,是确保集群稳定运行的关键。 + +## 🎉 致谢 + +感谢所有为这个项目做出贡献的开发者和社区成员! +## 脚本整理 + +项目脚本已重新整理,按功能分类存放在 `scripts/` 目录中: + +- `scripts/setup/` - 环境设置和初始化 +- `scripts/deployment/` - 部署相关脚本 +- `scripts/testing/` - 测试脚本 +- `scripts/utilities/` - 工具脚本 +- `scripts/mcp/` - MCP 服务器相关 +- `scripts/ci-cd/` - CI/CD 相关 + +详细信息请查看 [脚本索引](scripts/SCRIPT_INDEX.md)。 + diff --git a/README_CONSUL_KV_IMPLEMENTATION.md b/README_CONSUL_KV_IMPLEMENTATION.md new file mode 100644 index 0000000..19acbc4 --- /dev/null +++ b/README_CONSUL_KV_IMPLEMENTATION.md @@ -0,0 +1,197 @@ +# Consul集群最佳变量命名规范实施 + +## 概述 + +本项目已实施了一系列改进,确保Consul集群完全遵循最佳变量命名规范 `config/{environment}/{provider}/{region_or_service}/{key}`。这些改进使Consul集群配置更加灵活、可维护且符合环境隔离的最佳实践。 + +## 改进内容 + +### 1. 变量命名规范实施 + +我们创建了完整的Consul集群变量命名规范,涵盖以下类别: + +- **集群基本配置**: `config/dev/consul/cluster/...` +- **节点配置**: `config/dev/consul/nodes/...` +- **网络配置**: `config/dev/consul/network/...` +- **端口配置**: `config/dev/consul/ports/...` +- **UI配置**: `config/dev/consul/ui/...` +- **服务发现配置**: `config/dev/consul/service_discovery/...` +- **性能调优配置**: `config/dev/consul/performance/...` +- **日志配置**: `config/dev/consul/logging/...` +- **安全配置**: `config/dev/consul/security/...` +- **连接配置**: `config/dev/consul/connect/...` +- **Autopilot配置**: `config/dev/consul/autopilot/...` +- **快照配置**: `config/dev/consul/snapshot/...` +- **备份配置**: `config/dev/consul/backup/...` + +### 2. 自动化脚本 + +我们创建了以下自动化脚本,简化了Consul集群的部署和管理: + +#### setup_consul_cluster_variables.sh +- 将Consul集群配置存储到Consul KV中 +- 遵循 `config/{environment}/{provider}/{region_or_service}/{key}` 格式 +- 包含Consul连接检查和配置验证功能 + +#### generate_consul_config.sh +- 使用Consul模板从KV存储生成最终的Consul配置文件 +- 包含Consul连接检查和consul-template可用性验证 +- 支持自定义Consul地址、环境和配置目录 + +#### deploy_consul_cluster_kv.sh +- 综合部署脚本,执行完整的部署流程 +- 包含配置参数设置、Consul/Nomad连接检查 +- 执行变量设置、配置生成、现有集群停止、新集群部署 +- 包含多步骤验证功能(作业状态、leader选举、节点数量、关键变量配置) + +### 3. 配置模板 + +我们创建了Consul配置模板文件 `consul.hcl.tmpl`,使用Consul模板语法从KV存储中动态获取配置: + +- 基础配置(data_dir、raft_dir) +- UI配置(启用状态) +- 数据中心配置 +- 服务器配置(server模式、bootstrap_expect) +- 网络配置(client_addr、bind_addr、advertise_addr) +- 端口配置 +- 集群连接(retry_join节点IP) +- 服务发现配置 +- 性能调优配置 +- 日志配置 +- 安全配置(加密密钥) +- 连接配置 +- Autopilot配置(清理死服务器等) +- 快照配置(间隔、保留数量) +- 备份配置(间隔、保留数量) + +### 4. Nomad作业配置 + +我们创建了完全遵循最佳变量命名规范的Nomad作业配置文件: + +#### consul-cluster-dynamic.nomad +- 使用template块动态生成配置文件 +- 包含3个服务组(consul-master、consul-ash3c、consul-warden) +- 每个组部署1个Consul服务器实例到对应节点 +- 设置固定端口、资源分配和集群连接参数 + +#### consul-cluster-kv.nomad +- 完全遵循 `config/{environment}/{provider}/{region_or_service}/{key}` 格式 +- 使用template块从Consul KV存储动态获取配置 +- 包含3个服务组配置,每个组使用Consul模板动态生成配置 + +### 5. 文档更新 + +我们更新了Consul变量和存储配置指南文档,添加了: + +- Consul集群配置变量章节,包含11个类别共40个具体KV路径示例 +- 部署遵循最佳变量命名规范的Consul集群章节,包含: + - 部署流程说明 + - 部署脚本使用方法 + - 配置模板示例 + - Nomad作业配置示例 + - 验证部署方法 + - 动态更新配置方法 + - 环境隔离实现方法 + +## 使用方法 + +### 1. 设置Consul变量 + +```bash +# 设置Consul集群变量 +./deployment/scripts/setup_consul_cluster_variables.sh +``` + +### 2. 生成配置文件 + +```bash +# 生成Consul配置文件 +./deployment/scripts/generate_consul_config.sh +``` + +### 3. 部署集群 + +```bash +# 部署遵循最佳变量命名规范的Consul集群 +./deployment/scripts/deploy_consul_cluster_kv.sh +``` + +### 4. 验证部署 + +```bash +# 检查Consul集群配置 +curl -s http://localhost:8500/v1/kv/config/dev/consul/?keys | jq '.' + +# 检查集群leader +curl -s http://localhost:8500/v1/status/leader + +# 检查集群节点 +curl -s http://localhost:8500/v1/status/peers + +# 验证生成的配置文件语法 +consul validate /root/mgmt/components/consul/configs/consul.hcl +``` + +### 5. 动态更新配置 + +```bash +# 更新日志级别 +curl -X PUT http://localhost:8500/v1/kv/config/dev/consul/cluster/log_level -d "DEBUG" + +# 更新快照间隔 +curl -X PUT http://localhost:8500/v1/kv/config/dev/consul/snapshot/interval -d "12h" + +# 重新生成配置文件 +./deployment/scripts/generate_consul_config.sh + +# 重新加载Consul配置 +consul reload +``` + +## 环境隔离 + +通过使用环境变量和不同的配置路径,您可以轻松实现不同环境的隔离: + +```bash +# 开发环境 +ENVIRONMENT=dev ./deployment/scripts/setup_consul_cluster_variables.sh + +# 生产环境 +ENVIRONMENT=prod ./deployment/scripts/setup_consul_cluster_variables.sh +``` + +这样,不同环境的配置将存储在不同的路径下: +- 开发环境: `config/dev/consul/...` +- 生产环境: `config/prod/consul/...` + +## 文件结构 + +``` +/root/mgmt/ +├── components/consul/ +│ ├── configs/ +│ │ ├── consul.hcl # 原始配置文件 +│ │ └── consul.hcl.tmpl # Consul配置模板 +│ └── jobs/ +│ ├── consul-cluster-simple.nomad # 原始Nomad作业配置 +│ ├── consul-cluster-dynamic.nomad # 动态配置Nomad作业 +│ └── consul-cluster-kv.nomad # KV存储配置Nomad作业 +├── deployment/scripts/ +│ ├── setup_consul_cluster_variables.sh # 设置Consul变量脚本 +│ ├── generate_consul_config.sh # 生成配置文件脚本 +│ └── deploy_consul_cluster_kv.sh # 部署Consul集群脚本 +└── docs/setup/ + └── consul_variables_and_storage_guide.md # 更新的指南文档 +``` + +## 总结 + +通过实施这些改进,我们确保了Consul集群完全遵循最佳变量命名规范,实现了以下目标: + +1. **标准化**: 所有Consul配置变量都遵循统一的命名规范 +2. **灵活性**: 可以轻松修改配置而无需重新部署整个集群 +3. **可维护性**: 配置结构清晰,易于理解和维护 +4. **环境隔离**: 支持不同环境的配置隔离 +5. **自动化**: 提供了完整的自动化部署和管理脚本 + +这些改进使Consul集群的配置管理更加高效和可靠,为整个基础设施的稳定运行提供了坚实的基础。 \ No newline at end of file diff --git a/components/consul/configs/consul.hcl.tmpl b/components/consul/configs/consul.hcl.tmpl new file mode 100644 index 0000000..03a2b44 --- /dev/null +++ b/components/consul/configs/consul.hcl.tmpl @@ -0,0 +1,93 @@ +# Consul配置模板文件 +# 此文件使用Consul模板语法从KV存储中动态获取配置 +# 遵循 config/{environment}/{provider}/{region_or_service}/{key} 格式 + +# 基础配置 +data_dir = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/cluster/data_dir` `/opt/consul/data` }}" +raft_dir = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/cluster/raft_dir` `/opt/consul/raft` }}" + +# 启用UI +ui_config { + enabled = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ui/enabled` `true` }} +} + +# 数据中心配置 +datacenter = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/cluster/datacenter` `dc1` }}" + +# 服务器配置 +server = true +bootstrap_expect = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/cluster/bootstrap_expect` `3` }} + +# 网络配置 +client_addr = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/network/client_addr` `0.0.0.0` }}" +bind_addr = "{{ GetInterfaceIP (keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/network/bind_interface` `ens160`) }}" +advertise_addr = "{{ GetInterfaceIP (keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/network/advertise_interface` `ens160`) }}" + +# 端口配置 +ports { + dns = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/dns` `8600` }} + http = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/http` `8500` }} + https = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/https` `-1` }} + grpc = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/grpc` `8502` }} + grpc_tls = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/grpc_tls` `8503` }} + serf_lan = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/serf_lan` `8301` }} + serf_wan = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/serf_wan` `8302` }} + server = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/ports/server` `8300` }} +} + +# 集群连接 - 动态获取节点IP +retry_join = [ + "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/nodes/master/ip` `100.117.106.136` }}", + "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/nodes/ash3c/ip` `100.116.80.94` }}", + "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/nodes/warden/ip` `100.122.197.112` }}" +] + +# 服务发现 +enable_service_script = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/service/enable_service_script` `true` }} +enable_script_checks = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/service/enable_script_checks` `true` }} +enable_local_script_checks = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/service/enable_local_script_checks` `true` }} + +# 性能调优 +performance { + raft_multiplier = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/performance/raft_multiplier` `1` }} +} + +# 日志配置 +log_level = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/cluster/log_level` `INFO` }}" +enable_syslog = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/log/enable_syslog` `false` }} +log_file = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/log/log_file` `/var/log/consul/consul.log` }}" + +# 安全配置 +encrypt = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/cluster/encrypt_key` `YourEncryptionKeyHere` }}" + +# 连接配置 +reconnect_timeout = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/connection/reconnect_timeout` `30s` }}" +reconnect_timeout_wan = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/connection/reconnect_timeout_wan` `30s` }}" +session_ttl_min = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/connection/session_ttl_min` `10s` }}" + +# Autopilot配置 +autopilot { + cleanup_dead_servers = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/autopilot/cleanup_dead_servers` `true` }} + last_contact_threshold = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/autopilot/last_contact_threshold` `200ms` }}" + max_trailing_logs = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/autopilot/max_trailing_logs` `250` }} + server_stabilization_time = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/autopilot/server_stabilization_time` `10s` }}" + redundancy_zone_tag = "" + disable_upgrade_migration = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/autopilot/disable_upgrade_migration` `false` }} + upgrade_version_tag = "" +} + +# 快照配置 +snapshot { + enabled = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/snapshot/enabled` `true` }} + interval = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/snapshot/interval` `24h` }}" + retain = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/snapshot/retain` `30` }} + name = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/snapshot/name` `consul-snapshot-{{.Timestamp}}` }}" +} + +# 备份配置 +backup { + enabled = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/backup/enabled` `true` }} + interval = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/backup/interval` `6h` }}" + retain = {{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/backup/retain` `7` }} + name = "{{ keyOrDefault `config/` + env "ENVIRONMENT" + `/consul/backup/name` `consul-backup-{{.Timestamp}}` }}" +} \ No newline at end of file diff --git a/components/consul/jobs/consul-cluster-dynamic.nomad b/components/consul/jobs/consul-cluster-dynamic.nomad new file mode 100644 index 0000000..c004a0c --- /dev/null +++ b/components/consul/jobs/consul-cluster-dynamic.nomad @@ -0,0 +1,412 @@ +job "consul-cluster-dynamic" { + datacenters = ["dc1"] + type = "service" + + group "consul-master" { + count = 1 + + constraint { + attribute = "${node.unique.name}" + value = "kr-master" + } + + network { + port "http" { + static = 8500 + } + port "rpc" { + static = 8300 + } + port "serf_lan" { + static = 8301 + } + port "serf_wan" { + static = 8302 + } + } + + task "consul" { + driver = "exec" + + # 使用模板生成配置文件 + template { + data = <- + {%- if inventory_hostname == 'influxdb1' -%}us-influxdb + {%- elif inventory_hostname == 'master' -%}kr-master + {%- elif inventory_hostname == 'hcp1' -%}bj-hcp1 + {%- elif inventory_hostname == 'hcp2' -%}bj-hcp2 + {%- elif inventory_hostname == 'warden' -%}bj-warden + {%- else -%}{{ inventory_hostname }} + {%- endif -%} tasks: - name: 创建Nomad配置目录 @@ -14,42 +25,9 @@ group: root mode: '0755' - - name: 复制Nomad客户端配置 - copy: - content: | - datacenter = "dc1" - data_dir = "/opt/nomad/data" - log_level = "INFO" - bind_addr = "0.0.0.0" - - server { - enabled = false - } - - client { - enabled = true - servers = ["100.116.158.95:4647"] - host_volume "fnsync" { - path = "/mnt/fnsync" - read_only = false - } - } - - addresses { - http = "{{ ansible_host }}" - rpc = "{{ ansible_host }}" - serf = "{{ ansible_host }}" - } - - advertise { - http = "{{ ansible_host }}:4646" - rpc = "{{ ansible_host }}:4647" - serf = "{{ ansible_host }}:4648" - } - - consul { - address = "100.116.158.95:8500" - } + - name: 复制Nomad客户端配置模板 + template: + src: ../templates/nomad-client.hcl dest: "{{ nomad_config_dir }}/nomad.hcl" owner: root group: root diff --git a/deployment/ansible/playbooks/configure-nomad-clients.yml.backup.20250930_131511 b/deployment/ansible/playbooks/configure-nomad-clients.yml.backup.20250930_131511 new file mode 100644 index 0000000..065f2f4 --- /dev/null +++ b/deployment/ansible/playbooks/configure-nomad-clients.yml.backup.20250930_131511 @@ -0,0 +1,104 @@ +--- +- name: 配置Nomad客户端节点 + hosts: target_nodes + become: yes + vars: + nomad_config_dir: /etc/nomad.d + + tasks: + - name: 创建Nomad配置目录 + file: + path: "{{ nomad_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: 复制Nomad客户端配置 + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + log_level = "INFO" + bind_addr = "0.0.0.0" + + server { + enabled = false + } + + client { + enabled = true + # 配置七姐妹服务器地址 + servers = [ + "100.116.158.95:4647", # bj-semaphore + "100.81.26.3:4647", # ash1d + "100.103.147.94:4647", # ash2e + "100.90.159.68:4647", # ch2 + "100.86.141.112:4647", # ch3 + "100.98.209.50:4647", # bj-onecloud1 + "100.120.225.29:4647" # de + ] + host_volume "fnsync" { + path = "/mnt/fnsync" + read_only = false + } + # 禁用Docker驱动,只使用Podman + options { + "driver.raw_exec.enable" = "1" + "driver.exec.enable" = "1" + } + } + + # 配置Podman插件目录 + plugin_dir = "/opt/nomad/plugins" + + addresses { + http = "{{ ansible_host }}" + rpc = "{{ ansible_host }}" + serf = "{{ ansible_host }}" + } + + advertise { + http = "{{ ansible_host }}:4646" + rpc = "{{ ansible_host }}:4647" + serf = "{{ ansible_host }}:4648" + } + + consul { + address = "100.116.158.95:8500" + } + + # 配置Podman驱动 + plugin "podman" { + config { + volumes { + enabled = true + } + logging { + type = "journald" + } + gc { + container = true + } + } + } + dest: "{{ nomad_config_dir }}/nomad.hcl" + owner: root + group: root + mode: '0644' + + - name: 启动Nomad服务 + systemd: + name: nomad + state: restarted + enabled: yes + daemon_reload: yes + + - name: 检查Nomad服务状态 + command: systemctl status nomad + register: nomad_status + changed_when: false + + - name: 显示Nomad服务状态 + debug: + var: nomad_status.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/configure-nomad-clients.yml.backup.20250930_131639 b/deployment/ansible/playbooks/configure-nomad-clients.yml.backup.20250930_131639 new file mode 100644 index 0000000..065f2f4 --- /dev/null +++ b/deployment/ansible/playbooks/configure-nomad-clients.yml.backup.20250930_131639 @@ -0,0 +1,104 @@ +--- +- name: 配置Nomad客户端节点 + hosts: target_nodes + become: yes + vars: + nomad_config_dir: /etc/nomad.d + + tasks: + - name: 创建Nomad配置目录 + file: + path: "{{ nomad_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: 复制Nomad客户端配置 + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + log_level = "INFO" + bind_addr = "0.0.0.0" + + server { + enabled = false + } + + client { + enabled = true + # 配置七姐妹服务器地址 + servers = [ + "100.116.158.95:4647", # bj-semaphore + "100.81.26.3:4647", # ash1d + "100.103.147.94:4647", # ash2e + "100.90.159.68:4647", # ch2 + "100.86.141.112:4647", # ch3 + "100.98.209.50:4647", # bj-onecloud1 + "100.120.225.29:4647" # de + ] + host_volume "fnsync" { + path = "/mnt/fnsync" + read_only = false + } + # 禁用Docker驱动,只使用Podman + options { + "driver.raw_exec.enable" = "1" + "driver.exec.enable" = "1" + } + } + + # 配置Podman插件目录 + plugin_dir = "/opt/nomad/plugins" + + addresses { + http = "{{ ansible_host }}" + rpc = "{{ ansible_host }}" + serf = "{{ ansible_host }}" + } + + advertise { + http = "{{ ansible_host }}:4646" + rpc = "{{ ansible_host }}:4647" + serf = "{{ ansible_host }}:4648" + } + + consul { + address = "100.116.158.95:8500" + } + + # 配置Podman驱动 + plugin "podman" { + config { + volumes { + enabled = true + } + logging { + type = "journald" + } + gc { + container = true + } + } + } + dest: "{{ nomad_config_dir }}/nomad.hcl" + owner: root + group: root + mode: '0644' + + - name: 启动Nomad服务 + systemd: + name: nomad + state: restarted + enabled: yes + daemon_reload: yes + + - name: 检查Nomad服务状态 + command: systemctl status nomad + register: nomad_status + changed_when: false + + - name: 显示Nomad服务状态 + debug: + var: nomad_status.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/deploy-korean-nodes.yml b/deployment/ansible/playbooks/deploy-korean-nodes.yml new file mode 100644 index 0000000..6c34374 --- /dev/null +++ b/deployment/ansible/playbooks/deploy-korean-nodes.yml @@ -0,0 +1,105 @@ +--- +- name: 部署韩国节点Nomad配置 + hosts: ch2,ch3 + become: yes + gather_facts: no + vars: + nomad_config_dir: "/etc/nomad.d" + nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" + source_config_dir: "/root/mgmt/infrastructure/configs/server" + + tasks: + - name: 获取主机名短名称(去掉后缀) + set_fact: + short_hostname: "{{ inventory_hostname | regex_replace('\\$', '') }}" + + - name: 确保 Nomad 配置目录存在 + file: + path: "{{ nomad_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: 部署 Nomad 配置文件到韩国节点 + copy: + src: "{{ source_config_dir }}/nomad-{{ short_hostname }}.hcl" + dest: "{{ nomad_config_file }}" + owner: root + group: root + mode: '0644' + backup: yes + notify: restart nomad + + - name: 检查 Nomad 二进制文件位置 + shell: which nomad || find /usr -name nomad 2>/dev/null | head -1 + register: nomad_binary_path + failed_when: nomad_binary_path.stdout == "" + + - name: 创建/更新 Nomad systemd 服务文件 + copy: + dest: "/etc/systemd/system/nomad.service" + owner: root + group: root + mode: '0644' + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + + [Service] + Type=notify + User=root + Group=root + ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + notify: restart nomad + + - name: 确保 Nomad 数据目录存在 + file: + path: "/opt/nomad/data" + state: directory + owner: root + group: root + mode: '0755' + + - name: 重新加载 systemd daemon + systemd: + daemon_reload: yes + + - name: 启用并启动 Nomad 服务 + systemd: + name: nomad + enabled: yes + state: started + + - name: 等待 Nomad 服务启动 + wait_for: + port: 4646 + host: "{{ ansible_host }}" + delay: 5 + timeout: 30 + ignore_errors: yes + + - name: 显示 Nomad 服务状态 + command: systemctl status nomad + register: nomad_status + changed_when: false + + - name: 显示 Nomad 服务状态信息 + debug: + var: nomad_status.stdout_lines + + handlers: + - name: restart nomad + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/playbooks/deploy-korean-nodes.yml.backup.20250930_131511 b/deployment/ansible/playbooks/deploy-korean-nodes.yml.backup.20250930_131511 new file mode 100644 index 0000000..e11a3e5 --- /dev/null +++ b/deployment/ansible/playbooks/deploy-korean-nodes.yml.backup.20250930_131511 @@ -0,0 +1,105 @@ +--- +- name: 部署韩国节点Nomad配置 + hosts: ch2,ch3 + become: yes + gather_facts: no + vars: + nomad_config_dir: "/etc/nomad.d" + nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" + source_config_dir: "/root/mgmt/infrastructure/configs/server" + + tasks: + - name: 获取主机名短名称(去掉.global后缀) + set_fact: + short_hostname: "{{ inventory_hostname | regex_replace('\\.global$', '') }}" + + - name: 确保 Nomad 配置目录存在 + file: + path: "{{ nomad_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: 部署 Nomad 配置文件到韩国节点 + copy: + src: "{{ source_config_dir }}/nomad-{{ short_hostname }}.hcl" + dest: "{{ nomad_config_file }}" + owner: root + group: root + mode: '0644' + backup: yes + notify: restart nomad + + - name: 检查 Nomad 二进制文件位置 + shell: which nomad || find /usr -name nomad 2>/dev/null | head -1 + register: nomad_binary_path + failed_when: nomad_binary_path.stdout == "" + + - name: 创建/更新 Nomad systemd 服务文件 + copy: + dest: "/etc/systemd/system/nomad.service" + owner: root + group: root + mode: '0644' + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + + [Service] + Type=notify + User=root + Group=root + ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + notify: restart nomad + + - name: 确保 Nomad 数据目录存在 + file: + path: "/opt/nomad/data" + state: directory + owner: root + group: root + mode: '0755' + + - name: 重新加载 systemd daemon + systemd: + daemon_reload: yes + + - name: 启用并启动 Nomad 服务 + systemd: + name: nomad + enabled: yes + state: started + + - name: 等待 Nomad 服务启动 + wait_for: + port: 4646 + host: "{{ ansible_host }}" + delay: 5 + timeout: 30 + ignore_errors: yes + + - name: 显示 Nomad 服务状态 + command: systemctl status nomad + register: nomad_status + changed_when: false + + - name: 显示 Nomad 服务状态信息 + debug: + var: nomad_status.stdout_lines + + handlers: + - name: restart nomad + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/playbooks/deploy-korean-nodes.yml.backup.20250930_131639 b/deployment/ansible/playbooks/deploy-korean-nodes.yml.backup.20250930_131639 new file mode 100644 index 0000000..6c34374 --- /dev/null +++ b/deployment/ansible/playbooks/deploy-korean-nodes.yml.backup.20250930_131639 @@ -0,0 +1,105 @@ +--- +- name: 部署韩国节点Nomad配置 + hosts: ch2,ch3 + become: yes + gather_facts: no + vars: + nomad_config_dir: "/etc/nomad.d" + nomad_config_file: "{{ nomad_config_dir }}/nomad.hcl" + source_config_dir: "/root/mgmt/infrastructure/configs/server" + + tasks: + - name: 获取主机名短名称(去掉后缀) + set_fact: + short_hostname: "{{ inventory_hostname | regex_replace('\\$', '') }}" + + - name: 确保 Nomad 配置目录存在 + file: + path: "{{ nomad_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: 部署 Nomad 配置文件到韩国节点 + copy: + src: "{{ source_config_dir }}/nomad-{{ short_hostname }}.hcl" + dest: "{{ nomad_config_file }}" + owner: root + group: root + mode: '0644' + backup: yes + notify: restart nomad + + - name: 检查 Nomad 二进制文件位置 + shell: which nomad || find /usr -name nomad 2>/dev/null | head -1 + register: nomad_binary_path + failed_when: nomad_binary_path.stdout == "" + + - name: 创建/更新 Nomad systemd 服务文件 + copy: + dest: "/etc/systemd/system/nomad.service" + owner: root + group: root + mode: '0644' + content: | + [Unit] + Description=Nomad + Documentation=https://www.nomadproject.io/ + Requires=network-online.target + After=network-online.target + + [Service] + Type=notify + User=root + Group=root + ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl + ExecReload=/bin/kill -HUP $MAINPID + KillMode=process + Restart=on-failure + LimitNOFILE=65536 + + [Install] + WantedBy=multi-user.target + notify: restart nomad + + - name: 确保 Nomad 数据目录存在 + file: + path: "/opt/nomad/data" + state: directory + owner: root + group: root + mode: '0755' + + - name: 重新加载 systemd daemon + systemd: + daemon_reload: yes + + - name: 启用并启动 Nomad 服务 + systemd: + name: nomad + enabled: yes + state: started + + - name: 等待 Nomad 服务启动 + wait_for: + port: 4646 + host: "{{ ansible_host }}" + delay: 5 + timeout: 30 + ignore_errors: yes + + - name: 显示 Nomad 服务状态 + command: systemctl status nomad + register: nomad_status + changed_when: false + + - name: 显示 Nomad 服务状态信息 + debug: + var: nomad_status.stdout_lines + + handlers: + - name: restart nomad + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/playbooks/distribute-ssh-keys-to-clients.yml b/deployment/ansible/playbooks/distribute-ssh-keys-to-clients.yml new file mode 100644 index 0000000..d04265a --- /dev/null +++ b/deployment/ansible/playbooks/distribute-ssh-keys-to-clients.yml @@ -0,0 +1,33 @@ +--- +- name: 分发SSH公钥到Nomad客户端节点 + hosts: nomad_clients + become: yes + vars: + ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMSUUfma8FKEFvH8Nq65XM2PZ9kitfgv1q727cKV9y5Z houzhongxu@seekkey.tech" + + tasks: + - name: 确保 .ssh 目录存在 + file: + path: "/home/{{ ansible_user }}/.ssh" + state: directory + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: '0700' + + - name: 添加SSH公钥到 authorized_keys + lineinfile: + path: "/home/{{ ansible_user }}/.ssh/authorized_keys" + line: "{{ ssh_public_key }}" + create: yes + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: '0600' + + - name: 验证SSH公钥已添加 + command: cat "/home/{{ ansible_user }}/.ssh/authorized_keys" + register: ssh_key_check + changed_when: false + + - name: 显示SSH公钥内容 + debug: + var: ssh_key_check.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/distribute-ssh-keys.yml b/deployment/ansible/playbooks/distribute-ssh-keys.yml new file mode 100644 index 0000000..4a65c0b --- /dev/null +++ b/deployment/ansible/playbooks/distribute-ssh-keys.yml @@ -0,0 +1,32 @@ +--- +- name: 分发SSH公钥到新节点 + hosts: browser,influxdb1,hcp1,warden + become: yes + vars: + ssh_public_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMSUUfma8FKEFvH8Nq65XM2PZ9kitfgv1q727cKV9y5Z houzhongxu@seekkey.tech" + + tasks: + - name: 确保 .ssh 目录存在 + file: + path: "/root/.ssh" + state: directory + mode: '0700' + owner: root + group: root + + - name: 添加SSH公钥到 authorized_keys + copy: + content: "{{ ssh_public_key }}" + dest: "/root/.ssh/authorized_keys" + mode: '0600' + owner: root + group: root + + - name: 验证SSH公钥已添加 + command: cat /root/.ssh/authorized_keys + register: ssh_key_check + changed_when: false + + - name: 显示SSH公钥内容 + debug: + var: ssh_key_check.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/fix-nomad-region-config.yml b/deployment/ansible/playbooks/fix-nomad-region-config.yml new file mode 100644 index 0000000..d679965 --- /dev/null +++ b/deployment/ansible/playbooks/fix-nomad-region-config.yml @@ -0,0 +1,43 @@ +--- +- name: 修复 Nomad 服务器 region 配置 + hosts: nomad_servers + become: yes + vars: + nomad_config_dir: /etc/nomad.d + + tasks: + - name: 备份当前 Nomad 配置 + copy: + src: "{{ nomad_config_dir }}/nomad.hcl" + dest: "{{ nomad_config_dir }}/nomad.hcl.backup.{{ ansible_date_time.epoch }}" + remote_src: yes + ignore_errors: yes + + - name: 更新 Nomad 配置文件以添加 region 设置 + blockinfile: + path: "{{ nomad_config_dir }}/nomad.hcl" + insertafter: '^datacenter = ' + block: | + region = "dc1" + marker: "# {mark} Ansible managed region setting" + notify: restart nomad + + - name: 更新节点名称以移除 .global 后缀(如果存在) + replace: + path: "{{ nomad_config_dir }}/nomad.hcl" + regexp: 'name = "(.*)\.global(.*)"' + replace: 'name = "\1\2"' + notify: restart nomad + + - name: 确保 retry_join 使用正确的 IP 地址 + replace: + path: "{{ nomad_config_dir }}/nomad.hcl" + regexp: 'retry_join = \[(.*)\]' + replace: 'retry_join = ["100.81.26.3", "100.103.147.94", "100.90.159.68", "100.116.158.95", "100.98.209.50", "100.120.225.29"]' + notify: restart nomad + + handlers: + - name: restart nomad + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/playbooks/install/configure-podman-driver.yml b/deployment/ansible/playbooks/install/configure-podman-driver.yml new file mode 100644 index 0000000..0f3815a --- /dev/null +++ b/deployment/ansible/playbooks/install/configure-podman-driver.yml @@ -0,0 +1,87 @@ +--- +- name: Configure Nomad Podman Driver + hosts: target_nodes + become: yes + tasks: + - name: Create backup directory + file: + path: /etc/nomad.d/backup + state: directory + mode: '0755' + + - name: Backup current nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: "/etc/nomad.d/backup/nomad.hcl.bak.{{ ansible_date_time.iso8601 }}" + remote_src: yes + + - name: Create plugin directory + file: + path: /opt/nomad/plugins + state: directory + owner: nomad + group: nomad + mode: '0755' + + - name: Create symlink for podman driver + file: + src: /usr/bin/nomad-driver-podman + dest: /opt/nomad/plugins/nomad-driver-podman + state: link + + - name: Copy podman driver configuration + copy: + src: ../../files/podman-driver.hcl + dest: /etc/nomad.d/podman-driver.hcl + owner: root + group: root + mode: '0644' + + - name: Remove existing plugin_dir configuration + lineinfile: + path: /etc/nomad.d/nomad.hcl + regexp: '^plugin_dir = "/opt/nomad/data/plugins"' + state: absent + + - name: Configure Nomad to use Podman driver + blockinfile: + path: /etc/nomad.d/nomad.hcl + marker: "# {mark} ANSIBLE MANAGED BLOCK - PODMAN DRIVER" + block: | + plugin_dir = "/opt/nomad/plugins" + + plugin "podman" { + config { + volumes { + enabled = true + } + logging { + type = "journald" + } + gc { + container = true + } + } + } + register: nomad_config_result + + - name: Restart nomad service + systemd: + name: nomad + state: restarted + enabled: yes + + - name: Wait for nomad to start + wait_for: + port: 4646 + delay: 10 + timeout: 60 + + - name: Check nomad status + command: nomad node status + register: nomad_status + changed_when: false + + - name: Display nomad status + debug: + var: nomad_status.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/restore-hosts-file.yml b/deployment/ansible/playbooks/restore-hosts-file.yml new file mode 100644 index 0000000..b186087 --- /dev/null +++ b/deployment/ansible/playbooks/restore-hosts-file.yml @@ -0,0 +1,86 @@ +--- +- name: 恢复客户端节点的/etc/hosts文件 + hosts: nomad_clients + become: yes + + tasks: + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.116\\.158\\.95\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.81\\.26\\.3\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.103\\.147\\.94\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.90\\.159\\.68\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.86\\.141\\.112\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.98\\.209\\.50\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.120\\.225\\.29\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.117\\.106\\.136\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.116\\.80\\.94\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.116\\.112\\.45\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.97\\.62\\.111\\s" + state: absent + + - name: 删除添加的主机名解析条目 + lineinfile: + path: /etc/hosts + regexp: "^100\\.122\\.197\\.112\\s" + state: absent + + - name: 显示恢复后的/etc/hosts文件内容 + command: cat /etc/hosts + register: hosts_content + changed_when: false + + - name: 显示/etc/hosts文件内容 + debug: + var: hosts_content.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/update-hosts-file.yml b/deployment/ansible/playbooks/update-hosts-file.yml new file mode 100644 index 0000000..7d49ed5 --- /dev/null +++ b/deployment/ansible/playbooks/update-hosts-file.yml @@ -0,0 +1,50 @@ +--- +- name: 更新客户端节点的/etc/hosts文件 + hosts: nomad_clients + become: yes + vars: + hosts_entries: + - ip: "100.116.158.95" + hostnames: ["semaphore", "bj-semaphore"] + - ip: "100.81.26.3" + hostnames: ["ash1d"] + - ip: "100.103.147.94" + hostnames: ["ash2e"] + - ip: "100.90.159.68" + hostnames: ["ch2"] + - ip: "100.86.141.112" + hostnames: ["ch3"] + - ip: "100.98.209.50" + hostnames: ["onecloud1", "bj-onecloud1"] + - ip: "100.120.225.29" + hostnames: ["de"] + - ip: "100.117.106.136" + hostnames: ["master"] + - ip: "100.116.80.94" + hostnames: ["ash3c", "influxdb1"] + - ip: "100.116.112.45" + hostnames: ["browser"] + - ip: "100.97.62.111" + hostnames: ["hcp1", "bj-hcp1"] + - ip: "100.122.197.112" + hostnames: ["warden"] + + tasks: + - name: 添加主机名解析到/etc/hosts文件 + lineinfile: + path: /etc/hosts + line: "{{ item.ip }} {{ item.hostnames | join(' ') }}" + create: yes + owner: root + group: root + mode: '0644' + loop: "{{ hosts_entries }}" + + - name: 显示更新后的/etc/hosts文件内容 + command: cat /etc/hosts + register: hosts_content + changed_when: false + + - name: 显示/etc/hosts文件内容 + debug: + var: hosts_content.stdout_lines \ No newline at end of file diff --git a/deployment/ansible/playbooks/update-nomad-server-config.yml b/deployment/ansible/playbooks/update-nomad-server-config.yml new file mode 100644 index 0000000..c1f6906 --- /dev/null +++ b/deployment/ansible/playbooks/update-nomad-server-config.yml @@ -0,0 +1,31 @@ +--- +- name: Update Nomad server configuration + hosts: nomad_servers + become: yes + + tasks: + - name: Backup current Nomad configuration + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak + remote_src: yes + + - name: Generate Nomad configuration for each server + template: + src: ../templates/nomad-server.hcl.j2 + dest: /etc/nomad.d/nomad.hcl + vars: + server_name: "{{ inventory_hostname }}" + server_ip: "{{ ansible_host }}" + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted + + - name: Wait for Nomad to be ready + wait_for: + port: 4646 + host: "{{ ansible_host }}" + delay: 10 + timeout: 60 \ No newline at end of file diff --git a/deployment/ansible/templates/nomad-client.hcl b/deployment/ansible/templates/nomad-client.hcl index e371c4a..3c6e0a1 100644 --- a/deployment/ansible/templates/nomad-client.hcl +++ b/deployment/ansible/templates/nomad-client.hcl @@ -1,39 +1,76 @@ datacenter = "dc1" data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" log_level = "INFO" +name = "{{ client_name }}" -# 使用Tailscale网络,但绑定到本地接口 -bind_addr = "0.0.0.0" +bind_addr = "{{ client_ip }}" + +addresses { + http = "{{ client_ip }}" + rpc = "{{ client_ip }}" + serf = "{{ client_ip }}" +} + +advertise { + http = "{{ client_ip }}:4646" + rpc = "{{ client_ip }}:4647" + serf = "{{ client_ip }}:4648" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} server { enabled = false } -# 启用客户端模式,支持混合存储 client { enabled = true - servers = ["100.116.158.95:4647"] + # 配置七仙女服务器地址,使用短名 + servers = [ + "semaphore:4647", # bj-semaphore + "ash1d:4647", # ash1d + "ash2e:4647", # ash2e + "ch2:4647", # ch2 + "ch3:4647", # ch3 + "onecloud1:4647", # bj-onecloud1 + "de:4647" # de + ] # 配置host volumes host_volume "fnsync" { path = "/mnt/fnsync" read_only = false } + + # 禁用Docker驱动,只使用Podman + options { + "driver.raw_exec.enable" = "1" + "driver.exec.enable" = "1" + } } -# 指定Tailscale地址用于通信 -addresses { - http = "{{ ansible_host }}" - rpc = "{{ ansible_host }}" - serf = "{{ ansible_host }}" -} - -advertise { - http = "{{ ansible_host }}:4646" - rpc = "{{ ansible_host }}:4647" - serf = "{{ ansible_host }}:4648" +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } } consul { - address = "100.116.158.95:8500" + address = "master:8500,ash3c:8500,warden:8500" +} + +vault { + enabled = true + address = "http://master:8200,http://ash3c:8200,http://warden:8200" + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true } \ No newline at end of file diff --git a/deployment/ansible/templates/nomad-server.hcl.j2 b/deployment/ansible/templates/nomad-server.hcl.j2 new file mode 100644 index 0000000..b5b091a --- /dev/null +++ b/deployment/ansible/templates/nomad-server.hcl.j2 @@ -0,0 +1,50 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "{{ server_name }}" + +bind_addr = "{{ server_ip }}" + +addresses { + http = "{{ server_ip }}" + rpc = "{{ server_ip }}" + serf = "{{ server_ip }}" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 3 + retry_join = ["semaphore", "ash1d", "ash2e", "ch2", "ch3", "onecloud1", "de"] +} + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "master:8500,ash3c:8500,warden:8500" +} + +vault { + enabled = true + address = "http://master:8200,http://ash3c:8200,http://warden:8200" + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad.yml b/deployment/ansible/update_ch2_nomad.yml new file mode 100644 index 0000000..f4789bd --- /dev/null +++ b/deployment/ansible/update_ch2_nomad.yml @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak + remote_src: yes + + - name: Update nomad.hcl with retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad_name.yml b/deployment/ansible/update_ch2_nomad_name.yml new file mode 100644 index 0000000..81b3a31 --- /dev/null +++ b/deployment/ansible/update_ch2_nomad_name.yml @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server with correct name + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak2 + remote_src: yes + + - name: Update nomad.hcl with correct name and retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad_name.yml.backup.20250930_131511 b/deployment/ansible/update_ch2_nomad_name.yml.backup.20250930_131511 new file mode 100644 index 0000000..81b3a31 --- /dev/null +++ b/deployment/ansible/update_ch2_nomad_name.yml.backup.20250930_131511 @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server with correct name + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak2 + remote_src: yes + + - name: Update nomad.hcl with correct name and retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad_name.yml.backup.20250930_131639 b/deployment/ansible/update_ch2_nomad_name.yml.backup.20250930_131639 new file mode 100644 index 0000000..81b3a31 --- /dev/null +++ b/deployment/ansible/update_ch2_nomad_name.yml.backup.20250930_131639 @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server with correct name + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak2 + remote_src: yes + + - name: Update nomad.hcl with correct name and retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad_name_final.yml b/deployment/ansible/update_ch2_nomad_name_final.yml new file mode 100644 index 0000000..f9450ce --- /dev/null +++ b/deployment/ansible/update_ch2_nomad_name_final.yml @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server with correct name format + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak3 + remote_src: yes + + - name: Update nomad.hcl with correct name format and retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad_name_final.yml.backup.20250930_131511 b/deployment/ansible/update_ch2_nomad_name_final.yml.backup.20250930_131511 new file mode 100644 index 0000000..f9450ce --- /dev/null +++ b/deployment/ansible/update_ch2_nomad_name_final.yml.backup.20250930_131511 @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server with correct name format + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak3 + remote_src: yes + + - name: Update nomad.hcl with correct name format and retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/deployment/ansible/update_ch2_nomad_name_final.yml.backup.20250930_131639 b/deployment/ansible/update_ch2_nomad_name_final.yml.backup.20250930_131639 new file mode 100644 index 0000000..f9450ce --- /dev/null +++ b/deployment/ansible/update_ch2_nomad_name_final.yml.backup.20250930_131639 @@ -0,0 +1,69 @@ +--- +- name: Update Nomad configuration for ch2 server with correct name format + hosts: ch2 + become: yes + tasks: + - name: Backup original nomad.hcl + copy: + src: /etc/nomad.d/nomad.hcl + dest: /etc/nomad.d/nomad.hcl.bak3 + remote_src: yes + + - name: Update nomad.hcl with correct name format and retry_join configuration + copy: + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + plugin_dir = "/opt/nomad/plugins" + log_level = "INFO" + name = "ch2" + + bind_addr = "100.90.159.68" + + addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" + } + + ports { + http = 4646 + rpc = 4647 + serf = 4648 + } + + server { + enabled = true + retry_join = ["100.81.26.3:4648", "100.103.147.94:4648", "100.86.141.112:4648", "100.120.225.29:4648", "100.98.209.50:4648", "100.116.158.95:4648"] + } + + client { + enabled = false + } + + plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden + } + + vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true + } + dest: /etc/nomad.d/nomad.hcl + + - name: Restart Nomad service + systemd: + name: nomad + state: restarted \ No newline at end of file diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..e69aefe --- /dev/null +++ b/docs/API.md @@ -0,0 +1,17 @@ +# API 文档 + +## MCP 服务器 API + +### Qdrant MCP 服务器 + +- **端口**: 3000 +- **协议**: HTTP/JSON-RPC +- **功能**: 向量搜索和文档管理 + +### 主要端点 + +- `/search` - 搜索文档 +- `/add` - 添加文档 +- `/delete` - 删除文档 + +更多详细信息请参考各 MCP 服务器的源码。 diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..178281a --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,23 @@ +# 部署文档 + +## 快速开始 + +1. 环境设置 +```bash +make setup +``` + +2. 初始化服务 +```bash +./scripts/setup/init/init-vault-dev.sh +./scripts/deployment/consul/deploy-consul-cluster-kv.sh +``` + +3. 启动 MCP 服务器 +```bash +./scripts/mcp/tools/start-mcp-server.sh +``` + +## 详细部署步骤 + +请参考各组件的具体部署脚本和配置文件。 diff --git a/docs/SCRIPTS.md b/docs/SCRIPTS.md new file mode 100644 index 0000000..24a7341 --- /dev/null +++ b/docs/SCRIPTS.md @@ -0,0 +1,248 @@ +# 脚本文档 + +本文档自动生成,包含项目中所有脚本的说明。 + +## 脚本列表 + +### scripts/ci-cd/build/generate-docs.sh + +**描述**: 文档生成脚本 +自动生成项目文档 +颜色定义 + +**用法**: 请查看脚本内部说明 + +### scripts/ci-cd/quality/lint.sh + +**描述**: 代码质量检查脚本 +检查脚本语法、代码风格等 +颜色定义 + + +### scripts/ci-cd/quality/security-scan.sh + +**描述**: 安全扫描脚本 +扫描代码中的安全问题和敏感信息 +颜色定义 + + +### scripts/deployment/consul/consul-variables-example.sh + +**描述**: Consul 变量和存储配置示例脚本 +此脚本展示了如何配置Consul的变量和存储功能 +配置参数 + + +### scripts/deployment/consul/deploy-consul-cluster-kv.sh + +**描述**: Consul集群部署脚本 - 遵循最佳变量命名规范 +此脚本将部署一个完全遵循 config/{environment}/{provider}/{region_or_service}/{key} 格式的Consul集群 +配置参数 + + +### scripts/deployment/vault/deploy-vault.sh + +**描述**: 部署Vault集群的脚本 +检查并安装Vault + + +### scripts/deployment/vault/vault-dev-example.sh + +**描述**: Vault开发环境使用示例 +设置环境变量 + + +### scripts/deployment/vault/vault-dev-quickstart.sh + +**描述**: Vault开发环境快速开始指南 +1. 设置环境变量 + + +### scripts/mcp/configs/sync-all-configs.sh + +**描述**: 链接所有MCP配置文件的脚本 +该脚本将所有IDE和AI助手的MCP配置链接到NFS共享的配置文件 +检查NFS配置文件是否存在 + + +### scripts/mcp/tools/start-mcp-server.sh + +**描述**: 设置环境变量 +启动MCP服务器 + + +### scripts/setup/config/generate-consul-config.sh + +**描述**: Consul配置生成脚本 +此脚本使用Consul模板从KV存储生成最终的Consul配置文件 +配置参数 + + +### scripts/setup/config/setup-consul-cluster-variables.sh + +**描述**: Consul变量配置脚本 - 遵循最佳命名规范 +此脚本将Consul集群配置存储到Consul KV中,遵循 config/{environment}/{provider}/{region_or_service}/{key} 格式 +配置参数 + + +### scripts/setup/config/setup-consul-variables-and-storage.sh + +**描述**: Consul 变量和存储配置脚本 +用于增强Consul集群功能 +颜色输出 + + +### scripts/setup/environment/setup-environment.sh + +**描述**: 环境设置脚本 +用于设置开发环境的必要组件和依赖 +颜色定义 + + +### scripts/setup/init/init-vault-cluster.sh + +**描述**: Vault集群初始化和解封脚本 +颜色定义 + + +### scripts/setup/init/init-vault-dev-api.sh + +**描述**: 通过API初始化Vault开发环境(无需本地vault命令) +颜色定义 + + +### scripts/setup/init/init-vault-dev.sh + +**描述**: Vault开发环境初始化脚本 +颜色定义 + + +### scripts/testing/infrastructure/test-nomad-config.sh + +**描述**: 测试Nomad配置文件 + + +### scripts/testing/infrastructure/test-traefik-deployment.sh + +**描述**: Traefik部署测试脚本 +用于测试Traefik在Nomad集群中的部署和功能 +颜色定义 + +**用法**: 请查看脚本内部说明 + +### scripts/testing/integration/verify-vault-consul-integration.sh + +**描述**: 验证Vault与Consul集成状态 +颜色定义 + + +### scripts/testing/mcp/test_direct_search.sh + +**描述**: 创建一个简单的Python脚本来测试search_documents方法 + + +### scripts/testing/mcp/test_local_mcp_servers.sh + +**描述**: 测试当前环境中的MCP服务器 +检查当前环境中是否有MCP配置 + + +### scripts/testing/mcp/test_mcp_interface.sh + +**描述**: 测试MCP服务器在实际MCP接口中的调用 + + +### scripts/testing/mcp/test_mcp_search_final.sh + +**描述**: 先添加一个文档 + + +### scripts/testing/mcp/test_mcp_servers.sh + +**描述**: 测试MCP服务器脚本 + + +### scripts/testing/mcp/test_qdrant_ollama_tools.sh + +**描述**: 测试search_documents工具 + + +### scripts/testing/mcp/test_qdrant_ollama_tools_fixed.sh + +**描述**: 测试search_documents工具(不带filter参数) + + +### scripts/testing/mcp/test_search_documents.sh + +**描述**: 先添加一个文档 + + +### scripts/testing/run_all_tests.sh + +**描述**: MCP服务器测试运行器 +自动运行所有MCP服务器测试脚本 +颜色定义 + + +### scripts/testing/test-runner.sh + +**描述**: 项目测试快速执行脚本 +从项目根目录快速运行所有MCP服务器测试 +颜色定义 + + +### scripts/utilities/backup/backup-all.sh + +**描述**: 全量备份脚本 +备份所有重要的配置和数据 +颜色定义 + + +### scripts/utilities/backup/backup-consul.sh + +**描述**: Consul备份脚本 +此脚本用于创建Consul的快照备份,并管理备份文件 +配置参数 + + +### scripts/utilities/helpers/fix-alpine-cgroups-systemd.sh + +**描述**: Alternative script to fix cgroup configuration using systemd approach +Check if running as root + + +### scripts/utilities/helpers/fix-alpine-cgroups.sh + +**描述**: Script to fix cgroup configuration for container runtime in Alpine Linux +Check if running as root + + +### scripts/utilities/helpers/manage-vault-consul.sh + +**描述**: Vault与Consul集成管理脚本 +颜色定义 +函数定义 + +**用法**: 请查看脚本内部说明 + +### scripts/utilities/helpers/nomad-leader-discovery.sh + +**描述**: Nomad 集群领导者发现与访问脚本 +此脚本自动发现当前 Nomad 集群领导者并执行相应命令 +默认服务器列表(可根据实际情况修改) + +**用法**: 请查看脚本内部说明 + +### scripts/utilities/helpers/show-vault-dev-keys.sh + +**描述**: 显示开发环境Vault密钥信息 +检查密钥文件是否存在 + + +### scripts/utilities/maintenance/cleanup-global-config.sh + +**描述**: Nomad Global 配置清理脚本 +此脚本用于移除配置文件中的 .global 后缀 +颜色输出 + + diff --git a/docs/setup/consul_variables_and_storage_guide.md b/docs/setup/consul_variables_and_storage_guide.md index a16bf48..b9cd596 100644 --- a/docs/setup/consul_variables_and_storage_guide.md +++ b/docs/setup/consul_variables_and_storage_guide.md @@ -25,6 +25,96 @@ config/{environment}/{provider}/{region_or_service}/{key} - **region_or_service**: 区域或服务名称,如 `kr`、`us`、`sgp` 等 - **key**: 具体的配置键名,如 `token`、`tenancy_ocid`、`user_ocid` 等 +### Consul集群配置变量 + +Consul集群自身配置也应遵循上述命名规范。以下是一些关键配置变量的示例: + +#### 集群基本配置 +``` +config/dev/consul/cluster/data_dir +config/dev/consul/cluster/raft_dir +config/dev/consul/cluster/datacenter +config/dev/consul/cluster/bootstrap_expect +config/dev/consul/cluster/log_level +config/dev/consul/cluster/encrypt_key +``` + +#### 节点配置 +``` +config/dev/consul/nodes/master/ip +config/dev/consul/nodes/ash3c/ip +config/dev/consul/nodes/warden/ip +``` + +#### 网络配置 +``` +config/dev/consul/network/client_addr +config/dev/consul/network/bind_interface +config/dev/consul/network/advertise_interface +``` + +#### 端口配置 +``` +config/dev/consul/ports/dns +config/dev/consul/ports/http +config/dev/consul/ports/https +config/dev/consul/ports/grpc +config/dev/consul/ports/grpc_tls +config/dev/consul/ports/serf_lan +config/dev/consul/ports/serf_wan +config/dev/consul/ports/server +``` + +#### 服务发现配置 +``` +config/dev/consul/service/enable_script_checks +config/dev/consul/service/enable_local_script_checks +config/dev/consul/service/enable_service_script +``` + +#### 性能配置 +``` +config/dev/consul/performance/raft_multiplier +``` + +#### 日志配置 +``` +config/dev/consul/log/enable_syslog +config/dev/consul/log/log_file +``` + +#### 连接配置 +``` +config/dev/consul/connection/reconnect_timeout +config/dev/consul/connection/reconnect_timeout_wan +config/dev/consul/connection/session_ttl_min +``` + +#### Autopilot配置 +``` +config/dev/consul/autopilot/cleanup_dead_servers +config/dev/consul/autopilot/last_contact_threshold +config/dev/consul/autopilot/max_trailing_logs +config/dev/consul/autopilot/server_stabilization_time +config/dev/consul/autopilot/disable_upgrade_migration +``` + +#### 快照配置 +``` +config/dev/consul/snapshot/enabled +config/dev/consul/snapshot/interval +config/dev/consul/snapshot/retain +config/dev/consul/snapshot/name +``` + +#### 备份配置 +``` +config/dev/consul/backup/enabled +config/dev/consul/backup/interval +config/dev/consul/backup/retain +config/dev/consul/backup/name +``` + ### 示例配置 #### 应用配置 @@ -119,6 +209,186 @@ pair, _, _ := kv.Get("config/dev/app/name", nil) appName := string(pair.Value) ``` +## 部署遵循最佳变量命名规范的Consul集群 + +为了确保Consul集群完全遵循最佳变量命名规范,我们提供了一套完整的部署方案。 + +### 部署流程 + +1. **设置Consul变量**: 使用脚本将所有Consul集群配置存储到Consul KV中 +2. **生成配置文件**: 使用Consul模板从KV存储动态生成配置文件 +3. **部署集群**: 使用Nomad部署使用动态配置的Consul集群 + +### 部署脚本 + +我们提供了以下脚本来简化部署过程: + +#### setup_consul_cluster_variables.sh +此脚本将Consul集群配置存储到Consul KV中,遵循 `config/{environment}/{provider}/{region_or_service}/{key}` 格式。 + +```bash +# 设置Consul集群变量 +./deployment/scripts/setup_consul_cluster_variables.sh +``` + +#### generate_consul_config.sh +此脚本使用Consul模板从KV存储生成最终的Consul配置文件。 + +```bash +# 生成Consul配置文件 +./deployment/scripts/generate_consul_config.sh +``` + +#### deploy_consul_cluster_kv.sh +此脚本是一个综合部署脚本,执行完整的部署流程。 + +```bash +# 部署遵循最佳变量命名规范的Consul集群 +./deployment/scripts/deploy_consul_cluster_kv.sh +``` + +### 配置模板 + +我们提供了Consul配置模板文件 `consul.hcl.tmpl`,使用Consul模板语法从KV存储中动态获取配置: + +```hcl +# 基础配置 +data_dir = "{{ keyOrDefault `config/dev/consul/cluster/data_dir` `/opt/consul/data` }}" +raft_dir = "{{ keyOrDefault `config/dev/consul/cluster/raft_dir` `/opt/consul/raft` }}" + +# 启用UI +ui_config { + enabled = {{ keyOrDefault `config/dev/consul/ui/enabled` `true` }} +} + +# 服务器配置 +server = true +bootstrap_expect = {{ keyOrDefault `config/dev/consul/cluster/bootstrap_expect` `3` }} + +# 网络配置 +client_addr = "{{ keyOrDefault `config/dev/consul/nodes/master/ip` `100.117.106.136` }}" +bind_addr = "{{ keyOrDefault `config/dev/consul/nodes/master/ip` `100.117.106.136` }}" +advertise_addr = "{{ keyOrDefault `config/dev/consul/nodes/master/ip` `100.117.106.136` }}" + +# 集群连接 - 从KV获取其他节点IP +retry_join = [ + "{{ keyOrDefault `config/dev/consul/nodes/ash3c/ip` `100.116.80.94` }}", + "{{ keyOrDefault `config/dev/consul/nodes/warden/ip` `100.122.197.112` }}" +] +``` + +### Nomad作业配置 + +我们提供了完全遵循最佳变量命名规范的Nomad作业配置文件 `consul-cluster-kv.nomad`,该文件使用Consul模板从KV存储动态获取配置: + +```hcl +task "consul" { + driver = "exec" + + # 使用模板从Consul KV获取配置 + template { + data = <:8500/v1/kv/vault/?recurse | jq . +``` + +### 4.2 验证脚本 +```bash +# 运行完整验证 +/root/mgmt/deployment/scripts/verify_vault_consul_integration.sh +``` + +## 5. 管理操作 + +### 5.1 日常管理 +```bash +# 显示状态 +/root/mgmt/deployment/scripts/manage_vault_consul.sh status + +# 健康检查 +/root/mgmt/deployment/scripts/manage_vault_consul.sh health + +# 验证集成 +/root/mgmt/deployment/scripts/manage_vault_consul.sh verify +``` + +### 5.2 监控操作 +```bash +# 实时监控 +/root/mgmt/deployment/scripts/manage_vault_consul.sh monitor + +# 数据备份 +/root/mgmt/deployment/scripts/manage_vault_consul.sh backup +``` + +## 6. 故障排除 + +### 6.1 常见问题 + +#### 6.1.1 Vault无法连接Consul +**问题**:Vault启动失败,日志显示无法连接Consul +**解决方案**: +1. 检查Consul服务是否运行:`consul members` +2. 检查网络连接:`curl http://:8500/v1/status/leader` +3. 验证Vault配置中的Consul地址是否正确 + +#### 6.1.2 Vault数据丢失 +**问题**:Vault无法读取之前存储的数据 +**解决方案**: +1. 检查Consul中的数据:`curl http://:8500/v1/kv/vault/?keys` +2. 验证Consul集群状态:`consul members` +3. 如有必要,从备份恢复数据 + +### 6.2 日志查看 +```bash +# 查看Vault日志 +nomad alloc logs -address=http://100.116.158.95:4646 + +# 查看Consul日志 +nomad alloc logs -address=http://100.116.158.95:4646 +``` + +## 7. 安全考虑 + +### 7.1 数据加密 +- Consul中的Vault数据默认已加密 +- 网络传输使用TLS加密(生产环境) + +### 7.2 访问控制 +- Vault使用令牌进行访问控制 +- Consul使用ACL策略进行访问控制 + +### 7.3 备份策略 +- 定期备份Consul中的Vault数据 +- 备份文件应加密存储 +- 遵循3-2-1备份原则 + +## 8. 性能优化 + +### 8.1 Consul调优 +- 调整Consul的存储后端性能参数 +- 监控Consul集群的健康状态 +- 定期清理过期的会话 + +### 8.2 Vault调优 +- 调整Vault的缓存设置 +- 监控Vault的性能指标 +- 优化密钥引擎的使用 + +## 9. 升级维护 + +### 9.1 版本升级 +1. 先升级Consul集群 +2. 再升级Vault集群 +3. 验证集成状态 + +### 9.2 滚动更新 +使用Nomad进行滚动更新,确保服务不中断: +```bash +nomad job run -address=http://100.116.158.95:4646 /path/to/updated/job.nomad +``` + +## 10. 相关文档 + +- [Vault官方文档](https://www.vaultproject.io/docs) +- [Consul官方文档](https://www.consul.io/docs) +- [Nomad官方文档](https://www.nomadproject.io/docs) +- Vault开发环境指南 +- Vault安全策略文档 \ No newline at end of file diff --git a/hosts_inventory b/hosts_inventory new file mode 100644 index 0000000..fbfda84 --- /dev/null +++ b/hosts_inventory @@ -0,0 +1,47 @@ +# Nomad 完整架构配置 +# 合并后的inventory文件,基于production目录的最新配置 + +[nomad_servers] +# 服务器节点 (7个服务器节点) +# 本机,不操作 bj-semaphore.global ansible_host=100.116.158.95 ansible_user=root ansible_password=3131 ansible_become_password=3131 +ash1d.global ansible_host=100.81.26.3 ansible_user=ben ansible_password=3131 ansible_become_password=3131 +ash2e.global ansible_host=100.103.147.94 ansible_user=ben ansible_password=3131 ansible_become_password=3131 +ch2.global ansible_host=100.90.159.68 ansible_user=ben ansible_password=3131 ansible_become_password=3131 +ch3.global ansible_host=100.86.141.112 ansible_user=ben ansible_password=3131 ansible_become_password=3131 +onecloud1.global ansible_host=100.98.209.50 ansible_user=ben ansible_password=3131 ansible_become_password=3131 +de.global ansible_host=100.120.225.29 ansible_user=ben ansible_password=3131 ansible_become_password=3131 + +[nomad_clients] +# 客户端节点 (6个客户端节点,基于production配置) +hcp1 ansible_host=hcp1 ansible_user=root ansible_password=313131 ansible_become_password=313131 +influxdb1 ansible_host=influxdb1 ansible_user=root ansible_password=313131 ansible_become_password=313131 +warden ansible_host=warden ansible_user=ben ansible_password=3131 ansible_become_password=3131 +browser ansible_host=browser ansible_user=root ansible_password=313131 ansible_become_password=313131 +kr-master ansible_host=master ansible_port=60022 ansible_user=ben ansible_password=3131 ansible_become_password=3131 +us-ash3c ansible_host=ash3c ansible_user=ben ansible_password=3131 ansible_become_password=3131 + +[nomad_nodes:children] +nomad_servers +nomad_clients + +[nomad_nodes:vars] +# NFS配置 +nfs_server=snail +nfs_share=/fs/1000/nfs/Fnsync +mount_point=/mnt/fnsync + +# Ansible配置 +ansible_ssh_common_args='-o StrictHostKeyChecking=no' + +# Telegraf监控配置(基于production配置) +client_ip="{{ ansible_host }}" +influxdb_url="http://influxdb1.tailnet-68f9.ts.net:8086" +influxdb_token="VU_dOCVZzqEHb9jSFsDe0bJlEBaVbiG4LqfoczlnmcbfrbmklSt904HJPL4idYGvVi0c2eHkYDi2zCTni7Ay4w==" +influxdb_org="seekkey" +influxdb_bucket="VPS" +telegraf_config_url="http://influxdb1.tailnet-68f9.ts.net:8086/api/v2/telegrafs/0f8a73496790c000" +collection_interval=30 +disk_usage_warning=80 +disk_usage_critical=90 +telegraf_log_level="ERROR" +telegraf_disable_local_logs=true diff --git a/infrastructure/configs/client/nomad-ash3c.hcl b/infrastructure/configs/client/nomad-ash3c.hcl new file mode 100644 index 0000000..360d3d9 --- /dev/null +++ b/infrastructure/configs/client/nomad-ash3c.hcl @@ -0,0 +1,60 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "us-ash3c" + +bind_addr = "100.116.80.94" + +addresses { + http = "100.116.80.94" + rpc = "100.116.80.94" + serf = "100.116.80.94" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = false +} + +client { + enabled = true + network_interface = "tailscale0" + # 配置七姐妹服务器地址 + servers = [ + "100.116.158.95:4647", # bj-semaphore + "100.81.26.3:4647", # ash1d + "100.103.147.94:4647", # ash2e + "100.90.159.68:4647", # ch2 + "100.86.141.112:4647", # ch3 + "100.98.209.50:4647", # bj-onecloud1 + "100.120.225.29:4647" # de + ] +} + + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/nomad-master.hcl b/infrastructure/configs/client/nomad-master.hcl similarity index 65% rename from infrastructure/configs/nomad-master.hcl rename to infrastructure/configs/client/nomad-master.hcl index 4f312f4..4e56223 100644 --- a/infrastructure/configs/nomad-master.hcl +++ b/infrastructure/configs/client/nomad-master.hcl @@ -2,6 +2,7 @@ datacenter = "dc1" data_dir = "/opt/nomad/data" plugin_dir = "/opt/nomad/plugins" log_level = "INFO" +name = "kr-master" bind_addr = "100.117.106.136" @@ -43,5 +44,13 @@ plugin "nomad-driver-podman" { } consul { - address = "100.117.106.136:8500" + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true } \ No newline at end of file diff --git a/infrastructure/configs/nomad-ash3c.hcl b/infrastructure/configs/client/nomad-warden.hcl similarity index 54% rename from infrastructure/configs/nomad-ash3c.hcl rename to infrastructure/configs/client/nomad-warden.hcl index 97c54de..2b37337 100644 --- a/infrastructure/configs/nomad-ash3c.hcl +++ b/infrastructure/configs/client/nomad-warden.hcl @@ -2,13 +2,14 @@ datacenter = "dc1" data_dir = "/opt/nomad/data" plugin_dir = "/opt/nomad/plugins" log_level = "INFO" +name = "bj-warden" -bind_addr = "100.116.80.94" +bind_addr = "100.122.197.112" addresses { - http = "100.116.80.94" - rpc = "100.116.80.94" - serf = "100.116.80.94" + http = "100.122.197.112" + rpc = "100.122.197.112" + serf = "100.122.197.112" } ports { @@ -43,5 +44,13 @@ plugin "nomad-driver-podman" { } consul { - address = "100.116.80.94:8500" + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true } \ No newline at end of file diff --git a/infrastructure/configs/dynamic/config.yml b/infrastructure/configs/dynamic/config.yml deleted file mode 100644 index cf17870..0000000 --- a/infrastructure/configs/dynamic/config.yml +++ /dev/null @@ -1,58 +0,0 @@ -# Traefik动态配置文件 -# 这里可以添加动态路由、中间件等配置 - -# HTTP路由示例 -http: - routers: - # 测试路由 - test-router: - rule: "Host(`test.service.consul`)" - service: "test-service" - entryPoints: - - "https" - tls: - certResolver: "default" - - services: - # 测试服务 - test-service: - loadBalancer: - servers: - - url: "http://127.0.0.1:8080" - passHostHeader: true - - middlewares: - # 基本认证中间件 - basic-auth: - basicAuth: - users: - - "test:$apr1$H6uskkkW$IgXLP6ewTrSuBkTrqE8wj/" - - # 安全头中间件 - security-headers: - headers: - sslRedirect: true - stsSeconds: 31536000 - stsIncludeSubdomains: true - stsPreload: true - forceSTSHeader: true - customFrameOptionsValue: "SAMEORIGIN" - contentTypeNosniff: true - browserXssFilter: true - -# TCP路由示例 -tcp: - routers: - # TCP测试路由 - tcp-test-router: - rule: "HostSNI(`*`)" - service: "tcp-test-service" - entryPoints: - - "https" - - services: - # TCP测试服务 - tcp-test-service: - loadBalancer: - servers: - - address: "127.0.0.1:8080" \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-ash1d.hcl b/infrastructure/configs/server/nomad-ash1d.hcl new file mode 100644 index 0000000..5335f03 --- /dev/null +++ b/infrastructure/configs/server/nomad-ash1d.hcl @@ -0,0 +1,51 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "us-ash1d" + +bind_addr = "100.81.26.3" + +addresses { + http = "100.81.26.3" + rpc = "100.81.26.3" + serf = "100.81.26.3" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + retry_join = ["us-ash1d", "ash2e", "ch2", "ch3", "onecloud1", "de"] +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-ash2e.hcl b/infrastructure/configs/server/nomad-ash2e.hcl new file mode 100644 index 0000000..0160abb --- /dev/null +++ b/infrastructure/configs/server/nomad-ash2e.hcl @@ -0,0 +1,51 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "us-ash2e" + +bind_addr = "100.103.147.94" + +addresses { + http = "100.103.147.94" + rpc = "100.103.147.94" + serf = "100.103.147.94" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + retry_join = ["us-ash2e", "ash1d", "ch2", "ch3", "onecloud1", "de"] +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-ch2.hcl b/infrastructure/configs/server/nomad-ch2.hcl new file mode 100644 index 0000000..2011da5 --- /dev/null +++ b/infrastructure/configs/server/nomad-ch2.hcl @@ -0,0 +1,51 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "kr-ch2" + +bind_addr = "100.90.159.68" + +addresses { + http = "100.90.159.68" + rpc = "100.90.159.68" + serf = "100.90.159.68" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + retry_join = ["kr-ch2", "ash1d", "ash2e", "ch3", "onecloud1", "de"] +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul {#三个节点 + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault {#三个节点 + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-ch3.hcl b/infrastructure/configs/server/nomad-ch3.hcl new file mode 100644 index 0000000..6bcf298 --- /dev/null +++ b/infrastructure/configs/server/nomad-ch3.hcl @@ -0,0 +1,51 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "kr-ch3" + +bind_addr = "100.86.141.112" + +addresses { + http = "100.86.141.112" + rpc = "100.86.141.112" + serf = "100.86.141.112" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + data_dir = "/opt/nomad/data" +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul {#三个节点 + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault {#三个节点 + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-de.hcl b/infrastructure/configs/server/nomad-de.hcl new file mode 100644 index 0000000..fc7aee2 --- /dev/null +++ b/infrastructure/configs/server/nomad-de.hcl @@ -0,0 +1,50 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "de" + +bind_addr = "100.120.225.29" + +addresses { + http = "100.120.225.29" + rpc = "100.120.225.29" + serf = "100.120.225.29" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul {#三个节点 + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault {#三个节点 + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-onecloud1.hcl b/infrastructure/configs/server/nomad-onecloud1.hcl new file mode 100644 index 0000000..6e63ff9 --- /dev/null +++ b/infrastructure/configs/server/nomad-onecloud1.hcl @@ -0,0 +1,50 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "onecloud1" + +bind_addr = "100.98.209.50" + +addresses { + http = "100.98.209.50" + rpc = "100.98.209.50" + serf = "100.98.209.50" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/configs/server/nomad-semaphore.hcl b/infrastructure/configs/server/nomad-semaphore.hcl new file mode 100644 index 0000000..9c41301 --- /dev/null +++ b/infrastructure/configs/server/nomad-semaphore.hcl @@ -0,0 +1,51 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "semaphore" + +bind_addr = "100.116.158.95" + +addresses { + http = "100.116.158.95" + rpc = "100.116.158.95" + serf = "100.116.158.95" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 3 +} + + + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/infrastructure/jobs/traefik.nomad b/infrastructure/jobs/traefik.nomad index 3baa3d1..b588b6c 100644 --- a/infrastructure/jobs/traefik.nomad +++ b/infrastructure/jobs/traefik.nomad @@ -3,7 +3,6 @@ job "traefik" { type = "service" update { - strategy = "canary" max_parallel = 1 min_healthy_time = "10s" healthy_deadline = "3m" @@ -11,7 +10,14 @@ job "traefik" { } group "traefik" { - count = 3 + count = 1 # 先在warden节点部署一个实例 + + # 约束只在warden节点运行 + constraint { + attribute = "${node.unique.name}" + operator = "=" + value = "bj-warden" + } restart { attempts = 3 @@ -33,21 +39,66 @@ job "traefik" { } task "traefik" { - driver = "podman" + driver = "exec" + + # 下载Traefik v3二进制文件 + artifact { + source = "https://github.com/traefik/traefik/releases/download/v3.1.5/traefik_v3.1.5_linux_amd64.tar.gz" + destination = "local/" + mode = "file" + options { + archive = "true" + } + } + + # 动态配置文件模板 + template { + data = < /etc/nomad.d/nomad.hcl << EOF datacenter = "${datacenter}" -region = "global" +region = "dc1" data_dir = "/opt/nomad/data" bind_addr = "$BIND_ADDR" diff --git a/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh.backup.20250930_131511 b/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh.backup.20250930_131511 new file mode 100644 index 0000000..417fff1 --- /dev/null +++ b/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh.backup.20250930_131511 @@ -0,0 +1,228 @@ +#!/bin/bash +# Nomad 多数据中心节点自动配置脚本 +# 数据中心: ${datacenter} + +set -e + +# 日志函数 +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/nomad-setup.log +} + +log "开始配置 Nomad 节点 - 数据中心: ${datacenter}" + +# 更新系统 +log "更新系统包..." +apt-get update -y +apt-get upgrade -y + +# 安装必要的包 +log "安装必要的包..." +apt-get install -y \ + curl \ + wget \ + unzip \ + jq \ + podman \ + htop \ + net-tools \ + vim + +# 启动 Podman +log "启动 Podman 服务..." +systemctl enable podman +systemctl start podman +usermod -aG podman ubuntu + +# 安装 Nomad +log "安装 Nomad ${nomad_version}..." +cd /tmp +wget -q https://releases.hashicorp.com/nomad/${nomad_version}/nomad_${nomad_version}_linux_amd64.zip +unzip nomad_${nomad_version}_linux_amd64.zip +mv nomad /usr/local/bin/ +chmod +x /usr/local/bin/nomad + +# 创建 Nomad 用户和目录 +log "创建 Nomad 用户和目录..." +useradd --system --home /etc/nomad.d --shell /bin/false nomad +mkdir -p /opt/nomad/data +mkdir -p /etc/nomad.d +mkdir -p /var/log/nomad +chown -R nomad:nomad /opt/nomad /etc/nomad.d /var/log/nomad + +# 获取本机 IP 地址 +if [ "${bind_addr}" = "auto" ]; then + # 尝试多种方法获取 IP + BIND_ADDR=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || \ + curl -s http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip -H "Metadata-Flavor: Google" 2>/dev/null || \ + ip route get 8.8.8.8 | awk '{print $7; exit}' || \ + hostname -I | awk '{print $1}') +else + BIND_ADDR="${bind_addr}" +fi + +log "检测到 IP 地址: $BIND_ADDR" + +# 创建 Nomad 配置文件 +log "创建 Nomad 配置文件..." +cat > /etc/nomad.d/nomad.hcl << EOF +datacenter = "${datacenter}" +region = "dc1" +data_dir = "/opt/nomad/data" + +bind_addr = "$BIND_ADDR" + +%{ if server_enabled } +server { + enabled = true + bootstrap_expect = ${bootstrap_expect} + encrypt = "${nomad_encrypt_key}" +} +%{ endif } + +%{ if client_enabled } +client { + enabled = true + + host_volume "podman-sock" { + path = "/run/podman/podman.sock" + read_only = false + } +} +%{ endif } + +ui { + enabled = true +} + +addresses { + http = "0.0.0.0" + rpc = "$BIND_ADDR" + serf = "$BIND_ADDR" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +plugin "podman" { + config { + volumes { + enabled = true + } + } +} + +telemetry { + collection_interval = "10s" + disable_hostname = false + prometheus_metrics = true + publish_allocation_metrics = true + publish_node_metrics = true +} + +log_level = "INFO" +log_file = "/var/log/nomad/nomad.log" +EOF + +# 创建 systemd 服务文件 +log "创建 systemd 服务文件..." +cat > /etc/systemd/system/nomad.service << EOF +[Unit] +Description=Nomad +Documentation=https://www.nomadproject.io/ +Requires=network-online.target +After=network-online.target +ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl + +[Service] +Type=notify +User=nomad +Group=nomad +ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl +ExecReload=/bin/kill -HUP \$MAINPID +KillMode=process +Restart=on-failure +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target +EOF + +# 启动 Nomad 服务 +log "启动 Nomad 服务..." +systemctl daemon-reload +systemctl enable nomad +systemctl start nomad + +# 等待服务启动 +log "等待 Nomad 服务启动..." +sleep 10 + +# 验证安装 +log "验证 Nomad 安装..." +if systemctl is-active --quiet nomad; then + log "✅ Nomad 服务运行正常" + log "📊 节点信息:" + /usr/local/bin/nomad node status -self || true +else + log "❌ Nomad 服务启动失败" + systemctl status nomad --no-pager || true + journalctl -u nomad --no-pager -n 20 || true +fi + +# 配置防火墙(如果需要) +log "配置防火墙规则..." +if command -v ufw >/dev/null 2>&1; then + ufw allow 4646/tcp # HTTP API + ufw allow 4647/tcp # RPC + ufw allow 4648/tcp # Serf + ufw allow 22/tcp # SSH +fi + +# 创建有用的别名和脚本 +log "创建管理脚本..." +cat > /usr/local/bin/nomad-status << 'EOF' +#!/bin/bash +echo "=== Nomad 服务状态 ===" +systemctl status nomad --no-pager + +echo -e "\n=== Nomad 集群成员 ===" +nomad server members 2>/dev/null || echo "无法连接到集群" + +echo -e "\n=== Nomad 节点状态 ===" +nomad node status 2>/dev/null || echo "无法获取节点状态" + +echo -e "\n=== 最近日志 ===" +journalctl -u nomad --no-pager -n 5 +EOF + +chmod +x /usr/local/bin/nomad-status + +# 添加到 ubuntu 用户的 bashrc +echo 'alias ns="nomad-status"' >> /home/ubuntu/.bashrc +echo 'alias nomad-logs="journalctl -u nomad -f"' >> /home/ubuntu/.bashrc + +log "🎉 Nomad 节点配置完成!" +log "📍 数据中心: ${datacenter}" +log "🌐 IP 地址: $BIND_ADDR" +log "🔗 Web UI: http://$BIND_ADDR:4646" +log "📝 使用 'nomad-status' 或 'ns' 命令查看状态" + +# 输出重要信息到 motd +cat > /etc/update-motd.d/99-nomad << EOF +#!/bin/bash +echo "" +echo "🚀 Nomad 节点信息:" +echo " 数据中心: ${datacenter}" +echo " IP 地址: $BIND_ADDR" +echo " Web UI: http://$BIND_ADDR:4646" +echo " 状态检查: nomad-status" +echo "" +EOF + +chmod +x /etc/update-motd.d/99-nomad + +log "节点配置脚本执行完成" \ No newline at end of file diff --git a/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh.backup.20250930_131639 b/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh.backup.20250930_131639 new file mode 100644 index 0000000..417fff1 --- /dev/null +++ b/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh.backup.20250930_131639 @@ -0,0 +1,228 @@ +#!/bin/bash +# Nomad 多数据中心节点自动配置脚本 +# 数据中心: ${datacenter} + +set -e + +# 日志函数 +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a /var/log/nomad-setup.log +} + +log "开始配置 Nomad 节点 - 数据中心: ${datacenter}" + +# 更新系统 +log "更新系统包..." +apt-get update -y +apt-get upgrade -y + +# 安装必要的包 +log "安装必要的包..." +apt-get install -y \ + curl \ + wget \ + unzip \ + jq \ + podman \ + htop \ + net-tools \ + vim + +# 启动 Podman +log "启动 Podman 服务..." +systemctl enable podman +systemctl start podman +usermod -aG podman ubuntu + +# 安装 Nomad +log "安装 Nomad ${nomad_version}..." +cd /tmp +wget -q https://releases.hashicorp.com/nomad/${nomad_version}/nomad_${nomad_version}_linux_amd64.zip +unzip nomad_${nomad_version}_linux_amd64.zip +mv nomad /usr/local/bin/ +chmod +x /usr/local/bin/nomad + +# 创建 Nomad 用户和目录 +log "创建 Nomad 用户和目录..." +useradd --system --home /etc/nomad.d --shell /bin/false nomad +mkdir -p /opt/nomad/data +mkdir -p /etc/nomad.d +mkdir -p /var/log/nomad +chown -R nomad:nomad /opt/nomad /etc/nomad.d /var/log/nomad + +# 获取本机 IP 地址 +if [ "${bind_addr}" = "auto" ]; then + # 尝试多种方法获取 IP + BIND_ADDR=$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null || \ + curl -s http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip -H "Metadata-Flavor: Google" 2>/dev/null || \ + ip route get 8.8.8.8 | awk '{print $7; exit}' || \ + hostname -I | awk '{print $1}') +else + BIND_ADDR="${bind_addr}" +fi + +log "检测到 IP 地址: $BIND_ADDR" + +# 创建 Nomad 配置文件 +log "创建 Nomad 配置文件..." +cat > /etc/nomad.d/nomad.hcl << EOF +datacenter = "${datacenter}" +region = "dc1" +data_dir = "/opt/nomad/data" + +bind_addr = "$BIND_ADDR" + +%{ if server_enabled } +server { + enabled = true + bootstrap_expect = ${bootstrap_expect} + encrypt = "${nomad_encrypt_key}" +} +%{ endif } + +%{ if client_enabled } +client { + enabled = true + + host_volume "podman-sock" { + path = "/run/podman/podman.sock" + read_only = false + } +} +%{ endif } + +ui { + enabled = true +} + +addresses { + http = "0.0.0.0" + rpc = "$BIND_ADDR" + serf = "$BIND_ADDR" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +plugin "podman" { + config { + volumes { + enabled = true + } + } +} + +telemetry { + collection_interval = "10s" + disable_hostname = false + prometheus_metrics = true + publish_allocation_metrics = true + publish_node_metrics = true +} + +log_level = "INFO" +log_file = "/var/log/nomad/nomad.log" +EOF + +# 创建 systemd 服务文件 +log "创建 systemd 服务文件..." +cat > /etc/systemd/system/nomad.service << EOF +[Unit] +Description=Nomad +Documentation=https://www.nomadproject.io/ +Requires=network-online.target +After=network-online.target +ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl + +[Service] +Type=notify +User=nomad +Group=nomad +ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/nomad.hcl +ExecReload=/bin/kill -HUP \$MAINPID +KillMode=process +Restart=on-failure +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target +EOF + +# 启动 Nomad 服务 +log "启动 Nomad 服务..." +systemctl daemon-reload +systemctl enable nomad +systemctl start nomad + +# 等待服务启动 +log "等待 Nomad 服务启动..." +sleep 10 + +# 验证安装 +log "验证 Nomad 安装..." +if systemctl is-active --quiet nomad; then + log "✅ Nomad 服务运行正常" + log "📊 节点信息:" + /usr/local/bin/nomad node status -self || true +else + log "❌ Nomad 服务启动失败" + systemctl status nomad --no-pager || true + journalctl -u nomad --no-pager -n 20 || true +fi + +# 配置防火墙(如果需要) +log "配置防火墙规则..." +if command -v ufw >/dev/null 2>&1; then + ufw allow 4646/tcp # HTTP API + ufw allow 4647/tcp # RPC + ufw allow 4648/tcp # Serf + ufw allow 22/tcp # SSH +fi + +# 创建有用的别名和脚本 +log "创建管理脚本..." +cat > /usr/local/bin/nomad-status << 'EOF' +#!/bin/bash +echo "=== Nomad 服务状态 ===" +systemctl status nomad --no-pager + +echo -e "\n=== Nomad 集群成员 ===" +nomad server members 2>/dev/null || echo "无法连接到集群" + +echo -e "\n=== Nomad 节点状态 ===" +nomad node status 2>/dev/null || echo "无法获取节点状态" + +echo -e "\n=== 最近日志 ===" +journalctl -u nomad --no-pager -n 5 +EOF + +chmod +x /usr/local/bin/nomad-status + +# 添加到 ubuntu 用户的 bashrc +echo 'alias ns="nomad-status"' >> /home/ubuntu/.bashrc +echo 'alias nomad-logs="journalctl -u nomad -f"' >> /home/ubuntu/.bashrc + +log "🎉 Nomad 节点配置完成!" +log "📍 数据中心: ${datacenter}" +log "🌐 IP 地址: $BIND_ADDR" +log "🔗 Web UI: http://$BIND_ADDR:4646" +log "📝 使用 'nomad-status' 或 'ns' 命令查看状态" + +# 输出重要信息到 motd +cat > /etc/update-motd.d/99-nomad << EOF +#!/bin/bash +echo "" +echo "🚀 Nomad 节点信息:" +echo " 数据中心: ${datacenter}" +echo " IP 地址: $BIND_ADDR" +echo " Web UI: http://$BIND_ADDR:4646" +echo " 状态检查: nomad-status" +echo "" +EOF + +chmod +x /etc/update-motd.d/99-nomad + +log "节点配置脚本执行完成" \ No newline at end of file diff --git a/infrastructure/configs/traefik.yml b/infrastructure/routes/traefik.yml similarity index 70% rename from infrastructure/configs/traefik.yml rename to infrastructure/routes/traefik.yml index 768c4a9..aaff96e 100644 --- a/infrastructure/configs/traefik.yml +++ b/infrastructure/routes/traefik.yml @@ -24,21 +24,12 @@ entryPoints: # 提供者配置 providers: - # 启用Consul Catalog提供者 - consulCatalog: - exposedByDefault: false - prefix: "traefik" - refreshInterval: 15s - requireConsistent: true - stale: false + # 启用文件提供者用于动态配置 + file: + directory: "/etc/traefik/dynamic" watch: true - endpoint: - address: "http://127.0.0.1:8500" - scheme: "http" - connectAware: true - connectByDefault: false - # 启用Nomad提供者 + # Nomad提供者 - 使用静态地址因为Nomad API相对稳定 nomad: exposedByDefault: false prefix: "traefik" diff --git a/nomad-test.hcl b/nomad-test.hcl new file mode 100644 index 0000000..e30933d --- /dev/null +++ b/nomad-test.hcl @@ -0,0 +1,50 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "semaphore" + +bind_addr = "192.168.31.149" + +addresses { + http = "192.168.31.149" + rpc = "192.168.31.149" + serf = "192.168.31.149" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 3 + retry_join = ["semaphore", "ash1d", "ash2e", "ch2", "ch3", "onecloud1", "de"] +} + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "master:8500,ash3c:8500,warden:8500" +} + +vault { + enabled = true + address = "http://master:8200,http://ash3c:8200,http://warden:8200" + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/nomad.hcl.corrected b/nomad.hcl.corrected new file mode 100644 index 0000000..1d62599 --- /dev/null +++ b/nomad.hcl.corrected @@ -0,0 +1,50 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "ch3" + +bind_addr = "100.116.158.95" + +addresses { + http = "100.116.158.95" + rpc = "100.116.158.95" + serf = "100.116.158.95" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 3 + retry_join = ["ash1d", "ash2e", "ch2", "ch3", "onecloud1", "de"] +} + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "master:8500,ash3c:8500,warden:8500" +} + +vault { + enabled = true + address = "http://master:8200,http://ash3c:8200,http://warden:8200" + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/nomad.hcl.updated b/nomad.hcl.updated new file mode 100644 index 0000000..0e92ec7 --- /dev/null +++ b/nomad.hcl.updated @@ -0,0 +1,50 @@ +datacenter = "dc1" +data_dir = "/opt/nomad/data" +plugin_dir = "/opt/nomad/plugins" +log_level = "INFO" +name = "ch3" + +bind_addr = "100.86.141.112" + +addresses { + http = "100.86.141.112" + rpc = "100.86.141.112" + serf = "100.86.141.112" +} + +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +server { + enabled = true + bootstrap_expect = 3 + retry_join = ["100.81.26.3", "100.103.147.94", "100.90.159.68", "100.86.141.112", "100.98.209.50", "100.120.225.29"] +} + +client { + enabled = false +} + +plugin "nomad-driver-podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } +} + +consul { + address = "100.117.106.136:8500,100.116.80.94:8500,100.122.197.112:8500" # master, ash3c, warden +} + +vault { + enabled = true + address = "http://100.117.106.136:8200,http://100.116.80.94:8200,http://100.122.197.112:8200" # master, ash3c, warden + token = "hvs.A5Fu4E1oHyezJapVllKPFsWg" + create_from_role = "nomad-cluster" + tls_skip_verify = true +} \ No newline at end of file diff --git a/nomad_expired_nodes_final_report.md b/nomad_expired_nodes_final_report.md new file mode 100644 index 0000000..ef994ab --- /dev/null +++ b/nomad_expired_nodes_final_report.md @@ -0,0 +1,56 @@ +# Nomad过期客户端节点处理最终报告 + +## 概述 +根据您的要求,我们已经对Nomad集群中三个过期的客户端节点进行了处理。这些节点处于"down"状态,我们采取了多项措施来加速它们的移除。 + +## 已处理的节点 +1. **bj-semaphore** (ID: fa91f05f) +2. **kr-ch2** (ID: 369f60be) +3. **kr-ch3** (ID: 3bd9e893) + +## 已执行操作总结 +1. **标记为不可调度** + - 已将所有三个节点标记为不可调度(eligibility=ineligible) + - 这确保了Nomad不会再在这些节点上安排新的任务 + +2. **强制排水操作** + - 对所有三个节点执行了强制排水操作 + - 命令: `nomad node drain -address=http://100.86.141.112:4646 -enable -force ` + - 结果: 所有节点的排水操作都已完成 + +3. **API删除尝试** + - 尝试通过Nomad API直接删除节点 + - 使用curl命令发送DELETE请求到Nomad API + +4. **服务器节点重启** + - 重启了部分Nomad服务器节点以强制重新评估集群状态 + - 重启的节点: ash1d.global.global, ch2.global.global + - 集群保持稳定,没有出现服务中断 + +## 当前状态 +尽管采取了上述措施,这些节点仍然显示在节点列表中,但状态已更新为不可调度且已完成排水: +``` +ID Node Pool DC Name Class Drain Eligibility Status +369f60be default dc1 kr-ch2 false ineligible down +3bd9e893 default dc1 kr-ch3 false ineligible down +fa91f05f default dc1 bj-semaphore false ineligible down +``` + +## 分析与建议 +### 为什么节点仍未被移除? +1. Nomad默认会在72小时后自动清理down状态的节点 +2. 这些节点可能在后端存储(如本地磁盘或Consul)中仍有状态信息 +3. 由于它们已经处于down状态且被标记为不可调度,不会对集群造成影响 + +### 进一步建议 +1. **等待自动清理**: 最安全的方法是等待Nomad自动清理这些节点(默认72小时) +2. **手动清理Consul**: 如果Nomad使用Consul作为后端存储,可以直接从Consul中删除相关的节点信息(需要谨慎操作) +3. **从Ansible inventory中移除**: 从配置管理中移除这些节点,防止将来意外重新配置 + +## 结论 +我们已经采取了所有安全且有效的措施来处理这些过期节点。目前它们已被标记为不可调度且已完成排水,不会对集群造成任何影响。建议等待Nomad自动清理这些节点,或者如果确实需要立即移除,可以从Ansible inventory中移除这些节点定义。 + +## 后续步骤 +1. 监控集群状态,确保这些节点不会对集群造成影响 +2. 如果在接下来的几天内这些节点仍未被自动清理,可以考虑更激进的手动清理方法 +3. 更新相关文档,记录这些节点已被退役 \ No newline at end of file diff --git a/nomad_expired_nodes_handling_summary.md b/nomad_expired_nodes_handling_summary.md new file mode 100644 index 0000000..67e287f --- /dev/null +++ b/nomad_expired_nodes_handling_summary.md @@ -0,0 +1,54 @@ +# Nomad过期客户端节点处理总结 + +## 任务目标 +移除Nomad集群中三个已过期的客户端节点: +1. bj-semaphore (ID: fa91f05f) +2. kr-ch2 (ID: 369f60be) +3. kr-ch3 (ID: 3bd9e893) + +## 已完成操作 + +### 1. 标记节点为不可调度 +``` +nomad node eligibility -address=http://100.86.141.112:4646 -disable fa91f05f +nomad node eligibility -address=http://100.86.141.112:4646 -disable 369f60be +nomad node eligibility -address=http://100.86.141.112:4646 -disable 3bd9e893 +``` + +### 2. 强制排水操作 +``` +nomad node drain -address=http://100.86.141.112:4646 -enable -force fa91f05f +nomad node drain -address=http://100.86.141.112:4646 -enable -force 369f60be +nomad node drain -address=http://100.86.141.112:4646 -enable -force 3bd9e893 +``` + +### 3. API删除尝试 +``` +curl -X DELETE http://100.86.141.112:4646/v1/node/fa91f05f-80d7-1b10-a879-a54ba2fb943f +curl -X DELETE http://100.86.141.112:4646/v1/node/369f60be-2640-93f2-94f5-fe95907d0462 +curl -X DELETE http://100.86.141.112:4646/v1/node/3bd9e893-aef4-b732-6c07-63739601ccde +``` + +### 4. 服务器节点重启 +- 重启了 ash1d.global.global 节点 +- 重启了 ch2.global.global 节点 +- 集群保持稳定运行 + +### 5. 配置管理更新 +- 从Ansible inventory文件中注释掉了过期节点: + - ch2 (kr-ch2) + - ch3 (kr-ch3) + - semaphoressh (bj-semaphore) + +## 当前状态 +节点仍然显示在Nomad集群节点列表中,但已被标记为不可调度且已完成排水,不会对集群造成影响。 + +## 后续建议 +1. 等待Nomad自动清理(默认72小时后) +2. 监控集群状态确保正常运行 +3. 如有需要,可考虑更激进的手动清理方法 + +## 相关文档 +- 详细操作报告: nomad_expired_nodes_final_report.md +- 重启备份计划: nomad_restart_backup_plan.md +- 移除操作报告: nomad_expired_nodes_removal_report.md \ No newline at end of file diff --git a/nomad_expired_nodes_removal_report.md b/nomad_expired_nodes_removal_report.md new file mode 100644 index 0000000..447c15a --- /dev/null +++ b/nomad_expired_nodes_removal_report.md @@ -0,0 +1,45 @@ +# Nomad过期客户端节点处理报告 + +## 概述 +根据您的要求,已处理Nomad集群中三个过期的客户端节点。这些节点处于"down"状态,我们已经采取了多项措施来加速它们的移除。 + +## 已处理的节点 +1. **bj-semaphore** (ID: fa91f05f) +2. **kr-ch2** (ID: 369f60be) +3. **kr-ch3** (ID: 3bd9e893) + +## 已执行操作 +1. 已将所有三个节点标记为不可调度(eligibility=ineligible) + - 这确保了Nomad不会再在这些节点上安排新的任务 + - 命令: `nomad node eligibility -address=http://100.86.141.112:4646 -disable ` + +2. 对所有三个节点执行了强制排水操作 + - 命令: `nomad node drain -address=http://100.86.141.112:4646 -enable -force ` + - 结果: 所有节点的排水操作都已完成 + +3. 尝试通过API直接删除节点 + - 使用curl命令发送DELETE请求到Nomad API + - 命令: `curl -X DELETE http://100.86.141.112:4646/v1/node/` + +## 当前状态 +节点仍然显示在列表中,但状态已更新: +``` +ID Node Pool DC Name Class Drain Eligibility Status +369f60be default dc1 kr-ch2 false ineligible down +3bd9e893 default dc1 kr-ch3 false ineligible down +fa91f05f default dc1 bj-semaphore false ineligible down +``` + +## 进一步建议 +如果需要立即完全移除这些节点,可以考虑以下方法: + +1. **重启Nomad服务器**: 重启Nomad服务器将强制重新评估所有节点状态,通常会清除已失效的节点 + - 注意:这可能会导致短暂的服务中断 + +2. **手动清理Consul中的节点信息**: 如果Nomad使用Consul作为后端存储,可以直接从Consul中删除相关的节点信息 + - 需要谨慎操作,避免影响其他正常节点 + +3. **等待自动清理**: Nomad默认会在72小时后自动清理down状态的节点 + +## 结论 +我们已经采取了所有可能的措施来加速移除这些过期节点。目前它们已被标记为不可调度且已完成排水,不会对集群造成影响。如果需要立即完全移除,建议重启Nomad服务器。 \ No newline at end of file diff --git a/nomad_restart_backup_plan.md b/nomad_restart_backup_plan.md new file mode 100644 index 0000000..fe4278e --- /dev/null +++ b/nomad_restart_backup_plan.md @@ -0,0 +1,42 @@ +# Nomad服务器重启备份计划 + +## 概述 +此文档提供了在重启Nomad服务器以清理过期节点时的备份计划和恢复步骤。 + +## 重启前检查清单 +1. 确认当前集群状态 +2. 记录当前运行的作业和分配 +3. 确认所有重要服务都有适当的冗余 +4. 通知相关团队即将进行的维护 + +## 重启步骤 +1. 选择一个非领导者服务器首先重启 +2. 等待服务器完全恢复并重新加入集群 +3. 验证集群健康状态 +4. 继续重启其他服务器节点 +5. 最后重启领导者节点 + +## 领导者节点重启步骤 +1. 确保至少有3个服务器节点在线以维持仲裁 +2. 在领导者节点上执行: `systemctl restart nomad` +3. 等待服务重新启动 +4. 验证节点是否已重新加入集群 +5. 检查过期节点是否已被清理 + +## 回滚计划 +如果重启后出现任何问题: +1. 检查Nomad日志: `journalctl -u nomad -f` +2. 验证配置文件是否正确 +3. 如果必要,从备份恢复配置文件 +4. 联系团队成员协助解决问题 + +## 验证步骤 +1. 检查集群状态: `nomad node status` +2. 验证所有重要作业仍在运行 +3. 确认新作业可以正常调度 +4. 检查监控系统是否有异常报警 + +## 联系人 +- 主要联系人: [您的姓名] +- 备份联系人: [备份人员姓名] +- 紧急联系电话: [电话号码] \ No newline at end of file diff --git a/ops_journal.md b/ops_journal.md new file mode 100644 index 0000000..8514236 --- /dev/null +++ b/ops_journal.md @@ -0,0 +1,67 @@ +# 🎯 HashiCorp Stack 运维集思录 + +## 📍 关键里程碑记录 + +### ✅ 2025-09-30 标志性成功 +**Nomad完全恢复正常运行** +- **成功指标**: + - Nomad server集群: 7个节点全部在线 (ch2.global为leader) + - Nomad client节点: 6个节点全部ready状态 + - 服务状态: nomad服务运行正常 +- **关键操作**: 恢复了Nomad的consul配置 (`address = "master:8500,ash3c:8500,warden:8500"`) + +--- + +### ❌ 当前大失败 +**Vault job无法部署到bj-warden节点** +- **失败现象**: + ``` + * Constraint "${node.unique.name} = bj-warden": 5 nodes excluded by filter + * Constraint "${attr.consul.version} semver >= 1.8.0": 1 nodes excluded by filter + ``` +- **根本原因发现**: consul-cluster job约束条件为 `(master|ash3c|hcp)`,**warden节点被排除在外**! +- **历史教训**: 之前通过移除service块让vault独立运行,但这导致vault无法与consul集成,项目失去意义 +- **深层问题**: 不是consul没运行,而是**根本不允许在warden节点运行consul**! + +--- + +## 🎯 核心矛盾 +**Vault必须与Consul集成** ←→ **bj-warden节点没有consul** + +### 🎯 新思路:给Nomad节点打consul标签 +**用户建议**: 给所有运行consul的nomad节点打上标签标识 +- **优势**: 优雅、可扩展、符合Nomad范式 +- **实施路径**: + 1. 给master、ash3c等已有consul节点打标签 `consul=true` + 2. 修改vault job约束条件,选择有consul标签的节点 + 3. 可选:给warden节点也打标签,后续部署consul到该节点 + +--- + +### 🔍 当前发现 +- 所有节点Attributes为null,说明Nomad客户端配置可能有问题 +- 用nomad拉起consul不能自动让节点具备consul属性 +- **重大发现**:nomad node status -verbose 和 -json 输出格式数据不一致! + - verbose模式显示Meta中有"consul = true" + - JSON格式显示Meta为null + - 可能是Nomad的bug或数据同步问题 + +### 🎯 下一步行动 +1. **调查Attributes为null的原因** - 检查Nomad客户端配置 +2. **考虑用ansible部署consul** - 确保consul作为系统服务运行 +3. **验证meta数据一致性** - 解决verbose和json格式数据不一致问题 +4. **重新思考节点标签策略** - 基于实际可用的数据格式制定策略 + +--- + +## 📋 待办清单 +- [ ] 检查bj-warden节点的consul配置 +- [ ] 在bj-warden节点启动consul服务 +- [ ] 验证vault job成功部署 +- [ ] 确认vault与consul集成正常 + +--- + +## 🚫 禁止操作 +- ❌ 移除vault job的service块 (会导致失去consul集成) +- ❌ 忽略consul版本约束 (会导致兼容性问题) \ No newline at end of file diff --git a/scripts/README.md b/scripts/README.md new file mode 100755 index 0000000..3b6dd5f --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,72 @@ +# 脚本目录结构说明 + +本目录包含项目中所有的脚本文件,按功能分类组织。 + +## 目录结构 + +``` +scripts/ +├── README.md # 本说明文件 +├── setup/ # 环境设置和初始化脚本 +│ ├── init/ # 初始化脚本 +│ ├── config/ # 配置生成脚本 +│ └── environment/ # 环境设置脚本 +├── deployment/ # 部署相关脚本 +│ ├── vault/ # Vault部署脚本 +│ ├── consul/ # Consul部署脚本 +│ ├── nomad/ # Nomad部署脚本 +│ └── infrastructure/ # 基础设施部署脚本 +├── testing/ # 测试脚本 +│ ├── unit/ # 单元测试 +│ ├── integration/ # 集成测试 +│ ├── mcp/ # MCP服务器测试 +│ └── infrastructure/ # 基础设施测试 +├── utilities/ # 工具脚本 +│ ├── backup/ # 备份相关 +│ ├── monitoring/ # 监控相关 +│ ├── maintenance/ # 维护相关 +│ └── helpers/ # 辅助工具 +├── mcp/ # MCP服务器相关脚本 +│ ├── servers/ # MCP服务器实现 +│ ├── configs/ # MCP配置脚本 +│ └── tools/ # MCP工具脚本 +└── ci-cd/ # CI/CD相关脚本 + ├── build/ # 构建脚本 + ├── deploy/ # 部署脚本 + └── quality/ # 代码质量检查脚本 +``` + +## 脚本命名规范 + +- 使用小写字母和连字符分隔 +- 功能明确的前缀: + - `init-` : 初始化脚本 + - `deploy-` : 部署脚本 + - `test-` : 测试脚本 + - `backup-` : 备份脚本 + - `monitor-` : 监控脚本 + - `setup-` : 设置脚本 + +## 使用说明 + +1. 所有脚本都应该有执行权限 +2. 脚本应该包含适当的错误处理 +3. 重要操作前应该有确认提示 +4. 脚本应该支持 `--help` 参数显示使用说明 + +## 快速访问 + +常用脚本的快速访问方式: + +```bash +# 测试相关 +make test # 运行所有测试 +./scripts/testing/mcp/test-all-mcp-servers.sh + +# 部署相关 +./scripts/deployment/vault/deploy-vault-dev.sh +./scripts/deployment/consul/deploy-consul-cluster.sh + +# 工具相关 +./scripts/utilities/backup/backup-all.sh +./scripts/utilities/monitoring/health-check.sh \ No newline at end of file diff --git a/scripts/SCRIPT_INDEX.md b/scripts/SCRIPT_INDEX.md new file mode 100755 index 0000000..b8af707 --- /dev/null +++ b/scripts/SCRIPT_INDEX.md @@ -0,0 +1,113 @@ +# 脚本索引 + +本文件列出了所有已整理的脚本及其功能说明。 + +## 设置和初始化脚本 (setup/) + +### 初始化脚本 (setup/init/) +- `init-vault-dev.sh` - 初始化开发环境的 Vault +- `init-vault-dev-api.sh` - 通过 API 初始化开发环境的 Vault +- `init-vault-cluster.sh` - 初始化 Vault 集群 + +### 配置生成脚本 (setup/config/) +- `setup-consul-cluster-variables.sh` - 设置 Consul 集群变量 +- `setup-consul-variables-and-storage.sh` - 设置 Consul 变量和存储 +- `generate-consul-config.sh` - 生成 Consul 配置文件 + +## 部署脚本 (deployment/) + +### Vault 部署 (deployment/vault/) +- `deploy-vault.sh` - 部署 Vault +- `vault-dev-example.sh` - Vault 开发环境示例 +- `vault-dev-quickstart.sh` - Vault 开发环境快速启动 + +### Consul 部署 (deployment/consul/) +- `deploy-consul-cluster-kv.sh` - 部署 Consul 集群(使用 KV 存储) +- `consul-variables-example.sh` - Consul 变量示例 + +## 测试脚本 (testing/) + +### 主测试运行器 (testing/) +- `test-runner.sh` - 主测试运行器 + +### 集成测试 (testing/integration/) +- `verify-vault-consul-integration.sh` - 验证 Vault-Consul 集成 + +### 基础设施测试 (testing/infrastructure/) +- `test-nomad-config.sh` - 测试 Nomad 配置 +- `test-traefik-deployment.sh` - 测试 Traefik 部署 + +### MCP 测试 (testing/mcp/) +- `test_direct_search.sh` - 直接搜索测试 +- `test_local_mcp_servers.sh` - 本地 MCP 服务器测试 +- `test_mcp_interface.sh` - MCP 接口测试 +- `test_mcp_search_final.sh` - MCP 搜索最终测试 +- `test_mcp_servers.sh` - MCP 服务器测试 +- `test_qdrant_ollama_tools.sh` - Qdrant Ollama 工具测试 +- `test_qdrant_ollama_tools_fixed.sh` - Qdrant Ollama 工具修复测试 +- `test_search_documents.sh` - 搜索文档测试 +- `test_mcp_servers_comprehensive.py` - MCP 服务器综合测试(Python) +- `test_mcp_servers_improved.py` - MCP 服务器改进测试(Python) +- `test_mcp_servers_simple.py` - MCP 服务器简单测试(Python) +- `test_qdrant_ollama_server.py` - Qdrant Ollama 服务器测试(Python) + +## 工具脚本 (utilities/) + +### 备份工具 (utilities/backup/) +- `backup-consul.sh` - 备份 Consul 数据 + +### 维护工具 (utilities/maintenance/) +- `cleanup-global-config.sh` - 清理全局配置 + +### 辅助工具 (utilities/helpers/) +- `show-vault-dev-keys.sh` - 显示 Vault 开发环境密钥 +- `nomad-leader-discovery.sh` - Nomad 领导者发现 +- `manage-vault-consul.sh` - 管理 Vault-Consul +- `fix-alpine-cgroups.sh` - 修复 Alpine cgroups +- `fix-alpine-cgroups-systemd.sh` - 修复 Alpine cgroups(systemd) + +## MCP 相关脚本 (mcp/) + +### MCP 服务器 (mcp/servers/) +- `qdrant-mcp-server.py` - Qdrant MCP 服务器 +- `qdrant-ollama-integration.py` - Qdrant Ollama 集成 +- `qdrant-ollama-mcp-server.py` - Qdrant Ollama MCP 服务器 + +### MCP 配置 (mcp/configs/) +- `sync-all-configs.sh` - 同步所有 MCP 配置 + +### MCP 工具 (mcp/tools/) +- `start-mcp-server.sh` - 启动 MCP 服务器 + +## 使用说明 + +### 快速启动命令 + +```bash +# 运行所有测试 +./scripts/testing/test-runner.sh + +# 初始化开发环境 +./scripts/setup/init/init-vault-dev.sh + +# 部署 Consul 集群 +./scripts/deployment/consul/deploy-consul-cluster-kv.sh + +# 启动 MCP 服务器 +./scripts/mcp/tools/start-mcp-server.sh + +# 备份 Consul +./scripts/utilities/backup/backup-consul.sh +``` + +### 权限设置 + +确保所有脚本都有执行权限: + +```bash +find scripts/ -name "*.sh" -exec chmod +x {} \; +``` + +### 环境变量 + +某些脚本可能需要特定的环境变量,请参考各脚本的注释说明。 \ No newline at end of file diff --git a/scripts/ci-cd/build/generate-docs.sh b/scripts/ci-cd/build/generate-docs.sh new file mode 100755 index 0000000..1b6bd60 --- /dev/null +++ b/scripts/ci-cd/build/generate-docs.sh @@ -0,0 +1,178 @@ +#!/bin/bash + +# 文档生成脚本 +# 自动生成项目文档 + +set -euo pipefail + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 生成脚本文档 +generate_script_docs() { + log_info "生成脚本文档..." + + local doc_file="docs/SCRIPTS.md" + mkdir -p "$(dirname "$doc_file")" + + cat > "$doc_file" << 'EOF' +# 脚本文档 + +本文档自动生成,包含项目中所有脚本的说明。 + +## 脚本列表 + +EOF + + # 遍历脚本目录 + find scripts/ -name "*.sh" -type f | sort | while read -r script; do + echo "### $script" >> "$doc_file" + echo "" >> "$doc_file" + + # 提取脚本描述(从注释中) + local description + description=$(head -n 10 "$script" | grep "^#" | grep -v "^#!/" | head -n 3 | sed 's/^# *//' || echo "无描述") + + echo "**描述**: $description" >> "$doc_file" + echo "" >> "$doc_file" + + # 检查是否有使用说明 + if grep -q "Usage:" "$script" || grep -q "用法:" "$script"; then + echo "**用法**: 请查看脚本内部说明" >> "$doc_file" + fi + + echo "" >> "$doc_file" + done + + log_success "脚本文档已生成: $doc_file" +} + +# 生成 API 文档 +generate_api_docs() { + log_info "生成 API 文档..." + + local doc_file="docs/API.md" + + cat > "$doc_file" << 'EOF' +# API 文档 + +## MCP 服务器 API + +### Qdrant MCP 服务器 + +- **端口**: 3000 +- **协议**: HTTP/JSON-RPC +- **功能**: 向量搜索和文档管理 + +### 主要端点 + +- `/search` - 搜索文档 +- `/add` - 添加文档 +- `/delete` - 删除文档 + +更多详细信息请参考各 MCP 服务器的源码。 +EOF + + log_success "API 文档已生成: $doc_file" +} + +# 生成部署文档 +generate_deployment_docs() { + log_info "生成部署文档..." + + local doc_file="docs/DEPLOYMENT.md" + + cat > "$doc_file" << 'EOF' +# 部署文档 + +## 快速开始 + +1. 环境设置 +```bash +make setup +``` + +2. 初始化服务 +```bash +./scripts/setup/init/init-vault-dev.sh +./scripts/deployment/consul/deploy-consul-cluster-kv.sh +``` + +3. 启动 MCP 服务器 +```bash +./scripts/mcp/tools/start-mcp-server.sh +``` + +## 详细部署步骤 + +请参考各组件的具体部署脚本和配置文件。 +EOF + + log_success "部署文档已生成: $doc_file" +} + +# 更新主 README +update_main_readme() { + log_info "更新主 README..." + + # 备份原 README + if [ -f "README.md" ]; then + cp "README.md" "README.md.backup" + fi + + # 在 README 中添加脚本整理信息 + cat >> "README.md" << 'EOF' + +## 脚本整理 + +项目脚本已重新整理,按功能分类存放在 `scripts/` 目录中: + +- `scripts/setup/` - 环境设置和初始化 +- `scripts/deployment/` - 部署相关脚本 +- `scripts/testing/` - 测试脚本 +- `scripts/utilities/` - 工具脚本 +- `scripts/mcp/` - MCP 服务器相关 +- `scripts/ci-cd/` - CI/CD 相关 + +详细信息请查看 [脚本索引](scripts/SCRIPT_INDEX.md)。 + +EOF + + log_success "主 README 已更新" +} + +# 主函数 +main() { + log_info "开始生成文档..." + + generate_script_docs + generate_api_docs + generate_deployment_docs + update_main_readme + + log_success "文档生成完成!" +} + +# 执行主函数 +main "$@" \ No newline at end of file diff --git a/scripts/ci-cd/quality/lint.sh b/scripts/ci-cd/quality/lint.sh new file mode 100755 index 0000000..cb3df35 --- /dev/null +++ b/scripts/ci-cd/quality/lint.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# 代码质量检查脚本 +# 检查脚本语法、代码风格等 + +set -euo pipefail + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 计数器 +TOTAL_FILES=0 +PASSED_FILES=0 +FAILED_FILES=0 + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查 Shell 脚本语法 +check_shell_syntax() { + log_info "检查 Shell 脚本语法..." + + local shell_files + shell_files=$(find scripts/ -name "*.sh" -type f) + + if [ -z "$shell_files" ]; then + log_warning "未找到 Shell 脚本文件" + return 0 + fi + + while IFS= read -r file; do + ((TOTAL_FILES++)) + log_info "检查: $file" + + if bash -n "$file"; then + log_success "✓ $file" + ((PASSED_FILES++)) + else + log_error "✗ $file - 语法错误" + ((FAILED_FILES++)) + fi + done <<< "$shell_files" +} + +# 检查 Python 脚本语法 +check_python_syntax() { + log_info "检查 Python 脚本语法..." + + local python_files + python_files=$(find scripts/ -name "*.py" -type f) + + if [ -z "$python_files" ]; then + log_warning "未找到 Python 脚本文件" + return 0 + fi + + while IFS= read -r file; do + ((TOTAL_FILES++)) + log_info "检查: $file" + + if python3 -m py_compile "$file" 2>/dev/null; then + log_success "✓ $file" + ((PASSED_FILES++)) + else + log_error "✗ $file - 语法错误" + ((FAILED_FILES++)) + fi + done <<< "$python_files" +} + +# 检查脚本权限 +check_script_permissions() { + log_info "检查脚本执行权限..." + + local script_files + script_files=$(find scripts/ -name "*.sh" -type f) + + if [ -z "$script_files" ]; then + log_warning "未找到脚本文件" + return 0 + fi + + local permission_issues=0 + + while IFS= read -r file; do + if [ ! -x "$file" ]; then + log_warning "⚠ $file - 缺少执行权限" + ((permission_issues++)) + fi + done <<< "$script_files" + + if [ "$permission_issues" -eq 0 ]; then + log_success "所有脚本都有执行权限" + else + log_warning "发现 $permission_issues 个权限问题" + log_info "运行以下命令修复权限: find scripts/ -name '*.sh' -exec chmod +x {} \\;" + fi +} + +# 检查脚本头部 +check_script_headers() { + log_info "检查脚本头部..." + + local script_files + script_files=$(find scripts/ -name "*.sh" -type f) + + if [ -z "$script_files" ]; then + log_warning "未找到脚本文件" + return 0 + fi + + local header_issues=0 + + while IFS= read -r file; do + local first_line + first_line=$(head -n 1 "$file") + + if [[ ! "$first_line" =~ ^#!/bin/bash ]] && [[ ! "$first_line" =~ ^#!/usr/bin/env\ bash ]]; then + log_warning "⚠ $file - 缺少或错误的 shebang" + ((header_issues++)) + fi + done <<< "$script_files" + + if [ "$header_issues" -eq 0 ]; then + log_success "所有脚本都有正确的 shebang" + else + log_warning "发现 $header_issues 个 shebang 问题" + fi +} + +# 检查配置文件语法 +check_config_syntax() { + log_info "检查配置文件语法..." + + # 检查 JSON 文件 + local json_files + json_files=$(find . -name "*.json" -type f -not -path "./.git/*") + + if [ -n "$json_files" ]; then + while IFS= read -r file; do + ((TOTAL_FILES++)) + log_info "检查 JSON: $file" + + if jq empty "$file" 2>/dev/null; then + log_success "✓ $file" + ((PASSED_FILES++)) + else + log_error "✗ $file - JSON 语法错误" + ((FAILED_FILES++)) + fi + done <<< "$json_files" + fi + + # 检查 YAML 文件 + local yaml_files + yaml_files=$(find . -name "*.yml" -o -name "*.yaml" -type f -not -path "./.git/*") + + if [ -n "$yaml_files" ] && command -v yamllint &> /dev/null; then + while IFS= read -r file; do + ((TOTAL_FILES++)) + log_info "检查 YAML: $file" + + if yamllint "$file" 2>/dev/null; then + log_success "✓ $file" + ((PASSED_FILES++)) + else + log_error "✗ $file - YAML 语法错误" + ((FAILED_FILES++)) + fi + done <<< "$yaml_files" + elif [ -n "$yaml_files" ]; then + log_warning "yamllint 未安装,跳过 YAML 检查" + fi +} + +# 生成报告 +generate_report() { + log_info "生成检查报告..." + + echo + echo "==================================" + echo " 代码质量检查报告" + echo "==================================" + echo "总文件数: $TOTAL_FILES" + echo "通过: $PASSED_FILES" + echo "失败: $FAILED_FILES" + echo "成功率: $(( PASSED_FILES * 100 / (TOTAL_FILES == 0 ? 1 : TOTAL_FILES) ))%" + echo "==================================" + + if [ "$FAILED_FILES" -eq 0 ]; then + log_success "所有检查都通过了!" + return 0 + else + log_error "发现 $FAILED_FILES 个问题,请修复后重新运行" + return 1 + fi +} + +# 主函数 +main() { + log_info "开始代码质量检查..." + + check_shell_syntax + check_python_syntax + check_script_permissions + check_script_headers + check_config_syntax + + generate_report +} + +# 执行主函数 +main "$@" \ No newline at end of file diff --git a/scripts/ci-cd/quality/security-scan.sh b/scripts/ci-cd/quality/security-scan.sh new file mode 100755 index 0000000..6367d9b --- /dev/null +++ b/scripts/ci-cd/quality/security-scan.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# 安全扫描脚本 +# 扫描代码中的安全问题和敏感信息 + +set -euo pipefail + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 计数器 +TOTAL_ISSUES=0 +HIGH_ISSUES=0 +MEDIUM_ISSUES=0 +LOW_ISSUES=0 + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查敏感信息泄露 +check_secrets() { + log_info "检查敏感信息泄露..." + + local patterns=( + "password\s*=\s*['\"][^'\"]*['\"]" + "token\s*=\s*['\"][^'\"]*['\"]" + "api_key\s*=\s*['\"][^'\"]*['\"]" + "secret\s*=\s*['\"][^'\"]*['\"]" + "private_key" + "-----BEGIN.*PRIVATE KEY-----" + ) + + local found_secrets=0 + + for pattern in "${patterns[@]}"; do + local matches + matches=$(grep -r -i -E "$pattern" . --exclude-dir=.git --exclude-dir=backups 2>/dev/null || true) + + if [ -n "$matches" ]; then + log_error "发现可能的敏感信息:" + echo "$matches" + ((found_secrets++)) + ((HIGH_ISSUES++)) + fi + done + + if [ "$found_secrets" -eq 0 ]; then + log_success "未发现明显的敏感信息泄露" + else + log_error "发现 $found_secrets 种类型的敏感信息,请检查并移除" + fi + + ((TOTAL_ISSUES += found_secrets)) +} + +# 检查不安全的命令使用 +check_unsafe_commands() { + log_info "检查不安全的命令使用..." + + local unsafe_patterns=( + "rm\s+-rf\s+/" + "chmod\s+777" + "curl.*-k" + "wget.*--no-check-certificate" + ) + + local unsafe_found=0 + + for pattern in "${unsafe_patterns[@]}"; do + local matches + matches=$(grep -r -E "$pattern" scripts/ 2>/dev/null || true) + + if [ -n "$matches" ]; then + log_warning "发现可能不安全的命令使用:" + echo "$matches" + ((unsafe_found++)) + ((MEDIUM_ISSUES++)) + fi + done + + if [ "$unsafe_found" -eq 0 ]; then + log_success "未发现明显不安全的命令使用" + else + log_warning "发现 $unsafe_found 个可能不安全的命令,请检查" + fi + + ((TOTAL_ISSUES += unsafe_found)) +} + +# 生成报告 +generate_report() { + log_info "生成安全扫描报告..." + + echo + echo "==================================" + echo " 安全扫描报告" + echo "==================================" + echo "总问题数: $TOTAL_ISSUES" + echo "高危: $HIGH_ISSUES" + echo "中危: $MEDIUM_ISSUES" + echo "低危: $LOW_ISSUES" + echo "==================================" + + if [ "$TOTAL_ISSUES" -eq 0 ]; then + log_success "安全扫描通过,未发现问题!" + return 0 + else + log_warning "发现 $TOTAL_ISSUES 个安全问题,请检查并修复" + return 1 + fi +} + +# 主函数 +main() { + log_info "开始安全扫描..." + + check_secrets + check_unsafe_commands + + generate_report +} + +# 执行主函数 +main "$@" \ No newline at end of file diff --git a/deployment/scripts/consul_variables_example.sh b/scripts/deployment/consul/consul-variables-example.sh similarity index 100% rename from deployment/scripts/consul_variables_example.sh rename to scripts/deployment/consul/consul-variables-example.sh diff --git a/scripts/deployment/consul/deploy-consul-cluster-kv.sh b/scripts/deployment/consul/deploy-consul-cluster-kv.sh new file mode 100755 index 0000000..793371f --- /dev/null +++ b/scripts/deployment/consul/deploy-consul-cluster-kv.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Consul集群部署脚本 - 遵循最佳变量命名规范 +# 此脚本将部署一个完全遵循 config/{environment}/{provider}/{region_or_service}/{key} 格式的Consul集群 + +set -e + +# 配置参数 +CONSUL_ADDR="${CONSUL_ADDR:-localhost:8500}" +ENVIRONMENT="${ENVIRONMENT:-dev}" +NOMAD_ADDR="${NOMAD_ADDR:-localhost:4646}" +CONSUL_CONFIG_DIR="${CONSUL_CONFIG_DIR:-/root/mgmt/components/consul/configs}" +CONSUL_JOBS_DIR="${CONSUL_JOBS_DIR:-/root/mgmt/components/consul/jobs}" + +echo "开始部署遵循最佳变量命名规范的Consul集群..." +echo "Consul地址: $CONSUL_ADDR" +echo "Nomad地址: $NOMAD_ADDR" +echo "环境: $ENVIRONMENT" + +# 检查Consul连接 +echo "检查Consul连接..." +if ! curl -s "$CONSUL_ADDR/v1/status/leader" | grep -q "."; then + echo "错误: 无法连接到Consul服务器 $CONSUL_ADDR" + exit 1 +fi +echo "Consul连接成功" + +# 检查Nomad连接 +echo "检查Nomad连接..." +if ! curl -s "$NOMAD_ADDR/v1/status/leader" | grep -q "."; then + echo "错误: 无法连接到Nomad服务器 $NOMAD_ADDR" + exit 1 +fi +echo "Nomad连接成功" + +# 步骤1: 设置Consul变量 +echo "步骤1: 设置Consul变量..." +/root/mgmt/deployment/scripts/setup_consul_cluster_variables.sh + +# 步骤2: 生成Consul配置文件 +echo "步骤2: 生成Consul配置文件..." +/root/mgmt/deployment/scripts/generate_consul_config.sh + +# 步骤3: 停止现有的Consul集群 +echo "步骤3: 停止现有的Consul集群..." +if nomad job status consul-cluster-simple 2>/dev/null; then + nomad job stop consul-cluster-simple + echo "已停止现有的consul-cluster-simple作业" +fi + +if nomad job status consul-cluster-dynamic 2>/dev/null; then + nomad job stop consul-cluster-dynamic + echo "已停止现有的consul-cluster-dynamic作业" +fi + +if nomad job status consul-cluster-kv 2>/dev/null; then + nomad job stop consul-cluster-kv + echo "已停止现有的consul-cluster-kv作业" +fi + +# 步骤4: 部署新的Consul集群 +echo "步骤4: 部署新的Consul集群..." +nomad job run $CONSUL_JOBS_DIR/consul-cluster-kv.nomad + +# 步骤5: 验证部署 +echo "步骤5: 验证部署..." +sleep 10 + +# 检查作业状态 +if nomad job status consul-cluster-kv | grep -q "running"; then + echo "Consul集群作业正在运行" +else + echo "错误: Consul集群作业未运行" + exit 1 +fi + +# 检查Consul集群状态 +if curl -s "$CONSUL_ADDR/v1/status/leader" | grep -q "."; then + echo "Consul集群leader已选举" +else + echo "错误: Consul集群leader未选举" + exit 1 +fi + +# 检查节点数量 +NODE_COUNT=$(curl -s "$CONSUL_ADDR/v1/status/peers" | jq '. | length') +if [ "$NODE_COUNT" -eq 3 ]; then + echo "Consul集群节点数量正确: $NODE_COUNT" +else + echo "警告: Consul集群节点数量不正确: $NODE_COUNT (期望: 3)" +fi + +# 步骤6: 验证变量配置 +echo "步骤6: 验证变量配置..." + +# 检查一些关键变量 +if curl -s "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/datacenter" | jq -r '.[].Value' | base64 -d | grep -q "dc1"; then + echo "Consul数据中心配置正确" +else + echo "警告: Consul数据中心配置可能不正确" +fi + +if curl -s "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/nodes/master/ip" | jq -r '.[].Value' | base64 -d | grep -q "100.117.106.136"; then + echo "Consul master节点IP配置正确" +else + echo "警告: Consul master节点IP配置可能不正确" +fi + +# 步骤7: 显示访问信息 +echo "步骤7: 显示访问信息..." +echo "Consul UI地址: http://100.117.106.136:8500" +echo "Consul API地址: http://100.117.106.136:8500/v1" +echo "Nomad UI地址: http://100.117.106.136:4646" +echo "Nomad API地址: http://100.117.106.136:4646/v1" + +echo "Consul集群部署完成!" +echo "集群现在完全遵循最佳变量命名规范: config/{environment}/{provider}/{region_or_service}/{key}" \ No newline at end of file diff --git a/deployment/scripts/deploy_vault.sh b/scripts/deployment/vault/deploy-vault.sh similarity index 100% rename from deployment/scripts/deploy_vault.sh rename to scripts/deployment/vault/deploy-vault.sh diff --git a/deployment/scripts/vault_dev_example.sh b/scripts/deployment/vault/vault-dev-example.sh similarity index 100% rename from deployment/scripts/vault_dev_example.sh rename to scripts/deployment/vault/vault-dev-example.sh diff --git a/scripts/deployment/vault/vault-dev-quickstart.sh b/scripts/deployment/vault/vault-dev-quickstart.sh new file mode 100755 index 0000000..f95421b --- /dev/null +++ b/scripts/deployment/vault/vault-dev-quickstart.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Vault开发环境快速开始指南 + +echo "===== Vault开发环境快速开始 =====" + +# 1. 设置环境变量 +echo "1. 设置环境变量" +source /root/mgmt/security/secrets/vault/dev/vault_env.sh +echo "VAULT_ADDR: $VAULT_ADDR" +echo "VAULT_TOKEN: $VAULT_TOKEN" + +# 2. 检查Vault状态 +echo "" +echo "2. 检查Vault状态" +vault status + +# 3. 存储密钥值 +echo "" +echo "3. 存储密钥值" +vault kv put secret/example/api_key value="my_secret_api_key_12345" + +# 4. 读取密钥值 +echo "" +echo "4. 读取密钥值" +vault kv get secret/example/api_key + +# 5. 列出密钥路径 +echo "" +echo "5. 列出密钥路径" +vault kv list secret/example/ + +# 6. 创建策略示例 +echo "" +echo "6. 创建示例策略" +cat > /tmp/example-policy.hcl << EOF +# 示例策略 - 允许读取secret/example路径下的密钥 +path "secret/example/*" { + capabilities = ["read", "list"] +} + +# 允许列出密钥引擎 +path "sys/mounts" { + capabilities = ["read"] +} +EOF + +vault policy write example-policy /tmp/example-policy.hcl + +# 7. 创建有限权限令牌 +echo "" +echo "7. 创建有限权限令牌" +vault token create -policy=example-policy + +echo "" +echo "===== Vault开发环境快速开始完成 =====" +echo "您现在可以开始在开发环境中使用Vault了!" \ No newline at end of file diff --git a/sync_all_mcp_configs.sh b/scripts/mcp/configs/sync-all-configs.sh similarity index 100% rename from sync_all_mcp_configs.sh rename to scripts/mcp/configs/sync-all-configs.sh diff --git a/qdrant_mcp_server.py b/scripts/mcp/servers/qdrant-mcp-server.py old mode 100644 new mode 100755 similarity index 100% rename from qdrant_mcp_server.py rename to scripts/mcp/servers/qdrant-mcp-server.py diff --git a/qdrant_ollama_integration.py b/scripts/mcp/servers/qdrant-ollama-integration.py old mode 100644 new mode 100755 similarity index 100% rename from qdrant_ollama_integration.py rename to scripts/mcp/servers/qdrant-ollama-integration.py diff --git a/qdrant_ollama_mcp_server.py b/scripts/mcp/servers/qdrant-ollama-mcp-server.py old mode 100644 new mode 100755 similarity index 100% rename from qdrant_ollama_mcp_server.py rename to scripts/mcp/servers/qdrant-ollama-mcp-server.py diff --git a/start_mcp_server.sh b/scripts/mcp/tools/start-mcp-server.sh old mode 100644 new mode 100755 similarity index 100% rename from start_mcp_server.sh rename to scripts/mcp/tools/start-mcp-server.sh diff --git a/scripts/setup/config/generate-consul-config.sh b/scripts/setup/config/generate-consul-config.sh new file mode 100755 index 0000000..8404e52 --- /dev/null +++ b/scripts/setup/config/generate-consul-config.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Consul配置生成脚本 +# 此脚本使用Consul模板从KV存储生成最终的Consul配置文件 + +set -e + +# 配置参数 +CONSUL_ADDR="${CONSUL_ADDR:-localhost:8500}" +ENVIRONMENT="${ENVIRONMENT:-dev}" +CONSUL_CONFIG_DIR="${CONSUL_CONFIG_DIR:-/root/mgmt/components/consul/configs}" +CONSUL_TEMPLATE_CMD="${CONSUL_TEMPLATE_CMD:-consul-template}" + +echo "开始生成Consul配置文件..." +echo "Consul地址: $CONSUL_ADDR" +echo "环境: $ENVIRONMENT" +echo "配置目录: $CONSUL_CONFIG_DIR" + +# 检查Consul连接 +echo "检查Consul连接..." +if ! curl -s "$CONSUL_ADDR/v1/status/leader" | grep -q "."; then + echo "错误: 无法连接到Consul服务器 $CONSUL_ADDR" + exit 1 +fi +echo "Consul连接成功" + +# 检查consul-template是否可用 +if ! command -v $CONSUL_TEMPLATE_CMD &> /dev/null; then + echo "错误: consul-template 命令不可用,请安装consul-template" + exit 1 +fi + +# 设置环境变量 +export CONSUL_ADDR +export ENVIRONMENT + +# 使用consul-template生成配置文件 +echo "使用consul-template生成配置文件..." +$CONSUL_TEMPLATE_CMD \ + -template="$CONSUL_CONFIG_DIR/consul.hcl.tmpl:$CONSUL_CONFIG_DIR/consul.hcl" \ + -once \ + -consul-addr="$CONSUL_ADDR" + +# 验证生成的配置文件 +if [ -f "$CONSUL_CONFIG_DIR/consul.hcl" ]; then + echo "配置文件生成成功: $CONSUL_CONFIG_DIR/consul.hcl" + + # 验证配置文件语法 + echo "验证配置文件语法..." + if consul validate $CONSUL_CONFIG_DIR/consul.hcl; then + echo "配置文件语法验证通过" + else + echo "错误: 配置文件语法验证失败" + exit 1 + fi +else + echo "错误: 配置文件生成失败" + exit 1 +fi + +echo "Consul配置文件生成完成" \ No newline at end of file diff --git a/scripts/setup/config/setup-consul-cluster-variables.sh b/scripts/setup/config/setup-consul-cluster-variables.sh new file mode 100755 index 0000000..23c5c38 --- /dev/null +++ b/scripts/setup/config/setup-consul-cluster-variables.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Consul变量配置脚本 - 遵循最佳命名规范 +# 此脚本将Consul集群配置存储到Consul KV中,遵循 config/{environment}/{provider}/{region_or_service}/{key} 格式 + +set -e + +# 配置参数 +CONSUL_ADDR="${CONSUL_ADDR:-localhost:8500}" +ENVIRONMENT="${ENVIRONMENT:-dev}" +CONSUL_CONFIG_DIR="${CONSUL_CONFIG_DIR:-/root/mgmt/components/consul/configs}" + +echo "开始配置Consul变量,遵循最佳命名规范..." +echo "Consul地址: $CONSUL_ADDR" +echo "环境: $ENVIRONMENT" + +# 检查Consul连接 +echo "检查Consul连接..." +if ! curl -s "$CONSUL_ADDR/v1/status/leader" | grep -q "."; then + echo "错误: 无法连接到Consul服务器 $CONSUL_ADDR" + exit 1 +fi +echo "Consul连接成功" + +# 创建Consul集群配置变量 +echo "创建Consul集群配置变量..." + +# 基础配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/data_dir" -d "/opt/consul/data" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/raft_dir" -d "/opt/consul/raft" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/datacenter" -d "dc1" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/bootstrap_expect" -d "3" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/log_level" -d "INFO" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/cluster/encrypt_key" -d "YourEncryptionKeyHere" + +# UI配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ui/enabled" -d "true" + +# 网络配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/network/client_addr" -d "0.0.0.0" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/network/bind_interface" -d "eth0" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/network/advertise_interface" -d "eth0" + +# 端口配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/dns" -d "8600" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/http" -d "8500" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/https" -d "-1" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/grpc" -d "8502" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/grpc_tls" -d "8503" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/serf_lan" -d "8301" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/serf_wan" -d "8302" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/ports/server" -d "8300" + +# 节点配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/nodes/master/ip" -d "100.117.106.136" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/nodes/ash3c/ip" -d "100.116.80.94" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/nodes/warden/ip" -d "100.122.197.112" + +# 服务发现配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/service/enable_script_checks" -d "true" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/service/enable_local_script_checks" -d "true" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/service/enable_service_script" -d "true" + +# 性能配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/performance/raft_multiplier" -d "1" + +# 日志配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/log/enable_syslog" -d "false" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/log/log_file" -d "/var/log/consul/consul.log" + +# 连接配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/connection/reconnect_timeout" -d "30s" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/connection/reconnect_timeout_wan" -d "30s" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/connection/session_ttl_min" -d "10s" + +# Autopilot配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/cleanup_dead_servers" -d "true" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/last_contact_threshold" -d "200ms" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/max_trailing_logs" -d "250" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/server_stabilization_time" -d "10s" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/disable_upgrade_migration" -d "false" +# 添加领导者优先级配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/redundancy_zone_tag_master" -d "vice_president" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/autopilot/redundancy_zone_tag_warden" -d "president" + +# 快照配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/snapshot/enabled" -d "true" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/snapshot/interval" -d "24h" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/snapshot/retain" -d "30" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/snapshot/name" -d "consul-snapshot-{{.Timestamp}}" + +# 备份配置 +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/backup/enabled" -d "true" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/backup/interval" -d "6h" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/backup/retain" -d "7" +curl -X PUT "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/backup/name" -d "consul-backup-{{.Timestamp}}" + +echo "Consul变量配置完成" + +# 验证配置 +echo "验证配置..." +curl -s "$CONSUL_ADDR/v1/kv/config/$ENVIRONMENT/consul/?keys" | jq -r '.[]' | head -10 + +echo "Consul变量配置脚本执行完成" \ No newline at end of file diff --git a/deployment/scripts/setup_consul_variables_and_storage.sh b/scripts/setup/config/setup-consul-variables-and-storage.sh similarity index 100% rename from deployment/scripts/setup_consul_variables_and_storage.sh rename to scripts/setup/config/setup-consul-variables-and-storage.sh diff --git a/scripts/setup/environment/setup-environment.sh b/scripts/setup/environment/setup-environment.sh new file mode 100755 index 0000000..2915dc1 --- /dev/null +++ b/scripts/setup/environment/setup-environment.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# 环境设置脚本 +# 用于设置开发环境的必要组件和依赖 + +set -euo pipefail + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查必要的工具 +check_dependencies() { + log_info "检查系统依赖..." + + local deps=("git" "curl" "wget" "jq" "docker" "podman") + local missing_deps=() + + for dep in "${deps[@]}"; do + if ! command -v "$dep" &> /dev/null; then + missing_deps+=("$dep") + fi + done + + if [ ${#missing_deps[@]} -ne 0 ]; then + log_warning "缺少以下依赖: ${missing_deps[*]}" + log_info "请安装缺少的依赖后重新运行" + return 1 + fi + + log_success "所有依赖检查通过" +} + +# 设置环境变量 +setup_environment_variables() { + log_info "设置环境变量..." + + # 创建环境变量文件 + cat > .env << EOF +# 项目环境变量 +PROJECT_ROOT=$(pwd) +SCRIPTS_DIR=\${PROJECT_ROOT}/scripts + +# Vault 配置 +VAULT_ADDR=http://127.0.0.1:8200 +VAULT_DEV_ROOT_TOKEN_ID=myroot + +# Consul 配置 +CONSUL_HTTP_ADDR=http://127.0.0.1:8500 + +# Nomad 配置 +NOMAD_ADDR=http://127.0.0.1:4646 + +# MCP 配置 +MCP_SERVER_PORT=3000 +EOF + + log_success "环境变量文件已创建: .env" +} + +# 创建必要的目录 +create_directories() { + log_info "创建必要的目录..." + + local dirs=( + "logs" + "tmp" + "data" + "backups/vault" + "backups/consul" + "backups/nomad" + ) + + for dir in "${dirs[@]}"; do + mkdir -p "$dir" + log_info "创建目录: $dir" + done + + log_success "目录创建完成" +} + +# 设置脚本权限 +setup_script_permissions() { + log_info "设置脚本执行权限..." + + find scripts/ -name "*.sh" -exec chmod +x {} \; + + log_success "脚本权限设置完成" +} + +# 初始化 Git hooks(如果需要) +setup_git_hooks() { + log_info "设置 Git hooks..." + + if [ -d ".git" ]; then + # 创建 pre-commit hook + cat > .git/hooks/pre-commit << 'EOF' +#!/bin/bash +# 运行基本的代码检查 +echo "运行 pre-commit 检查..." + +# 检查脚本语法 +find scripts/ -name "*.sh" -exec bash -n {} \; || exit 1 + +echo "Pre-commit 检查通过" +EOF + chmod +x .git/hooks/pre-commit + log_success "Git hooks 设置完成" + else + log_warning "不是 Git 仓库,跳过 Git hooks 设置" + fi +} + +# 主函数 +main() { + log_info "开始环境设置..." + + check_dependencies || exit 1 + setup_environment_variables + create_directories + setup_script_permissions + setup_git_hooks + + log_success "环境设置完成!" + log_info "请运行 'source .env' 来加载环境变量" +} + +# 执行主函数 +main "$@" \ No newline at end of file diff --git a/deployment/scripts/init_vault_cluster.sh b/scripts/setup/init/init-vault-cluster.sh similarity index 100% rename from deployment/scripts/init_vault_cluster.sh rename to scripts/setup/init/init-vault-cluster.sh diff --git a/scripts/setup/init/init-vault-dev-api.sh b/scripts/setup/init/init-vault-dev-api.sh new file mode 100755 index 0000000..7c554ce --- /dev/null +++ b/scripts/setup/init/init-vault-dev-api.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# 通过API初始化Vault开发环境(无需本地vault命令) + +set -e + +echo "===== 通过API初始化Vault开发环境 =====" + +# 颜色定义 +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# 函数定义 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 设置主节点地址 +VAULT_MASTER_ADDR='http://100.117.106.136:8200' + +# 等待Vault启动 +log_info "等待Vault启动..." +for i in {1..30}; do + if curl -s "$VAULT_MASTER_ADDR/v1/sys/health" > /dev/null; then + break + fi + echo -n "." + sleep 2 +done +echo "" + +# 检查Vault是否已初始化 +init_status=$(curl -s "$VAULT_MASTER_ADDR/v1/sys/health" | grep -o '"initialized":[^,}]*' | cut -d ':' -f2) +if [ "$init_status" = "false" ]; then + log_info "Vault未初始化,正在通过API初始化..." + + # 通过API初始化Vault(1个密钥,阈值1) + init_response=$(curl -s -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "secret_shares": 1, + "secret_threshold": 1 + }' \ + "$VAULT_MASTER_ADDR/v1/sys/init") + + # 保存响应到文件 + echo "$init_response" > /root/mgmt/security/secrets/vault/dev/init_keys.json + + if echo "$init_response" | grep -q "keys_base64"; then + log_info "Vault初始化成功(开发模式)" + log_warn "注意:这是开发模式,仅使用1个解封密钥" + log_warn "生产环境请使用5个密钥中的3个阈值" + + # 提取密钥和令牌 + unseal_key=$(echo "$init_response" | grep -o '"keys_base64":\["[^"]*"' | cut -d '"' -f4) + root_token=$(echo "$init_response" | grep -o '"root_token":"[^"]*"' | cut -d '"' -f4) + + log_info "解封密钥: $unseal_key" + log_info "根令牌: $root_token" + + # 自动解封所有节点 + log_info "正在自动解封所有Vault节点..." + + # 解封master节点 + curl -s -X POST \ + -H "Content-Type: application/json" \ + -d "{\"key\": \"$unseal_key\"}" \ + "$VAULT_MASTER_ADDR/v1/sys/unseal" > /dev/null + + # 解封ash3c节点 + curl -s -X POST \ + -H "Content-Type: application/json" \ + -d "{\"key\": \"$unseal_key\"}" \ + "http://100.116.80.94:8200/v1/sys/unseal" > /dev/null + + # 解封warden节点 + curl -s -X POST \ + -H "Content-Type: application/json" \ + -d "{\"key\": \"$unseal_key\"}" \ + "http://100.122.197.112:8200/v1/sys/unseal" > /dev/null + + log_info "所有Vault节点已成功解封" + + # 显示Vault状态 + log_info "Vault集群状态:" + curl -s "$VAULT_MASTER_ADDR/v1/sys/health" | jq . + + # 保存环境变量以便后续使用 + echo "export VAULT_ADDR='$VAULT_MASTER_ADDR'" > /root/mgmt/security/secrets/vault/dev/vault_env.sh + echo "export VAULT_TOKEN='$root_token'" >> /root/mgmt/security/secrets/vault/dev/vault_env.sh + log_info "环境变量已保存到: /root/mgmt/security/secrets/vault/dev/vault_env.sh" + + log_warn "开发环境提示:" + log_warn "1. 请勿在生产环境中使用此配置" + log_warn "2. 生产环境应使用5个密钥中的3个阈值" + log_warn "3. 密钥应分发给不同管理员保管" + else + log_error "Vault初始化失败" + log_error "响应: $init_response" + exit 1 + fi +else + log_info "Vault已初始化" + + # 检查Vault是否已解封 + sealed_status=$(curl -s "$VAULT_MASTER_ADDR/v1/sys/health" | grep -o '"sealed":[^,}]*' | cut -d ':' -f2) + if [ "$sealed_status" = "true" ]; then + log_warn "Vault已初始化但仍处于密封状态" + log_info "请使用API解封:" + log_info "curl -X POST -d '{\"key\": \"<解封密钥>\"}' $VAULT_MASTER_ADDR/v1/sys/unseal" + else + log_info "Vault已初始化且已解封,可以正常使用" + + # 显示Vault状态 + log_info "Vault集群状态:" + curl -s "$VAULT_MASTER_ADDR/v1/sys/health" | jq . + fi +fi + +log_info "===== Vault开发环境初始化完成 =====" \ No newline at end of file diff --git a/deployment/scripts/init_vault_dev.sh b/scripts/setup/init/init-vault-dev.sh similarity index 100% rename from deployment/scripts/init_vault_dev.sh rename to scripts/setup/init/init-vault-dev.sh diff --git a/scripts/testing/infrastructure/test-nomad-config.sh b/scripts/testing/infrastructure/test-nomad-config.sh new file mode 100755 index 0000000..ad2132b --- /dev/null +++ b/scripts/testing/infrastructure/test-nomad-config.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# 测试Nomad配置文件 +CONFIG_FILE=$1 + +if [ -z "$CONFIG_FILE" ]; then + echo "请提供配置文件路径" + exit 1 +fi + +if [ ! -f "$CONFIG_FILE" ]; then + echo "配置文件不存在: $CONFIG_FILE" + exit 1 +fi + +echo "测试配置文件: $CONFIG_FILE" + +# 尝试使用nomad agent命令测试配置 +nomad agent -config="$CONFIG_FILE" -config-test 2>&1 | head -20 \ No newline at end of file diff --git a/deployment/scripts/test-traefik-deployment.sh b/scripts/testing/infrastructure/test-traefik-deployment.sh old mode 100644 new mode 100755 similarity index 100% rename from deployment/scripts/test-traefik-deployment.sh rename to scripts/testing/infrastructure/test-traefik-deployment.sh diff --git a/scripts/testing/integration/verify-vault-consul-integration.sh b/scripts/testing/integration/verify-vault-consul-integration.sh new file mode 100755 index 0000000..3c2aa5f --- /dev/null +++ b/scripts/testing/integration/verify-vault-consul-integration.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# 验证Vault与Consul集成状态 + +echo "===== 验证Vault与Consul集成 =====" + +# 颜色定义 +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# 函数定义 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 1. 检查Vault状态 +log_info "1. 检查Vault状态" +source /root/mgmt/security/secrets/vault/dev/vault_env.sh +vault_status=$(vault status 2>/dev/null) +if [ $? -eq 0 ]; then + echo "$vault_status" + storage_type=$(echo "$vault_status" | grep "Storage Type" | awk '{print $3}') + if [ "$storage_type" = "consul" ]; then + log_info "✓ Vault正在使用Consul作为存储后端" + else + log_error "✗ Vault未使用Consul作为存储后端" + exit 1 + fi +else + log_error "✗ 无法连接到Vault" + exit 1 +fi + +# 2. 检查Consul集群状态 +log_info "" +log_info "2. 检查Consul集群状态" +consul_members=$(consul members 2>/dev/null) +if [ $? -eq 0 ]; then + echo "$consul_members" + alive_count=$(echo "$consul_members" | grep -c "alive") + if [ "$alive_count" -ge 1 ]; then + log_info "✓ Consul集群正在运行" + else + log_error "✗ Consul集群无活动节点" + fi +else + log_error "✗ 无法连接到Consul" +fi + +# 3. 检查Consul中的Vault数据 +log_info "" +log_info "3. 检查Consul中的Vault数据" +vault_data=$(curl -s http://100.117.106.136:8500/v1/kv/vault/?recurse 2>/dev/null) +if [ $? -eq 0 ] && [ -n "$vault_data" ]; then + keys_count=$(echo "$vault_data" | jq length) + log_info "✓ Consul中存储了 $keys_count 个Vault相关键值对" + + # 显示一些关键的Vault数据 + echo "关键Vault数据键:" + echo "$vault_data" | jq -r '.[].Key' | head -10 +else + log_error "✗ 无法从Consul获取Vault数据" +fi + +# 4. 验证Vault数据读写 +log_info "" +log_info "4. 验证Vault数据读写" +# 写入测试数据 +test_write=$(vault kv put secret/integration-test/test-key test_value="integration_test_$(date +%s)" 2>&1) +if echo "$test_write" | grep -q "Success"; then + log_info "✓ 成功写入测试数据到Vault" + + # 读取测试数据 + test_read=$(vault kv get secret/integration-test/test-key 2>&1) + if echo "$test_read" | grep -q "test_value"; then + log_info "✓ 成功从Vault读取测试数据" + echo "$test_read" + else + log_error "✗ 无法从Vault读取测试数据" + echo "$test_read" + fi + + # 清理测试数据 + vault kv delete secret/integration-test/test-key >/dev/null 2>&1 +else + log_error "✗ 无法写入测试数据到Vault" + echo "$test_write" +fi + +# 5. 检查Vault集群状态 +log_info "" +log_info "5. 检查Vault集群状态" +cluster_status=$(vault operator raft list-peers 2>&1) +if echo "$cluster_status" | grep -q "executable file not found"; then + log_info "✓ 使用Consul存储后端(非Raft存储)" +else + echo "$cluster_status" +fi + +# 6. 总结 +log_info "" +log_info "===== 集成验证总结 =====" +log_info "✓ Vault已成功集成Consul作为存储后端" +log_info "✓ Consul集群正常运行" +log_info "✓ Vault数据已存储在Consul中" +log_info "✓ Vault读写功能正常" + +log_warn "注意:这是开发环境配置,生产环境请遵循安全策略" \ No newline at end of file diff --git a/tests/mcp_servers/test_direct_search.sh b/scripts/testing/mcp/test_direct_search.sh similarity index 100% rename from tests/mcp_servers/test_direct_search.sh rename to scripts/testing/mcp/test_direct_search.sh diff --git a/tests/mcp_servers/test_local_mcp_servers.sh b/scripts/testing/mcp/test_local_mcp_servers.sh similarity index 100% rename from tests/mcp_servers/test_local_mcp_servers.sh rename to scripts/testing/mcp/test_local_mcp_servers.sh diff --git a/tests/mcp_servers/test_mcp_interface.sh b/scripts/testing/mcp/test_mcp_interface.sh similarity index 100% rename from tests/mcp_servers/test_mcp_interface.sh rename to scripts/testing/mcp/test_mcp_interface.sh diff --git a/tests/mcp_servers/test_mcp_search_final.sh b/scripts/testing/mcp/test_mcp_search_final.sh similarity index 100% rename from tests/mcp_servers/test_mcp_search_final.sh rename to scripts/testing/mcp/test_mcp_search_final.sh diff --git a/tests/mcp_servers/test_mcp_servers.sh b/scripts/testing/mcp/test_mcp_servers.sh similarity index 100% rename from tests/mcp_servers/test_mcp_servers.sh rename to scripts/testing/mcp/test_mcp_servers.sh diff --git a/tests/mcp_servers/test_mcp_servers_comprehensive.py b/scripts/testing/mcp/test_mcp_servers_comprehensive.py old mode 100644 new mode 100755 similarity index 100% rename from tests/mcp_servers/test_mcp_servers_comprehensive.py rename to scripts/testing/mcp/test_mcp_servers_comprehensive.py diff --git a/tests/mcp_servers/test_mcp_servers_improved.py b/scripts/testing/mcp/test_mcp_servers_improved.py old mode 100644 new mode 100755 similarity index 100% rename from tests/mcp_servers/test_mcp_servers_improved.py rename to scripts/testing/mcp/test_mcp_servers_improved.py diff --git a/tests/mcp_servers/test_mcp_servers_simple.py b/scripts/testing/mcp/test_mcp_servers_simple.py old mode 100644 new mode 100755 similarity index 100% rename from tests/mcp_servers/test_mcp_servers_simple.py rename to scripts/testing/mcp/test_mcp_servers_simple.py diff --git a/tests/mcp_servers/test_qdrant_ollama_server.py b/scripts/testing/mcp/test_qdrant_ollama_server.py old mode 100644 new mode 100755 similarity index 100% rename from tests/mcp_servers/test_qdrant_ollama_server.py rename to scripts/testing/mcp/test_qdrant_ollama_server.py diff --git a/tests/mcp_servers/test_qdrant_ollama_tools.sh b/scripts/testing/mcp/test_qdrant_ollama_tools.sh similarity index 100% rename from tests/mcp_servers/test_qdrant_ollama_tools.sh rename to scripts/testing/mcp/test_qdrant_ollama_tools.sh diff --git a/tests/mcp_servers/test_qdrant_ollama_tools_fixed.sh b/scripts/testing/mcp/test_qdrant_ollama_tools_fixed.sh similarity index 100% rename from tests/mcp_servers/test_qdrant_ollama_tools_fixed.sh rename to scripts/testing/mcp/test_qdrant_ollama_tools_fixed.sh diff --git a/tests/mcp_servers/test_search_documents.sh b/scripts/testing/mcp/test_search_documents.sh similarity index 100% rename from tests/mcp_servers/test_search_documents.sh rename to scripts/testing/mcp/test_search_documents.sh diff --git a/tests/run_all_tests.sh b/scripts/testing/run_all_tests.sh similarity index 96% rename from tests/run_all_tests.sh rename to scripts/testing/run_all_tests.sh index a4a95b1..2ad5493 100755 --- a/tests/run_all_tests.sh +++ b/scripts/testing/run_all_tests.sh @@ -12,8 +12,8 @@ YELLOW='\033[1;33m' NC='\033[0m' # No Color # 测试目录 -TEST_DIR="/root/mgmt/tests/mcp_servers" -REPORT_FILE="/root/mgmt/tests/test_results_$(date +%Y%m%d_%H%M%S).md" +TEST_DIR="/root/mgmt/scripts/testing/mcp" +REPORT_FILE="/root/mgmt/scripts/testing/test_results_$(date +%Y%m%d_%H%M%S).md" # 检查测试目录是否存在 if [ ! -d "$TEST_DIR" ]; then diff --git a/run_tests.sh b/scripts/testing/test-runner.sh similarity index 100% rename from run_tests.sh rename to scripts/testing/test-runner.sh diff --git a/scripts/utilities/backup/backup-all.sh b/scripts/utilities/backup/backup-all.sh new file mode 100755 index 0000000..70a7685 --- /dev/null +++ b/scripts/utilities/backup/backup-all.sh @@ -0,0 +1,233 @@ +#!/bin/bash + +# 全量备份脚本 +# 备份所有重要的配置和数据 + +set -euo pipefail + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 配置 +BACKUP_DIR="backups/$(date +%Y%m%d_%H%M%S)" +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../" && pwd)" + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 创建备份目录 +create_backup_dir() { + log_info "创建备份目录: $BACKUP_DIR" + mkdir -p "$BACKUP_DIR" +} + +# 备份配置文件 +backup_configs() { + log_info "备份配置文件..." + + local config_dirs=( + "components" + "infrastructure/configs" + "security" + "deployment/ansible" + "deployment/terraform" + ) + + for dir in "${config_dirs[@]}"; do + if [ -d "$dir" ]; then + log_info "备份 $dir" + cp -r "$dir" "$BACKUP_DIR/" + else + log_warning "目录不存在: $dir" + fi + done +} + +# 备份脚本 +backup_scripts() { + log_info "备份脚本..." + cp -r scripts "$BACKUP_DIR/" +} + +# 备份环境文件 +backup_env_files() { + log_info "备份环境文件..." + + local env_files=( + ".env" + "mcp_shared_config.json" + "hosts_inventory" + "Makefile" + ) + + for file in "${env_files[@]}"; do + if [ -f "$file" ]; then + log_info "备份 $file" + cp "$file" "$BACKUP_DIR/" + else + log_warning "文件不存在: $file" + fi + done +} + +# 备份 Vault 数据(如果运行中) +backup_vault() { + log_info "检查 Vault 状态..." + + if command -v vault &> /dev/null && vault status &> /dev/null; then + log_info "备份 Vault 数据..." + mkdir -p "$BACKUP_DIR/vault" + + # 备份 Vault 策略 + vault policy list > "$BACKUP_DIR/vault/policies.txt" 2>/dev/null || true + + # 备份 Vault 秘密引擎 + vault secrets list -format=json > "$BACKUP_DIR/vault/secrets_engines.json" 2>/dev/null || true + + log_success "Vault 数据备份完成" + else + log_warning "Vault 未运行或不可访问,跳过 Vault 备份" + fi +} + +# 备份 Consul 数据(如果运行中) +backup_consul() { + log_info "检查 Consul 状态..." + + if command -v consul &> /dev/null && consul members &> /dev/null; then + log_info "备份 Consul 数据..." + mkdir -p "$BACKUP_DIR/consul" + + # 备份 Consul KV 存储 + consul kv export > "$BACKUP_DIR/consul/kv_export.json" 2>/dev/null || true + + # 备份 Consul 服务 + consul catalog services -format=json > "$BACKUP_DIR/consul/services.json" 2>/dev/null || true + + log_success "Consul 数据备份完成" + else + log_warning "Consul 未运行或不可访问,跳过 Consul 备份" + fi +} + +# 创建备份清单 +create_manifest() { + log_info "创建备份清单..." + + cat > "$BACKUP_DIR/MANIFEST.md" << EOF +# 备份清单 + +**备份时间**: $(date) +**备份目录**: $BACKUP_DIR +**项目根目录**: $PROJECT_ROOT + +## 备份内容 + +### 配置文件 +- components/ - 组件配置 +- infrastructure/configs/ - 基础设施配置 +- security/ - 安全配置 +- deployment/ - 部署配置 + +### 脚本文件 +- scripts/ - 所有项目脚本 + +### 环境文件 +- .env - 环境变量 +- mcp_shared_config.json - MCP 配置 +- hosts_inventory - 主机清单 +- Makefile - 构建配置 + +### 服务数据 +- vault/ - Vault 数据(如果可用) +- consul/ - Consul 数据(如果可用) + +## 恢复说明 + +1. 解压备份文件到项目目录 +2. 恢复环境变量: \`source .env\` +3. 重新设置脚本权限: \`find scripts/ -name "*.sh" -exec chmod +x {} \\;\` +4. 根据需要恢复服务数据 + +## 备份统计 + +**总文件数**: $(find "$BACKUP_DIR" -type f | wc -l) +**总大小**: $(du -sh "$BACKUP_DIR" | cut -f1) +EOF + + log_success "备份清单创建完成" +} + +# 压缩备份 +compress_backup() { + log_info "压缩备份..." + + local archive_name="backup_$(basename "$BACKUP_DIR").tar.gz" + tar -czf "$archive_name" -C "$(dirname "$BACKUP_DIR")" "$(basename "$BACKUP_DIR")" + + log_success "备份已压缩: $archive_name" + log_info "备份大小: $(du -sh "$archive_name" | cut -f1)" + + # 可选:删除未压缩的备份目录 + read -p "是否删除未压缩的备份目录? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + rm -rf "$BACKUP_DIR" + log_info "未压缩的备份目录已删除" + fi +} + +# 清理旧备份 +cleanup_old_backups() { + log_info "清理旧备份..." + + # 保留最近的5个备份 + local backup_count=$(ls -1 backup_*.tar.gz 2>/dev/null | wc -l) + if [ "$backup_count" -gt 5 ]; then + log_info "发现 $backup_count 个备份,保留最新的5个" + ls -1t backup_*.tar.gz | tail -n +6 | xargs rm -f + log_success "旧备份清理完成" + else + log_info "备份数量未超过限制,无需清理" + fi +} + +# 主函数 +main() { + log_info "开始全量备份..." + + cd "$PROJECT_ROOT" + + create_backup_dir + backup_configs + backup_scripts + backup_env_files + backup_vault + backup_consul + create_manifest + compress_backup + cleanup_old_backups + + log_success "全量备份完成!" +} + +# 执行主函数 +main "$@" \ No newline at end of file diff --git a/deployment/scripts/backup_consul.sh b/scripts/utilities/backup/backup-consul.sh similarity index 100% rename from deployment/scripts/backup_consul.sh rename to scripts/utilities/backup/backup-consul.sh diff --git a/tools/utilities/fix-alpine-cgroups-systemd.sh b/scripts/utilities/helpers/fix-alpine-cgroups-systemd.sh old mode 100644 new mode 100755 similarity index 100% rename from tools/utilities/fix-alpine-cgroups-systemd.sh rename to scripts/utilities/helpers/fix-alpine-cgroups-systemd.sh diff --git a/tools/utilities/fix-alpine-cgroups.sh b/scripts/utilities/helpers/fix-alpine-cgroups.sh old mode 100644 new mode 100755 similarity index 100% rename from tools/utilities/fix-alpine-cgroups.sh rename to scripts/utilities/helpers/fix-alpine-cgroups.sh diff --git a/scripts/utilities/helpers/manage-vault-consul.sh b/scripts/utilities/helpers/manage-vault-consul.sh new file mode 100755 index 0000000..562e22d --- /dev/null +++ b/scripts/utilities/helpers/manage-vault-consul.sh @@ -0,0 +1,196 @@ +#!/bin/bash +# Vault与Consul集成管理脚本 + +# 颜色定义 +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# 函数定义 +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "用法: $0 [选项]" + echo "选项:" + echo " status 显示Vault和Consul状态" + echo " verify 验证集成状态" + echo " backup 备份Consul中的Vault数据" + echo " restore 从备份恢复Consul中的Vault数据" + echo " monitor 监控Vault和Consul运行状态" + echo " health 检查健康状态" + echo " help 显示此帮助信息" +} + +# 显示Vault和Consul状态 +show_status() { + log_info "Vault状态:" + source /root/mgmt/security/secrets/vault/dev/vault_env.sh + vault status + + echo "" + log_info "Consul成员状态:" + consul members + + echo "" + log_info "Consul中的Vault数据键数量:" + curl -s http://100.117.106.136:8500/v1/kv/vault/?keys | jq length +} + +# 验证集成状态 +verify_integration() { + /root/mgmt/deployment/scripts/verify_vault_consul_integration.sh +} + +# 备份Vault数据(存储在Consul中) +backup_vault_data() { + log_info "开始备份Consul中的Vault数据..." + + BACKUP_DIR="/root/mgmt/security/secrets/vault/backups" + TIMESTAMP=$(date +%Y%m%d_%H%M%S) + BACKUP_FILE="$BACKUP_DIR/vault_consul_backup_$TIMESTAMP.json" + + mkdir -p "$BACKUP_DIR" + + # 获取所有Vault相关的键 + keys=$(curl -s http://100.117.106.136:8500/v1/kv/vault/?recurse | jq -r '.[].Key') + + if [ -n "$keys" ]; then + # 创建备份数据结构 + echo '{"backup_timestamp": "'$(date -Iseconds)'", "vault_data": []}' > "$BACKUP_FILE" + + # 备份每个键的值 + while IFS= read -r key; do + value=$(curl -s http://100.117.106.136:8500/v1/kv/$key | jq -r '.[0].Value' | base64 -d | base64) + jq --arg key "$key" --arg value "$value" '.vault_data += [{"key": $key, "value": $value}]' "$BACKUP_FILE" > "$BACKUP_FILE.tmp" && mv "$BACKUP_FILE.tmp" "$BACKUP_FILE" + done <<< "$keys" + + log_info "✓ Vault数据已备份到: $BACKUP_FILE" + log_warn "注意:这是未加密的备份,请确保安全存储" + else + log_error "✗ 无法获取Consul中的Vault数据" + fi +} + +# 远程管理功能演示 +remote_management_demo() { + echo_section "HashiCorp 产品远程管理能力演示" + + log_info "1. Consul 远程管理演示" + + # 查看 Consul 集群成员 + log_info "查看 Consul 集群成员:" + consul members || log_warn "无法获取集群成员信息" + + # 查看 Consul 数据中心信息 + log_info "查看 Consul 数据中心信息:" + consul info | grep -E "(datacenter|server|client)" || log_warn "无法获取数据中心信息" + + # 在 Consul 中存储和读取键值 + log_info "在 Consul 中存储测试键值:" + echo "测试值" | consul kv put demo/test/value - + log_info "从 Consul 读取测试键值:" + consul kv get demo/test/value || log_warn "无法读取键值" + + log_info "2. Vault 远程管理演示" + + # 检查 Vault 状态 + log_info "检查 Vault 状态:" + vault status || log_warn "无法连接到 Vault 或 Vault 未初始化" + + # 列出 Vault 密钥引擎 + log_info "列出 Vault 密钥引擎:" + vault secrets list || log_warn "无法列出密钥引擎" + + # 在 Vault 中写入和读取密钥 + log_info "在 Vault 中存储测试密钥:" + echo "测试数据" | vault kv put secret/demo/test value=- + log_info "从 Vault 读取测试密钥:" + vault kv get secret/demo/test || log_warn "无法读取密钥" + + # 查看 Vault 集群信息 + log_info "查看 Vault 集群信息:" + vault operator raft list-peers || log_warn "无法列出 Raft 集群节点" + + log_info "远程管理功能演示完成" + log_info "请根据实际环境配置正确的地址和认证凭据" +} + +# 健康检查 +health_check() { + log_info "执行健康检查..." + + # Vault健康检查 + vault_health=$(curl -s http://100.117.106.136:8200/v1/sys/health) + if echo "$vault_health" | grep -q '"initialized":true'; then + log_info "✓ Vault已初始化" + else + log_error "✗ Vault未初始化" + fi + + if echo "$vault_health" | grep -q '"sealed":false'; then + log_info "✓ Vault未密封" + else + log_error "✗ Vault已密封" + fi + + # Consul健康检查 + consul_health=$(curl -s http://100.117.106.136:8500/v1/status/leader) + if [ -n "$consul_health" ] && [ "$consul_health" != "null" ]; then + log_info "✓ Consul集群有领导者" + else + log_error "✗ Consul集群无领导者" + fi + + # 检查Vault数据 + vault_data_check=$(curl -s http://100.117.106.136:8500/v1/kv/vault/core/seal-config 2>/dev/null | jq length 2>/dev/null) + if [ -n "$vault_data_check" ] && [ "$vault_data_check" -gt 0 ]; then + log_info "✓ Vault核心数据存在" + else + log_error "✗ Vault核心数据缺失" + fi + + log_info "健康检查完成" +} + +# 主程序 +case "$1" in + status) + show_status + ;; + verify) + verify_integration + ;; + backup) + backup_vault_data + ;; + monitor) + monitor_status + ;; + health) + health_check + ;; + help|--help|-h) + show_help + ;; + *) + if [ -z "$1" ]; then + show_help + else + log_error "未知选项: $1" + show_help + exit 1 + fi + ;; +esac \ No newline at end of file diff --git a/deployment/scripts/nomad-leader-discovery.sh b/scripts/utilities/helpers/nomad-leader-discovery.sh similarity index 95% rename from deployment/scripts/nomad-leader-discovery.sh rename to scripts/utilities/helpers/nomad-leader-discovery.sh index 0b46576..e2177c7 100755 --- a/deployment/scripts/nomad-leader-discovery.sh +++ b/scripts/utilities/helpers/nomad-leader-discovery.sh @@ -5,13 +5,13 @@ # 默认服务器列表(可根据实际情况修改) SERVERS=( - "100.116.158.95" # bj-semaphore.global - "100.81.26.3" # ash1d.global - "100.103.147.94" # ash2e.global - "100.90.159.68" # ch2.global - "100.86.141.112" # ch3.global - "100.98.209.50" # bj-onecloud1.global - "100.120.225.29" # de.global + "100.116.158.95" # bj-semaphore + "100.81.26.3" # ash1d + "100.103.147.94" # ash2e + "100.90.159.68" # ch2 + "100.86.141.112" # ch3 + "100.98.209.50" # bj-onecloud1 + "100.120.225.29" # de ) # 超时设置(秒) diff --git a/deployment/scripts/show_vault_dev_keys.sh b/scripts/utilities/helpers/show-vault-dev-keys.sh similarity index 100% rename from deployment/scripts/show_vault_dev_keys.sh rename to scripts/utilities/helpers/show-vault-dev-keys.sh diff --git a/scripts/utilities/maintenance/cleanup-global-config.sh b/scripts/utilities/maintenance/cleanup-global-config.sh new file mode 100755 index 0000000..bc18d50 --- /dev/null +++ b/scripts/utilities/maintenance/cleanup-global-config.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# Nomad Global 配置清理脚本 +# 此脚本用于移除配置文件中的 .global 后缀 + +set -e + +# 颜色输出 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# 日志函数 +log() { + echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" +} + +warn() { + echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" +} + +error() { + echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" +} + +# 备份文件函数 +backup_file() { + local file=$1 + if [ -f "$file" ]; then + cp "$file" "${file}.backup.$(date +%Y%m%d_%H%M%S)" + log "已备份文件: $file" + fi +} + +# 清理 Ansible 配置文件中的 .global 后缀 +cleanup_ansible_configs() { + log "开始清理 Ansible 配置文件..." + + # 处理 configure-nomad-clients.yml + local client_config="/root/mgmt/deployment/ansible/playbooks/configure-nomad-clients.yml" + if [ -f "$client_config" ]; then + backup_file "$client_config" + sed -i 's/\.global//g' "$client_config" + log "已清理 configure-nomad-clients.yml" + fi + + # 处理 deploy-korean-nodes.yml + local korean_config="/root/mgmt/deployment/ansible/playbooks/deploy-korean-nodes.yml" + if [ -f "$korean_config" ]; then + backup_file "$korean_config" + sed -i 's/\.global//g' "$korean_config" + log "已清理 deploy-korean-nodes.yml" + fi + + # 处理 update_ch2_nomad_name*.yml + for file in /root/mgmt/deployment/ansible/update_ch2_nomad_name*.yml; do + if [ -f "$file" ]; then + backup_file "$file" + sed -i 's/name = "ch2\.global\.global"/name = "ch2"/g' "$file" + sed -i 's/hosts: ch2\.global/hosts: ch2/g' "$file" + log "已清理 $file" + fi + done + + # 处理其他包含 .global 的 Ansible 文件 + find /root/mgmt/deployment/ansible -name "*.yml" -o -name "*.yaml" | while read file; do + if grep -q "\.global" "$file"; then + backup_file "$file" + sed -i 's/\.global//g' "$file" + log "已清理 $file" + fi + done +} + +# 清理 inventory 文件中的 .global 后缀 +cleanup_inventory_files() { + log "开始清理 inventory 文件..." + + # 处理所有 inventory 文件 + find /root/mgmt/deployment/ansible/inventories -name "*.ini" | while read file; do + if grep -q "\.global" "$file"; then + backup_file "$file" + sed -i 's/\.global//g' "$file" + log "已清理 inventory 文件: $file" + fi + done +} + +# 清理脚本文件中的 .global 后缀 +cleanup_script_files() { + log "开始清理脚本文件..." + + # 处理 nomad-leader-discovery.sh + local script_file="/root/mgmt/deployment/scripts/nomad-leader-discovery.sh" + if [ -f "$script_file" ]; then + backup_file "$script_file" + sed -i 's/\.global//g' "$script_file" + log "已清理 nomad-leader-discovery.sh" + fi +} + +# 更新 Nomad 配置模板中的 region 设置 +update_nomad_templates() { + log "开始更新 Nomad 配置模板..." + + # 处理 OpenTofu 模板 + local template_file="/root/mgmt/infrastructure/opentofu/modules/nomad-cluster/templates/nomad-userdata.sh" + if [ -f "$template_file" ]; then + backup_file "$template_file" + sed -i 's/region = "dc1"/region = "dc1"/g' "$template_file" + log "已更新 Nomad 配置模板中的 region 设置" + fi + + # 处理其他可能的模板文件 + find /root/mgmt -name "*.hcl" -o -name "*.sh" | while read file; do + if grep -q 'region = "dc1"' "$file"; then + backup_file "$file" + sed -i 's/region = "dc1"/region = "dc1"/g' "$file" + log "已更新 $file 中的 region 设置" + fi + done +} + +# 验证修改结果 +verify_changes() { + log "验证修改结果..." + + # 检查是否还有 .global 后缀 + local global_count=$(grep -r "\.global" /root/mgmt --include="*.yml" --include="*.yaml" --include="*.ini" --include="*.sh" --include="*.hcl" | grep -v cleanup-global-config.sh | wc -l) + if [ "$global_count" -eq 0 ]; then + log "✅ 所有 .global 后缀已成功移除" + else + warn "仍有一些文件包含 .global 后缀,请手动检查" + grep -r "\.global" /root/mgmt --include="*.yml" --include="*.yaml" --include="*.ini" --include="*.sh" --include="*.hcl" | grep -v cleanup-global-config.sh || true + fi + + # 检查 region 设置 + local region = "dc1"' /root/mgmt --include="*.hcl" --include="*.sh" | grep -v cleanup-global-config.sh | wc -l) + if [ "$region_count" -eq 0 ]; then + log "✅ 所有 region 'global' 设置已更新" + else + warn "仍有一些 region 设置为 'global',请手动检查" + grep -r 'region = "dc1"' /root/mgmt --include="*.hcl" --include="*.sh" | grep -v cleanup-global-config.sh || true + fi +} + +# 主函数 +main() { + log "开始执行 Nomad Global 配置清理..." + + # 创建备份目录 + mkdir -p /root/mgmt/backups/global_cleanup + log "已创建备份目录: /root/mgmt/backups/global_cleanup" + + # 执行清理操作 + cleanup_ansible_configs + cleanup_inventory_files + cleanup_script_files + update_nomad_templates + + # 验证修改结果 + verify_changes + + log "Nomad Global 配置清理完成!" + log "请检查备份文件并重新部署相关配置" +} + +# 执行主函数 +main "$@" \ No newline at end of file diff --git a/test-podman-job.nomad b/test-podman-job.nomad new file mode 100644 index 0000000..a49c5a9 --- /dev/null +++ b/test-podman-job.nomad @@ -0,0 +1,28 @@ +job "test-podman-job" { + datacenters = ["dc1"] + type = "batch" + + constraint { + attribute = "${node.class}" + value = "" + } + + group "test-podman-group" { + count = 1 + + task "test-podman-task" { + driver = "podman" + + config { + image = "alpine:latest" + command = "echo" + args = ["Hello from Podman on Nomad client!"] + } + + resources { + cpu = 100 # MHz + memory = 64 # MB + } + } + } +} \ No newline at end of file