🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped

 Major Achievements:
- Deployed complete observability stack (Prometheus + Loki + Grafana)
- Established rapid troubleshooting capabilities (3-step process)
- Created heatmap dashboard for log correlation analysis
- Unified logging system (systemd-journald across all nodes)
- Configured API access with Service Account tokens

🧹 Project Cleanup:
- Intelligent cleanup based on Git modification frequency
- Organized files into proper directory structure
- Removed deprecated webhook deployment scripts
- Eliminated 70+ temporary/test files (43% reduction)

📊 Infrastructure Status:
- Prometheus: 13 nodes monitored
- Loki: 12 nodes logging
- Grafana: Heatmap dashboard + API access
- Promtail: Deployed to 12/13 nodes

🚀 Ready for Terraform transition (静默一周后切换)

Project Status: COMPLETED 
This commit is contained in:
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions

View File

@@ -1,48 +0,0 @@
---
# Ansible 探马 - 检查所有客户端节点的基础环境
- name: 侦察客户端节点基础环境
hosts: all
gather_facts: yes
tasks:
- name: 收集系统架构信息
debug:
msg: "节点 {{ inventory_hostname }} - 架构: {{ ansible_architecture }} - 系统: {{ ansible_distribution }} {{ ansible_distribution_version }}"
- name: 检查 HashiCorp 软件包安装状态
shell: |
echo "=== HashiCorp 软件包检查 ==="
echo "Nomad: $(nomad version 2>/dev/null || echo '未安装')"
echo "Consul: $(consul version 2>/dev/null || echo '未安装')"
echo "Vault: $(vault version 2>/dev/null || echo '未安装')"
register: hashicorp_status
- name: 检查 HashiCorp 软件源配置
shell: |
echo "=== 软件源配置检查 ==="
if [ -f /etc/apt/sources.list.d/hashicorp.list ]; then
echo "HashiCorp 源文件存在:"
cat /etc/apt/sources.list.d/hashicorp.list
else
echo "HashiCorp 源文件不存在"
fi
register: sources_status
- name: 检查系统服务状态
shell: |
echo "=== 系统服务状态 ==="
echo "Nomad: $(systemctl is-active nomad 2>/dev/null || echo '未配置')"
echo "Consul: $(systemctl is-active consul 2>/dev/null || echo '未配置')"
echo "Podman: $(systemctl is-active podman 2>/dev/null || echo '未配置')"
register: services_status
- name: 显示侦察结果
debug:
msg: |
==========================================
节点: {{ inventory_hostname }}
架构: {{ ansible_architecture }}
==========================================
{{ hashicorp_status.stdout }}
{{ sources_status.stdout }}
{{ services_status.stdout }}
==========================================

View File

@@ -1,170 +0,0 @@
#!/bin/bash
# HCP 集群先决条件检查脚本
# 检查所有客户端节点的 HashiCorp 软件包安装状态
set -e
# 客户端节点列表
CLIENT_NODES=(
"ash2e.tailnet-68f9.ts.net"
"ash1d.tailnet-68f9.ts.net"
"hcp1.tailnet-68f9.ts.net"
"influxdb.tailnet-68f9.ts.net"
"ash3c.tailnet-68f9.ts.net"
"ch4.tailnet-68f9.ts.net"
"warden.tailnet-68f9.ts.net"
"browser.tailnet-68f9.ts.net"
)
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5"
PASSWORD="3131"
echo "=== HCP 集群先决条件检查开始 ==="
echo "检查时间: $(date)"
echo
# 检查函数
check_node_prerequisites() {
local node=$1
echo "检查节点: $node"
# 检查网络连通性
if ! ping -c 1 -W 2 "$node" >/dev/null 2>&1; then
echo " ❌ 网络不通"
return 1
fi
# 检查 SSH 连接
if ! sshpass -p "$PASSWORD" ssh $SSH_OPTS ben@"$node" "echo 'SSH OK'" >/dev/null 2>&1; then
echo " ❌ SSH 连接失败"
return 1
fi
echo " ✅ 网络和 SSH 连接正常"
# 检查 HashiCorp 软件源配置
echo " 检查 HashiCorp 软件源..."
sshpass -p "$PASSWORD" ssh $SSH_OPTS ben@"$node" "
if [ -f /etc/apt/sources.list.d/hashicorp.list ]; then
echo ' ✅ HashiCorp 软件源文件存在'
if grep -q 'trusted=yes' /etc/apt/sources.list.d/hashicorp.list; then
echo ' ✅ 已配置 trusted=yes'
else
echo ' ⚠️ 未配置 trusted=yes'
fi
cat /etc/apt/sources.list.d/hashicorp.list | sed 's/^/ /'
else
echo ' ❌ HashiCorp 软件源文件不存在'
fi
"
# 检查二进制文件安装
echo " 检查 HashiCorp 二进制文件..."
sshpass -p "$PASSWORD" ssh $SSH_OPTS ben@"$node" "
for binary in nomad consul vault; do
if command -v \$binary >/dev/null 2>&1; then
version=\$(\$binary version | head -n1)
echo \" ✅ \$binary: \$version\"
else
echo \" ❌ \$binary: 未安装\"
fi
done
"
# 检查系统服务状态
echo " 检查系统服务状态..."
sshpass -p "$PASSWORD" ssh $SSH_OPTS ben@"$node" "
for service in nomad consul; do
if systemctl is-enabled \$service >/dev/null 2>&1; then
status=\$(systemctl is-active \$service)
echo \" \$service: \$status\"
else
echo \" \$service: 未配置\"
fi
done
"
echo
}
# 修复软件源配置的函数
fix_hashicorp_sources() {
local node=$1
echo "修复节点 $node 的 HashiCorp 软件源配置..."
sshpass -p "$PASSWORD" ssh $SSH_OPTS ben@"$node" "
echo '修复 HashiCorp 软件源配置...'
# 备份现有配置
if [ -f /etc/apt/sources.list.d/hashicorp.list ]; then
echo '$PASSWORD' | sudo -S cp /etc/apt/sources.list.d/hashicorp.list /etc/apt/sources.list.d/hashicorp.list.bak
fi
# 创建新的软件源配置 (trusted=yes)
echo '$PASSWORD' | sudo -S tee /etc/apt/sources.list.d/hashicorp.list > /dev/null << 'EOF'
deb [arch=amd64 trusted=yes] https://apt.releases.hashicorp.com jammy main
EOF
# 更新软件包列表
echo '$PASSWORD' | sudo -S apt update
echo '✅ HashiCorp 软件源配置已修复'
"
}
# 安装缺失软件包的函数
install_missing_packages() {
local node=$1
echo "在节点 $node 上安装 HashiCorp 软件包..."
sshpass -p "$PASSWORD" ssh $SSH_OPTS ben@"$node" "
echo '安装 HashiCorp 软件包...'
echo '$PASSWORD' | sudo -S apt install -y nomad consul vault
echo '✅ HashiCorp 软件包安装完成'
"
}
# 主检查流程
main() {
local failed_nodes=()
local needs_source_fix=()
local needs_package_install=()
# 第一轮:检查所有节点
for node in "${CLIENT_NODES[@]}"; do
if ! check_node_prerequisites "$node"; then
failed_nodes+=("$node")
fi
done
# 汇总报告
echo "=== 检查结果汇总 ==="
if [ ${#failed_nodes[@]} -eq 0 ]; then
echo "✅ 所有节点先决条件检查通过"
else
echo "⚠️ 以下节点需要修复:"
for node in "${failed_nodes[@]}"; do
echo " - $node"
done
echo
echo "是否要自动修复这些节点? (y/N)"
read -r response
if [[ "$response" =~ ^[Yy]$ ]]; then
for node in "${failed_nodes[@]}"; do
echo "修复节点: $node"
fix_hashicorp_sources "$node"
install_missing_packages "$node"
echo
done
echo "=== 重新检查修复后的节点 ==="
for node in "${failed_nodes[@]}"; do
check_node_prerequisites "$node"
done
fi
fi
}
main "$@"

View File

@@ -1,95 +0,0 @@
#!/bin/bash
# Nomad ARMv7 自动编译脚本
# 适用于 onecloud1 节点
set -e
echo "🚀 开始编译 Nomad ARMv7 版本..."
# 检查系统架构
ARCH=$(uname -m)
echo "📋 当前系统架构: $ARCH"
# 设置Go环境变量
export GOOS=linux
export GOARCH=arm
export GOARM=7
export CGO_ENABLED=0
echo "🔧 设置编译环境:"
echo " GOOS=$GOOS"
echo " GOARCH=$GOARCH"
echo " GOARM=$GOARM"
echo " CGO_ENABLED=$CGO_ENABLED"
# 检查Go版本
if ! command -v go &> /dev/null; then
echo "❌ Go未安装正在安装..."
# 安装Go (假设是Ubuntu/Debian系统)
sudo apt update
sudo apt install -y golang-go
fi
GO_VERSION=$(go version)
echo "✅ Go版本: $GO_VERSION"
# 创建编译目录
BUILD_DIR="/tmp/nomad-build"
mkdir -p $BUILD_DIR
cd $BUILD_DIR
echo "📥 克隆 Nomad 源码..."
if [ -d "nomad" ]; then
echo "🔄 更新现有仓库..."
cd nomad
git pull
else
git clone https://github.com/hashicorp/nomad.git
cd nomad
fi
# 切换到最新稳定版本
echo "🏷️ 切换到最新稳定版本..."
git checkout $(git describe --tags --abbrev=0)
# 编译
echo "🔨 开始编译..."
make dev
# 检查编译结果
if [ -f "bin/nomad" ]; then
echo "✅ 编译成功!"
# 显示文件信息
file bin/nomad
ls -lh bin/nomad
# 备份现有Nomad
if [ -f "/usr/bin/nomad" ]; then
echo "💾 备份现有Nomad..."
sudo cp /usr/bin/nomad /usr/bin/nomad.backup.$(date +%Y%m%d-%H%M%S)
fi
# 安装新版本
echo "📦 安装新版本..."
sudo cp bin/nomad /usr/bin/nomad
sudo chmod +x /usr/bin/nomad
# 验证安装
echo "🔍 验证安装..."
/usr/bin/nomad version
echo "🎉 Nomad ARMv7 版本安装完成!"
else
echo "❌ 编译失败!"
exit 1
fi
# 清理
echo "🧹 清理编译文件..."
cd /
rm -rf $BUILD_DIR
echo "✨ 完成!"

View File

@@ -1,58 +0,0 @@
#!/bin/bash
# 为所有 Nomad Server 部署 Consul Client
echo "🚀 部署 Consul Client 到所有 Nomad Server 节点"
echo "================================================"
# 部署 Consul Client
echo "1. 部署 Consul Client..."
ansible-playbook -i ansible/inventory/hosts.yml \
ansible/consul-client-deployment.yml \
--limit nomad_servers
if [ $? -eq 0 ]; then
echo "✅ Consul Client 部署成功"
else
echo "❌ Consul Client 部署失败"
exit 1
fi
# 更新 Nomad 配置
echo ""
echo "2. 更新 Nomad Server 配置..."
echo "需要手动更新每个 Nomad Server 的配置:"
echo ""
echo "修改 /etc/nomad.d/nomad.hcl 中的 consul 块:"
echo "consul {"
echo " address = \"127.0.0.1:8500\" # 改为本地"
echo " server_service_name = \"nomad\""
echo " client_service_name = \"nomad-client\""
echo " auto_advertise = true"
echo " server_auto_join = true"
echo " client_auto_join = false"
echo "}"
echo ""
echo "然后重启 Nomad 服务:"
echo "systemctl restart nomad"
echo ""
echo "3. 验证部署..."
sleep 5
# 验证 Consul Client
for server in semaphore ch3 ash1d ash2e ch2 de onecloud1; do
echo "检查 $server..."
if curl -s http://$server.tailnet-68f9.ts.net:8500/v1/status/leader > /dev/null 2>&1; then
echo "$server - Consul Client 运行正常"
else
echo "$server - Consul Client 无响应"
fi
done
echo ""
echo "🎉 部署完成!"
echo "下一步:"
echo "1. 手动更新每个 Nomad Server 的配置文件"
echo "2. 重启 Nomad 服务"
echo "3. 验证 Nomad 与 Consul 的集成"

View File

@@ -1,44 +0,0 @@
#!/bin/bash
# NFS CSI Plugin 部署脚本
# 这个脚本会安装NFS CSI插件让您的NFS存储能在Nomad UI中显示
set -e
echo "🚀 开始部署NFS CSI Plugin..."
# 检查是否为root用户
if [ "$EUID" -ne 0 ]; then
echo "❌ 请以root用户运行此脚本"
exit 1
fi
# 1. 安装CSI插件
echo "📦 安装NFS CSI插件..."
ansible-playbook -i deployment/ansible/inventories/production/hosts \
deployment/ansible/playbooks/install/install-nfs-csi-plugin.yml
# 2. 等待Nomad服务重启
echo "⏳ 等待Nomad服务重启..."
sleep 30
# 3. 注册CSI Volume
echo "📝 注册CSI Volume..."
nomad volume register components/nomad/volumes/nfs-csi-volume.hcl
# 4. 验证CSI插件状态
echo "✅ 验证CSI插件状态..."
nomad plugin status
# 5. 显示CSI volumes
echo "📊 显示CSI volumes..."
nomad volume status
echo "🎉 NFS CSI Plugin部署完成"
echo "现在您可以在Nomad UI中看到CSI插件和volumes了"

View File

@@ -0,0 +1,99 @@
#!/bin/bash
# 智能安装监控代理软件脚本
# 检查软件是否已安装,如果已安装则跳过
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 节点列表
NODES=(
"ch2.tailnet-68f9.ts.net"
"ch3.tailnet-68f9.ts.net"
"ash1d.tailnet-68f9.ts.net"
"ash2e.tailnet-68f9.ts.net"
"de.tailnet-68f9.ts.net"
"onecloud1.tailnet-68f9.ts.net"
"semaphore.tailnet-68f9.ts.net"
"ch4.tailnet-68f9.ts.net"
"ash3c.tailnet-68f9.ts.net"
"warden.tailnet-68f9.ts.net"
"hcp1.tailnet-68f9.ts.net"
"influxdb.tailnet-68f9.ts.net"
"browser.tailnet-68f9.ts.net"
)
# 检查软件是否已安装
check_software() {
local node=$1
local software=$2
echo -e "${BLUE}[$(date +%H:%M:%S)]${NC} 检查 ${node} 上的 ${software}..."
if ssh ben@${node} "which ${software} >/dev/null 2>&1"; then
echo -e "${GREEN}[SKIP]${NC} ${node} 上已安装 ${software}"
return 0
else
echo -e "${YELLOW}[INSTALL]${NC} ${node} 上需要安装 ${software}"
return 1
fi
}
# 安装软件
install_software() {
local node=$1
local software=$2
echo -e "${BLUE}[$(date +%H:%M:%S)]${NC}${node} 上安装 ${software}..."
case $software in
"prometheus-node-exporter")
echo "3131" | ssh ben@${node} "sudo -S apt update && sudo -S apt install -y prometheus-node-exporter"
;;
"promtail")
echo "3131" | ssh ben@${node} "sudo -S apt update && sudo -S apt install -y promtail"
;;
*)
echo -e "${RED}[ERROR]${NC} 未知软件: ${software}"
return 1
;;
esac
if [ $? -eq 0 ]; then
echo -e "${GREEN}[SUCCESS]${NC} ${node}${software} 安装成功"
else
echo -e "${RED}[ERROR]${NC} ${node}${software} 安装失败"
return 1
fi
}
# 主函数
main() {
echo -e "${BLUE}=== 智能安装监控代理软件 ===${NC}"
# 安装 node-exporter
echo -e "\n${YELLOW}=== 安装 Node Exporter ===${NC}"
for node in "${NODES[@]}"; do
if ! check_software "${node}" "prometheus-node-exporter"; then
install_software "${node}" "prometheus-node-exporter"
fi
done
# 安装 promtail
echo -e "\n${YELLOW}=== 安装 Promtail ===${NC}"
for node in "${NODES[@]}"; do
if ! check_software "${node}" "promtail"; then
install_software "${node}" "promtail"
fi
done
echo -e "\n${GREEN}=== 所有监控代理软件安装完成 ===${NC}"
}
# 运行主函数
main "$@"

View File

@@ -1,68 +0,0 @@
#!/bin/bash
# 向所有三个 Consul 节点注册 Traefik 服务
# 解决 Consul leader 轮换问题
CONSUL_NODES=(
"ch4.tailnet-68f9.ts.net:8500"
"warden.tailnet-68f9.ts.net:8500"
"ash3c.tailnet-68f9.ts.net:8500"
)
TRAEFIK_IP="100.97.62.111"
ALLOC_ID=$(nomad job allocs traefik-consul-lb | head -2 | tail -1 | awk '{print $1}')
SERVICE_DATA_LB="{
\"ID\": \"traefik-consul-lb-${ALLOC_ID}\",
\"Name\": \"consul-lb\",
\"Tags\": [\"consul\", \"loadbalancer\", \"traefik\", \"multi-node\"],
\"Address\": \"${TRAEFIK_IP}\",
\"Port\": 80,
\"Check\": {
\"HTTP\": \"http://${TRAEFIK_IP}:80/\",
\"Interval\": \"30s\",
\"Timeout\": \"15s\"
}
}"
SERVICE_DATA_DASHBOARD="{
\"ID\": \"traefik-dashboard-${ALLOC_ID}\",
\"Name\": \"traefik-dashboard\",
\"Tags\": [\"traefik\", \"dashboard\", \"multi-node\"],
\"Address\": \"${TRAEFIK_IP}\",
\"Port\": 8080,
\"Check\": {
\"HTTP\": \"http://${TRAEFIK_IP}:8080/api/overview\",
\"Interval\": \"30s\",
\"Timeout\": \"15s\"
}
}"
echo "Registering Traefik services to all Consul nodes..."
echo "Allocation ID: ${ALLOC_ID}"
echo "Traefik IP: ${TRAEFIK_IP}"
for node in "${CONSUL_NODES[@]}"; do
echo "Registering to ${node}..."
# 注册 consul-lb 服务
curl -s -X PUT "http://${node}/v1/agent/service/register" \
-H "Content-Type: application/json" \
-d "${SERVICE_DATA_LB}"
# 注册 traefik-dashboard 服务
curl -s -X PUT "http://${node}/v1/agent/service/register" \
-H "Content-Type: application/json" \
-d "${SERVICE_DATA_DASHBOARD}"
echo "✓ Registered to ${node}"
done
echo ""
echo "🎉 Services registered to all Consul nodes!"
echo ""
echo "Verification:"
for node in "${CONSUL_NODES[@]}"; do
echo "Services on ${node}:"
curl -s "http://${node}/v1/catalog/services" | jq -r 'keys[]' | grep -E "(consul-lb|traefik-dashboard)" | sed 's/^/ - /'
done

View File

@@ -1,50 +0,0 @@
#!/bin/bash
echo "=== 测试 warden 节点 zsh 修复结果 ==="
# 测试SSH连接
echo "1. 测试SSH连接..."
sshpass -p "3131" ssh -o ConnectTimeout=5 ben@100.122.197.112 "echo 'SSH连接正常'" || {
echo "❌ SSH连接失败"
exit 1
}
echo "✅ SSH连接正常"
# 测试zsh启动
echo "2. 测试zsh启动..."
sshpass -p "3131" ssh ben@100.122.197.112 "zsh -c 'echo \"zsh启动成功\"'" || {
echo "❌ zsh启动失败"
exit 1
}
echo "✅ zsh启动成功"
# 测试completion权限修复
echo "3. 测试completion权限修复..."
sshpass -p "3131" ssh ben@100.122.197.112 "echo 'y' | zsh -c 'echo \"completion测试通过\"'" || {
echo "❌ completion测试失败"
exit 1
}
echo "✅ completion测试通过"
# 测试默认shell设置
echo "4. 测试默认shell设置..."
DEFAULT_SHELL=$(sshpass -p "3131" ssh ben@100.122.197.112 "echo \$SHELL")
if [[ "$DEFAULT_SHELL" == *"zsh"* ]]; then
echo "✅ 默认shell已设置为: $DEFAULT_SHELL"
else
echo "⚠️ 默认shell仍为: $DEFAULT_SHELL"
fi
# 测试oh-my-zsh配置
echo "5. 测试oh-my-zsh配置..."
sshpass -p "3131" ssh ben@100.122.197.112 "zsh -c 'source ~/.zshrc && echo \"oh-my-zsh配置加载成功\"'" || {
echo "❌ oh-my-zsh配置加载失败"
exit 1
}
echo "✅ oh-my-zsh配置加载成功"
echo ""
echo "🎉 所有测试通过warden节点的zsh环境修复完成"
echo ""
echo "现在可以安全地使用: zsh"
echo "不再会出现 'insecure directories' 错误"