🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped

 Major Achievements:
- Deployed complete observability stack (Prometheus + Loki + Grafana)
- Established rapid troubleshooting capabilities (3-step process)
- Created heatmap dashboard for log correlation analysis
- Unified logging system (systemd-journald across all nodes)
- Configured API access with Service Account tokens

🧹 Project Cleanup:
- Intelligent cleanup based on Git modification frequency
- Organized files into proper directory structure
- Removed deprecated webhook deployment scripts
- Eliminated 70+ temporary/test files (43% reduction)

📊 Infrastructure Status:
- Prometheus: 13 nodes monitored
- Loki: 12 nodes logging
- Grafana: Heatmap dashboard + API access
- Promtail: Deployed to 12/13 nodes

🚀 Ready for Terraform transition (静默一周后切换)

Project Status: COMPLETED 
This commit is contained in:
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions

View File

@@ -1,106 +1,80 @@
---
# Ansible Playbook: 部署 Consul Client 到所有 Nomad 节点
- name: Deploy Consul Client to Nomad nodes
hosts: nomad_clients:nomad_servers
- name: 批量部署Consul配置到所有节点
hosts: nomad_cluster # 部署到所有Nomad集群节点
become: yes
vars:
consul_version: "1.21.5"
consul_datacenter: "dc1"
consul_servers:
- "100.117.106.136:8300" # master (韩国)
- "100.122.197.112:8300" # warden (北京)
- "100.116.80.94:8300" # ash3c (美国)
consul_server_ips:
- "100.117.106.136" # ch4
- "100.122.197.112" # warden
- "100.116.80.94" # ash3c
tasks:
- name: Update APT cache (忽略 GPG 错误)
apt:
update_cache: yes
force_apt_get: yes
ignore_errors: yes
- name: Install consul via APT (假设源已存在)
apt:
name: consul={{ consul_version }}-*
state: present
force_apt_get: yes
ignore_errors: yes
- name: Create consul user (if not exists)
user:
name: consul
system: yes
shell: /bin/false
home: /opt/consul
create_home: yes
- name: Create consul directories
- name: 创建Consul数据目录
file:
path: "{{ item }}"
path: /opt/consul
state: directory
owner: consul
group: consul
mode: '0755'
loop:
- /opt/consul
- /opt/consul/data
- /etc/consul.d
- /var/log/consul
- name: Get node Tailscale IP
shell: ip addr show tailscale0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1
register: tailscale_ip
failed_when: tailscale_ip.stdout == ""
- name: Create consul client configuration
template:
src: templates/consul-client.hcl.j2
dest: /etc/consul.d/consul.hcl
- name: 创建Consul数据子目录
file:
path: /opt/consul/data
state: directory
owner: consul
group: consul
mode: '0644'
notify: restart consul
mode: '0755'
- name: Create consul systemd service
- name: 创建Consul配置目录
file:
path: /etc/consul.d
state: directory
owner: consul
group: consul
mode: '0755'
- name: 检查节点类型
set_fact:
node_type: "{{ 'server' if inventory_hostname in ['ch4', 'ash3c', 'warden'] else 'client' }}"
ui_enabled: "{{ true if inventory_hostname in ['ch4', 'ash3c', 'warden'] else false }}"
bind_addr: "{{ hostvars[inventory_hostname]['tailscale_ip'] }}" # 使用inventory中指定的Tailscale IP
- name: 生成Consul配置文件
template:
src: templates/consul.service.j2
dest: /etc/systemd/system/consul.service
src: ../infrastructure/consul/templates/consul.j2
dest: /etc/consul.d/consul.hcl
owner: root
group: root
mode: '0644'
notify: reload systemd
vars:
node_name: "{{ inventory_hostname }}"
bind_addr: "{{ hostvars[inventory_hostname]['tailscale_ip'] }}"
node_zone: "{{ node_type }}"
ui_enabled: "{{ ui_enabled }}"
consul_servers: "{{ consul_server_ips }}"
- name: Enable and start consul service
- name: 验证Consul配置文件
command: consul validate /etc/consul.d/consul.hcl
register: consul_validate_result
failed_when: consul_validate_result.rc != 0
- name: 重启Consul服务
systemd:
name: consul
state: restarted
enabled: yes
state: started
notify: restart consul
- name: Wait for consul to be ready
uri:
url: "http://{{ tailscale_ip.stdout }}:8500/v1/status/leader"
status_code: 200
timeout: 5
register: consul_leader_status
until: consul_leader_status.status == 200
retries: 30
delay: 5
- name: Verify consul cluster membership
shell: consul members -status=alive -format=json | jq -r '.[].Name'
register: consul_members
changed_when: false
- name: Display cluster status
debug:
msg: "Node {{ inventory_hostname.split('.')[0] }} joined cluster with {{ consul_members.stdout_lines | length }} members"
handlers:
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart consul
- name: 等待Consul服务启动
wait_for:
port: 8500
host: "{{ hostvars[inventory_hostname]['tailscale_ip'] }}"
timeout: 60
- name: 显示Consul服务状态
systemd:
name: consul
state: restarted
register: consul_status
- name: 显示服务状态
debug:
msg: "{{ inventory_hostname }} ({{ node_type }}) Consul服务状态: {{ consul_status.status.ActiveState }}"

View File

@@ -0,0 +1,63 @@
---
- name: 部署监控代理配置文件
hosts: nomad_cluster
become: yes
vars:
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: 创建promtail配置目录
file:
path: /etc/promtail
state: directory
mode: '0755'
tags:
- promtail-config
- name: 创建node-exporter配置目录
file:
path: /etc/prometheus
state: directory
mode: '0755'
tags:
- node-exporter-config
- name: 部署promtail配置
copy:
src: /root/mgmt/infrastructure/monitor/configs/promtail/promtail-config.yaml
dest: /etc/promtail/config.yaml
owner: root
group: root
mode: '0644'
backup: yes
tags:
- promtail-config
- name: 部署node-exporter配置
copy:
src: /root/mgmt/infrastructure/monitor/configs/node-exporter/node-exporter-config.yml
dest: /etc/prometheus/node-exporter-config.yml
owner: prometheus
group: prometheus
mode: '0644'
backup: yes
tags:
- node-exporter-config
- name: 重启promtail服务
systemd:
name: promtail
state: restarted
enabled: yes
when: ansible_facts['systemd']['promtail']['status'] is defined
tags:
- promtail-restart
- name: 重启node-exporter服务
systemd:
name: prometheus-node-exporter
state: restarted
enabled: yes
when: ansible_facts['systemd']['prometheus-node-exporter']['status'] is defined
tags:
- node-exporter-restart

View File

@@ -0,0 +1,45 @@
---
- name: 部署完整监控栈
hosts: localhost
become: no
vars:
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: 停止并purge现有的monitoring-stack job
command: nomad job stop -purge monitoring-stack
register: stop_result
failed_when: false
changed_when: stop_result.rc == 0
- name: 等待job完全停止
pause:
seconds: 5
- name: 部署完整的monitoring-stack job (Grafana + Prometheus + Loki)
command: nomad job run /root/mgmt/infrastructure/monitor/monitoring-stack.nomad
register: deploy_result
- name: 显示部署结果
debug:
msg: "{{ deploy_result.stdout_lines }}"
- name: 等待服务启动
pause:
seconds: 30
- name: 检查monitoring-stack job状态
command: nomad job status monitoring-stack
register: status_result
- name: 显示job状态
debug:
msg: "{{ status_result.stdout_lines }}"
- name: 检查Consul中的监控服务
command: consul catalog services
register: consul_services
- name: 显示Consul服务
debug:
msg: "{{ consul_services.stdout_lines }}"

View File

@@ -0,0 +1,35 @@
---
- name: 部署Prometheus配置
hosts: influxdb
become: yes
vars:
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: 备份原Prometheus配置
copy:
src: /etc/prometheus/prometheus.yml
dest: /etc/prometheus/prometheus.yml.backup
remote_src: yes
backup: yes
tags:
- backup-config
- name: 部署新Prometheus配置
copy:
src: /root/mgmt/infrastructure/monitor/configs/prometheus/prometheus.yml
dest: /etc/prometheus/prometheus.yml
owner: prometheus
group: prometheus
mode: '0644'
backup: yes
tags:
- deploy-config
- name: 重启Prometheus服务
systemd:
name: prometheus
state: restarted
enabled: yes
tags:
- restart-service

View File

@@ -0,0 +1,80 @@
---
# 修复美国 Ashburn 服务器节点的安全配置
- name: 修复 Ashburn 服务器节点不安全配置
hosts: ash1d,ash2e
become: yes
serial: 1 # 一个一个来,确保安全
tasks:
- name: 显示当前处理的服务器节点
debug:
msg: "⚠️ 正在处理关键服务器节点: {{ inventory_hostname }}"
- name: 检查集群状态 - 确保有足够的服务器在线
uri:
url: "http://semaphore.tailnet-68f9.ts.net:4646/v1/status/leader"
method: GET
register: leader_check
delegate_to: localhost
- name: 确认集群有 leader
fail:
msg: "集群没有 leader停止操作"
when: leader_check.status != 200
- name: 备份当前配置
copy:
src: /etc/nomad.d/nomad.hcl
dest: /etc/nomad.d/nomad.hcl.backup.{{ ansible_date_time.epoch }}
backup: yes
- name: 创建安全的服务器配置
template:
src: ../nomad-configs-tofu/server-template-secure.hcl
dest: /etc/nomad.d/nomad.hcl
backup: yes
notify: restart nomad
- name: 验证配置文件语法
command: nomad config validate /etc/nomad.d/nomad.hcl
register: config_validation
- name: 显示验证结果
debug:
msg: "{{ inventory_hostname }} 配置验证: {{ config_validation.stdout }}"
- name: 重启 Nomad 服务
systemd:
name: nomad
state: restarted
daemon_reload: yes
- name: 等待服务启动
wait_for:
port: 4646
host: "{{ inventory_hostname }}.tailnet-68f9.ts.net"
delay: 10
timeout: 60
delegate_to: localhost
handlers:
- name: restart nomad
systemd:
name: nomad
state: restarted
daemon_reload: yes
post_tasks:
- name: 等待节点重新加入集群
pause:
seconds: 20
- name: 验证服务器重新加入集群
uri:
url: "http://semaphore.tailnet-68f9.ts.net:4646/v1/status/peers"
method: GET
register: cluster_peers
delegate_to: localhost
- name: 显示集群状态
debug:
msg: "集群 peers: {{ cluster_peers.json }}"

View File

@@ -0,0 +1,69 @@
---
- name: 批量安装监控代理软件
hosts: nomad_cluster
become: yes
vars:
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: 添加Grafana APT源
apt_repository:
repo: "deb [trusted=yes] https://packages.grafana.com/oss/deb stable main"
state: present
filename: grafana
when: ansible_distribution == "Debian" or ansible_distribution == "Ubuntu"
tags:
- grafana-repo
- name: 更新APT缓存
apt:
update_cache: yes
tags:
- update-cache
- name: 检查node-exporter是否已安装
command: which prometheus-node-exporter
register: node_exporter_check
failed_when: false
changed_when: false
- name: 安装prometheus-node-exporter
apt:
name: prometheus-node-exporter
state: present
update_cache: yes
when: node_exporter_check.rc != 0
register: node_exporter_install
- name: 显示node-exporter安装结果
debug:
msg: "{{ inventory_hostname }}: {{ '已安装' if node_exporter_check.rc == 0 else '安装完成' if node_exporter_install.changed else '安装失败' }}"
- name: 检查promtail是否已安装
command: which promtail
register: promtail_check
failed_when: false
changed_when: false
- name: 安装promtail
apt:
name: promtail
state: present
update_cache: yes
when: promtail_check.rc != 0
register: promtail_install
- name: 显示promtail安装结果
debug:
msg: "{{ inventory_hostname }}: {{ '已安装' if promtail_check.rc == 0 else '安装完成' if promtail_install.changed else '安装失败' }}"
- name: 创建promtail数据目录
file:
path: /opt/promtail/data
state: directory
owner: promtail
group: nogroup
mode: '0755'
when: promtail_check.rc != 0 or promtail_install.changed
tags:
- promtail-dirs

View File

@@ -1,81 +1,100 @@
---
all:
children:
pve_cluster:
hosts:
nuc12:
ansible_host: nuc12
ansible_user: root
ansible_ssh_pass: "Aa313131@ben"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
xgp:
ansible_host: xgp
ansible_user: root
ansible_ssh_pass: "Aa313131@ben"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
pve:
ansible_host: pve
ansible_user: root
ansible_ssh_pass: "Aa313131@ben"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
vars:
ansible_python_interpreter: /usr/bin/python3
nomad_cluster:
hosts:
ch4:
ansible_host: ch4.tailnet-68f9.ts.net
# 服务器节点 (7个)
ch2:
ansible_host: ch2.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
hcp1:
ansible_host: hcp1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
ash3c:
ansible_host: ash3c.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
warden:
ansible_host: warden.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
onecloud1:
ansible_host: onecloud1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
influxdb1:
ansible_host: influxdb1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
browser:
ansible_host: browser.tailnet-68f9.ts.net
tailscale_ip: "100.90.159.68"
ch3:
ansible_host: ch3.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.86.141.112"
ash1d:
ansible_host: ash1d.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.81.26.3"
ash2e:
ansible_host: ash2e.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.125.147.1"
de:
ansible_host: de.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.120.225.29"
onecloud1:
ansible_host: onecloud1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.98.209.50"
semaphore:
ansible_host: semaphore.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.116.158.95"
# 客户端节点 (6个)
ch4:
ansible_host: ch4.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.117.106.136"
ash3c:
ansible_host: ash3c.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.116.80.94"
warden:
ansible_host: warden.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.122.197.112"
hcp1:
ansible_host: hcp1.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.97.62.111"
influxdb:
ansible_host: influxdb.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.100.7.4"
browser:
ansible_host: browser.tailnet-68f9.ts.net
ansible_user: ben
ansible_ssh_pass: "3131"
ansible_become_pass: "3131"
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
tailscale_ip: "100.116.112.45"
vars:
ansible_python_interpreter: /usr/bin/python3

View File

@@ -0,0 +1,71 @@
# Nomad 服务器安全配置 - OneCloud1 节点
datacenter = "dc1"
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
log_level = "INFO"
name = "onecloud1"
# 安全绑定 - 只绑定到 Tailscale 接口
bind_addr = "onecloud1.tailnet-68f9.ts.net"
addresses {
http = "onecloud1.tailnet-68f9.ts.net"
rpc = "onecloud1.tailnet-68f9.ts.net"
serf = "onecloud1.tailnet-68f9.ts.net"
}
advertise {
http = "onecloud1.tailnet-68f9.ts.net:4646"
rpc = "onecloud1.tailnet-68f9.ts.net:4647"
serf = "onecloud1.tailnet-68f9.ts.net:4648"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
server {
enabled = true
bootstrap_expect = 7
# 服务器发现配置
server_join {
retry_join = [
"semaphore.tailnet-68f9.ts.net:4647",
"ash1d.tailnet-68f9.ts.net:4647",
"ash2e.tailnet-68f9.ts.net:4647",
"ch2.tailnet-68f9.ts.net:4647",
"ch3.tailnet-68f9.ts.net:4647",
"onecloud1.tailnet-68f9.ts.net:4647",
"de.tailnet-68f9.ts.net:4647"
]
retry_interval = "15s"
retry_max = 3
}
}
# 安全的 Consul 配置
consul {
address = "127.0.0.1:8500"
server_service_name = "nomad"
client_service_name = "nomad-client"
auto_advertise = true
server_auto_join = true
client_auto_join = true
}
# Vault 配置(暂时禁用)
vault {
enabled = false
}
# 遥测配置
telemetry {
collection_interval = "1s"
disable_hostname = false
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}