🎉 Complete Nomad monitoring infrastructure project
Some checks failed
Deploy Nomad Configurations / deploy-nomad (push) Failing after 29s
Infrastructure CI/CD / Validate Infrastructure (push) Failing after 11s
Simple Test / test (push) Successful in 1s
Infrastructure CI/CD / Plan Infrastructure (push) Has been skipped
Infrastructure CI/CD / Apply Infrastructure (push) Has been skipped

 Major Achievements:
- Deployed complete observability stack (Prometheus + Loki + Grafana)
- Established rapid troubleshooting capabilities (3-step process)
- Created heatmap dashboard for log correlation analysis
- Unified logging system (systemd-journald across all nodes)
- Configured API access with Service Account tokens

🧹 Project Cleanup:
- Intelligent cleanup based on Git modification frequency
- Organized files into proper directory structure
- Removed deprecated webhook deployment scripts
- Eliminated 70+ temporary/test files (43% reduction)

📊 Infrastructure Status:
- Prometheus: 13 nodes monitored
- Loki: 12 nodes logging
- Grafana: Heatmap dashboard + API access
- Promtail: Deployed to 12/13 nodes

🚀 Ready for Terraform transition (静默一周后切换)

Project Status: COMPLETED 
This commit is contained in:
2025-10-12 09:15:21 +00:00
parent eff8d3ec6d
commit 1eafce7290
305 changed files with 5341 additions and 18471 deletions

View File

@@ -0,0 +1,43 @@
# ash1d 健康检查和重启配置
# 获取 ash1d 实例的详细信息
data "oci_core_instance" "ash1d_detail" {
provider = oci.us_check
instance_id = "ocid1.instance.oc1.iad.anuwcljtkbqyulqcr3ekof6jr5mnmja2gl7vfmwf6s4nnsch6t5osfhwhhfq"
}
# 获取实例的 VNIC 信息
data "oci_core_vnic_attachments" "ash1d_vnics" {
provider = oci.us_check
compartment_id = var.tenancy_ocid
instance_id = "ocid1.instance.oc1.iad.anuwcljtkbqyulqcr3ekof6jr5mnmja2gl7vfmwf6s4nnsch6t5osfhwhhfq"
}
# 输出 ash1d 详细健康信息
output "ash1d_health_status" {
value = {
instance_id = data.oci_core_instance.ash1d_detail.id
display_name = data.oci_core_instance.ash1d_detail.display_name
state = data.oci_core_instance.ash1d_detail.state
time_created = data.oci_core_instance.ash1d_detail.time_created
fault_domain = data.oci_core_instance.ash1d_detail.fault_domain
launch_mode = data.oci_core_instance.ash1d_detail.launch_mode
boot_volume_id = data.oci_core_instance.ash1d_detail.boot_volume_id
# 网络信息
vnics_count = length(data.oci_core_vnic_attachments.ash1d_vnics.vnic_attachments)
}
description = "ash1d 实例详细健康状态 - 检查是否需要重启"
}
# 创建一个变量来控制是否重启
variable "reboot_ash1d" {
description = "设置为 true 来重启 ash1d"
type = bool
default = false
}
# 输出重启命令(手动执行)
output "ash1d_reboot_command" {
value = "oci compute instance action --instance-id ocid1.instance.oc1.iad.anuwcljtkbqyulqcr3ekof6jr5mnmja2gl7vfmwf6s4nnsch6t5osfhwhhfq --action SOFTRESET"
description = "手动执行此命令来重启 ash1d软重启不会丢失数据"
}

81
terraform-oci-us/main.tf Normal file
View File

@@ -0,0 +1,81 @@
# 查看美国区 Oracle Cloud 资源
terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "~> 7.0"
}
}
}
# 直接配置美国区域 Provider - 不依赖 Consul
provider "oci" {
alias = "us_check"
tenancy_ocid = var.tenancy_ocid
user_ocid = var.user_ocid
fingerprint = var.fingerprint
private_key_path = "./oci_api_key.pem"
region = "us-ashburn-1"
}
# 获取美国区域的所有实例
data "oci_core_instances" "us_instances" {
provider = oci.us_check
compartment_id = var.tenancy_ocid
}
# 获取美国区域的所有磁盘卷
data "oci_core_volumes" "us_volumes" {
provider = oci.us_check
compartment_id = var.tenancy_ocid
}
# 获取美国区域的所有启动卷
data "oci_core_boot_volumes" "us_boot_volumes" {
provider = oci.us_check
availability_domain = "TZXJ:US-ASHBURN-AD-1"
compartment_id = var.tenancy_ocid
}
# 输出所有实例信息
output "us_instances_status" {
value = {
for instance in data.oci_core_instances.us_instances.instances :
instance.display_name => {
id = instance.id
state = instance.state
shape = instance.shape
availability_domain = instance.availability_domain
time_created = instance.time_created
}
}
description = "美国区域所有实例状态"
}
# 输出磁盘状态 - 关键信息!
output "us_volumes_status" {
value = {
for volume in data.oci_core_volumes.us_volumes.volumes :
volume.display_name => {
id = volume.id
state = volume.state
size_in_gbs = volume.size_in_gbs
time_created = volume.time_created
}
}
description = "美国区域所有数据磁盘状态"
}
# 输出启动磁盘状态 - 更关键!
output "us_boot_volumes_status" {
value = {
for boot_volume in data.oci_core_boot_volumes.us_boot_volumes.boot_volumes :
boot_volume.display_name => {
id = boot_volume.id
state = boot_volume.state
size_in_gbs = boot_volume.size_in_gbs
time_created = boot_volume.time_created
}
}
description = "美国区域所有启动磁盘状态"
}

View File

@@ -0,0 +1,6 @@
[DEFAULT]
user=ocid1.user.oc1..aaaaaaaappc7zxue4dlrsjljg4fwl6wcc5smetreuvpqn72heiyvjeeqanqq
fingerprint=73:80:50:35:b6:1d:e3:fc:68:f8:e3:e8:0b:df:79:e3
tenancy=ocid1.tenancy.oc1..aaaaaaaayyhuf6swf2ho4s5acdpee6zssst6j7nkiri4kyfdusxzn3e7p32q
region=us-ashburn-1
key_file=./oci_api_key.pem

View File

@@ -0,0 +1,14 @@
variable "tenancy_ocid" {
description = "Oracle Cloud 租户 OCID"
type = string
}
variable "user_ocid" {
description = "Oracle Cloud 用户 OCID"
type = string
}
variable "fingerprint" {
description = "API 密钥指纹"
type = string
}