diff --git a/1 b/1 deleted file mode 100644 index 7295ccf..0000000 --- a/1 +++ /dev/null @@ -1,17 +0,0 @@ -===> 连接到 Nomad Leader: http://100.81.26.3:4646 -\n--- 当前节点列表 (Before) --- -ID Node Pool DC Name Class Drain Eligibility Status -ec4bf738 default dc1 pdns false eligible ready -583f1b77 default dc1 semaphore false eligible down -cd121e59 default dc1 influxdb false eligible ready -3edfa5bc default dc1 ash3c false eligible ready -300c11e7 default dc1 hcp1 false eligible ready -5e218d15 default dc1 master false eligible ready -06bb8a3a default dc1 hcs false eligible ready -baea7bb6 default dc1 hcp2 false eligible ready -d2e4ceee default dc1 ch3 false ineligible down -3521e4a1 default dc1 ch2 false eligible down -e6c0cdbf default dc1 ash1d false eligible down -645fbd8b default dc1 ash2e false eligible down -84913d2f default dc1 semaphore false eligible down -a3d0b0e3 default dc1 Syd false eligible ready diff --git a/configuration/inventories/production/nomad-cluster.ini b/configuration/inventories/production/nomad-cluster.ini new file mode 100644 index 0000000..567aeb7 --- /dev/null +++ b/configuration/inventories/production/nomad-cluster.ini @@ -0,0 +1,12 @@ +[consul_servers:children] +nomad_servers + +[consul_servers:vars] +consul_cert_dir=/etc/consul.d/certs +consul_ca_src=security/certificates/ca.pem +consul_cert_src=security/certificates/consul-server.pem +consul_key_src=security/certificates/consul-server-key.pem + +[nomad_cluster:children] +nomad_servers +nomad_clients \ No newline at end of file diff --git a/configuration/playbooks/check-security-logs.yml b/configuration/playbooks/check-security-logs.yml new file mode 100644 index 0000000..d746b2d --- /dev/null +++ b/configuration/playbooks/check-security-logs.yml @@ -0,0 +1,14 @@ +--- +- name: Check for AppArmor or SELinux denials + hosts: germany + become: yes + tasks: + - name: Search journalctl for AppArmor/SELinux messages + shell: 'journalctl -k | grep -i -e apparmor -e selinux -e "avc: denied"' + register: security_logs + changed_when: false + failed_when: false + + - name: Display security logs + debug: + var: security_logs.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/configure-nomad-tailscale.yml b/configuration/playbooks/configure-nomad-tailscale.yml index 3e010f1..624765e 100644 --- a/configuration/playbooks/configure-nomad-tailscale.yml +++ b/configuration/playbooks/configure-nomad-tailscale.yml @@ -116,6 +116,7 @@ client { enabled = true network_interface = "tailscale0" + cpu_total_compute = 0 servers = [ "100.116.158.95:4647", # semaphore @@ -162,7 +163,7 @@ Type=notify User=root Group=root - ExecStart=/snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl + ExecStart={{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl ExecReload=/bin/kill -HUP $MAINPID KillMode=process Restart=on-failure diff --git a/configuration/playbooks/debug-cgroup-permissions.yml b/configuration/playbooks/debug-cgroup-permissions.yml new file mode 100644 index 0000000..df51dbf --- /dev/null +++ b/configuration/playbooks/debug-cgroup-permissions.yml @@ -0,0 +1,33 @@ +--- +- name: Debug cgroup permissions + hosts: germany + become: yes + tasks: + - name: Check permissions of /sys/fs/cgroup/cpuset/ + stat: + path: /sys/fs/cgroup/cpuset/ + register: cpuset_dir + + - name: Display cpuset dir stats + debug: + var: cpuset_dir.stat + + - name: Check for nomad subdir in cpuset + stat: + path: /sys/fs/cgroup/cpuset/nomad + register: nomad_cpuset_dir + ignore_errors: true + + - name: Display nomad cpuset dir stats + debug: + var: nomad_cpuset_dir.stat + when: nomad_cpuset_dir.stat.exists is defined and nomad_cpuset_dir.stat.exists + + - name: List contents of /sys/fs/cgroup/cpuset/ + command: ls -la /sys/fs/cgroup/cpuset/ + register: ls_cpuset + changed_when: false + + - name: Display contents of /sys/fs/cgroup/cpuset/ + debug: + var: ls_cpuset.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/debug-nomad-cgroup.yml b/configuration/playbooks/debug-nomad-cgroup.yml new file mode 100644 index 0000000..4524ca8 --- /dev/null +++ b/configuration/playbooks/debug-nomad-cgroup.yml @@ -0,0 +1,14 @@ +--- +- name: Debug Nomad cgroup subdirectory + hosts: germany + become: yes + tasks: + - name: List contents of /sys/fs/cgroup/cpuset/nomad/ + command: ls -la /sys/fs/cgroup/cpuset/nomad/ + register: ls_nomad_cpuset + changed_when: false + failed_when: false + + - name: Display contents of /sys/fs/cgroup/cpuset/nomad/ + debug: + var: ls_nomad_cpuset.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/debug-nomad-nodes.yml b/configuration/playbooks/debug-nomad-nodes.yml new file mode 100644 index 0000000..abd0b0f --- /dev/null +++ b/configuration/playbooks/debug-nomad-nodes.yml @@ -0,0 +1,30 @@ +--- +- name: Gather Nomad debug information from multiple nodes + hosts: all + become: yes + tasks: + - name: Get Nomad service status + shell: systemctl status nomad --no-pager -l + register: nomad_status + changed_when: false + failed_when: false + + - name: Get last 50 lines of Nomad journal logs + shell: journalctl -u nomad -n 50 --no-pager + register: nomad_journal + changed_when: false + failed_when: false + + - name: Display Nomad Status + debug: + msg: | + --- Nomad Status for {{ inventory_hostname }} --- + {{ nomad_status.stdout }} + {{ nomad_status.stderr }} + + - name: Display Nomad Journal + debug: + msg: | + --- Nomad Journal for {{ inventory_hostname }} --- + {{ nomad_journal.stdout }} + {{ nomad_journal.stderr }} \ No newline at end of file diff --git a/configuration/playbooks/find-nomad-service.yml b/configuration/playbooks/find-nomad-service.yml new file mode 100644 index 0000000..4cebaed --- /dev/null +++ b/configuration/playbooks/find-nomad-service.yml @@ -0,0 +1,14 @@ +--- +- name: Find Nomad service + hosts: germany + become: yes + tasks: + - name: List systemd services and filter for nomad + shell: systemctl list-unit-files --type=service | grep -i nomad + register: nomad_services + changed_when: false + failed_when: false + + - name: Display found services + debug: + var: nomad_services.stdout_lines \ No newline at end of file diff --git a/configuration/playbooks/fix-cgroup-permissions.yml b/configuration/playbooks/fix-cgroup-permissions.yml new file mode 100644 index 0000000..717e133 --- /dev/null +++ b/configuration/playbooks/fix-cgroup-permissions.yml @@ -0,0 +1,19 @@ +--- +- name: Fix cgroup permissions for Nomad + hosts: germany + become: yes + tasks: + - name: Recursively change ownership of nomad cgroup directory + file: + path: /sys/fs/cgroup/cpuset/nomad + state: directory + owner: root + group: root + recurse: yes + + - name: Change ownership of the parent cpuset directory + file: + path: /sys/fs/cgroup/cpuset/ + state: directory + owner: root + group: root \ No newline at end of file diff --git a/configuration/playbooks/fix-nomad-server-config.yml b/configuration/playbooks/fix-nomad-server-config.yml new file mode 100644 index 0000000..eb3a473 --- /dev/null +++ b/configuration/playbooks/fix-nomad-server-config.yml @@ -0,0 +1,45 @@ +--- +- name: Fix Nomad server configuration + hosts: localhost + gather_facts: no + become: yes + tasks: + - name: Create corrected nomad.hcl + copy: + dest: /etc/nomad.d/nomad.hcl + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + log_level = "INFO" + + bind_addr = "100.116.158.95" + + server { + enabled = true + bootstrap_expect = 5 + encrypt = "NVOMDvXblgWfhtzFzOUIHnKEOrbXOkPrkIPbRGGf1YQ=" + retry_join = [ + "100.116.158.95", # semaphore + "100.81.26.3", # ash1d + "100.103.147.94", # ash2e + "100.90.159.68", # ch2 + "100.86.141.112" # ch3 + ] + } + + client { + enabled = false + } + + plugin "podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.116.158.95:8500" + } \ No newline at end of file diff --git a/configuration/playbooks/get-tailscale-ips.yml b/configuration/playbooks/get-tailscale-ips.yml new file mode 100644 index 0000000..0cb6f11 --- /dev/null +++ b/configuration/playbooks/get-tailscale-ips.yml @@ -0,0 +1,12 @@ +--- +- name: Get Tailscale IP for specified nodes + hosts: all + gather_facts: no + tasks: + - name: Get tailscale IP + shell: "tailscale ip -4" + register: tailscale_ip + + - name: Display Tailscale IP + debug: + msg: "Node {{ inventory_hostname }} has IP: {{ tailscale_ip.stdout }}" \ No newline at end of file diff --git a/configuration/playbooks/install-nomad-direct-download.yml b/configuration/playbooks/install-nomad-direct-download.yml index 50b2783..9158098 100644 --- a/configuration/playbooks/install-nomad-direct-download.yml +++ b/configuration/playbooks/install-nomad-direct-download.yml @@ -1,10 +1,8 @@ --- - name: Install Nomad by direct download from HashiCorp - hosts: hcs + hosts: all become: yes vars: - nomad_version: "1.10.5" - nomad_url: "https://releases.hashicorp.com/nomad/{{ nomad_version }}/nomad_{{ nomad_version }}_linux_amd64.zip" nomad_user: "nomad" nomad_group: "nomad" nomad_home: "/opt/nomad" diff --git a/configuration/playbooks/manual-run-nomad-germany.yml b/configuration/playbooks/manual-run-nomad-germany.yml index 4b8d417..00240d4 100644 --- a/configuration/playbooks/manual-run-nomad-germany.yml +++ b/configuration/playbooks/manual-run-nomad-germany.yml @@ -1,17 +1,22 @@ -- name: Manually run Nomad agent to capture output +--- +- name: Manually run Nomad agent for debugging hosts: germany - gather_facts: false + become: yes tasks: - - name: Run nomad agent directly - command: /snap/bin/nomad agent -config=/etc/nomad.d/nomad.hcl - register: nomad_agent_output - ignore_errors: true + - name: Find Nomad binary path + shell: which nomad || find /usr -name nomad 2>/dev/null | head -1 + register: nomad_binary_path + failed_when: nomad_binary_path.stdout == "" - - name: Display agent output + - name: Run nomad agent directly + command: "{{ nomad_binary_path.stdout }} agent -config=/etc/nomad.d/nomad.hcl" + register: nomad_run + failed_when: false + + - name: Display Nomad output debug: - msg: | - --- Nomad Agent STDOUT --- - {{ nomad_agent_output.stdout }} - - --- Nomad Agent STDERR --- - {{ nomad_agent_output.stderr }} \ No newline at end of file + var: nomad_run.stdout + + - name: Display Nomad error output + debug: + var: nomad_run.stderr \ No newline at end of file diff --git a/configuration/playbooks/ping-nodes.yml b/configuration/playbooks/ping-nodes.yml new file mode 100644 index 0000000..8efc80e --- /dev/null +++ b/configuration/playbooks/ping-nodes.yml @@ -0,0 +1,7 @@ +--- +- name: Ping nodes to check connectivity + hosts: all + gather_facts: no + tasks: + - name: Ping the host + ping: \ No newline at end of file diff --git a/configuration/playbooks/read-nomad-config.yml b/configuration/playbooks/read-nomad-config.yml new file mode 100644 index 0000000..18d15cf --- /dev/null +++ b/configuration/playbooks/read-nomad-config.yml @@ -0,0 +1,13 @@ +--- +- name: Read Nomad config file + hosts: localhost + gather_facts: no + tasks: + - name: Read nomad.hcl + slurp: + src: /etc/nomad.d/nomad.hcl + register: nomad_config + + - name: Display Nomad config + debug: + msg: "{{ nomad_config['content'] | b64decode }}" \ No newline at end of file diff --git a/configuration/playbooks/update-nomad-config.yml b/configuration/playbooks/update-nomad-config.yml new file mode 100644 index 0000000..6d3437a --- /dev/null +++ b/configuration/playbooks/update-nomad-config.yml @@ -0,0 +1,37 @@ +--- +- name: Update Nomad config to run as a client + hosts: localhost + gather_facts: no + become: yes + tasks: + - name: Create new nomad.hcl + copy: + dest: /etc/nomad.d/nomad.hcl + content: | + datacenter = "dc1" + data_dir = "/opt/nomad/data" + log_level = "INFO" + + bind_addr = "100.116.158.95" + + server { + enabled = false + } + + client { + enabled = true + servers = ["100.81.26.3:4647", "100.103.147.94:4647", "100.90.159.68:4647"] + } + + plugin "podman" { + config { + socket_path = "unix:///run/podman/podman.sock" + volumes { + enabled = true + } + } + } + + consul { + address = "100.116.158.95:8500" + } \ No newline at end of file diff --git a/consul-cluster.nomad b/consul-cluster.nomad index 401ff21..010cb8e 100644 --- a/consul-cluster.nomad +++ b/consul-cluster.nomad @@ -1,118 +1,57 @@ job "consul-cluster" { datacenters = ["dc1"] - type = "service" - - # 确保在指定的节点上运行 - constraint { - attribute = "${node.unique.name}" - operator = "regexp" - value = "(hcs|master|ash3c)" - } + type = "service" group "consul-servers" { count = 3 - - # 每个节点只运行一个 Consul 实例 + constraint { - operator = "distinct_hosts" - value = "true" - } - - # 网络配置 - network { - mode = "host" - port "http" { - static = 8500 - } - port "rpc" { - static = 8300 - } - port "serf_lan" { - static = 8301 - } - port "serf_wan" { - static = 8302 - } - port "grpc" { - static = 8502 - } - } - - # 持久化存储 - volume "consul-data" { - type = "host" - read_only = false - source = "consul-data" + attribute = "${node.unique.name}" + operator = "regexp" + value = "(master|ash3c|hcp)" } task "consul" { driver = "podman" + config { + image = "hashicorp/consul:latest" + ports = ["server", "serf_lan", "serf_wan", "ui"] + args = [ + "agent", + "-server", + "-bootstrap-expect=3", + "-data-dir=/consul/data", + "-ui", + "-client=0.0.0.0", + "-bind={{ env `NOMAD_IP_server` }}", + "-retry-join=100.117.106.136", + "-retry-join=100.116.80.94", + "-retry-join=100.76.13.187" + ] + } + volume_mount { volume = "consul-data" destination = "/consul/data" read_only = false } - config { - image = "docker.io/hashicorp/consul:1.17" - ports = ["http", "rpc", "serf_lan", "serf_wan", "grpc"] - - args = [ - "agent", - "-server", - "-bootstrap-expect=3", - "-datacenter=dc1", - "-data-dir=/consul/data", - "-log-level=INFO", - "-node=${node.unique.name}", - "-bind=${NOMAD_IP_serf_lan}", - "-client=0.0.0.0", - "-retry-join=100.84.197.26", - "-retry-join=100.117.106.136", - "-retry-join=100.116.80.94", - "-ui-config-enabled=true", - "-connect-enabled=true" - ] - } - - # 环境变量 - env { - CONSUL_ALLOW_PRIVILEGED_PORTS = "true" - } - - # 资源配置 resources { - cpu = 500 - memory = 512 - } - - # 健康检查 - service { - name = "consul" - port = "http" - - tags = [ - "consul", - "server", - "${node.unique.name}" - ] - - check { - type = "http" - path = "/v1/status/leader" - interval = "10s" - timeout = "3s" + network { + mbits = 10 + port "server" { static = 8300 } + port "serf_lan" { static = 8301 } + port "serf_wan" { static = 8302 } + port "ui" { static = 8500 } } } + } - # 重启策略 - restart { - attempts = 3 - interval = "30m" - delay = "15s" - mode = "fail" - } + volume "consul-data" { + type = "host" + read_only = false + source = "consul-data" } } } \ No newline at end of file