From f892d43b47fe4a9c58e890af2e3bf8e25e912ec8 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 27 Jul 2023 14:24:21 +0200 Subject: [PATCH 1/9] Kill slurmd remaining processes on upgrade --- m/common/slurm.nix | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/m/common/slurm.nix b/m/common/slurm.nix index 08de3fd..b02a914 100644 --- a/m/common/slurm.nix +++ b/m/common/slurm.nix @@ -1,6 +1,14 @@ -{ ... }: +{ lib, ... }: { + systemd.services.slurmd.serviceConfig = { + # Kill all processes in the control group on stop/restart. This will kill + # all the jobs running, so ensure that we only upgrade when the nodes are + # not in use. See: + # https://github.com/NixOS/nixpkgs/commit/ae93ed0f0d4e7be0a286d1fca86446318c0c6ffb + # https://bugs.schedmd.com/show_bug.cgi?id=2095#c24 + KillMode = lib.mkForce "control-group"; + }; services.slurm = { client.enable = true; controlMachine = "hut"; -- GitLab From b9001cdf7ddbc3eb0ffbbddb1bf919110593ecf1 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 27 Jul 2023 17:19:17 +0200 Subject: [PATCH 2/9] Upgrade flake: nixpkgs, bscpkgs and agenix --- flake.lock | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/flake.lock b/flake.lock index 62f3468..6c0a419 100644 --- a/flake.lock +++ b/flake.lock @@ -3,16 +3,17 @@ "agenix": { "inputs": { "darwin": "darwin", + "home-manager": "home-manager", "nixpkgs": [ "nixpkgs" ] }, "locked": { - "lastModified": 1682101079, - "narHash": "sha256-MdAhtjrLKnk2uiqun1FWABbKpLH090oeqCSiWemtuck=", + "lastModified": 1690228878, + "narHash": "sha256-9Xe7JV0krp4RJC9W9W9WutZVlw6BlHTFMiUP/k48LQY=", "owner": "ryantm", "repo": "agenix", - "rev": "2994d002dcff5353ca1ac48ec584c7f6589fe447", + "rev": "d8c973fd228949736dedf61b7f8cc1ece3236792", "type": "github" }, "original": { @@ -23,11 +24,11 @@ }, "bscpkgs": { "locked": { - "lastModified": 1686927936, - "narHash": "sha256-y9/R5OqDRFeq5kKRAsv9gge7vkeF/g1ImlbivpjYP/4=", + "lastModified": 1690380002, + "narHash": "sha256-7T1a46WMG/AfWP7zPVrrnjyqyfuUnjNZCdeeX0KM8WA=", "ref": "refs/heads/master", - "rev": "cbe9af5d042e9d5585fe2acef65a1347c68b2fbd", - "revCount": 834, + "rev": "976cdd5a4d98a4b772d35d9cdcc758bbd4eef1c6", + "revCount": 840, "type": "git", "url": "https://pm.bsc.es/gitlab/rarias/bscpkgs.git" }, @@ -58,13 +59,34 @@ "type": "github" } }, + "home-manager": { + "inputs": { + "nixpkgs": [ + "agenix", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1682203081, + "narHash": "sha256-kRL4ejWDhi0zph/FpebFYhzqlOBrk0Pl3dzGEKSAlEw=", + "owner": "nix-community", + "repo": "home-manager", + "rev": "32d3e39c491e2f91152c84f8ad8b003420eab0a1", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "home-manager", + "type": "github" + } + }, "nixpkgs": { "locked": { - "lastModified": 1682526928, - "narHash": "sha256-2cKh4O6t1rQ8Ok+v16URynmb0rV7oZPEbXkU0owNLQs=", + "lastModified": 1690272529, + "narHash": "sha256-MakzcKXEdv/I4qJUtq/k/eG+rVmyOZLnYNC2w1mB59Y=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d6b863fd9b7bb962e6f9fdf292419a775e772891", + "rev": "ef99fa5c5ed624460217c31ac4271cfb5cb2502c", "type": "github" }, "original": { -- GitLab From 14b173f67e5e35f51f9e78fb190dcf0e166e109b Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 27 Jul 2023 17:22:20 +0200 Subject: [PATCH 3/9] GRUB version no longer needed --- m/common/boot.nix | 1 - 1 file changed, 1 deletion(-) diff --git a/m/common/boot.nix b/m/common/boot.nix index ba917da..8b71901 100644 --- a/m/common/boot.nix +++ b/m/common/boot.nix @@ -3,7 +3,6 @@ { # Use the GRUB 2 boot loader. boot.loader.grub.enable = lib.mkForce true; - boot.loader.grub.version = 2; # Enable GRUB2 serial console boot.loader.grub.extraConfig = '' -- GitLab From 55d6c177763c51955d84ae9f144baf9434015d38 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Fri, 28 Jul 2023 13:48:30 +0200 Subject: [PATCH 4/9] Allow access to devices for node_exporter --- m/hut/monitoring.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/m/hut/monitoring.nix b/m/hut/monitoring.nix index c55d366..7690724 100644 --- a/m/hut/monitoring.nix +++ b/m/hut/monitoring.nix @@ -23,6 +23,10 @@ systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false; systemd.services.prometheus-ipmi-exporter.serviceConfig.PrivateDevices = lib.mkForce false; + # We need access to the devices to monitor the disk space + systemd.services.prometheus-node-exporter.serviceConfig.PrivateDevices = lib.mkForce false; + systemd.services.prometheus-node-exporter.serviceConfig.ProtectHome = lib.mkForce "read-only"; + virtualisation.docker.daemon.settings = { metrics-addr = "127.0.0.1:9323"; }; -- GitLab From c242b65e47f3f56a95cc08e72b70ad77f084f8c3 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Fri, 28 Jul 2023 14:24:51 +0200 Subject: [PATCH 5/9] Update nixpkgs to fix docker problem --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 6c0a419..956eefe 100644 --- a/flake.lock +++ b/flake.lock @@ -82,11 +82,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1690272529, - "narHash": "sha256-MakzcKXEdv/I4qJUtq/k/eG+rVmyOZLnYNC2w1mB59Y=", + "lastModified": 1690367991, + "narHash": "sha256-2VwOn1l8y6+cu7zjNE8MgeGJNNz1eat1HwHrINeogFA=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "ef99fa5c5ed624460217c31ac4271cfb5cb2502c", + "rev": "c9cf0708f00fbe553319258e48ca89ff9a413703", "type": "github" }, "original": { -- GitLab From bf692e6e4edf9e2628de286ac08c77b0daae739d Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 17 Aug 2023 12:37:58 +0200 Subject: [PATCH 6/9] Don't set all_proxy --- m/common/net.nix | 3 +++ 1 file changed, 3 insertions(+) diff --git a/m/common/net.nix b/m/common/net.nix index b2c09ca..9d6a28a 100644 --- a/m/common/net.nix +++ b/m/common/net.nix @@ -13,6 +13,9 @@ proxy = { default = "http://localhost:23080/"; noProxy = "127.0.0.1,localhost,internal.domain"; + # Don't set all_proxy as go complains and breaks the gitlab runner, see: + # https://github.com/golang/go/issues/16715 + allProxy = null; }; firewall = { -- GitLab From acf9b71f04a98270374d7730082cb881def2165e Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Fri, 28 Jul 2023 16:19:59 +0200 Subject: [PATCH 7/9] Increase prometheus retention time to one year --- m/hut/monitoring.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/m/hut/monitoring.nix b/m/hut/monitoring.nix index 7690724..d68fe3c 100644 --- a/m/hut/monitoring.nix +++ b/m/hut/monitoring.nix @@ -18,6 +18,7 @@ services.prometheus = { enable = true; port = 9001; + retentionTime = "1y"; }; systemd.services.prometheus-ipmi-exporter.serviceConfig.DynamicUser = lib.mkForce false; -- GitLab From f8fb5fa4ff2d5ca25ca124657a1847f029ced31b Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Thu, 17 Aug 2023 18:55:40 +0200 Subject: [PATCH 8/9] Monitor power from other nodes via LAN --- m/hut/ipmi.yml | 13 ++++++++++ m/hut/monitoring.nix | 62 +++++++++++++++++++++++++++++++++++++++++--- m/hut/targets.yml | 11 ++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 m/hut/ipmi.yml create mode 100644 m/hut/targets.yml diff --git a/m/hut/ipmi.yml b/m/hut/ipmi.yml new file mode 100644 index 0000000..0d68a53 --- /dev/null +++ b/m/hut/ipmi.yml @@ -0,0 +1,13 @@ +modules: + default: + collectors: + - bmc + - ipmi + - chassis + + lan: + collectors: + - ipmi + - chassis + user: "" + pass: "" diff --git a/m/hut/monitoring.nix b/m/hut/monitoring.nix index d68fe3c..a00fb9b 100644 --- a/m/hut/monitoring.nix +++ b/m/hut/monitoring.nix @@ -42,9 +42,13 @@ services.prometheus = { exporters = { - ipmi.enable = true; - ipmi.group = "root"; - ipmi.user = "root"; + ipmi = { + enable = true; + group = "root"; + user = "root"; + configFile = ./ipmi.yml; + #extraFlags = [ "--log.level=debug" ]; + }; node = { enable = true; enabledCollectors = [ "systemd" ]; @@ -66,6 +70,58 @@ ]; }]; } + { + # Scrape the IPMI info of the hosts remotely via LAN + job_name = "ipmi-lan"; + scrape_interval = "1m"; + scrape_timeout = "30s"; + metrics_path = "/ipmi"; + scheme = "http"; + relabel_configs = [ + { + # Takes the address and sets it in the "target=" URL parameter + source_labels = [ "__address__" ]; + separator = ";"; + regex = "(.*)(:80)?"; + target_label = "__param_target"; + replacement = "\${1}"; + action = "replace"; + } + { + # Sets the "instance" label with the remote host we are querying + source_labels = [ "__param_target" ]; + separator = ";"; + regex = "(.*)"; + target_label = "instance"; + replacement = "\${1}"; + action = "replace"; + } + { + # Sets the fixed "module=lan" URL param + separator = ";"; + regex = "(.*)"; + target_label = "__param_module"; + replacement = "lan"; + action = "replace"; + } + { + # Sets the target to query as the localhost IPMI exporter + separator = ";"; + regex = ".*"; + target_label = "__address__"; + replacement = "127.0.0.1:9290"; + action = "replace"; + } + ]; + + # Load the list of targets from another file + file_sd_configs = [ + { + files = [ "${./targets.yml}" ]; + refresh_interval = "30s"; + } + ]; + } ]; }; } diff --git a/m/hut/targets.yml b/m/hut/targets.yml new file mode 100644 index 0000000..2cecd66 --- /dev/null +++ b/m/hut/targets.yml @@ -0,0 +1,11 @@ +- targets: + - 10.0.40.101 + - 10.0.40.102 + - 10.0.40.103 + - 10.0.40.104 + - 10.0.40.105 + - 10.0.40.106 + - 10.0.40.107 + - 10.0.40.108 + labels: + job: ipmi-lan -- GitLab From 480c97e95246b153ff4c17a4e594a04de96d8984 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Tue, 22 Aug 2023 10:28:26 +0200 Subject: [PATCH 9/9] Update flake --- flake.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flake.lock b/flake.lock index 956eefe..1a3da19 100644 --- a/flake.lock +++ b/flake.lock @@ -24,11 +24,11 @@ }, "bscpkgs": { "locked": { - "lastModified": 1690380002, - "narHash": "sha256-7T1a46WMG/AfWP7zPVrrnjyqyfuUnjNZCdeeX0KM8WA=", + "lastModified": 1690560045, + "narHash": "sha256-39ZP+FIzlWoN3c43hReBYpStg4RLYw/z7TdxCQmOvTM=", "ref": "refs/heads/master", - "rev": "976cdd5a4d98a4b772d35d9cdcc758bbd4eef1c6", - "revCount": 840, + "rev": "b4a20d7c3af854b39682484adfd1c7979319f439", + "revCount": 841, "type": "git", "url": "https://pm.bsc.es/gitlab/rarias/bscpkgs.git" }, @@ -82,11 +82,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1690367991, - "narHash": "sha256-2VwOn1l8y6+cu7zjNE8MgeGJNNz1eat1HwHrINeogFA=", + "lastModified": 1692447944, + "narHash": "sha256-fkJGNjEmTPvqBs215EQU4r9ivecV5Qge5cF/QDLVn3U=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "c9cf0708f00fbe553319258e48ca89ff9a413703", + "rev": "d680ded26da5cf104dd2735a51e88d2d8f487b4d", "type": "github" }, "original": { -- GitLab