Skip to content

Commit

Permalink
[SARC-388] récupérer automatiquement les fichiers slurm.conf depuis l…
Browse files Browse the repository at this point in the history
…es clusters (#142)

* [SARC-388] récupérer automatiquement les fichiers slurm.conf depuis les clusters

* Fix tests

This reverts commit 6ccd487.

---------

Co-authored-by: Bruno Carrez <bruno.carrez@mila.quebec>
  • Loading branch information
notoraptor and nurbal authored Jan 17, 2025
1 parent 521efb7 commit 25649c7
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 27 deletions.
70 changes: 53 additions & 17 deletions sarc/cli/acquire/slurmconfig.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from __future__ import annotations

import io
import logging
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List

from hostlist import expand_hostlist
from simple_parsing import field

from sarc.cache import CachePolicy, with_cache
from sarc.client.gpumetrics import _gpu_billing_collection
from sarc.config import config
from sarc.config import ClusterConfig, config
from sarc.jobs.node_gpu_mapping import _node_gpu_mapping_collection

logger = logging.getLogger(__name__)
Expand All @@ -18,52 +20,86 @@
@dataclass
class AcquireSlurmConfig:
cluster_name: str = field(alias=["-c"])
day: str = field(alias=["-d"])
day: str = field(
alias=["-d"],
required=False,
help=(
"Cluster config file date (format YYYY-MM-DD). "
"Used for file versioning. Should represents a day when config file has been updated "
"(e.g. for new GPU billings, node GPUs, etc.). "
"If not specified, uses current day and downloads config file from cluster."
),
)

def execute(self) -> int:
if self.cluster_name == "mila":
logger.error("Cluster `mila` not yet supported.")
return -1

parser = SlurmConfigParser(self.cluster_name, self.day)
cluster_config = config().clusters[self.cluster_name]
parser = SlurmConfigParser(cluster_config, self.day)
slurm_conf = parser.get_slurm_config()
_gpu_billing_collection().save_gpu_billing(
self.cluster_name, self.day, slurm_conf.gpu_to_billing
self.cluster_name, parser.day, slurm_conf.gpu_to_billing
)
_node_gpu_mapping_collection().save_node_gpu_mapping(
self.cluster_name, self.day, slurm_conf.node_to_gpu
self.cluster_name, parser.day, slurm_conf.node_to_gpu
)
return 0


class FileContent:
"""
Formatter for slurm conf file cache.
Just read and write entire text content from file.
"""

@classmethod
def load(cls, file) -> str:
return file.read()

@classmethod
def dump(cls, value: str, output_file):
output_file.write(value)


class SlurmConfigParser:
def __init__(self, cluster_name: str, day: str):
self.cluster_name = cluster_name
def __init__(self, cluster: ClusterConfig, day: str | None = None):
if day is None:
# No day given, get current day
day = datetime.now().strftime("%Y-%m-%d")
# Cache must download slurm conf file and save it locally.
cache_policy = CachePolicy.use
logger.info(f"Looking for config file at current date: {day}")
else:
# Day given. Slurm conf file must be retrieved from cache only.
cache_policy = CachePolicy.always
self.cluster = cluster
self.day = day
self.cache_policy = cache_policy

def get_slurm_config(self) -> SlurmConfig:
return with_cache(
content = with_cache(
self._get_slurm_conf,
subdirectory="slurm_conf",
key=self._cache_key,
formatter=self,
)(cache_policy=CachePolicy.always)
formatter=FileContent,
)(cache_policy=self.cache_policy)
return self.load(io.StringIO(content))

def _get_slurm_conf(self):
raise RuntimeError(
f"Please add cluster slurm.conf file into cache, at location: "
f"{config().cache}/slurm_conf/{self._cache_key()}"
)
def _get_slurm_conf(self) -> str:
cmd = f"cat {self.cluster.slurm_conf_host_path}"
result = self.cluster.ssh.run(cmd, hide=True)
return result.stdout

def _cache_key(self):
return f"slurm.{self.cluster_name}.{self.day}.conf"
return f"slurm.{self.cluster.name}.{self.day}.conf"

def load(self, file) -> SlurmConfig:
"""
Parse cached slurm conf file and return a SlurmConfig object
containing node_to_gpu and gpu_to_billing.
"""

partitions: List[Partition] = []
node_to_gpu = {}

Expand Down
1 change: 1 addition & 0 deletions sarc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class ClusterConfig(BaseModel):
start_date: str = "2022-04-01"
rgu_start_date: str = None
gpu_to_rgu_billing: Path = None
slurm_conf_host_path: str = "/etc/slurm/slurm.conf"

@validator("timezone")
def _timezone(cls, value):
Expand Down
53 changes: 45 additions & 8 deletions tests/functional/cli/acquire/test_acquire_slurmconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from hostlist import expand_hostlist

from sarc.cache import CacheException
from sarc.cli.acquire.slurmconfig import SlurmConfigParser
from sarc.cli.acquire.slurmconfig import InconsistentGPUBillingError, SlurmConfigParser
from sarc.client.gpumetrics import GPUBilling, get_cluster_gpu_billings
from sarc.config import MTL, config
from sarc.jobs.node_gpu_mapping import NodeGPUMapping, get_node_to_gpu
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

SLURM_CONF_RAISIN_2020_01_01 = """
NodeName=mynode[1,2,5-20,30,40-43] UselessParam=UselessValue Gres=gpu1
Expand Down Expand Up @@ -79,8 +80,9 @@ def test_acquire_slurmconfig(cli_main, caplog):

_save_slurm_conf("raisin", "2020-01-01", SLURM_CONF_RAISIN_2020_01_01)

with pytest.raises(CacheException):
with pytest.raises(KeyError) as exc_info:
cli_main(["acquire", "slurmconfig", "-c", "unknown_raisin", "-d", "2020-01-01"])
assert str(exc_info.value) == "unknown_raisin"

with pytest.raises(CacheException):
cli_main(["acquire", "slurmconfig", "-c", "raisin", "-d", "1999-01-01"])
Expand Down Expand Up @@ -205,11 +207,10 @@ def test_acuire_slurmconfig_inconsistent_billing(cli_main, caplog):
""",
)

with pytest.raises(CacheException):
with pytest.raises(InconsistentGPUBillingError) as exc_info:
cli_main(["acquire", "slurmconfig", "-c", "raisin", "-d", "2020-01-01"])

assert (
"""InconsistentGPUBillingError:
assert """
GPU billing differs.
GPU name: gpu1
Previous value: 5000.0
Expand All @@ -219,8 +220,8 @@ def test_acuire_slurmconfig_inconsistent_billing(cli_main, caplog):
New value: 6000.0
From line: 5
PartitionName=partition2 Nodes=mynode[2,8-11,42] TRESBillingWeights=x=1,GRES/gpu:gpu1=6000,y=2
"""
in caplog.text
""" == str(
exc_info.value
)


Expand All @@ -239,7 +240,7 @@ def assert_same_node_gpu_mapping(


def _save_slurm_conf(cluster_name: str, day: str, content: str):
scp = SlurmConfigParser(cluster_name, day)
scp = SlurmConfigParser(config().clusters[cluster_name], day)
folder = "slurm_conf"
filename = scp._cache_key()
cache_dir = config().cache
Expand All @@ -248,3 +249,39 @@ def _save_slurm_conf(cluster_name: str, day: str, content: str):
file_path = file_dir / filename
with file_path.open("w") as file:
file.write(content)


@pytest.mark.freeze_time(MOCK_TIME)
def test_download_cluster_config(test_config, remote):
"""Test slurm conf file downloading."""

clusters = test_config.clusters
# Check default value for "slurm_conf_host_path" (with cluster raisina)
assert clusters["raisin"].slurm_conf_host_path == "/etc/slurm/slurm.conf"
# Check custom value for "slurm_conf_host_path" (with cluster patate)
assert clusters["patate"].slurm_conf_host_path == "/the/path/to/slurm.conf"

# Use cluster patate for download test
cluster = clusters["patate"]
scp = SlurmConfigParser(cluster)

file_dir = test_config.cache / "slurm_conf"
file_dir.mkdir(parents=True, exist_ok=True)
file_path = file_dir / scp._cache_key()

# Slurm conf file should not yet exist
assert not file_path.exists()

# Get conf file
expected_content = SLURM_CONF_RAISIN_2020_01_01
channel = remote.expect(
host=cluster.host,
cmd=f"cat {cluster.slurm_conf_host_path}",
out=expected_content.encode(),
)
scp.get_slurm_config()

# Now, slurm file should exist
assert file_path.is_file()
with file_path.open() as file:
assert file.read() == expected_content
2 changes: 1 addition & 1 deletion tests/functional/jobs/test_func_sacct.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ def test_get_gpu_type_without_prometheus(
def _save_slurm_conf(cluster_name: str, day: str, content: str):
from sarc.cli.acquire.slurmconfig import SlurmConfigParser

scp = SlurmConfigParser(cluster_name, day)
scp = SlurmConfigParser(config().clusters[cluster_name], day)
folder = "slurm_conf"
filename = scp._cache_key()
cache_dir = config().cache
Expand Down
3 changes: 2 additions & 1 deletion tests/sarc-test.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://fromage-thanos.calcul.ca",
"prometheus_headers_file": "tests/not-so-secrets/patate_prometheus/headers.json",
"gpu_to_rgu_billing": "tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json"
"gpu_to_rgu_billing": "tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json",
"slurm_conf_host_path": "/the/path/to/slurm.conf"
},
"gerudo": {
"host": "gerudo",
Expand Down

0 comments on commit 25649c7

Please sign in to comment.