Skip to content

Commit

Permalink
Merge branch 'master' into SARC-368-loki-connect
Browse files Browse the repository at this point in the history
  • Loading branch information
nurbal authored Jan 27, 2025
2 parents 3ab2c68 + 27bf4c2 commit 0a6c9a7
Show file tree
Hide file tree
Showing 20 changed files with 577 additions and 466 deletions.
2 changes: 1 addition & 1 deletion sarc/alerts/usage_alerts/cluster_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import pandas

from sarc.client.series import compute_time_frames, load_job_series
from sarc.config import MTL
from sarc.jobs.series import compute_time_frames, load_job_series

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion sarc/alerts/usage_alerts/gpu_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from datetime import datetime, timedelta
from typing import Optional, Sequence

from sarc.client.series import load_job_series
from sarc.config import MTL
from sarc.jobs.series import load_job_series

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion sarc/alerts/usage_alerts/gpu_util_per_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from datetime import datetime, timedelta
from typing import Optional

from sarc.client.series import compute_cost_and_waste, load_job_series
from sarc.config import MTL
from sarc.jobs.series import compute_cost_and_waste, load_job_series

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion sarc/alerts/usage_alerts/prometheus_stats_occurrences.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Sequence, Union

from sarc.client.series import compute_time_frames, load_job_series
from sarc.config import MTL
from sarc.jobs.series import compute_time_frames, load_job_series

logger = logging.getLogger(__name__)

Expand Down
70 changes: 53 additions & 17 deletions sarc/cli/acquire/slurmconfig.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from __future__ import annotations

import io
import logging
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List

from hostlist import expand_hostlist
from simple_parsing import field

from sarc.cache import CachePolicy, with_cache
from sarc.client.gpumetrics import _gpu_billing_collection
from sarc.config import config
from sarc.config import ClusterConfig, config
from sarc.jobs.node_gpu_mapping import _node_gpu_mapping_collection

logger = logging.getLogger(__name__)
Expand All @@ -18,52 +20,86 @@
@dataclass
class AcquireSlurmConfig:
cluster_name: str = field(alias=["-c"])
day: str = field(alias=["-d"])
day: str = field(
alias=["-d"],
required=False,
help=(
"Cluster config file date (format YYYY-MM-DD). "
"Used for file versioning. Should represents a day when config file has been updated "
"(e.g. for new GPU billings, node GPUs, etc.). "
"If not specified, uses current day and downloads config file from cluster."
),
)

def execute(self) -> int:
if self.cluster_name == "mila":
logger.error("Cluster `mila` not yet supported.")
return -1

parser = SlurmConfigParser(self.cluster_name, self.day)
cluster_config = config().clusters[self.cluster_name]
parser = SlurmConfigParser(cluster_config, self.day)
slurm_conf = parser.get_slurm_config()
_gpu_billing_collection().save_gpu_billing(
self.cluster_name, self.day, slurm_conf.gpu_to_billing
self.cluster_name, parser.day, slurm_conf.gpu_to_billing
)
_node_gpu_mapping_collection().save_node_gpu_mapping(
self.cluster_name, self.day, slurm_conf.node_to_gpu
self.cluster_name, parser.day, slurm_conf.node_to_gpu
)
return 0


class FileContent:
"""
Formatter for slurm conf file cache.
Just read and write entire text content from file.
"""

@classmethod
def load(cls, file) -> str:
return file.read()

@classmethod
def dump(cls, value: str, output_file):
output_file.write(value)


class SlurmConfigParser:
def __init__(self, cluster_name: str, day: str):
self.cluster_name = cluster_name
def __init__(self, cluster: ClusterConfig, day: str | None = None):
if day is None:
# No day given, get current day
day = datetime.now().strftime("%Y-%m-%d")
# Cache must download slurm conf file and save it locally.
cache_policy = CachePolicy.use
logger.info(f"Looking for config file at current date: {day}")
else:
# Day given. Slurm conf file must be retrieved from cache only.
cache_policy = CachePolicy.always
self.cluster = cluster
self.day = day
self.cache_policy = cache_policy

def get_slurm_config(self) -> SlurmConfig:
return with_cache(
content = with_cache(
self._get_slurm_conf,
subdirectory="slurm_conf",
key=self._cache_key,
formatter=self,
)(cache_policy=CachePolicy.always)
formatter=FileContent,
)(cache_policy=self.cache_policy)
return self.load(io.StringIO(content))

def _get_slurm_conf(self):
raise RuntimeError(
f"Please add cluster slurm.conf file into cache, at location: "
f"{config().cache}/slurm_conf/{self._cache_key()}"
)
def _get_slurm_conf(self) -> str:
cmd = f"cat {self.cluster.slurm_conf_host_path}"
result = self.cluster.ssh.run(cmd, hide=True)
return result.stdout

def _cache_key(self):
return f"slurm.{self.cluster_name}.{self.day}.conf"
return f"slurm.{self.cluster.name}.{self.day}.conf"

def load(self, file) -> SlurmConfig:
"""
Parse cached slurm conf file and return a SlurmConfig object
containing node_to_gpu and gpu_to_billing.
"""

partitions: List[Partition] = []
node_to_gpu = {}

Expand Down
16 changes: 15 additions & 1 deletion sarc/cli/health/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import gifnoc

from sarc.alerts.common import config
from sarc.alerts.common import CheckStatus, config
from sarc.alerts.runner import CheckRunner

logger = logging.getLogger(__name__)
Expand All @@ -14,18 +14,32 @@
@dataclass
class HealthCheckCommand:
config: Path = None
once: bool = False

name: str = None

def execute(self) -> int:
with gifnoc.use(self.config):
if self.name:
# only run one check, once (no CheckRunner)
check = config.checks[self.name]
results = check(write=False)
pprint(results)
for k, status in results.statuses.items():
print(f"{status.name} -- {k}")
print(f"{results.status.name}")
elif self.once:
# run all checks, once (no CheckRunner)
for check in [c for c in config.checks.values() if c.active]:
results = check(write=False)
if results.status == CheckStatus.OK:
print(f"Check '{check.name}' succeeded.")
else:
print(f"Check '{check.name}' failed.")
pprint(results)
for k, status in results.statuses.items():
print(f"{status.name} -- {k}")
print(f"{results.status.name}")
else:
try:
runner = CheckRunner(
Expand Down
4 changes: 4 additions & 0 deletions sarc/client/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .gpumetrics import get_cluster_gpu_billings
from .job import count_jobs, get_available_clusters, get_job, get_jobs
from .series import compute_cost_and_waste, compute_time_frames, load_job_series
from .users.api import get_user, get_users

__all__ = [
Expand All @@ -10,4 +11,7 @@
"get_user",
"get_users",
"get_cluster_gpu_billings",
"load_job_series",
"compute_time_frames",
"compute_cost_and_waste",
]
Loading

0 comments on commit 0a6c9a7

Please sign in to comment.