fix: adding URL santization to resolve CodeQL security finding; ruff …

…format (#31) fix: adding URL santization to resolve CodeQL security finding
aws · Sep 12, 2024 · 66a1934 · 66a1934
1 parent b0f3555
commit 66a1934
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 19 deletions.
diff --git a/examples/basic-job-example-config.yaml b/examples/basic-job-example-config.yaml
@@ -21,26 +21,29 @@ hydra:
   output_subdir: null
 
 training_cfg:
-  # entry_script: Required. Path to the entry script of training/fine-tuning, this path should be inside container
+  # entry_script: Required. Path to the entry script of training/fine-tuning, this path should be inside container.
+  # Mapping to '--entry-script' argument in 'start-job' command.
   entry_script: /opt/pytorch-mnist/mnist.py
-  # script_args: Optional. List of script arguments. Example of usage:
+  # script_args: Optional. List of script arguments. Mapping to '--script-args' argument in 'start-job' command.
+  # Example of usage:
   # script_args:
   #  - --max_context_width: 4096
   #  - --num_layers: 32
   script_args: []
   run:
-    # name: Required. Current Training Job name
+    # name: Required. Current Training Job name. Mapping to '--job-name' argument in 'start-job' command.
     name: hyperpod-cli-test
-    # nodes: Required. Number of nodes to use for current training
+    # nodes: Required. Number of nodes to use for current training. Mapping to '--node-count' argument in 'start-job' command.
     nodes: 2
     # ntasks_per_node: Optional. Number of devices to use per node.
     # For CPU instances, default value will be 8; For GPU or TRN instances, default value
-    # will be the accelerator cores number on the instance
+    # will be the accelerator cores number on the instance. Mapping to '--tasks-per-node' argument in 'start-job' command.
     ntasks_per_node: 1
 cluster:
-  # cluster_type: Required. Currently, only support k8s cluster type
+  # cluster_type: Required. Currently, only support k8s cluster type.
   cluster_type: k8s
   # instance_type: Required. SageMaker Hyperpod supported instance type only.
+  # Mapping to '--instance-type' argument in 'start-job' command.
   instance_type: ml.c5.2xlarge
   # cluster_config: Required. Fields related to cluster configuration for each Training job run.
   cluster_config:
@@ -51,27 +54,32 @@ cluster:
     #    sagemaker.amazonaws.com/job-max-retry-count: 1
     annotations: null
     # service_account_name: Optional. The name of service account associated with the namespace.
+    # Mapping to '--service-account-name' argument in 'start-job' command.
     service_account_name: null
-    # persistent_volume_claims: Optional. The persistent volume claims, usually used to mount FSx
+    # persistent_volume_claims: Optional. The persistent volume claims, usually used to mount FSx.
+    # Mapping to '--persistent-volume-claims' argument in 'start-job' command.
     persistent_volume_claims: null
     # namespace: Optional. The namespace to submit job. If not specify, Training job will submit to
     # the current namespace from Kubernetes context.
+    # Mapping to '--namespace' argument in 'start-job' command.
     namespace: kubeflow
     # custom_labels: Optional. Used to specify the name of the queue, which is created by the cluster admin users.
     # custom_labels:
     #   kueue.x-k8s.io/queue-name: low-priority-queue2
     custom_labels: null
     # priority_class_name: Optional. The priority for the job, which is created by the cluster admin users.
+    # Mapping to '--priority' argument in 'start-job' command.
     priority_class_name: null
-    # volumes: Optional. Used to mount temp path to container. Example of usage:
+    # volumes: Optional. Used to mount temp path to container. Mapping to '--volumes' argument in 'start-job' command.
+    # Example of usage:
     # volumes:
     #  - volumeName: v1
     #    hostPath: /data
     #    mountPath: /data
     volumes: null
     # labal_selector: Optional. Defines Kubernetes node affinity to select nodes with labels. Following
     # config will choose SageMaker HyperPod health labels and prefer nodes with SageMaker Hyperpod burn-in
-    # test passed label.
+    # test passed label. Mapping to '--label-selector' argument in 'start-job' command.
     label_selector:
       required:
         sagemaker.amazonaws.com/node-health-status:
@@ -81,17 +89,22 @@ cluster:
           - Passed
       weights:
         - 100
-    # pullPolicy: Required. Kubernetes PyTorchJob pull policy to pull container, can be Always, IfNotPresent and Never
+    # pullPolicy: Required. Kubernetes PyTorchJob pull policy to pull container, can be Always, IfNotPresent and Never.
+    # Mapping to '--pull-policy' argument in 'start-job' command.
     pullPolicy: IfNotPresent
     # restartPolicy: Required. Kubernetes PyTorchJob restart policy. Can be OnFailure, Always or Never.
     # To use SageMaker Hyperpod AutoResume functionality, please set it to OnFailure.
+    # Mapping to '--restart-policy' argument in 'start-job' command.
     restartPolicy: OnFailure
 
 # base_results_dir: Optional. Location to store the results, checkpoints and logs.
+# Mapping to '--results-dir' argument in 'start-job' command.
 base_results_dir: ./result
 # container: Required. Docker image to be used for Training Job
+# Mapping to '--image' argument in 'start-job' command.
 container: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-bc09cfd
 
 # env_vars: Optional. Environment variables passed to the training job.
+# Mapping to '--environment' argument in 'start-job' command.
 env_vars:
   NCCL_DEBUG: INFO # Logging level for NCCL. Set to "INFO" for debug information
diff --git a/src/hyperpod_cli/custom_launcher/launcher/nemo/stages.py b/src/hyperpod_cli/custom_launcher/launcher/nemo/stages.py
@@ -4,6 +4,7 @@
 from typing import Dict, List
 
 import omegaconf
+from urllib.parse import urlparse
 from nemo_launcher.core.stages import Training
 from nemo_launcher.utils.job_utils import JobPaths
 from omegaconf import OmegaConf
@@ -280,7 +281,9 @@ def insert_git_token(self, repo_url: str, token: str) -> str:
         """
         Insert git token to git repo url. Currently only support github repo
         """
-        if "github.com" in repo_url:
+
+        host_name = urlparse(repo_url).hostname
+        if "github.com" == host_name:
             splitted_url = repo_url.split("github.com", 1)
             repo_url = splitted_url[0] + self.cfg.git.token + "@github.com" + splitted_url[1]
         return repo_url

diff --git a/test/integration_tests/abstract_integration_tests.py b/test/integration_tests/abstract_integration_tests.py
@@ -224,12 +224,7 @@ def describe_vpc_stack_and_set_values(self, cfn_client):
                     )
 
     def apply_helm_charts(self):
-        command = [
-            "helm",
-            "dependencies",
-            "update",
-            "helm_chart/HyperPodHelmChart"
-        ]
+        command = ["helm", "dependencies", "update", "helm_chart/HyperPodHelmChart"]
 
         try:
             # Execute the command to update helm charts
@@ -250,7 +245,7 @@ def apply_helm_charts(self):
             "dependencies",
             "helm_chart/HyperPodHelmChart",
             "--namespace",
-            "kube-system"
+            "kube-system",
         ]
 
         try:
@@ -266,7 +261,6 @@ def apply_helm_charts(self):
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Failed to apply helm charts: {e}")
 
-
     def setup(self):
         self.new_session = self._create_session()
         self.create_test_resorces(self.new_session)