Refactor DSS provider tests (New) (#1601)

The tests for DSS provider in contrib have been refactored so that they use individual scripts now available in the relevant bin folder instead of scripts written inline in the jobs' command. The tests end up checking the same things, but are now better implemented. The shell scripts were extracted out into one of multiple ones that roughly group related tests, including those generic to DSS, those for setting up Intel GPUs, those for using ITEX, and those for using IPEX. Furthermore, larger Python scripts that were previously written in-line have been factored out into individual ones. Please note that these Python scripts are not meant to be run by the jobs directly, instead they are submitted via the job to DSS to be run in the appropriate Kubernetes Pods, hence the Python dependencies of these scripts are not relevant to the provider itself.
canonical · Nov 18, 2024 · cdb82d0 · cdb82d0
1 parent 1cd8e52
commit cdb82d0
Show file tree

Hide file tree

Showing 8 changed files with 417 additions and 252 deletions.
diff --git a/.github/workflows/testflinger-contrib-dss-regression.yaml b/.github/workflows/testflinger-contrib-dss-regression.yaml
@@ -40,7 +40,7 @@ jobs:
           -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
           -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
           -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
-          ${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
+          ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
           ${GITHUB_WORKSPACE}/job.yaml
       - name: Build job file from template with oemscript provisioning
         if: ${{ matrix.queue == 'dell-precision-5680-c31665' }}
@@ -51,7 +51,7 @@ jobs:
           -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
           -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
           -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
-          ${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
+          ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
           ${GITHUB_WORKSPACE}/job.yaml
       - name: Submit testflinger job
         uses: canonical/testflinger/.github/actions/submit@main

diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+#  IMPORTANT: for any test using the dss command:
+#
+# - Clear PYTHON shell vars to prevent conflicts between dss
+#   and checkbox python environments
+# - Run from ${HOME} as dss writes logs to its working directory,
+#   and as a snap does not have permissions to write to the default
+#   working directory for checkbox tests
+export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE
+
+check_dss_can_be_initialized() {
+    # TODO: we actually seem to initialize dss here; maybe split it out
+    cd "${HOME}"
+    dss initialize --kubeconfig="$(sudo microk8s config)"
+    echo "Test success: dss initialized."
+}
+
+check_dss_namespace_is_deployed() {
+    if microk8s.kubectl get ns | grep -q dss; then
+        echo "Test success: 'dss' namespace is deployed!"
+    else
+        >&2 echo "Test failure: no namespace named 'dss' deployed."
+        exit 1
+    fi
+}
+
+check_mlflow_status_is_ready() {
+    cd "${HOME}"
+    result=$(dss status) # save result to shell var to avoid broken pipe error
+    if echo "${result}" | grep -q "MLflow deployment: Ready"; then
+        echo "Test success: 'dss status' shows ready status for mlflow."
+    else
+        >&2 echo "Test failure: 'dss status' does not show ready status for mlflow."
+        exit 1
+    fi
+}
+
+check_mlflow_is_deployed_as_first_service() {
+    # TODO: enable mlflow to be a service in any position
+    result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}')
+    if [ "${result}" = "mlflow" ]; then
+        echo "Test success: 'mlflow' service is deployed!"
+    else
+        >&2 echo "Test failure: expected service name 'mlflow' but got ${result}"
+        exit 1
+    fi
+}
+
+check_dss_has_intel_gpu_acceleration_enabled() {
+    cd "${HOME}"
+    result=$(dss status) # save result to shell var to avoid broken pipe error
+    if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then
+        echo "Test success: 'dss status' correctly reports Intel GPU status."
+    else
+        >&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled."
+        exit 1
+    fi
+}
+
+check_dss_can_create_itex_215_notebook() {
+    cd "${HOME}"
+    if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then
+        echo "Test success: successfully created an ITEX 2.15 notebook."
+    else
+        >&2 echo "Test failure: failed to create an ITEX 2.15 notebook."
+        exit 1
+    fi
+}
+
+check_dss_can_create_ipex_2120_notebook() {
+    cd "${HOME}"
+    if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then
+        echo "Test success: successfully created an IPEX 2.1.20 notebook."
+    else
+        >&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook."
+        exit 1
+    fi
+}
+
+help_function() {
+    echo "This script is used for generic tests related to DSS"
+    echo "Usage: check_dss.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<dss_can_be_initialized>: check_dss_can_be_initialized"
+    echo -e "\t<dss_namespace_is_deployed>: check_dss_namespace_is_deployed"
+    echo -e "\t<mlflow_status_is_ready>: check_mlflow_status_is_ready"
+    echo -e "\t<mlflow_is_deployed_as_first_service>: check_mlflow_is_deployed_as_first_service"
+    echo -e "\t<intel_gpu_acceleration_is_enabled>: check_dss_has_intel_gpu_acceleration_enabled"
+    echo -e "\t<can_create_itex_215_notebook>: check_dss_can_create_itex_215_notebook"
+    echo -e "\t<can_create_ipex_2120_notebook>: check_dss_can_create_ipex_2120_notebook"
+}
+
+main() {
+    case ${1} in
+    dss_can_be_initialized) check_dss_can_be_initialized ;;
+    dss_namespace_is_deployed) check_dss_namespace_is_deployed ;;
+    mlflow_status_is_ready) check_mlflow_status_is_ready ;;
+    mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;;
+    intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;;
+    can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;;
+    can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+check_host_has_intel_gpus() {
+    result=$(intel_gpu_top -L)
+    if [[ ${result} == *"pci:vendor=8086"* ]]; then
+        echo "Test success: Intel GPU available on host: ${result}"
+    else
+        >&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}"
+        exit 1
+    fi
+}
+
+check_intel_gpu_plugin_can_be_installed() {
+    # Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453
+
+    # TODO: make version a param
+    VERSION=v0.30.0
+    # hack as redirecting stdout anywhere but /dev/null throws a permission denied error
+    # see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
+    kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null
+    kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null
+    kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null
+    sed -i 's/enable-monitoring/enable-monitoring\n        - -shared-dev-num=10/' /tmp/gpu_plugin.yaml
+    kubectl apply -f /tmp/node_feature_discovery.yaml
+    kubectl apply -f /tmp/node_feature_rules.yaml
+    kubectl apply -f /tmp/gpu_plugin.yaml
+    SLEEP_SECS=15
+    echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status."
+    sleep ${SLEEP_SECS}
+    kubectl -n node-feature-discovery rollout status ds/nfd-worker
+    kubectl -n default rollout status ds/intel-gpu-plugin
+    echo "[INFO]: sleeping for ${SLEEP_SECS} seconds to allow pod status to update for subsequent tests."
+    sleep ${SLEEP_SECS}
+    echo "Test success: Intel K8s GPU Device Plugin deployed."
+}
+
+check_intel_gpu_plugin_daemonset_is_deployed() {
+    result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].metadata.name}')
+    if [ "${result}" = "intel-gpu-plugin" ]; then
+        echo "Test success: 'intel-gpu-plugin' daemonset is deployed!"
+    else
+        >&2 echo "Test failure: expected daemonset name 'intel-gpu-plugin' but got ${result}"
+        exit 1
+    fi
+}
+
+check_one_intel_gpu_plugin_daemonset_is_available() {
+    result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberAvailable}')
+    if [ "${result}" = "1" ]; then
+        echo "Test success: 1 daemonset in numberAvailable status."
+    else
+        >&2 echo "Test failure: expected numberAvailable to be 1 but got ${result}"
+        exit 1
+    fi
+}
+
+check_one_intel_gpu_plugin_daemonset_is_ready() {
+    result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberReady}')
+    if [ "${result}" = "1" ]; then
+        echo "Test success: 1 daemonset in numberReady status."
+    else
+        >&2 echo "Test failure: expected numberReady to be 1 but got ${result}"
+        exit 1
+    fi
+}
+
+check_intel_gpu_node_label_is_attached() {
+    result=$(microk8s.kubectl get node -o jsonpath='{.items[0].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')
+    if [ "${result}" = "true" ]; then
+        echo "Test success: found expected label: 'intel.feature.node.kubernetes.io/gpu': 'true'"
+    else
+        >&2 echo "Test failure: expected 'true' but got ${result}"
+        exit 1
+    fi
+}
+
+check_at_least_one_intel_gpu_is_available() {
+    result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
+    if [ "${result}" -ge 1 ]; then
+        echo "Test success: Found ${result} GPUs on system."
+    else
+        >&2 echo "Test failure: expected at least 1 GPU but got ${result}"
+        exit 1
+    fi
+}
+
+check_capacity_slots_for_intel_gpus_match() {
+    num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
+    result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}')
+    # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
+    SLOTS_PER_GPU=10
+    total_slots=$((num_gpus * SLOTS_PER_GPU))
+    if [ "${total_slots}" -eq "${result}" ]; then
+        echo "Test success: Found ${result} GPU capacity slots on k8s node."
+    else
+        >&2 echo "Test failure: expected ${total_slots} GPU capacity slots but got ${result}"
+        exit 1
+    fi
+}
+
+check_allocatable_slots_for_intel_gpus_match() {
+    num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
+    result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}')
+    # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
+    SLOTS_PER_GPU=10
+    total_slots=$((num_gpus * SLOTS_PER_GPU))
+    if [ "${total_slots}" -eq "${result}" ]; then
+        echo "Test success: Found ${result} GPU allocatable slots on k8s node."
+    else
+        >&2 echo "Test failure: expected ${total_slots} GPU allocatable slots but got ${result}"
+        exit 1
+    fi
+}
+
+help_function() {
+    echo "This script is used for tests related to Intel GPUs"
+    echo "Usage: check.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<host_has_intel_gpus>: check_host_has_intel_gpus"
+    echo -e "\t<gpu_plugin_can_be_installed>: check_intel_gpu_plugin_can_be_installed"
+    echo -e "\t<gpu_plugin_daemonset_is_deployed>: check_intel_gpu_plugin_daemonset_is_deployed"
+    echo -e "\t<one_daemonset_is_available>: check_one_intel_gpu_plugin_daemonset_is_available"
+    echo -e "\t<one_daemonset_is_ready>: check_one_intel_gpu_plugin_daemonset_is_ready"
+    echo -e "\t<gpu_node_label_is_attached>: check_intel_gpu_node_label_is_attached"
+    echo -e "\t<at_least_one_gpu_is_available>: check_at_least_one_intel_gpu_is_available"
+    echo -e "\t<capacity_slots_for_gpus_match>: check_capacity_slots_for_intel_gpus_match"
+    echo -e "\t<allocatable_slots_for_gpus_match>: check_allocatable_slots_for_intel_gpus_match"
+}
+
+main() {
+    case ${1} in
+    host_has_intel_gpus) check_host_has_intel_gpus ;;
+    gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;;
+    gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;;
+    one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;;
+    one_daemonset_is_ready) check_one_intel_gpu_plugin_daemonset_is_ready ;;
+    gpu_node_label_is_attached) check_intel_gpu_node_label_is_attached ;;
+    at_least_one_gpu_is_available) check_at_least_one_intel_gpu_is_available ;;
+    capacity_slots_for_gpus_match) check_capacity_slots_for_intel_gpus_match ;;
+    allocatable_slots_for_gpus_match) check_allocatable_slots_for_intel_gpus_match ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_ipex.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_ipex.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+check_ipex_can_be_imported() {
+    echo "Starting ipex import test"
+    script="import intel_extension_for_pytorch as ipex; import torch; import jupyter"
+    if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
+        echo "PASS: Found module"
+        exit 0
+    else
+        >&2 echo "FAIL: Did not find IPEX python module"
+        exit 1
+    fi
+}
+
+check_pytorch_can_use_xpu() {
+    echo "Starting ipex GPU check test"
+    script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")"
+    gpu_grep_out=$(microk8s.kubectl -n dss exec "$1" -- python3 -c "$script" | grep "dev_type=.gpu" 2>&1)
+    if [[ -z ${gpu_grep_out} ]]; then
+        >&2 echo "FAIL: No GPU found"
+        exit 1
+    else
+        echo "PASS: GPU found"
+        exit 0
+    fi
+}
+
+help_function() {
+    echo "This script is used for tests related to IPEX"
+    echo "Usage: check_dss.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<can_be_imported>: check_itex_can_be_imported"
+    echo -e "\t<pytorch_can_use_xpu>: check_pytorch_can_use_xpu"
+}
+
+main() {
+    pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'ipex-2120-notebook\S*')
+    echo "Found PyTorch pod: ${pod}"
+    case ${1} in
+    can_be_imported) check_ipex_can_be_imported "$pod" ;;
+    pytorch_can_use_xpu) check_pytorch_can_use_xpu "$pod" ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_itex.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_itex.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+check_itex_can_be_imported() {
+    echo "Starting itex import test"
+    script="import intel_extension_for_tensorflow as itex; import tensorflow; import jupyter"
+    if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
+        echo "PASS: Found module"
+        exit 0
+    else
+        >&2 echo "FAIL: Did not find ITEX python module"
+        exit 1
+    fi
+}
+
+check_tensorflow_can_use_xpu() {
+    echo "Starting itex GPU check test"
+    script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")"
+    if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
+        echo "PASS: XPU found"
+        exit 0
+    else
+        >&2 echo "FAIL: No XPU found"
+        exit 1
+    fi
+}
+
+help_function() {
+    echo "This script is used for tests related to ITEX"
+    echo "Usage: check_dss.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<can_be_imported>: check_itex_can_be_imported"
+    echo -e "\t<tensorflow_can_use_xpu>: check_tensorflow_can_use_xpu"
+}
+
+main() {
+    pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'itex-215-notebook\S*')
+    echo "Found Tensorflow pod: ${pod}"
+    case ${1} in
+    can_be_imported) check_itex_can_be_imported "$pod" ;;
+    tensorflow_can_use_xpu) check_tensorflow_can_use_xpu "$pod" ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"