Skip to content

Commit

Permalink
Refactor DSS provider tests (New) (#1601)
Browse files Browse the repository at this point in the history
The tests for DSS provider in contrib have been refactored so that they use individual scripts now available in the relevant bin folder instead of scripts written inline in the jobs' command. The tests end up checking the same things, but are now better implemented.

The shell scripts were extracted out into one of multiple ones that roughly group related tests, including those generic to DSS, those for setting up Intel GPUs, those for using ITEX, and those for using IPEX. Furthermore, larger Python scripts that were previously written in-line have been factored out into individual ones. Please note that these Python scripts are not meant to be run by the jobs directly, instead they are submitted via the job to DSS to be run in the appropriate Kubernetes Pods, hence the Python dependencies of these scripts are not relevant to the provider itself.
  • Loading branch information
motjuste authored Nov 18, 2024
1 parent 1cd8e52 commit cdb82d0
Show file tree
Hide file tree
Showing 8 changed files with 417 additions and 252 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/testflinger-contrib-dss-regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
-e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
-e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
-e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/job.yaml
- name: Build job file from template with oemscript provisioning
if: ${{ matrix.queue == 'dell-precision-5680-c31665' }}
Expand All @@ -51,7 +51,7 @@ jobs:
-e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
-e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
-e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/job.yaml
- name: Submit testflinger job
uses: canonical/testflinger/.github/actions/submit@main
Expand Down
110 changes: 110 additions & 0 deletions contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env bash

set -euxo pipefail

# IMPORTANT: for any test using the dss command:
#
# - Clear PYTHON shell vars to prevent conflicts between dss
# and checkbox python environments
# - Run from ${HOME} as dss writes logs to its working directory,
# and as a snap does not have permissions to write to the default
# working directory for checkbox tests
export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE

check_dss_can_be_initialized() {
# TODO: we actually seem to initialize dss here; maybe split it out
cd "${HOME}"
dss initialize --kubeconfig="$(sudo microk8s config)"
echo "Test success: dss initialized."
}

check_dss_namespace_is_deployed() {
if microk8s.kubectl get ns | grep -q dss; then
echo "Test success: 'dss' namespace is deployed!"
else
>&2 echo "Test failure: no namespace named 'dss' deployed."
exit 1
fi
}

check_mlflow_status_is_ready() {
cd "${HOME}"
result=$(dss status) # save result to shell var to avoid broken pipe error
if echo "${result}" | grep -q "MLflow deployment: Ready"; then
echo "Test success: 'dss status' shows ready status for mlflow."
else
>&2 echo "Test failure: 'dss status' does not show ready status for mlflow."
exit 1
fi
}

check_mlflow_is_deployed_as_first_service() {
# TODO: enable mlflow to be a service in any position
result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}')
if [ "${result}" = "mlflow" ]; then
echo "Test success: 'mlflow' service is deployed!"
else
>&2 echo "Test failure: expected service name 'mlflow' but got ${result}"
exit 1
fi
}

check_dss_has_intel_gpu_acceleration_enabled() {
cd "${HOME}"
result=$(dss status) # save result to shell var to avoid broken pipe error
if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then
echo "Test success: 'dss status' correctly reports Intel GPU status."
else
>&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled."
exit 1
fi
}

check_dss_can_create_itex_215_notebook() {
cd "${HOME}"
if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then
echo "Test success: successfully created an ITEX 2.15 notebook."
else
>&2 echo "Test failure: failed to create an ITEX 2.15 notebook."
exit 1
fi
}

check_dss_can_create_ipex_2120_notebook() {
cd "${HOME}"
if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then
echo "Test success: successfully created an IPEX 2.1.20 notebook."
else
>&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook."
exit 1
fi
}

help_function() {
echo "This script is used for generic tests related to DSS"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<dss_can_be_initialized>: check_dss_can_be_initialized"
echo -e "\t<dss_namespace_is_deployed>: check_dss_namespace_is_deployed"
echo -e "\t<mlflow_status_is_ready>: check_mlflow_status_is_ready"
echo -e "\t<mlflow_is_deployed_as_first_service>: check_mlflow_is_deployed_as_first_service"
echo -e "\t<intel_gpu_acceleration_is_enabled>: check_dss_has_intel_gpu_acceleration_enabled"
echo -e "\t<can_create_itex_215_notebook>: check_dss_can_create_itex_215_notebook"
echo -e "\t<can_create_ipex_2120_notebook>: check_dss_can_create_ipex_2120_notebook"
}

main() {
case ${1} in
dss_can_be_initialized) check_dss_can_be_initialized ;;
dss_namespace_is_deployed) check_dss_namespace_is_deployed ;;
mlflow_status_is_ready) check_mlflow_status_is_ready ;;
mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;;
intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;;
can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;;
can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;;
*) help_function ;;
esac
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env bash

set -euxo pipefail

check_host_has_intel_gpus() {
result=$(intel_gpu_top -L)
if [[ ${result} == *"pci:vendor=8086"* ]]; then
echo "Test success: Intel GPU available on host: ${result}"
else
>&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}"
exit 1
fi
}

check_intel_gpu_plugin_can_be_installed() {
# Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453

# TODO: make version a param
VERSION=v0.30.0
# hack as redirecting stdout anywhere but /dev/null throws a permission denied error
# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null
sed -i 's/enable-monitoring/enable-monitoring\n - -shared-dev-num=10/' /tmp/gpu_plugin.yaml
kubectl apply -f /tmp/node_feature_discovery.yaml
kubectl apply -f /tmp/node_feature_rules.yaml
kubectl apply -f /tmp/gpu_plugin.yaml
SLEEP_SECS=15
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status."
sleep ${SLEEP_SECS}
kubectl -n node-feature-discovery rollout status ds/nfd-worker
kubectl -n default rollout status ds/intel-gpu-plugin
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds to allow pod status to update for subsequent tests."
sleep ${SLEEP_SECS}
echo "Test success: Intel K8s GPU Device Plugin deployed."
}

check_intel_gpu_plugin_daemonset_is_deployed() {
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].metadata.name}')
if [ "${result}" = "intel-gpu-plugin" ]; then
echo "Test success: 'intel-gpu-plugin' daemonset is deployed!"
else
>&2 echo "Test failure: expected daemonset name 'intel-gpu-plugin' but got ${result}"
exit 1
fi
}

check_one_intel_gpu_plugin_daemonset_is_available() {
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberAvailable}')
if [ "${result}" = "1" ]; then
echo "Test success: 1 daemonset in numberAvailable status."
else
>&2 echo "Test failure: expected numberAvailable to be 1 but got ${result}"
exit 1
fi
}

check_one_intel_gpu_plugin_daemonset_is_ready() {
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberReady}')
if [ "${result}" = "1" ]; then
echo "Test success: 1 daemonset in numberReady status."
else
>&2 echo "Test failure: expected numberReady to be 1 but got ${result}"
exit 1
fi
}

check_intel_gpu_node_label_is_attached() {
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')
if [ "${result}" = "true" ]; then
echo "Test success: found expected label: 'intel.feature.node.kubernetes.io/gpu': 'true'"
else
>&2 echo "Test failure: expected 'true' but got ${result}"
exit 1
fi
}

check_at_least_one_intel_gpu_is_available() {
result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
if [ "${result}" -ge 1 ]; then
echo "Test success: Found ${result} GPUs on system."
else
>&2 echo "Test failure: expected at least 1 GPU but got ${result}"
exit 1
fi
}

check_capacity_slots_for_intel_gpus_match() {
num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}')
# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
SLOTS_PER_GPU=10
total_slots=$((num_gpus * SLOTS_PER_GPU))
if [ "${total_slots}" -eq "${result}" ]; then
echo "Test success: Found ${result} GPU capacity slots on k8s node."
else
>&2 echo "Test failure: expected ${total_slots} GPU capacity slots but got ${result}"
exit 1
fi
}

check_allocatable_slots_for_intel_gpus_match() {
num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}')
# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
SLOTS_PER_GPU=10
total_slots=$((num_gpus * SLOTS_PER_GPU))
if [ "${total_slots}" -eq "${result}" ]; then
echo "Test success: Found ${result} GPU allocatable slots on k8s node."
else
>&2 echo "Test failure: expected ${total_slots} GPU allocatable slots but got ${result}"
exit 1
fi
}

help_function() {
echo "This script is used for tests related to Intel GPUs"
echo "Usage: check.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<host_has_intel_gpus>: check_host_has_intel_gpus"
echo -e "\t<gpu_plugin_can_be_installed>: check_intel_gpu_plugin_can_be_installed"
echo -e "\t<gpu_plugin_daemonset_is_deployed>: check_intel_gpu_plugin_daemonset_is_deployed"
echo -e "\t<one_daemonset_is_available>: check_one_intel_gpu_plugin_daemonset_is_available"
echo -e "\t<one_daemonset_is_ready>: check_one_intel_gpu_plugin_daemonset_is_ready"
echo -e "\t<gpu_node_label_is_attached>: check_intel_gpu_node_label_is_attached"
echo -e "\t<at_least_one_gpu_is_available>: check_at_least_one_intel_gpu_is_available"
echo -e "\t<capacity_slots_for_gpus_match>: check_capacity_slots_for_intel_gpus_match"
echo -e "\t<allocatable_slots_for_gpus_match>: check_allocatable_slots_for_intel_gpus_match"
}

main() {
case ${1} in
host_has_intel_gpus) check_host_has_intel_gpus ;;
gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;;
gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;;
one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;;
one_daemonset_is_ready) check_one_intel_gpu_plugin_daemonset_is_ready ;;
gpu_node_label_is_attached) check_intel_gpu_node_label_is_attached ;;
at_least_one_gpu_is_available) check_at_least_one_intel_gpu_is_available ;;
capacity_slots_for_gpus_match) check_capacity_slots_for_intel_gpus_match ;;
allocatable_slots_for_gpus_match) check_allocatable_slots_for_intel_gpus_match ;;
*) help_function ;;
esac
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

set -euxo pipefail

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

check_ipex_can_be_imported() {
echo "Starting ipex import test"
script="import intel_extension_for_pytorch as ipex; import torch; import jupyter"
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
echo "PASS: Found module"
exit 0
else
>&2 echo "FAIL: Did not find IPEX python module"
exit 1
fi
}

check_pytorch_can_use_xpu() {
echo "Starting ipex GPU check test"
script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")"
gpu_grep_out=$(microk8s.kubectl -n dss exec "$1" -- python3 -c "$script" | grep "dev_type=.gpu" 2>&1)
if [[ -z ${gpu_grep_out} ]]; then
>&2 echo "FAIL: No GPU found"
exit 1
else
echo "PASS: GPU found"
exit 0
fi
}

help_function() {
echo "This script is used for tests related to IPEX"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<can_be_imported>: check_itex_can_be_imported"
echo -e "\t<pytorch_can_use_xpu>: check_pytorch_can_use_xpu"
}

main() {
pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'ipex-2120-notebook\S*')
echo "Found PyTorch pod: ${pod}"
case ${1} in
can_be_imported) check_ipex_can_be_imported "$pod" ;;
pytorch_can_use_xpu) check_pytorch_can_use_xpu "$pod" ;;
*) help_function ;;
esac
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash

set -euxo pipefail

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

check_itex_can_be_imported() {
echo "Starting itex import test"
script="import intel_extension_for_tensorflow as itex; import tensorflow; import jupyter"
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
echo "PASS: Found module"
exit 0
else
>&2 echo "FAIL: Did not find ITEX python module"
exit 1
fi
}

check_tensorflow_can_use_xpu() {
echo "Starting itex GPU check test"
script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")"
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
echo "PASS: XPU found"
exit 0
else
>&2 echo "FAIL: No XPU found"
exit 1
fi
}

help_function() {
echo "This script is used for tests related to ITEX"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<can_be_imported>: check_itex_can_be_imported"
echo -e "\t<tensorflow_can_use_xpu>: check_tensorflow_can_use_xpu"
}

main() {
pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'itex-215-notebook\S*')
echo "Found Tensorflow pod: ${pod}"
case ${1} in
can_be_imported) check_itex_can_be_imported "$pod" ;;
tensorflow_can_use_xpu) check_tensorflow_can_use_xpu "$pod" ;;
*) help_function ;;
esac
}

main "$@"
Loading

0 comments on commit cdb82d0

Please sign in to comment.