-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor DSS provider tests (New) (#1601)
The tests for DSS provider in contrib have been refactored so that they use individual scripts now available in the relevant bin folder instead of scripts written inline in the jobs' command. The tests end up checking the same things, but are now better implemented. The shell scripts were extracted out into one of multiple ones that roughly group related tests, including those generic to DSS, those for setting up Intel GPUs, those for using ITEX, and those for using IPEX. Furthermore, larger Python scripts that were previously written in-line have been factored out into individual ones. Please note that these Python scripts are not meant to be run by the jobs directly, instead they are submitted via the job to DSS to be run in the appropriate Kubernetes Pods, hence the Python dependencies of these scripts are not relevant to the provider itself.
- Loading branch information
Showing
8 changed files
with
417 additions
and
252 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euxo pipefail | ||
|
||
# IMPORTANT: for any test using the dss command: | ||
# | ||
# - Clear PYTHON shell vars to prevent conflicts between dss | ||
# and checkbox python environments | ||
# - Run from ${HOME} as dss writes logs to its working directory, | ||
# and as a snap does not have permissions to write to the default | ||
# working directory for checkbox tests | ||
export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE | ||
|
||
check_dss_can_be_initialized() { | ||
# TODO: we actually seem to initialize dss here; maybe split it out | ||
cd "${HOME}" | ||
dss initialize --kubeconfig="$(sudo microk8s config)" | ||
echo "Test success: dss initialized." | ||
} | ||
|
||
check_dss_namespace_is_deployed() { | ||
if microk8s.kubectl get ns | grep -q dss; then | ||
echo "Test success: 'dss' namespace is deployed!" | ||
else | ||
>&2 echo "Test failure: no namespace named 'dss' deployed." | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_mlflow_status_is_ready() { | ||
cd "${HOME}" | ||
result=$(dss status) # save result to shell var to avoid broken pipe error | ||
if echo "${result}" | grep -q "MLflow deployment: Ready"; then | ||
echo "Test success: 'dss status' shows ready status for mlflow." | ||
else | ||
>&2 echo "Test failure: 'dss status' does not show ready status for mlflow." | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_mlflow_is_deployed_as_first_service() { | ||
# TODO: enable mlflow to be a service in any position | ||
result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}') | ||
if [ "${result}" = "mlflow" ]; then | ||
echo "Test success: 'mlflow' service is deployed!" | ||
else | ||
>&2 echo "Test failure: expected service name 'mlflow' but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_dss_has_intel_gpu_acceleration_enabled() { | ||
cd "${HOME}" | ||
result=$(dss status) # save result to shell var to avoid broken pipe error | ||
if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then | ||
echo "Test success: 'dss status' correctly reports Intel GPU status." | ||
else | ||
>&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled." | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_dss_can_create_itex_215_notebook() { | ||
cd "${HOME}" | ||
if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then | ||
echo "Test success: successfully created an ITEX 2.15 notebook." | ||
else | ||
>&2 echo "Test failure: failed to create an ITEX 2.15 notebook." | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_dss_can_create_ipex_2120_notebook() { | ||
cd "${HOME}" | ||
if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then | ||
echo "Test success: successfully created an IPEX 2.1.20 notebook." | ||
else | ||
>&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook." | ||
exit 1 | ||
fi | ||
} | ||
|
||
help_function() { | ||
echo "This script is used for generic tests related to DSS" | ||
echo "Usage: check_dss.sh <test_case>" | ||
echo | ||
echo "Test cases currently implemented:" | ||
echo -e "\t<dss_can_be_initialized>: check_dss_can_be_initialized" | ||
echo -e "\t<dss_namespace_is_deployed>: check_dss_namespace_is_deployed" | ||
echo -e "\t<mlflow_status_is_ready>: check_mlflow_status_is_ready" | ||
echo -e "\t<mlflow_is_deployed_as_first_service>: check_mlflow_is_deployed_as_first_service" | ||
echo -e "\t<intel_gpu_acceleration_is_enabled>: check_dss_has_intel_gpu_acceleration_enabled" | ||
echo -e "\t<can_create_itex_215_notebook>: check_dss_can_create_itex_215_notebook" | ||
echo -e "\t<can_create_ipex_2120_notebook>: check_dss_can_create_ipex_2120_notebook" | ||
} | ||
|
||
main() { | ||
case ${1} in | ||
dss_can_be_initialized) check_dss_can_be_initialized ;; | ||
dss_namespace_is_deployed) check_dss_namespace_is_deployed ;; | ||
mlflow_status_is_ready) check_mlflow_status_is_ready ;; | ||
mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;; | ||
intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; | ||
can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;; | ||
can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;; | ||
*) help_function ;; | ||
esac | ||
} | ||
|
||
main "$@" |
148 changes: 148 additions & 0 deletions
148
contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euxo pipefail | ||
|
||
check_host_has_intel_gpus() { | ||
result=$(intel_gpu_top -L) | ||
if [[ ${result} == *"pci:vendor=8086"* ]]; then | ||
echo "Test success: Intel GPU available on host: ${result}" | ||
else | ||
>&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_intel_gpu_plugin_can_be_installed() { | ||
# Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 | ||
|
||
# TODO: make version a param | ||
VERSION=v0.30.0 | ||
# hack as redirecting stdout anywhere but /dev/null throws a permission denied error | ||
# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4 | ||
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null | ||
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null | ||
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null | ||
sed -i 's/enable-monitoring/enable-monitoring\n - -shared-dev-num=10/' /tmp/gpu_plugin.yaml | ||
kubectl apply -f /tmp/node_feature_discovery.yaml | ||
kubectl apply -f /tmp/node_feature_rules.yaml | ||
kubectl apply -f /tmp/gpu_plugin.yaml | ||
SLEEP_SECS=15 | ||
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status." | ||
sleep ${SLEEP_SECS} | ||
kubectl -n node-feature-discovery rollout status ds/nfd-worker | ||
kubectl -n default rollout status ds/intel-gpu-plugin | ||
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds to allow pod status to update for subsequent tests." | ||
sleep ${SLEEP_SECS} | ||
echo "Test success: Intel K8s GPU Device Plugin deployed." | ||
} | ||
|
||
check_intel_gpu_plugin_daemonset_is_deployed() { | ||
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].metadata.name}') | ||
if [ "${result}" = "intel-gpu-plugin" ]; then | ||
echo "Test success: 'intel-gpu-plugin' daemonset is deployed!" | ||
else | ||
>&2 echo "Test failure: expected daemonset name 'intel-gpu-plugin' but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_one_intel_gpu_plugin_daemonset_is_available() { | ||
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberAvailable}') | ||
if [ "${result}" = "1" ]; then | ||
echo "Test success: 1 daemonset in numberAvailable status." | ||
else | ||
>&2 echo "Test failure: expected numberAvailable to be 1 but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_one_intel_gpu_plugin_daemonset_is_ready() { | ||
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberReady}') | ||
if [ "${result}" = "1" ]; then | ||
echo "Test success: 1 daemonset in numberReady status." | ||
else | ||
>&2 echo "Test failure: expected numberReady to be 1 but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_intel_gpu_node_label_is_attached() { | ||
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}') | ||
if [ "${result}" = "true" ]; then | ||
echo "Test success: found expected label: 'intel.feature.node.kubernetes.io/gpu': 'true'" | ||
else | ||
>&2 echo "Test failure: expected 'true' but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_at_least_one_intel_gpu_is_available() { | ||
result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') | ||
if [ "${result}" -ge 1 ]; then | ||
echo "Test success: Found ${result} GPUs on system." | ||
else | ||
>&2 echo "Test failure: expected at least 1 GPU but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_capacity_slots_for_intel_gpus_match() { | ||
num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') | ||
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}') | ||
# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation | ||
SLOTS_PER_GPU=10 | ||
total_slots=$((num_gpus * SLOTS_PER_GPU)) | ||
if [ "${total_slots}" -eq "${result}" ]; then | ||
echo "Test success: Found ${result} GPU capacity slots on k8s node." | ||
else | ||
>&2 echo "Test failure: expected ${total_slots} GPU capacity slots but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_allocatable_slots_for_intel_gpus_match() { | ||
num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') | ||
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}') | ||
# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation | ||
SLOTS_PER_GPU=10 | ||
total_slots=$((num_gpus * SLOTS_PER_GPU)) | ||
if [ "${total_slots}" -eq "${result}" ]; then | ||
echo "Test success: Found ${result} GPU allocatable slots on k8s node." | ||
else | ||
>&2 echo "Test failure: expected ${total_slots} GPU allocatable slots but got ${result}" | ||
exit 1 | ||
fi | ||
} | ||
|
||
help_function() { | ||
echo "This script is used for tests related to Intel GPUs" | ||
echo "Usage: check.sh <test_case>" | ||
echo | ||
echo "Test cases currently implemented:" | ||
echo -e "\t<host_has_intel_gpus>: check_host_has_intel_gpus" | ||
echo -e "\t<gpu_plugin_can_be_installed>: check_intel_gpu_plugin_can_be_installed" | ||
echo -e "\t<gpu_plugin_daemonset_is_deployed>: check_intel_gpu_plugin_daemonset_is_deployed" | ||
echo -e "\t<one_daemonset_is_available>: check_one_intel_gpu_plugin_daemonset_is_available" | ||
echo -e "\t<one_daemonset_is_ready>: check_one_intel_gpu_plugin_daemonset_is_ready" | ||
echo -e "\t<gpu_node_label_is_attached>: check_intel_gpu_node_label_is_attached" | ||
echo -e "\t<at_least_one_gpu_is_available>: check_at_least_one_intel_gpu_is_available" | ||
echo -e "\t<capacity_slots_for_gpus_match>: check_capacity_slots_for_intel_gpus_match" | ||
echo -e "\t<allocatable_slots_for_gpus_match>: check_allocatable_slots_for_intel_gpus_match" | ||
} | ||
|
||
main() { | ||
case ${1} in | ||
host_has_intel_gpus) check_host_has_intel_gpus ;; | ||
gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;; | ||
gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;; | ||
one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;; | ||
one_daemonset_is_ready) check_one_intel_gpu_plugin_daemonset_is_ready ;; | ||
gpu_node_label_is_attached) check_intel_gpu_node_label_is_attached ;; | ||
at_least_one_gpu_is_available) check_at_least_one_intel_gpu_is_available ;; | ||
capacity_slots_for_gpus_match) check_capacity_slots_for_intel_gpus_match ;; | ||
allocatable_slots_for_gpus_match) check_allocatable_slots_for_intel_gpus_match ;; | ||
*) help_function ;; | ||
esac | ||
} | ||
|
||
main "$@" |
51 changes: 51 additions & 0 deletions
51
contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_ipex.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euxo pipefail | ||
|
||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
|
||
check_ipex_can_be_imported() { | ||
echo "Starting ipex import test" | ||
script="import intel_extension_for_pytorch as ipex; import torch; import jupyter" | ||
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then | ||
echo "PASS: Found module" | ||
exit 0 | ||
else | ||
>&2 echo "FAIL: Did not find IPEX python module" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_pytorch_can_use_xpu() { | ||
echo "Starting ipex GPU check test" | ||
script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")" | ||
gpu_grep_out=$(microk8s.kubectl -n dss exec "$1" -- python3 -c "$script" | grep "dev_type=.gpu" 2>&1) | ||
if [[ -z ${gpu_grep_out} ]]; then | ||
>&2 echo "FAIL: No GPU found" | ||
exit 1 | ||
else | ||
echo "PASS: GPU found" | ||
exit 0 | ||
fi | ||
} | ||
|
||
help_function() { | ||
echo "This script is used for tests related to IPEX" | ||
echo "Usage: check_dss.sh <test_case>" | ||
echo | ||
echo "Test cases currently implemented:" | ||
echo -e "\t<can_be_imported>: check_itex_can_be_imported" | ||
echo -e "\t<pytorch_can_use_xpu>: check_pytorch_can_use_xpu" | ||
} | ||
|
||
main() { | ||
pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'ipex-2120-notebook\S*') | ||
echo "Found PyTorch pod: ${pod}" | ||
case ${1} in | ||
can_be_imported) check_ipex_can_be_imported "$pod" ;; | ||
pytorch_can_use_xpu) check_pytorch_can_use_xpu "$pod" ;; | ||
*) help_function ;; | ||
esac | ||
} | ||
|
||
main "$@" |
50 changes: 50 additions & 0 deletions
50
contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_itex.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euxo pipefail | ||
|
||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
|
||
check_itex_can_be_imported() { | ||
echo "Starting itex import test" | ||
script="import intel_extension_for_tensorflow as itex; import tensorflow; import jupyter" | ||
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then | ||
echo "PASS: Found module" | ||
exit 0 | ||
else | ||
>&2 echo "FAIL: Did not find ITEX python module" | ||
exit 1 | ||
fi | ||
} | ||
|
||
check_tensorflow_can_use_xpu() { | ||
echo "Starting itex GPU check test" | ||
script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")" | ||
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then | ||
echo "PASS: XPU found" | ||
exit 0 | ||
else | ||
>&2 echo "FAIL: No XPU found" | ||
exit 1 | ||
fi | ||
} | ||
|
||
help_function() { | ||
echo "This script is used for tests related to ITEX" | ||
echo "Usage: check_dss.sh <test_case>" | ||
echo | ||
echo "Test cases currently implemented:" | ||
echo -e "\t<can_be_imported>: check_itex_can_be_imported" | ||
echo -e "\t<tensorflow_can_use_xpu>: check_tensorflow_can_use_xpu" | ||
} | ||
|
||
main() { | ||
pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'itex-215-notebook\S*') | ||
echo "Found Tensorflow pod: ${pod}" | ||
case ${1} in | ||
can_be_imported) check_itex_can_be_imported "$pod" ;; | ||
tensorflow_can_use_xpu) check_tensorflow_can_use_xpu "$pod" ;; | ||
*) help_function ;; | ||
esac | ||
} | ||
|
||
main "$@" |
Oops, something went wrong.