-
Notifications
You must be signed in to change notification settings - Fork 4.2k
146 lines (135 loc) · 4.93 KB
/
hpu-gaudi2.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
name: hpu-gaudi2
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2.yml"
- "accelerator/hpu_accelerator.py"
- "op_builder/hpu/**"
- "deepspeed/runtime/engine.py"
- "deepspeed/runtime/bf16_optimizer.py"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/zero/partition_parameters.py"
- "deepspeed/runtime/zero/partitioned_param_coordinator.py"
- "deepspeed/runtime/zero/parameter_offload.py"
- "deepspeed/runtime/pipe/engine.py"
- "deepspeed/runtime/utils.py"
- "deepspeed/inference/engine.py"
- "deepspeed/module_inject/auto_tp.py"
- "deepspeed/module_inject/replace_module.py"
- "deepspeed/module_inject/load_checkpoint.py"
- "deepspeed/module_inject/inject.py"
- "deepspeed/ops/transformer/**"
- "deepspeed/ops/adam/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
env:
PT_HPU_LAZY_MODE: 0
TORCHINDUCTOR_COMPILE_THREADS: 1
TEST_LIST: |
test_accelerator.py
test_autotuning.py
test_latest_checkpoint.py
test_moe_checkpoint.py
test_other_optimizer.py
test_pipeline.py
test_reshape_checkpoint.py
test_shared_weights.py
test_sparse.py
test_tag_validation.py
test_universal_checkpoint.py
(test_zero_optimizer.py and (TestZeRONonDistributed or TestZeROCheckpointFrozenWeights or TestZeROElasticCheckpoint or TestSaveTensorClone or TestZeROSaveLoadEdgeCase))
test_dist.py
test_compression.py
test_elastic.py
test_run.py
test_multinode_runner.py
test_ds_arguments.py
test_moe_tp.py
test_moe.py
test_monitor.py
test_adamw.py
test_pipe_module.py
test_flops_profiler.py
test_partition_balanced.py
test_groups.py
test_get_optim_files.py
test_init_on_device.py
test_activation_checkpointing_non_reentrant.py
test_activation_checkpointing.py
test_coalesced_collectives.py
test_bf16.py
test_dynamic_loss_scale.py
(test_fp16.py and (TestZeroEmptyPartition or TestFP16AdamTypes or TestZeroEmptyGrad or TestFP16OptimizerForMoE or TestZeroSupportedClientOptimizer or TestZeroAllowUntestedOptimizer))
test_csr.py
test_data.py
test_data_efficiency.py
test_ds_config_dict.py
test_ds_config_model.py
test_ds_initialize.py
test_lr_schedulers.py
test_multi_output_model.py
test_mup_optimizers.py
(test_pld.py and test_pld_schedule)
test_runtime_utils.py
test_partition.py
test_ignore_unused_parameters.py
(test_zero.py and (not TestZero3RepeatForwardLoop))
test_zero_config.py
test_zero_context.py
test_zero_context_ancestry.py
test_zero_context_return.py
test_zero_dynamic_class.py
test_zero_leaf_module.py
test_zero_tiled.py
(test_compile_zero.py and (not nvme and not cpu-3-dtype1))
(test_zero_tensor_fragment.py and (not cpu-3))
test_zeropp.py
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
hl-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
pytest --verbose unit/ -k "${TEST_LIST}"