Skip to content

Commit

Permalink
Retry tests (#3229)
Browse files Browse the repository at this point in the history
* retry tests

* retry with pytest last failed logic

greatly speeds up reruns of tests as only previously failed tests are
rerun.

define pytest cachedir for each pytest invocation to prevent interaction
between different selections of tests.

protect against exit code of 5 when a previous pytest invocation
had no failed tests which results in all tests being deselected.

use eval to avoid issues with the -k and -m expansions.

* tidy test scripts

* set correct root dir

* add option to treat unrun tests as failures

* interpret sigterm as sigint

* adjust timeouts

* respond to comments and add comments

---------

Co-authored-by: leej3 <“[email protected]>
Co-authored-by: vfdev <[email protected]>
  • Loading branch information
3 people committed May 23, 2024
1 parent 9d31a9c commit 8db318b
Show file tree
Hide file tree
Showing 12 changed files with 316 additions and 73 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/gpu-hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
gpu-hvd-tests:
strategy:
matrix:
pytorch-channel: [pytorch, ]
pytorch-channel: [pytorch]
fail-fast: false
env:
DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
Expand Down Expand Up @@ -128,8 +128,8 @@ jobs:
# Can't build Horovod with recent pytorch due to pytorch required C++17 standard
# and horovod is still using C++14
# HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
# Using a similar hack as described here:
# https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
# Using a similar hack as described here:
# https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
git clone --recursive https://github.com/horovod/horovod.git /horovod
cd /horovod
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
Expand All @@ -152,7 +152,7 @@ jobs:
set -xe
bash tests/run_gpu_tests.sh 2 hvd
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd
EOF
)
Expand Down
21 changes: 8 additions & 13 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
REPOSITORY: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
runs-on: linux.8xlarge.nvidia.gpu
timeout-minutes: 45
timeout-minutes: 85

steps:
- name: Clean workspace
Expand Down Expand Up @@ -121,18 +121,13 @@ jobs:
- name: Run GPU Unit Tests
continue-on-error: false
run: |
script=$(cat << EOF
set -xe
bash tests/run_gpu_tests.sh 2
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 25
shell: bash
command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,13 @@ jobs:
target_dir: /tmp

- name: Run Tests
shell: bash -l {0}
run: |
bash tests/run_cpu_tests.sh
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
shell: bash
command: bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
16 changes: 10 additions & 6 deletions .github/workflows/pytorch-version-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ on:
jobs:
build:
runs-on: ubuntu-latest
timeout-minutes: 45
timeout-minutes: 85
strategy:
max-parallel: 5
fail-fast: false
matrix:
python-version: [3.8, 3.9, "3.10"]
pytorch-version:
[2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
exclude:
exclude:
- pytorch-version: 1.5.1
python-version: 3.9
- pytorch-version: 1.5.1
Expand Down Expand Up @@ -78,7 +78,7 @@ jobs:
pip install -r requirements-dev.txt
python setup.py install
# pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
# pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
# which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
if [ "${bad_pth_version}" == "True" ]; then
Expand All @@ -92,9 +92,13 @@ jobs:
target_dir: /tmp

- name: Run Tests
shell: bash -l {0}
run: |
bash tests/run_cpu_tests.sh "not test_time_profilers"
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
shell: bash
command: bash tests/run_cpu_tests.sh "not test_time_profilers"
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"

# create-issue:
# runs-on: ubuntu-latest
Expand Down
20 changes: 13 additions & 7 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,19 @@ jobs:
target_dir: /tmp

- name: Run Tests
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
export XRT_WORKERS="localservice:0;grpc://localhost:40934"
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 25
shell: bash
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
XRT_WORKERS: "localservice:0;grpc://localhost:40934"

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
15 changes: 10 additions & 5 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ concurrency:
jobs:
cpu-tests:
runs-on: ${{ matrix.os }}
timeout-minutes: 45
timeout-minutes: 85
defaults:
run:
shell: bash
Expand All @@ -40,7 +40,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
pytorch-channel: [pytorch, pytorch-nightly]
include:
# includes a single build on windows
Expand Down Expand Up @@ -102,7 +102,7 @@ jobs:
- name: Run Mypy
# https://github.com/pytorch/ignite/pull/2780
#
#
if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
run: |
bash ./tests/run_code_style.sh mypy
Expand All @@ -120,8 +120,13 @@ jobs:
cp -R /tmp/MNIST .
- name: Run Tests
run: |
SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
shell: bash
command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
102 changes: 102 additions & 0 deletions tests/common-test-functionality.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash

# Will catch exit code 5 when tests are deselected from previous passing run
# (relevent for --last-failed-no-failures none)
last_failed_no_failures_code=5

# functions shared across test files
run_tests() {
# Set defaults
local core_args="-vvv tests/ignite"
local cache_dir=".unknown-cache"
local skip_distrib_tests=1
local match_tests_expression=""
local trap_deselected_exit_code=1
local use_last_failed=0
local use_coverage=0
local world_size=0
# Always clean up pytest.ini
trap 'rm -f pytest.ini' RETURN
# Parse arguments
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--core_args)
core_args="$2"
shift
shift
;;
--cache_dir)
cache_dir="$2"
shift
shift
;;
--skip_distrib_tests)
skip_distrib_tests="$2"
shift
shift
;;
--match_tests_expression)
match_tests_expression="$2"
shift
shift
;;
--trap_deselected_exit_code)
trap_deselected_exit_code="$2"
shift
shift
;;
--use_last_failed)
use_last_failed="$2"
shift
shift
;;
--use_coverage)
use_coverage="$2"
shift
shift
;;
--world_size)
world_size="$2"
shift
shift
;;
*)
echo "Error: Unknown argument $key"
exit 1
shift
;;
esac
done

if [ "${skip_distrib_tests}" -eq "1" ]; then
# can be overwritten by core_args
skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
else
skip_distrib_opt=""
fi


echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini

# Assemble options for the pytest command
pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
fi
if [ "${use_coverage}" -eq "1" ]; then
pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
fi
if [ ! "${world_size}" -eq "0" ]; then
export WORLD_SIZE="${world_size}"
pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
fi

# Run the command
if [ "$trap_deselected_exit_code" -eq "1" ]; then
CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
else
CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
fi
}

0 comments on commit 8db318b

Please sign in to comment.