Retry tests (#3229)

* retry tests * retry with pytest last failed logic greatly speeds up reruns of tests as only previously failed tests are rerun. define pytest cachedir for each pytest invocation to prevent interaction between different selections of tests. protect against exit code of 5 when a previous pytest invocation had no failed tests which results in all tests being deselected. use eval to avoid issues with the -k and -m expansions. * tidy test scripts * set correct root dir * add option to treat unrun tests as failures * interpret sigterm as sigint * adjust timeouts * respond to comments and add comments --------- Co-authored-by: leej3 <“[email protected]> Co-authored-by: vfdev <[email protected]>
pytorch · May 23, 2024 · 8db318b · 8db318b
1 parent 9d31a9c
commit 8db318b
Show file tree

Hide file tree

Showing 12 changed files with 316 additions and 73 deletions.
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
@@ -22,7 +22,7 @@ jobs:
  gpu-hvd-tests:
  strategy:
  matrix:
- pytorch-channel: [pytorch, ]
+ pytorch-channel: [pytorch]
  fail-fast: false
  env:
  DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
@@ -128,8 +128,8 @@ jobs:
  # Can't build Horovod with recent pytorch due to pytorch required C++17 standard
  # and horovod is still using C++14
  # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
- # Using a similar hack as described here: 
- # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 
+ # Using a similar hack as described here:
+ # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
  git clone --recursive https://github.com/horovod/horovod.git /horovod
  cd /horovod
  sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
@@ -152,7 +152,7 @@ jobs:
  set -xe
 
  bash tests/run_gpu_tests.sh 2 hvd
- CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+ CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd
 
  EOF
  )

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
  REPOSITORY: ${{ github.repository }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  runs-on: linux.8xlarge.nvidia.gpu
- timeout-minutes: 45
+ timeout-minutes: 85
 
  steps:
  - name: Clean workspace
@@ -121,18 +121,13 @@ jobs:
 
  - name: Run GPU Unit Tests
  continue-on-error: false
- run: |
-
- script=$(cat << EOF
-
- set -xe
-
- bash tests/run_gpu_tests.sh 2
-
- EOF
- )
-
- docker exec -t pthd /bin/bash -c "${script}"
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 25
+ shell: bash
+ command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
+ new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
@@ -75,9 +75,13 @@ jobs:
  target_dir: /tmp
 
  - name: Run Tests
- shell: bash -l {0}
- run: |
- bash tests/run_cpu_tests.sh
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 15
+ shell: bash
+ command: bash tests/run_cpu_tests.sh
+ new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
@@ -10,15 +10,15 @@ on:
 jobs:
  build:
  runs-on: ubuntu-latest
- timeout-minutes: 45
+ timeout-minutes: 85
  strategy:
  max-parallel: 5
  fail-fast: false
  matrix:
  python-version: [3.8, 3.9, "3.10"]
  pytorch-version:
  [2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
- exclude: 
+ exclude:
  - pytorch-version: 1.5.1
  python-version: 3.9
  - pytorch-version: 1.5.1
@@ -78,7 +78,7 @@ jobs:
  pip install -r requirements-dev.txt
  python setup.py install
 
- # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern 
+ # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
  # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
  bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
  if [ "${bad_pth_version}" == "True" ]; then
@@ -92,9 +92,13 @@ jobs:
  target_dir: /tmp
 
  - name: Run Tests
- shell: bash -l {0}
- run: |
- bash tests/run_cpu_tests.sh "not test_time_profilers"
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 15
+ shell: bash
+ command: bash tests/run_cpu_tests.sh "not test_time_profilers"
+ new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"
 
  # create-issue:
  # runs-on: ubuntu-latest

diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
@@ -89,13 +89,19 @@ jobs:
  target_dir: /tmp
 
  - name: Run Tests
- run: |
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
- export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
- export XRT_WORKERS="localservice:0;grpc://localhost:40934"
-
- python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
- bash tests/run_tpu_tests.sh
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 25
+ shell: bash
+ command: |
+ python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
+ bash tests/run_tpu_tests.sh
+ new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
+ env:
+ LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
+ XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
+ XRT_WORKERS: "localservice:0;grpc://localhost:40934"
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
  cpu-tests:
  runs-on: ${{ matrix.os }}
- timeout-minutes: 45
+ timeout-minutes: 85
  defaults:
  run:
  shell: bash
@@ -40,7 +40,7 @@ jobs:
  fail-fast: false
  matrix:
  os: [ubuntu-latest]
- python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
  pytorch-channel: [pytorch, pytorch-nightly]
  include:
  # includes a single build on windows
@@ -102,7 +102,7 @@ jobs:
 
  - name: Run Mypy
  # https://github.com/pytorch/ignite/pull/2780
- # 
+ #
  if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
  run: |
  bash ./tests/run_code_style.sh mypy
@@ -120,8 +120,13 @@ jobs:
  cp -R /tmp/MNIST .
 
  - name: Run Tests
- run: |
- SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 15
+ shell: bash
+ command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+ new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/tests/common-test-functionality.sh b/tests/common-test-functionality.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Will catch exit code 5 when tests are deselected from previous passing run
+# (relevent for --last-failed-no-failures none)
+last_failed_no_failures_code=5
+
+# functions shared across test files
+run_tests() {
+ # Set defaults
+ local core_args="-vvv tests/ignite"
+ local cache_dir=".unknown-cache"
+ local skip_distrib_tests=1
+ local match_tests_expression=""
+ local trap_deselected_exit_code=1
+ local use_last_failed=0
+ local use_coverage=0
+ local world_size=0
+ # Always clean up pytest.ini
+ trap 'rm -f pytest.ini' RETURN
+ # Parse arguments
+ while [[ $# -gt 0 ]]
+ do
+ key="$1"
+ case $key in
+ --core_args)
+ core_args="$2"
+ shift
+ shift
+ ;;
+ --cache_dir)
+ cache_dir="$2"
+ shift
+ shift
+ ;;
+ --skip_distrib_tests)
+ skip_distrib_tests="$2"
+ shift
+ shift
+ ;;
+ --match_tests_expression)
+ match_tests_expression="$2"
+ shift
+ shift
+ ;;
+ --trap_deselected_exit_code)
+ trap_deselected_exit_code="$2"
+ shift
+ shift
+ ;;
+ --use_last_failed)
+ use_last_failed="$2"
+ shift
+ shift
+ ;;
+ --use_coverage)
+ use_coverage="$2"
+ shift
+ shift
+ ;;
+ --world_size)
+ world_size="$2"
+ shift
+ shift
+ ;;
+ *)
+ echo "Error: Unknown argument $key"
+ exit 1
+ shift
+ ;;
+ esac
+ done
+
+ if [ "${skip_distrib_tests}" -eq "1" ]; then
+ # can be overwritten by core_args
+ skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
+ else
+ skip_distrib_opt=""
+ fi
+
+
+ echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini
+
+ # Assemble options for the pytest command
+ pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
+ if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
+ pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
+ fi
+ if [ "${use_coverage}" -eq "1" ]; then
+ pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
+ fi
+ if [ ! "${world_size}" -eq "0" ]; then
+ export WORLD_SIZE="${world_size}"
+ pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
+ fi
+
+ # Run the command
+ if [ "$trap_deselected_exit_code" -eq "1" ]; then
+ CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
+ else
+ CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
+ fi
+}