pytorch · leej3 · May 23, 2024 · Apr 4, 2024 · Apr 2, 2024 · Apr 25, 2024
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
@@ -22,7 +22,7 @@ jobs:
  gpu-hvd-tests:
  strategy:
  matrix:
- pytorch-channel: [pytorch, ]
+ pytorch-channel: [pytorch]
  fail-fast: false
  env:
  DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
@@ -128,8 +128,8 @@ jobs:
  # Can't build Horovod with recent pytorch due to pytorch required C++17 standard
  # and horovod is still using C++14
  # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
- # Using a similar hack as described here: 
- # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 
+ # Using a similar hack as described here:
+ # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
  git clone --recursive https://github.com/horovod/horovod.git /horovod
  cd /horovod
  sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
@@ -152,7 +152,7 @@ jobs:
  set -xe
 
  bash tests/run_gpu_tests.sh 2 hvd
- CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+ CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd
 
  EOF
  )

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
  REPOSITORY: ${{ github.repository }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  runs-on: linux.8xlarge.nvidia.gpu
- timeout-minutes: 45
+ timeout-minutes: 85
 
  steps:
  - name: Clean workspace
@@ -121,18 +121,13 @@ jobs:
 
  - name: Run GPU Unit Tests
  continue-on-error: false
- run: |
-
- script=$(cat << EOF
-
- set -xe
-
- bash tests/run_gpu_tests.sh 2
-
- EOF
- )
-
- docker exec -t pthd /bin/bash -c "${script}"
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 25
+ shell: bash
+ command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
+ new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
@@ -75,9 +75,13 @@ jobs:
  target_dir: /tmp
 
  - name: Run Tests
- shell: bash -l {0}
- run: |
- bash tests/run_cpu_tests.sh
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 15
+ shell: bash
+ command: bash tests/run_cpu_tests.sh
+ new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
@@ -10,15 +10,15 @@ on:
 jobs:
  build:
  runs-on: ubuntu-latest
- timeout-minutes: 45
+ timeout-minutes: 85
  strategy:
  max-parallel: 5
  fail-fast: false
  matrix:
  python-version: [3.8, 3.9, "3.10"]
  pytorch-version:
  [2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
- exclude: 
+ exclude:
  - pytorch-version: 1.5.1
  python-version: 3.9
  - pytorch-version: 1.5.1
@@ -78,7 +78,7 @@ jobs:
  pip install -r requirements-dev.txt
  python setup.py install
 
- # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern 
+ # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
  # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
  bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
  if [ "${bad_pth_version}" == "True" ]; then
@@ -92,9 +92,13 @@ jobs:
  target_dir: /tmp
 
  - name: Run Tests
- shell: bash -l {0}
- run: |
- bash tests/run_cpu_tests.sh "not test_time_profilers"
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 15
+ shell: bash
+ command: bash tests/run_cpu_tests.sh "not test_time_profilers"
+ new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"
 
  # create-issue:
  # runs-on: ubuntu-latest

diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
@@ -89,13 +89,19 @@ jobs:
  target_dir: /tmp
 
  - name: Run Tests
- run: |
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
- export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
- export XRT_WORKERS="localservice:0;grpc://localhost:40934"
-
- python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
- bash tests/run_tpu_tests.sh
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 25
+ shell: bash
+ command: |
+ python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
+ bash tests/run_tpu_tests.sh
+ new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
+ env:
+ LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
+ XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
+ XRT_WORKERS: "localservice:0;grpc://localhost:40934"
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
  cpu-tests:
  runs-on: ${{ matrix.os }}
- timeout-minutes: 45
+ timeout-minutes: 85
  defaults:
  run:
  shell: bash
@@ -40,7 +40,7 @@ jobs:
  fail-fast: false
  matrix:
  os: [ubuntu-latest]
- python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
  pytorch-channel: [pytorch, pytorch-nightly]
  include:
  # includes a single build on windows
@@ -102,7 +102,7 @@ jobs:
 
  - name: Run Mypy
  # https://github.com/pytorch/ignite/pull/2780
- # 
+ #
  if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
  run: |
  bash ./tests/run_code_style.sh mypy
@@ -120,8 +120,13 @@ jobs:
  cp -R /tmp/MNIST .
 
  - name: Run Tests
- run: |
- SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+ uses: nick-fields/retry@v3
+ with:
+ max_attempts: 5
+ timeout_minutes: 15
+ shell: bash
+ command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+ new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
 
  - name: Upload coverage to Codecov
  uses: codecov/codecov-action@v3

diff --git a/tests/common-test-functionality.sh b/tests/common-test-functionality.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Will catch exit code 5 when tests are deselected from previous passing run
+# (relevent for --last-failed-no-failures none)
+last_failed_no_failures_code=5
+
+# functions shared across test files
+run_tests() {
+ # Set defaults
+ local core_args="-vvv tests/ignite"
+ local cache_dir=".unknown-cache"
+ local skip_distrib_tests=1
+ local match_tests_expression=""
+ local trap_deselected_exit_code=1
+ local use_last_failed=0
+ local use_coverage=0
+ local world_size=0
+ # Always clean up pytest.ini
+ trap 'rm -f pytest.ini' RETURN
+ # Parse arguments
+ while [[ $# -gt 0 ]]
+ do
+ key="$1"
+ case $key in
+ --core_args)
+ core_args="$2"
+ shift
+ shift
+ ;;
+ --cache_dir)
+ cache_dir="$2"
+ shift
+ shift
+ ;;
+ --skip_distrib_tests)
+ skip_distrib_tests="$2"
+ shift
+ shift
+ ;;
+ --match_tests_expression)
+ match_tests_expression="$2"
+ shift
+ shift
+ ;;
+ --trap_deselected_exit_code)
+ trap_deselected_exit_code="$2"
+ shift
+ shift
+ ;;
+ --use_last_failed)
+ use_last_failed="$2"
+ shift
+ shift
+ ;;
+ --use_coverage)
+ use_coverage="$2"
+ shift
+ shift
+ ;;
+ --world_size)
+ world_size="$2"
+ shift
+ shift
+ ;;
+ *)
+ echo "Error: Unknown argument $key"
+ exit 1
+ shift
+ ;;
+ esac
+ done
+
+ if [ "${skip_distrib_tests}" -eq "1" ]; then
+ # can be overwritten by core_args
+ skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
+ else
+ skip_distrib_opt=""
+ fi
+
+
+ echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini
+
+ # Assemble options for the pytest command
+ pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
+ if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
+ pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
+ fi
+ if [ "${use_coverage}" -eq "1" ]; then
+ pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
+ fi
+ if [ ! "${world_size}" -eq "0" ]; then
+ export WORLD_SIZE="${world_size}"
+ pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
+ fi
+
+ # Run the command
+ if [ "$trap_deselected_exit_code" -eq "1" ]; then
+ CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
+ else
+ CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
+ fi
+}