diff --git a/.github/gen-workflow-ci.py b/.github/gen-workflow-ci.py index dca5cfb245..9fc9f7515a 100644 --- a/.github/gen-workflow-ci.py +++ b/.github/gen-workflow-ci.py @@ -191,9 +191,8 @@ def jobs(*jobs: str) -> str: ' HOROVOD_WITHOUT_MPI: 1\n' \ ' run: |\n' \ ' python -m pip install --upgrade pip\n' \ - ' python -m pip install setuptools wheel\n' \ ' python setup.py sdist\n' \ - ' pip -v install dist/horovod-*.tar.gz\n' \ + ' pip -v install --use-pep517 dist/horovod-*.tar.gz\n' \ '\n' + \ '\n'.join(jobs) @@ -480,7 +479,7 @@ def build_and_test_macos(id: str, name: str, needs: List[str], attempts: int = 3 f' if [[ ${{TENSORFLOW}} == 1.* ]] || [[ ${{TENSORFLOW}} == 2.[012345].* ]]; then pip install "h5py<3" "protobuf~=3.20"; fi\n' f' pip install torch==${{PYTORCH}} pytorch_lightning==${{PYTORCH_LIGHTNING}} torchvision==${{TORCHVISION}}\n' f' pip install mxnet==${{MXNET}}\n' - f' HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir .[test]\n' + f' HOROVOD_WITH_TENSORFLOW=${{TENSORFLOW}} HOROVOD_WITH_PYTORCH=${{PYTORCH}} HOROVOD_WITH_MXNET=${{MXNET}} pip install --no-cache-dir --use-pep517 .[test]\n' f' horovodrun --check-build\n' f'\n' + '\n'.join([f' - name: Test [attempt {attempt} of {attempts}]\n' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9336ca5b76..26828ee4ef 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -69,9 +69,8 @@ jobs: HOROVOD_WITHOUT_MPI: 1 run: | python -m pip install --upgrade pip - python -m pip install setuptools wheel python setup.py sdist - pip -v install dist/horovod-*.tar.gz + pip -v install --use-pep517 dist/horovod-*.tar.gz init-workflow: name: "Init Workflow" @@ -4499,7 +4498,7 @@ jobs: if [[ ${TENSORFLOW} == 1.* ]] || [[ ${TENSORFLOW} == 2.[012345].* ]]; then pip install "h5py<3" "protobuf~=3.20"; fi pip install torch==${PYTORCH} pytorch_lightning==${PYTORCH_LIGHTNING} torchvision==${TORCHVISION} pip install mxnet==${MXNET} - HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir .[test] + HOROVOD_WITH_TENSORFLOW=${TENSORFLOW} HOROVOD_WITH_PYTORCH=${PYTORCH} HOROVOD_WITH_MXNET=${MXNET} pip install --no-cache-dir --use-pep517 .[test] horovodrun --check-build - name: Test [attempt 1 of 3] diff --git a/CHANGELOG.md b/CHANGELOG.md index e0db85a91c..95300e9cf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +## [0.29.0] - 2022-10-05 + +### Changed +- Installation environment variables to enable a PEP517 compliant build process. ([#3991](https://github.com/horovod/horovod/pull/3991) ## [v0.28.1] - 2023-06-12 diff --git a/Dockerfile.test.cpu b/Dockerfile.test.cpu index 97d96cd673..cc08a7d71d 100644 --- a/Dockerfile.test.cpu +++ b/Dockerfile.test.cpu @@ -236,7 +236,7 @@ RUN if [[ ${MPI_KIND} == "ONECCL" ]]; then \ fi; \ cd /horovod && \ python setup.py sdist && \ - bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" + bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir --use-pep517 --no-build-isolation -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" # Show the effective python package version to easily spot version differences RUN pip freeze | sort diff --git a/Dockerfile.test.gpu b/Dockerfile.test.gpu index 7506b198dd..20ed82a366 100644 --- a/Dockerfile.test.gpu +++ b/Dockerfile.test.gpu @@ -214,7 +214,7 @@ RUN if [[ ${MXNET_PACKAGE} == "mxnet-nightly-cu"* ]]; then \ RUN cd /horovod && \ python setup.py sdist && \ ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ - bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ + bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir --use-pep517 --no-build-isolation -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ ldconfig # Show the effective python package version to easily spot version differences diff --git a/Jenkinsfile.ppc64le b/Jenkinsfile.ppc64le index 473ada0534..a0f79348bd 100644 --- a/Jenkinsfile.ppc64le +++ b/Jenkinsfile.ppc64le @@ -26,7 +26,7 @@ pipeline { . ${CONDA_INIT} conda activate ${CONDA_ENV} set -xe - HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_TENSORFLOW=1 \ + HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_PYTORCH=1.9.1 HOROVOD_WITH_TENSORFLOW=2.6.0 \ HOROVOD_CUDA_HOME="/usr/local/cuda" HOROVOD_GPU_OPERATIONS=NCCL \ pip install -v . --no-cache-dir --no-deps ''' diff --git a/MANIFEST.in b/MANIFEST.in index 12677135d6..c6830d6a8b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,8 @@ recursive-include * *.h *.hpp *.cc *.cu *.md *.cmake CMakeLists.txt include LICENSE horovod.lds horovod.exp CMakeLists.txt include cmake/build_utils.py +include _custom_build/backend.py + prune .eggs # prune eigen LGPL2 diff --git a/_custom_build/backend.py b/_custom_build/backend.py new file mode 100644 index 0000000000..4abf53c623 --- /dev/null +++ b/_custom_build/backend.py @@ -0,0 +1,43 @@ +import os +import sys +import sysconfig +from packaging import version +from importlib import metadata +from setuptools import build_meta as _orig + +prepare_metadata_for_build_wheel = _orig.__legacy__.prepare_metadata_for_build_wheel +build_wheel = _orig.__legacy__.build_wheel +build_sdist = _orig.__legacy__.build_sdist +get_requires_for_build_sdist = _orig.__legacy__.get_requires_for_build_sdist + + +def get_requires_for_build_wheel(self, config_settings=None): + """ + Custom backend to enable PEP517, utilises env variables to define which extra build + packages we should be installing into the isolated build env. + These should match the users expected versions installed outside the isolated environment or it will + cause library mismatch failures. + """ + new_pkgs = [] + MXNET = "mxnet" + key_pkg_map = {'HOROVOD_WITH_MXNET': MXNET, + 'HOROVOD_WITH_PYTORCH': 'torch', + 'HOROVOD_WITH_TENSORFLOW': 'tensorflow'} + for key in key_pkg_map.keys(): + try: + version_string = os.environ[key] + try: + version.Version(version_string) + new_pkgs.append(f"{key_pkg_map[key]}=={version_string}") + except version.InvalidVersion: + new_pkgs.append(f"{version_string}") + if key_pkg_map[key] == MXNET: + # MxNet has np.bool everywhere which is removed in newer + # versions... + new_pkgs.append("numpy==1.20.3") + except BaseException: + # Pass for now, elsewhere will alert the user has built this wrong. + ... + + return _orig.__legacy__.get_requires_for_build_wheel( + config_settings) + new_pkgs diff --git a/docker/horovod-cpu/Dockerfile b/docker/horovod-cpu/Dockerfile index 2abe187747..b19778b656 100644 --- a/docker/horovod-cpu/Dockerfile +++ b/docker/horovod-cpu/Dockerfile @@ -82,7 +82,7 @@ RUN pip install --no-cache-dir ${PYSPARK_PACKAGE} WORKDIR /horovod COPY . . RUN python setup.py sdist && \ - bash -c "HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ + bash -c "HOROVOD_WITH_TENSORFLOW=${TENSORFLOW_VERSION} HOROVOD_WITH_PYTORCH=${PYTORCH_VERSION} HOROVOD_WITH_MXNET=${MXNET_VERSION} pip install --no-cache-dir --use-pep517 -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ horovodrun --check-build # Check all frameworks are working correctly diff --git a/docker/horovod-nvtabular/Dockerfile b/docker/horovod-nvtabular/Dockerfile index 5568d849ff..340662239b 100644 --- a/docker/horovod-nvtabular/Dockerfile +++ b/docker/horovod-nvtabular/Dockerfile @@ -199,7 +199,7 @@ RUN if [[ ${MXNET_PACKAGE} == "mxnet-nightly-cu"* ]]; then \ RUN cd /horovod && \ python setup.py sdist && \ ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ - bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ + bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=${TENSORFLOW_VERSION} HOROVOD_WITH_PYTORCH=${PYTORCH_VERSION} HOROVOD_WITH_MXNET=${MXNET_VERSION} pip install --no-cache-dir --use-pep517 -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ ldconfig # Show the effective python package version to easily spot version differences diff --git a/docker/horovod-ray/Dockerfile b/docker/horovod-ray/Dockerfile index bb44d1e920..cad521a48d 100644 --- a/docker/horovod-ray/Dockerfile +++ b/docker/horovod-ray/Dockerfile @@ -56,7 +56,7 @@ WORKDIR /horovod COPY --chown=ray:users . . RUN python setup.py sdist && \ sudo ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ - HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[ray] && \ + HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=${TENSORFLOW_VERSION} HOROVOD_WITH_PYTORCH=${PYTORCH_VERSION} pip install --no-cache-dir --use-pep517 -v $(ls /horovod/dist/horovod-*.tar.gz)[ray] && \ horovodrun --check-build && \ sudo ldconfig diff --git a/docker/horovod/Dockerfile b/docker/horovod/Dockerfile index d148727e70..046e994801 100644 --- a/docker/horovod/Dockerfile +++ b/docker/horovod/Dockerfile @@ -102,7 +102,7 @@ WORKDIR /horovod COPY . . RUN python setup.py sdist && \ ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ - bash -c "HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ + bash -c "HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=${TENSORFLOW_VERSION} HOROVOD_WITH_PYTORCH=${PYTORCH_VERSION} HOROVOD_WITH_MXNET=${MXNET_VERSION} pip install --no-cache-dir --use-pep517 -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \ horovodrun --check-build && \ ldconfig diff --git a/docs/contributors.rst b/docs/contributors.rst index 3bc8b5177c..746b638da1 100644 --- a/docs/contributors.rst +++ b/docs/contributors.rst @@ -41,12 +41,12 @@ From *inside* the Horovod root directory, install Horovod in develop/editable mo .. code-block:: bash - $ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_TENSORFLOW=1 pip install -v -e . + $ HOROVOD_WITH_PYTORCH={YOUR_PYTORCH_VERSION} HOROVOD_WITH_TENSORFLOW={YOUR_TF_VERSION} pip install -v -e . Set ``HOROVOD_WITHOUT_[FRAMEWORK]=1`` to disable building Horovod plugins for that framework. This is useful when you’re testing a feature of one framework in particular and wish to save time. -Set ``HOROVOD_WITH_[FRAMEWORK]=1`` to generate an error if the Horovod plugin for that framework failed to build. +Set ``HOROVOD_WITH_[FRAMEWORK]={FRAMEWORK_VERSION}`` to generate an error if the Horovod plugin for that framework failed to build. Set ``HOROVOD_DEBUG=1`` for a debug build with checked assertions, disabled compiler optimizations etc. diff --git a/docs/install.rst b/docs/install.rst index da1309382e..da98d4dbd1 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -54,7 +54,7 @@ To ensure that Horovod is built with TensorFlow support enabled: .. code-block:: bash - $ HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow] + $ HOROVOD_WITH_TENSORFLOW={YOUR_TF_VERSION} pip install horovod[tensorflow] To skip TensorFlow, set ``HOROVOD_WITHOUT_TENSORFLOW=1`` in your environment. @@ -65,7 +65,7 @@ To ensure that Horovod is built with PyTorch support enabled: .. code-block:: bash - $ HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch] + $ HOROVOD_WITH_PYTORCH={YOUR_PyTorch_VERSION} pip install horovod[pytorch] To skip PyTorch, set ``HOROVOD_WITHOUT_PYTORCH=1`` in your environment. @@ -76,7 +76,7 @@ To ensure that Horovod is built with MXNet CPU support enabled: .. code-block:: bash - $ HOROVOD_WITH_MXNET=1 pip install horovod[mxnet] + $ HOROVOD_WITH_MXNET={YOUR_MXNet_VERSION} pip install horovod[mxnet] Some MXNet versions do not work with Horovod: @@ -95,7 +95,7 @@ To ensure that Horovod is built with Keras support available: .. code-block:: bash - $ HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras] + $ HOROVOD_WITH_TENSORFLOW={YOUR_TF_VERSION} pip install horovod[tensorflow,keras] There are no plugins built for Keras, but the TensorFlow plugin must be enabled in order to use Horovod with Keras. @@ -227,6 +227,10 @@ Environment Variables Optional environment variables that can be set to configure the installation process for Horovod. +Due to `PEP-517 `_ we can't rely on any DL library being installed into +the build env, therefore we need to tell the build env specific DL library versions we require. +This isn't the prettiest solution, however it is the most pragmatic. + Possible values are given in curly brackets: {}. * ``HOROVOD_DEBUG`` - {1}. Install a debug build of Horovod with checked assertions, disabled compiler optimizations etc. @@ -252,11 +256,11 @@ Possible values are given in curly brackets: {}. * ``HOROVOD_ALLOW_MIXED_GPU_IMPL`` - {1}. Allow Horovod to install with NCCL allreduce and MPI GPU allgather / broadcast / alltoall / reducescatter. Not recommended due to a possible deadlock. * ``HOROVOD_CPU_OPERATIONS`` - {MPI, GLOO, CCL}. Framework to use for CPU tensor allreduce, allgather, and broadcast. * ``HOROVOD_CMAKE`` - path to the CMake binary used to build Horovod. -* ``HOROVOD_WITH_TENSORFLOW`` - {1}. Require Horovod to install with TensorFlow support enabled. +* ``HOROVOD_WITH_TENSORFLOW`` - {TF pypi version}. If set require Horovod to install with specific TensorFlow version support enabled. * ``HOROVOD_WITHOUT_TENSORFLOW`` - {1}. Skip installing TensorFlow support. -* ``HOROVOD_WITH_PYTORCH`` - {1}. Require Horovod to install with PyTorch support enabled. +* ``HOROVOD_WITH_PYTORCH`` - {PyTorch pypi version}. If set require Horovod to install with specific PyTorch version support enabled. * ``HOROVOD_WITHOUT_PYTORCH`` - {1}. Skip installing PyTorch support. -* ``HOROVOD_WITH_MXNET`` - {1}. Require Horovod to install with MXNet support enabled. +* ``HOROVOD_WITH_MXNET`` - {MXNet pypi version}. If set require Horovod to install with specific MXNet version support enabled. * ``HOROVOD_WITHOUT_MXNET`` - {1}. Skip installing MXNet support. .. inclusion-marker-end-do-not-remove diff --git a/horovod/__init__.py b/horovod/__init__.py index 9d15c965e2..39b5e34516 100644 --- a/horovod/__init__.py +++ b/horovod/__init__.py @@ -1,3 +1,3 @@ from horovod.runner import run -__version__ = '0.28.1' +__version__ = '0.29.0' diff --git a/horovod/common/exceptions.py b/horovod/common/exceptions.py index 4d0ff2e684..7ff65c83e1 100644 --- a/horovod/common/exceptions.py +++ b/horovod/common/exceptions.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== - class HorovodInternalError(RuntimeError): """Internal error raised when a Horovod collective operation (e.g., allreduce) fails. @@ -28,22 +27,30 @@ class HostsUpdatedInterrupt(RuntimeError): In elastic mode, this will result in a reset event without a restore to committed state. """ + def __init__(self, skip_sync): self.skip_sync = skip_sync -def get_version_mismatch_message(name, version, installed_version): +def get_version_mismatch_message(name, version, installed_version, build_flag): return f'Framework {name} installed with version {installed_version} but found version {version}.\n\ This can result in unexpected behavior including runtime errors.\n\ - Reinstall Horovod using `pip install --no-cache-dir` to build with the new version.' + Reinstall Horovod using `{build_flag} pip install --no-cache-dir` to build with the new version.' class HorovodVersionMismatchError(ImportError): """Internal error raised when the runtime version of a framework mismatches its version at Horovod installation time. """ - def __init__(self, name, version, installed_version): - super().__init__(get_version_mismatch_message(name, version, installed_version)) + + def __init__(self, name, version, installed_version, build_flag): + super().__init__( + get_version_mismatch_message( + name, + version, + installed_version, + build_flag)) self.name = name self.version = version self.installed_version = installed_version + self.build_flag = build_flag diff --git a/horovod/common/util.py b/horovod/common/util.py index 3c5057f244..0410db2d0b 100644 --- a/horovod/common/util.py +++ b/horovod/common/util.py @@ -23,11 +23,15 @@ import warnings from contextlib import contextmanager +from importlib import metadata from horovod.common.exceptions import get_version_mismatch_message, HorovodVersionMismatchError -EXTENSIONS = ['tensorflow', 'torch', 'mxnet'] +EXTENSIONS = { + 'tensorflow': 'HOROVOD_WITH_TENSORFLOW', + 'torch': 'HOROVOD_WITH_PYTORCH', + 'mxnet': 'HOROVOD_WITH_MXNET'} def get_ext_suffix(): @@ -55,8 +59,8 @@ def check_extension(ext_name, ext_env_var, pkg_path, *args): if not os.path.exists(full_path): raise ImportError( 'Extension {} has not been built: {} not found\n' - 'If this is not expected, reinstall Horovod with {}=1 to debug the build error.'.format( - ext_name, full_path, ext_env_var + 'If this is not expected, reinstall Horovod with {}={} to debug the build error.'.format( + ext_name, full_path, ext_env_var, metadata.version(ext_name.split('.')[-1]) ) ) @@ -72,8 +76,10 @@ def _target_fn(ext_base_name, fn, fn_desc, queue, verbose): import traceback if verbose: - print('Checking whether extension {ext_base_name} was {fn_desc}.'.format( - ext_base_name=ext_base_name, fn_desc=fn_desc)) + print( + 'Checking whether extension {ext_base_name} was {fn_desc}.'.format( + ext_base_name=ext_base_name, + fn_desc=fn_desc)) else: # Suppress output sys.stdout = open(os.devnull, 'w') @@ -82,14 +88,17 @@ def _target_fn(ext_base_name, fn, fn_desc, queue, verbose): try: ext = importlib.import_module('.' + ext_base_name, 'horovod') result = fn(ext) - except: + except BaseException: traceback.print_exc() result = None if verbose: - print('Extension {ext_base_name} {flag} {fn_desc}.'.format( - ext_base_name=ext_base_name, flag=('was' if result else 'was NOT'), - fn_desc=fn_desc)) + print( + 'Extension {ext_base_name} {flag} {fn_desc}.'.format( + ext_base_name=ext_base_name, + flag=( + 'was' if result else 'was NOT'), + fn_desc=fn_desc)) queue.put(result) @@ -105,7 +114,7 @@ def _target_fn(ext_base_name, fn, fn_desc, queue, verbose): def extension_available(ext_base_name, verbose=False): - available_fn = lambda ext: ext is not None + def available_fn(ext): return ext is not None return _check_extension_lambda( ext_base_name, available_fn, 'built', verbose) or False @@ -128,7 +137,7 @@ def wrapper(*args, **kwargs): @_cache def gpu_available(ext_base_name, verbose=False): - available_fn = lambda ext: ext._check_has_gpu() + def available_fn(ext): return ext._check_has_gpu() return _check_extension_lambda( ext_base_name, available_fn, 'running with GPU', verbose) or False @@ -136,7 +145,7 @@ def gpu_available(ext_base_name, verbose=False): @_cache def mpi_built(verbose=False): for ext_base_name in EXTENSIONS: - built_fn = lambda ext: ext.mpi_built() + def built_fn(ext): return ext.mpi_built() result = _check_extension_lambda( ext_base_name, built_fn, 'built with MPI', verbose) if result is not None: @@ -147,43 +156,47 @@ def mpi_built(verbose=False): @_cache def gloo_built(verbose=False): for ext_base_name in EXTENSIONS: - built_fn = lambda ext: ext.gloo_built() + def built_fn(ext): return ext.gloo_built() result = _check_extension_lambda( ext_base_name, built_fn, 'built with Gloo', verbose) if result is not None: return result return None + @_cache def nccl_built(verbose=False): for ext_base_name in EXTENSIONS: - built_fn = lambda ext: ext.nccl_built() + def built_fn(ext): return ext.nccl_built() result = _check_extension_lambda( ext_base_name, built_fn, 'built with NCCL', verbose) if result is not None: return result return None + @_cache def ddl_built(verbose=False): for ext_base_name in EXTENSIONS: - built_fn = lambda ext: ext.ddl_built() + def built_fn(ext): return ext.ddl_built() result = _check_extension_lambda( ext_base_name, built_fn, 'built with DDL', verbose) if result is not None: return result return None + @_cache def ccl_built(verbose=False): for ext_base_name in EXTENSIONS: - built_fn = lambda ext: ext.ccl_built() + def built_fn(ext): return ext.ccl_built() result = _check_extension_lambda( ext_base_name, built_fn, 'built with CCL', verbose) if result is not None: return result return None + @contextmanager def env(**kwargs): # ignore args with None values @@ -221,11 +234,13 @@ def get_average_backwards_compatibility_fun(reduce_ops): def impl(op, average): if op is not None: if average is not None: - raise ValueError('The op parameter supersedes average. Please provide only one of them.') + raise ValueError( + 'The op parameter supersedes average. Please provide only one of them.') return op elif average is not None: - warnings.warn('Parameter `average` has been replaced with `op` and will be removed in v1.0', - DeprecationWarning) + warnings.warn( + 'Parameter `average` has been replaced with `op` and will be removed in v1.0', + DeprecationWarning) return reduce_ops.Average if average else reduce_ops.Sum else: return reduce_ops.Average @@ -238,7 +253,8 @@ def num_rank_is_power_2(num_rank): for Adasum allreduce. TODO support non-power of 2 ranks. """ - return num_rank != 0 and ((num_rank & (num_rank -1)) == 0) + return num_rank != 0 and ((num_rank & (num_rank - 1)) == 0) + def split_list(l, n): """ @@ -249,15 +265,30 @@ def split_list(l, n): def check_installed_version(name, version, exception=None): - file_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),\ - os.pardir, "metadata.json")) + file_path = os.path.abspath( + os.path.join( + os.path.dirname( + os.path.abspath(__file__)), + os.pardir, + "metadata.json")) with open(file_path) as f: installed_version = json.load(f).get(name) + extension_flag = EXTENSIONS['torch'] if name == 'pytorch' else EXTENSIONS[name] + # Anything after a + is useless for pip, i.e. torch==2.0.0+cu117 finds + # nothing. + build_flag = f"{extension_flag}=={version.split('+')[0]}" if installed_version != version: if exception is None: - warnings.warn(get_version_mismatch_message(name, version, installed_version)) + warnings.warn( + get_version_mismatch_message( + name, + version, + installed_version, + build_flag)) else: - raise HorovodVersionMismatchError(name, version, installed_version) from exception + raise HorovodVersionMismatchError( + name, version, installed_version, build_flag) from exception + def is_iterable(x): try: @@ -271,15 +302,14 @@ def is_iterable(x): def is_version_greater_equal_than(ver, target): from packaging import version if any([not isinstance(_str, str) for _str in (ver, target)]): - raise ValueError("This function only accepts string arguments. \n" - "Received:\n" - "\t- ver (type {type_ver}: {val_ver})" - "\t- target (type {type_target}: {val_target})".format( - type_ver=(type(ver)), - val_ver=ver, - type_target=(type(target)), - val_target=target, - )) + raise ValueError( + "This function only accepts string arguments. \n" + "Received:\n" + "\t- ver (type {type_ver}: {val_ver})" + "\t- target (type {type_target}: {val_target})".format( + type_ver=( + type(ver)), val_ver=ver, type_target=( + type(target)), val_target=target, )) if len(target.split(".")) != 3: raise ValueError("We only accepts target version values in the form " diff --git a/horovod/mxnet/CMakeLists.txt b/horovod/mxnet/CMakeLists.txt index 4afe9246a9..a492afa3cf 100644 --- a/horovod/mxnet/CMakeLists.txt +++ b/horovod/mxnet/CMakeLists.txt @@ -6,7 +6,7 @@ set(Mxnet_TARGET_LIB "mxnet") # Find MXNet set(Mxnet_REQUIRED "") -if ("$ENV{HOROVOD_WITH_MXNET}" STREQUAL "1") +if ("$ENV{HOROVOD_WITH_MXNET}" MATCHES "^1\.") set(Mxnet_REQUIRED "REQUIRED") endif () find_package(Mxnet "1.4.1" ${Mxnet_REQUIRED}) diff --git a/horovod/tensorflow/CMakeLists.txt b/horovod/tensorflow/CMakeLists.txt index 38aa606f99..b0644944f0 100644 --- a/horovod/tensorflow/CMakeLists.txt +++ b/horovod/tensorflow/CMakeLists.txt @@ -6,7 +6,7 @@ set(TF_TARGET_LIB "tensorflow") # Find TF set(TF_REQUIRED "") -if ("$ENV{HOROVOD_WITH_TENSORFLOW}" STREQUAL "1") +if ("$ENV{HOROVOD_WITH_TENSORFLOW}" MATCHES "^[1-2]\.") set(TF_REQUIRED "REQUIRED") endif () find_package(Tensorflow "1.15.0" ${TF_REQUIRED}) diff --git a/horovod/torch/CMakeLists.txt b/horovod/torch/CMakeLists.txt index 7234d741ef..05a506083b 100644 --- a/horovod/torch/CMakeLists.txt +++ b/horovod/torch/CMakeLists.txt @@ -6,7 +6,7 @@ set(PYTORCH_TARGET_LIB "pytorch") # Find PyTorch set(PYTORCH_REQUIRED "") -if ("$ENV{HOROVOD_WITH_PYTORCH}" STREQUAL "1") +if ("$ENV{HOROVOD_WITH_PYTORCH}" MATCHES "^[1-2]\.") set(PYTORCH_REQUIRED "REQUIRED") endif () find_package(Pytorch "1.5.0" ${PYTORCH_REQUIRED}) @@ -63,7 +63,9 @@ list(APPEND PYTORCH_LINKER_LIBS ${Pytorch_LIBRARIES}) parse_version(${Pytorch_VERSION} VERSION_DEC) add_definitions(-DPYTORCH_VERSION=${VERSION_DEC} -DTORCH_API_INCLUDE_EXTENSION_H=1) set(Pytorch_CXX11 ${Pytorch_CXX11} PARENT_SCOPE) -if(NOT Pytorch_VERSION VERSION_LESS "1.5.0") +if (Pytorch_VERSION VERSION_GREATER_EQUAL "2.0.0") + set(CMAKE_CXX_STANDARD 17) +elseif(Pytorch_VERSION VERSION_GREATER_EQUAL "1.5.0") set(CMAKE_CXX_STANDARD 14) endif() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..8fbc333688 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[build-system] +requires = ["setuptools", "wheel", "packaging", "cmake"] +build-backend = "backend" +backend-path = ["_custom_build"] diff --git a/setup.py b/setup.py index b890286674..36016a238f 100644 --- a/setup.py +++ b/setup.py @@ -26,12 +26,14 @@ import textwrap from setuptools import setup, Extension, find_packages +from setuptools.command.build_py import build_py as _build_py from setuptools.command.build_ext import build_ext from horovod import __version__ _FRAMEWORK_METADATA_FILE = 'horovod/metadata.json' + class CMakeExtension(Extension): def __init__(self, name, cmake_lists_dir='.', sources=None, **kwa): if sources is None: @@ -47,6 +49,7 @@ def __init__(self, name, cmake_lists_dir='.', sources=None, **kwa): mxnet_mpi_lib = CMakeExtension('horovod.mxnet.mpi_lib', cmake_lists_dir='.', sources=[]) + def is_build_action(): if len(sys.argv) <= 1: return False @@ -63,6 +66,7 @@ def is_build_action(): if sys.argv[1].startswith('develop'): return True + def get_cmake_bin(): from packaging import version @@ -75,18 +79,24 @@ def get_cmake_bin(): except OSError: cmake_installed_version = version.parse("0.0") else: - cmake_installed_version = version.parse(re.search(r'version\s*([\d.]+)', out.decode()).group(1)) + cmake_installed_version = version.parse( + re.search(r'version\s*([\d.]+)', out.decode()).group(1)) if cmake_installed_version < version.parse("3.13.0"): - print("Could not find a recent CMake to build Horovod. " - "Attempting to install CMake 3.13 to a temporary location via pip.", flush=True) - cmake_temp_dir = tempfile.TemporaryDirectory(prefix="horovod-cmake-tmp") + print( + "Could not find a recent CMake to build Horovod. " + "Attempting to install CMake 3.13 to a temporary location via pip.", + flush=True) + cmake_temp_dir = tempfile.TemporaryDirectory( + prefix="horovod-cmake-tmp") atexit.register(cmake_temp_dir.cleanup) try: - _ = subprocess.check_output(["pip", "install", "--target", cmake_temp_dir.name, "cmake~=3.13.0"]) + _ = subprocess.check_output( + ["pip", "install", "--target", cmake_temp_dir.name, "cmake~=3.13.0"]) except Exception: - raise RuntimeError("Failed to install temporary CMake. " - "Please update your CMake to 3.13+ or set HOROVOD_CMAKE appropriately.") + raise RuntimeError( + "Failed to install temporary CMake. " + "Please update your CMake to 3.13+ or set HOROVOD_CMAKE appropriately.") cmake_bin = os.path.join(cmake_temp_dir.name, "bin", "run_cmake") with io.open(cmake_bin, "w") as f_run_cmake: f_run_cmake.write( @@ -97,6 +107,7 @@ def get_cmake_bin(): class custom_build_ext(build_ext): + def build_extensions(self): if os.getenv('HOROVOD_SKIP_COMPILE') == '1': # Skip building extensions using CMake @@ -105,14 +116,17 @@ def build_extensions(self): cmake_bin = get_cmake_bin() - config = 'Debug' if self.debug or os.environ.get('HOROVOD_DEBUG') == "1" else 'RelWithDebInfo' + config = 'Debug' if self.debug or os.environ.get( + 'HOROVOD_DEBUG') == "1" else 'RelWithDebInfo' ext_name = self.extensions[0].name - build_dir = self.get_ext_fullpath(ext_name).replace(self.get_ext_filename(ext_name), '') + build_dir = self.get_ext_fullpath(ext_name).replace( + self.get_ext_filename(ext_name), '') build_dir = os.path.abspath(build_dir) cmake_args = ['-DCMAKE_BUILD_TYPE=' + config, - '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(config.upper(), build_dir), + '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(config.upper(), + build_dir), '-DPYTHON_EXECUTABLE:FILEPATH=' + sys.executable] make_args = ['-j8'] if not os.environ.get('MAKEFLAGS') else [] @@ -148,14 +162,28 @@ def build_extensions(self): if sys.argv[1].startswith('develop'): # Copy over metadata.json file from build directory - shutil.copyfile(os.path.join(build_dir, _FRAMEWORK_METADATA_FILE), - os.path.join(self.extensions[0].cmake_lists_dir, _FRAMEWORK_METADATA_FILE)) - # Remove unfound frameworks, otherwise develop mode will fail the install - self.extensions = [x for x in self.extensions if os.path.exists(self.get_ext_fullpath(x.name))] + shutil.copyfile( + os.path.join( + build_dir, + _FRAMEWORK_METADATA_FILE), + os.path.join( + self.extensions[0].cmake_lists_dir, + _FRAMEWORK_METADATA_FILE)) + # Remove unfound frameworks, otherwise develop mode will fail the + # install + self.extensions = [ + x for x in self.extensions if os.path.exists( + self.get_ext_fullpath( + x.name))] # python packages required to use horovod in general -require_list = ['cloudpickle', 'psutil', 'pyyaml', 'dataclasses;python_version<"3.7"', 'packaging'] +require_list = [ + 'cloudpickle', + 'psutil', + 'pyyaml', + 'dataclasses;python_version<"3.7"', + 'packaging'] # framework dependencies tensorflow_require_list = ['tensorflow'] @@ -167,28 +195,33 @@ def build_extensions(self): mxnet_require_list = ['mxnet>=1.4.1'] pyspark_require_list = ['pyspark>=2.3.2;python_version<"3.8"', 'pyspark>=3.0.0;python_version>="3.8"'] -spark_require_list = ['numpy', 'petastorm>=0.12.0', 'pyarrow>=0.15.0,<11.0', 'fsspec>=2021.07.0'] +spark_require_list = [ + 'numpy', + 'petastorm>=0.12.0', + 'pyarrow>=0.15.0,<11.0', + 'fsspec>=2021.07.0'] # https://github.com/ray-project/ray/pull/17465 # google-api-core>=2.9.0 depends on protobuf<5.0.0dev,>=3.20.1, which conflicts with # tensorflow protobuf~=3.20 and pytorch-lightning protobuf<3.20,>=3.9.2 ray_require_list = ['ray', 'aioredis<2', 'google-api-core<2.9.0'] pytorch_spark_require_list = pytorch_require_list + \ - spark_require_list + \ - pyspark_require_list + \ - ['pytorch_lightning>=1.3.8,<1.5.10'] + spark_require_list + \ + pyspark_require_list + \ + ['pytorch_lightning>=1.3.8,<1.5.10'] # all frameworks' dependencies all_frameworks_require_list = tensorflow_require_list + \ - keras_require_list + \ - pytorch_require_list + \ - mxnet_require_list + \ - spark_require_list + \ - pyspark_require_list + keras_require_list + \ + pytorch_require_list + \ + mxnet_require_list + \ + spark_require_list + \ + pyspark_require_list # python packages required / recommended to develop horovod # these are the earliest versions to work with Python 3.8 # keep in sync with Dockerfile.test.cpu -# NOTE: do not use versions with +cpu or +gpu here as users would need to add --find-links to pip +# NOTE: do not use versions with +cpu or +gpu here as users would need to +# add --find-links to pip dev_require_list = ['tensorflow-cpu==2.2.0', 'keras==2.3.1', 'torch==1.4.0', @@ -199,7 +232,12 @@ def build_extensions(self): # torchvision 0.5.0 depends on torch==1.4.0 # python packages required only to run tests -test_require_list = ['mock', 'pytest', 'pytest-forked', 'pytest-subtests', 'parameterized'] +test_require_list = [ + 'mock', + 'pytest', + 'pytest-forked', + 'pytest-subtests', + 'parameterized'] # Skip cffi if pytorch extension explicitly disabled if not os.environ.get('HOROVOD_WITHOUT_PYTORCH'): @@ -207,7 +245,8 @@ def build_extensions(self): def get_package_version(): - return __version__ + "+" + os.environ['HOROVOD_LOCAL_VERSION'] if 'HOROVOD_LOCAL_VERSION' in os.environ else __version__ + return __version__ + "+" + \ + os.environ['HOROVOD_LOCAL_VERSION'] if 'HOROVOD_LOCAL_VERSION' in os.environ else __version__ setup(name='horovod', @@ -220,7 +259,14 @@ def get_package_version(): Horovod is a distributed training framework for TensorFlow, Keras, PyTorch, and Apache MXNet. The goal of Horovod is to make distributed Deep Learning fast and easy to use.'''), url='https://github.com/horovod/horovod', - keywords=['deep learning', 'tensorflow', 'keras', 'pytorch', 'mxnet', 'spark', 'AI'], + keywords=[ + 'deep learning', + 'tensorflow', + 'keras', + 'pytorch', + 'mxnet', + 'spark', + 'AI'], classifiers=[ 'License :: OSI Approved :: Apache Software License', 'Development Status :: 4 - Beta',