merging

modin-project · Mar 2, 2024 · 8bd2d09 · 8bd2d09
2 parents 4aa3691 + a966395
commit 8bd2d09
Show file tree

Hide file tree

Showing 418 changed files with 16,101 additions and 11,886 deletions.
diff --git a/.github/actions/run-core-tests/group_2/action.yml b/.github/actions/run-core-tests/group_2/action.yml
@@ -20,3 +20,5 @@ runs:
  modin/pandas/test/dataframe/test_pickle.py
  echo "::endgroup::"
  shell: bash -l {0}
+ - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
+ shell: bash -l {0}
diff --git a/.github/actions/run-core-tests/group_3/action.yml b/.github/actions/run-core-tests/group_3/action.yml
@@ -19,6 +19,6 @@ runs:
  shell: bash -l {0}
  - run: |
  echo "::group::Running experimental groupby tests (group 3)..."
- MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
+ MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
  echo "::endgroup::"
  shell: bash -l {0}
diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml
@@ -8,6 +8,7 @@ on:
  - setup.cfg
  - setup.py
  - requirements/env_hdk.yml
+ - requirements/env_unidist_linux.yml
 concurrency:
  # Cancel other jobs in the same branch. We don't care whether CI passes
  # on old commits.
@@ -28,12 +29,17 @@ jobs:
  steps:
  - uses: actions/checkout@v3
  - uses: ./.github/actions/python-only
- if: matrix.execution != 'hdk_on_native'
+ if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
  - uses: ./.github/actions/mamba-env
  with:
  environment-file: requirements/env_hdk.yml
  activate-environment: modin_on_hdk
  if: matrix.execution == 'hdk_on_native'
+ - uses: ./.github/actions/mamba-env
+ with:
+ environment-file: requirements/env_unidist_linux.yml
+ activate-environment: modin_on_unidist
+ if: matrix.execution == 'pandas_on_unidist'
  - name: Cache datasets
  uses: actions/cache@v2
  with:
@@ -43,29 +49,29 @@ jobs:
  # replace modin with . in the tutorial requirements file for `pandas_on_ray` and
  # `pandas_on_dask` since we need Modin built from sources
  - run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
- if: matrix.execution != 'hdk_on_native'
+ if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
  # install dependencies required for notebooks execution for `pandas_on_ray` and `pandas_on_dask`
  # Override modin-spreadsheet install for now
  - run: |
  pip install -r examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
  pip install git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
- if: matrix.execution != 'hdk_on_native'
- # Build Modin from sources for `hdk_on_native`
+ if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
+ # Build Modin from sources for `hdk_on_native` and `pandas_on_unidist`
  - run: pip install -e .
- if: matrix.execution == 'hdk_on_native'
+ if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
  # install test dependencies
  # NOTE: If you are changing the set of packages installed here, make sure that
  # the dev requirements match them.
  - run: pip install pytest pytest-cov black flake8 flake8-print flake8-no-implicit-concat
- if: matrix.execution != 'hdk_on_native'
+ if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
  - run: pip install flake8-print jupyter nbformat nbconvert
- if: matrix.execution == 'hdk_on_native'
+ if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
  - run: pip list
- if: matrix.execution != 'hdk_on_native'
+ if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
  - run: |
  conda info
  conda list
- if: matrix.execution == 'hdk_on_native'
+ if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
  # setup kernel configuration for `pandas_on_unidist` execution with mpi backend
  - run: python examples/tutorial/jupyter/execution/${{ matrix.execution }}/setup_kernel.py
  if: matrix.execution == 'pandas_on_unidist'

diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml
@@ -66,8 +66,6 @@ jobs:
  asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \
  asv_bench/benchmarks/scalability/__init__.py \
  modin/core/io \
- modin/experimental/core/execution/ray/implementations/pandas_on_ray \
- modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \
  modin/pandas/series.py \
  modin/core/execution/python \
  modin/pandas/dataframe.py \
@@ -91,7 +89,6 @@ jobs:
  python scripts/doc_checker.py modin/experimental/pandas/io.py \
  modin/experimental/pandas/__init__.py
  - run: python scripts/doc_checker.py modin/core/storage_formats/base
- - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow
  - run: python scripts/doc_checker.py modin/core/storage_formats/pandas
  - run: |
  python scripts/doc_checker.py \
@@ -108,3 +105,15 @@ jobs:
  - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol
  - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
  - run: python scripts/doc_checker.py modin/logging
+
+ lint-black-isort:
+ name: lint (black and isort)
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: ./.github/actions/python-only
+ - run: pip install black>=24.1.0 isort>=5.12
+ # NOTE: keep the black command here in sync with the pre-commit hook in
+ # /contributing/pre-commit
+ - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py
+ - run: isort . --check-only
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,17 +26,6 @@ env:
  MODIN_GITHUB_CI: true
 
 jobs:
- lint-black:
- name: lint (black)
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v3
- - uses: ./.github/actions/python-only
- - run: pip install black
- # NOTE: keep the black command here in sync with the pre-commit hook in
- # /contributing/pre-commit
- - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py
-
  lint-mypy:
  name: lint (mypy)
  runs-on: ubuntu-latest
@@ -77,7 +66,7 @@ jobs:
  - uses: ./.github/actions/upload-coverage
 
  test-clean-install:
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  strategy:
  matrix:
  os:
@@ -92,14 +81,20 @@ jobs:
  - uses: actions/checkout@v3
  - uses: ./.github/actions/python-only
  - run: python -m pip install -e ".[all]"
- - name: Ensure all engines start up
+ - name: Ensure Ray and Dask engines start up
  run: |
  MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
  MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+ - name: Ensure MPI engine start up
+ # Install a working MPI implementation beforehand so mpi4py can link to it
+ run: |
+ sudo apt install libmpich-dev
+ python -m pip install -e ".[mpi]"
  MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+ if: matrix.os == 'ubuntu'
 
  test-internals:
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  runs-on: ubuntu-latest
  defaults:
  run:
@@ -124,7 +119,7 @@ jobs:
  - uses: ./.github/actions/upload-coverage
 
  test-defaults:
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  runs-on: ubuntu-latest
  defaults:
  run:
@@ -155,7 +150,7 @@ jobs:
  - uses: ./.github/actions/upload-coverage
 
  test-hdk:
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  runs-on: ubuntu-latest
  defaults:
  run:
@@ -193,6 +188,7 @@ jobs:
  - run: python -m pytest modin/pandas/test/dataframe/test_binary.py
  - run: python -m pytest modin/pandas/test/dataframe/test_reduce.py
  - run: python -m pytest modin/pandas/test/dataframe/test_join_sort.py
+ - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
  - run: python -m pytest modin/pandas/test/test_general.py
  - run: python -m pytest modin/pandas/test/dataframe/test_indexing.py
  - run: python -m pytest modin/pandas/test/test_series.py
@@ -212,7 +208,7 @@ jobs:
 
  test-asv-benchmarks:
  if: github.event_name == 'pull_request'
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  runs-on: ubuntu-latest
  defaults:
  run:
@@ -256,11 +252,6 @@ jobs:
  MODIN_ASV_USE_IMPL=pandas asv run --quick --strict --show-stderr --launch-method=spawn \
  -b ^benchmarks -b ^io | tee benchmarks.log
 
- # HDK: ERR_OUT_OF_CPU_MEM: Not enough host memory to execute the query (MODIN#4270)
- # just disable test for testing - it works well in a machine with more memory
- sed -i 's/def time_groupby_agg_nunique(self, \*args, \*\*kwargs):/# def time_groupby_agg_nunique(self, *args, **kwargs):/g' benchmarks/hdk/benchmarks.py
- sed -i 's/execute(self.df.groupby(by=self.groupby_columns).agg("nunique"))/# execute(self.df.groupby(by=self.groupby_columns).agg("nunique"))/g' benchmarks/hdk/benchmarks.py
-
  # Otherwise, ASV considers that the environment has already been created, although ASV command is run for another config,
  # which requires the creation of a completely new environment. This step will be required after removing the manual environment setup step.
  rm -f -R .asv/env/
@@ -322,7 +313,7 @@ jobs:
  "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT
 
  test-all-unidist:
- needs: [lint-flake8, lint-black, execution-filter]
+ needs: [lint-flake8, execution-filter]
  if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true'
  runs-on: ubuntu-latest
  defaults:
@@ -353,7 +344,7 @@ jobs:
  - uses: actions/checkout@v3
  - uses: ./.github/actions/mamba-env
  with:
- environment-file: requirements/env_unidist.yml
+ environment-file: requirements/env_unidist_linux.yml
  activate-environment: modin_on_unidist
  python-version: ${{matrix.python-version}}
  - name: Install HDF5
@@ -376,8 +367,18 @@ jobs:
  - run: ./.github/workflows/sql_server/set_up_sql_server.sh
  # need an extra argument "genv" to set environment variables for mpiexec. We need
  # these variables to test writing to the mock s3 filesystem.
- - run: mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
- - run: mpiexec -n 1 python -m pytest modin/experimental/pandas/test/test_io_exp.py
+ - uses: nick-fields/retry@v2
+ # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
+ # for details see: https://github.com/modin-project/modin/pull/6776
+ with:
+ timeout_minutes: 15
+ max_attempts: 3
+ command: |
+ conda run --no-capture-output -n modin_on_unidist mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key \
+ -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
+ - run: |
+ mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret \
+ python -m pytest modin/experimental/pandas/test/test_io_exp.py
  - run: mpiexec -n 1 python -m pytest modin/experimental/sql/test/test_sql.py
  - run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/test_general.py
  - run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
@@ -387,7 +388,7 @@ jobs:
  - uses: ./.github/actions/upload-coverage
 
  test-all:
- needs: [lint-flake8, lint-black, execution-filter]
+ needs: [lint-flake8, execution-filter]
  strategy:
  matrix:
  os:
@@ -521,7 +522,7 @@ jobs:
  if: matrix.os == 'windows'
 
  test-sanity:
- needs: [lint-flake8, lint-black, execution-filter]
+ needs: [lint-flake8, execution-filter]
  if: github.event_name == 'pull_request'
  strategy:
  matrix:
@@ -560,7 +561,7 @@ jobs:
  - uses: actions/checkout@v3
  - uses: ./.github/actions/mamba-env
  with:
- environment-file: ${{ matrix.execution.name == 'unidist' && 'requirements/env_unidist.yml' || 'environment-dev.yml' }}
+ environment-file: ${{ matrix.os == 'ubuntu' && matrix.execution.name == 'unidist' && 'requirements/env_unidist_linux.yml' || matrix.os == 'windows' && matrix.execution.name == 'unidist' && 'requirements/env_unidist_win.yml' || 'environment-dev.yml' }}
  activate-environment: ${{ matrix.execution.name == 'unidist' && 'modin_on_unidist' || 'modin' }}
  python-version: ${{matrix.python-version}}
  - name: Install HDF5
@@ -584,6 +585,7 @@ jobs:
  - run: MODIN_BENCHMARK_MODE=True ${{ matrix.execution.shell-ex }} modin/pandas/test/internals/test_benchmark_mode.py
  - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/pandas/test/internals/test_repartition.py
  - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/test_partition_api.py
+ - run: ${{ matrix.execution.shell-ex }} modin/pandas/api/extensions/test
  - name: xgboost tests
  run: |
  # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
@@ -630,6 +632,15 @@ jobs:
  if: matrix.os != 'windows'
  - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/numpy/test
  - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
+ if: matrix.execution.name != 'unidist'
+ - uses: nick-fields/retry@v2
+ # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
+ # for details see: https://github.com/modin-project/modin/pull/6776
+ with:
+ timeout_minutes: 15
+ max_attempts: 3
+ command: conda run --no-capture-output -n modin_on_unidist ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
+ if: matrix.execution.name == 'unidist'
  - run: ${{ matrix.execution.shell-ex }} modin/experimental/pandas/test/test_io_exp.py
  - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/test_general.py
  - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
@@ -644,7 +655,7 @@ jobs:
  - uses: ./.github/actions/upload-coverage
 
  test-experimental:
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  runs-on: ubuntu-latest
  defaults:
  run:
@@ -672,38 +683,8 @@ jobs:
  - run: python -m pytest modin/pandas/test/test_io.py --verbose
  - uses: ./.github/actions/upload-coverage
 
- test-pyarrow:
- needs: [lint-flake8, lint-black]
- runs-on: ubuntu-latest
- defaults:
- run:
- shell: bash -l {0}
- strategy:
- matrix:
- python-version: ["3.9"]
- env:
- MODIN_STORAGE_FORMAT: pyarrow
- MODIN_EXPERIMENTAL: "True"
- name: test (pyarrow, python ${{matrix.python-version}})
- services:
- moto:
- image: motoserver/moto
- ports:
- - 5000:5000
- env:
- AWS_ACCESS_KEY_ID: foobar_key
- AWS_SECRET_ACCESS_KEY: foobar_secret
- steps:
- - uses: actions/checkout@v3
- - uses: ./.github/actions/mamba-env
- with:
- environment-file: environment-dev.yml
- python-version: ${{matrix.python-version}}
- - run: sudo apt update && sudo apt install -y libhdf5-dev
- - run: python -m pytest modin/pandas/test/test_io.py::TestCsv --verbose
-
  test-spreadsheet:
- needs: [lint-flake8, lint-black]
+ needs: [lint-flake8]
  runs-on: ubuntu-latest
  defaults:
  run: