Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#7203: Make sure modin works correctly with pandas, which uses pyarrow as a backend #7204

Merged
merged 59 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
8e46e4e
TEST-#7049: Add some sanity tests with pyarrow-backed pandas dataframes
anmyachev Mar 11, 2024
6814c6e
fixes
anmyachev Apr 2, 2024
e1dbc69
fix
anmyachev Apr 19, 2024
0241d7f
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev Apr 19, 2024
7b925a5
cleanup
anmyachev Apr 19, 2024
23003c5
fix comment
anmyachev Apr 19, 2024
cc2a5ab
skip some cases for HDK
anmyachev Apr 19, 2024
b710865
FEAT-#7203: Make sure modin works correctly with pandas, which uses p…
anmyachev Apr 19, 2024
310f12a
don't use numpy types directly
anmyachev Apr 19, 2024
cb90479
try another dtype_backend
anmyachev Apr 19, 2024
f9b2560
fix
anmyachev Apr 19, 2024
907fc9a
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev Apr 29, 2024
13e0d0e
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 2, 2024
ddcda4f
fixes
anmyachev May 2, 2024
afae62f
fix
anmyachev May 2, 2024
316cddb
fix
anmyachev May 2, 2024
639c2ed
fix pivot_table
anmyachev May 2, 2024
05f32e5
fix
anmyachev May 2, 2024
194cc68
find potential problem areas at the query_compiler level
anmyachev May 3, 2024
91f2607
some more places
anmyachev May 3, 2024
e24201f
add construct_dtype
anmyachev May 3, 2024
4dba613
fix
anmyachev May 6, 2024
5f56c4a
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 6, 2024
ea05389
fix
anmyachev May 6, 2024
005f480
fix
anmyachev May 6, 2024
0d34bea
fix
anmyachev May 6, 2024
aac7097
fix
anmyachev May 6, 2024
258c3b9
fix
anmyachev May 6, 2024
068f67d
fix
anmyachev May 6, 2024
45c1d1f
fix
anmyachev May 7, 2024
b114314
fix
anmyachev May 7, 2024
c597f7f
cleanup
anmyachev May 7, 2024
9562144
updates
anmyachev May 10, 2024
46df4ea
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 12, 2024
8b93500
fixes after merge
anmyachev May 12, 2024
ae861e3
new approach
anmyachev May 12, 2024
5b18cfd
cleanup
anmyachev May 12, 2024
9c6ce78
cleanup
anmyachev May 12, 2024
a04b0a2
cleanup
anmyachev May 13, 2024
60101b5
fix
anmyachev May 13, 2024
07f9927
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 13, 2024
c25a419
cleanup
anmyachev May 13, 2024
6e0c37e
fixes
anmyachev May 13, 2024
778be02
cleanup
anmyachev May 13, 2024
9d6d839
cleanup
anmyachev May 13, 2024
22f2db6
cleanup
anmyachev May 13, 2024
acc20b3
cleanup
anmyachev May 13, 2024
7a91fc4
cleanup
anmyachev May 13, 2024
d31e93f
revert changes in metadata/dtypes.py
anmyachev May 13, 2024
a4c5f91
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 13, 2024
b3179fc
fix tests
anmyachev May 13, 2024
18eec16
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 14, 2024
b3471ff
cleanup
anmyachev May 14, 2024
e538fb4
Merge branch 'main' of https://github.com/modin-project/modin into is…
anmyachev May 14, 2024
14b4dd3
fix
anmyachev May 14, 2024
7abfc42
Apply suggestions from code review
anmyachev May 14, 2024
30d4749
Apply suggestions from code review
anmyachev May 14, 2024
3213194
address review comments
anmyachev May 14, 2024
45acef9
expand comments
anmyachev May 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
43 changes: 33 additions & 10 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@

"""Module houses builder class for Binary operator."""

from __future__ import annotations

import warnings
from typing import Optional
from typing import TYPE_CHECKING, Optional

import numpy as np
import pandas
Expand All @@ -24,9 +26,12 @@

from .operator import Operator

if TYPE_CHECKING:
from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler


def maybe_compute_dtypes_common_cast(
first,
first: PandasQueryCompiler,
second,
trigger_computations=False,
axis=0,
Expand Down Expand Up @@ -80,6 +85,7 @@ def maybe_compute_dtypes_common_cast(
# belong to the intersection, these will be NaN columns in the result
mismatch_columns = columns_first ^ columns_second
elif isinstance(second, dict):
# TODO: pyarrow backend
dtypes_second = {
key: pandas.api.types.pandas_dtype(type(value))
for key, value in second.items()
Expand All @@ -92,6 +98,7 @@ def maybe_compute_dtypes_common_cast(
mismatch_columns = columns_first.difference(columns_second)
else:
if isinstance(second, (list, tuple)):
# TODO: pyarrow backend
second_dtypes_list = (
[pandas.api.types.pandas_dtype(type(value)) for value in second]
if axis == 1
Expand All @@ -100,6 +107,7 @@ def maybe_compute_dtypes_common_cast(
else [np.array(second).dtype] * len(dtypes_first)
)
elif is_scalar(second) or isinstance(second, np.ndarray):
# TODO: pyarrow backend
try:
dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype(
type(second)
Expand All @@ -125,6 +133,7 @@ def maybe_compute_dtypes_common_cast(
mismatch_columns = []

# If at least one column doesn't match, the result of the non matching column would be nan.
# TODO: pyarrow backend
nan_dtype = pandas.api.types.pandas_dtype(type(np.nan))
dtypes = None
if func is not None:
Expand Down Expand Up @@ -168,7 +177,7 @@ def maybe_compute_dtypes_common_cast(


def maybe_build_dtypes_series(
first, second, dtype, trigger_computations=False
first: PandasQueryCompiler, second, dtype, trigger_computations=False
) -> Optional[pandas.Series]:
"""
Build a ``pandas.Series`` describing dtypes of the result of a binary operation.
Expand All @@ -179,7 +188,7 @@ def maybe_build_dtypes_series(
First operand for which the binary operation would be performed later.
second : PandasQueryCompiler, list-like or scalar
Second operand for which the binary operation would be performed later.
dtype : np.dtype
dtype : pandas supported dtype
Dtype of the result.
trigger_computations : bool, default: False
Whether to trigger computation of the lazy metadata for `first` and `second`.
Expand Down Expand Up @@ -217,8 +226,13 @@ def maybe_build_dtypes_series(


def try_compute_new_dtypes(
first, second, infer_dtypes=None, result_dtype=None, axis=0, func=None
):
first: PandasQueryCompiler,
second,
infer_dtypes=None,
result_dtype=None,
axis=0,
func=None,
) -> Optional[pandas.Series]:
"""
Precompute resulting dtypes of the binary operation if possible.

Expand All @@ -235,7 +249,7 @@ def try_compute_new_dtypes(
infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None
How dtypes should be infered (see ``Binary.register`` doc for more info).
result_dtype : np.dtype, optional
NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter.
NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. Only NumPy?
axis : int, default: 0
Axis to perform the binary operation along.
func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional
Expand All @@ -250,10 +264,19 @@ def try_compute_new_dtypes(

try:
if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
# FIXME: https://github.com/modin-project/modin/issues/7203
# can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
# dataframe can contain types of different backends at the same time, for example:
# (Pdb) (pandas.DataFrame([[1,2,3], [4,5,6]]).astype({0: "int64[pyarrow]"}) > 4).dtypes
# 0 bool[pyarrow]
# 1 bool
# 2 bool
# dtype: object
backend = ""
if any("pyarrow" in str(x) for x in first.dtypes) or any(
"pyarrow" in str(x) for x in second.dtypes
):
backend = "[pyarrow]"
dtypes = maybe_build_dtypes_series(
first, second, dtype=pandas.api.types.pandas_dtype(bool)
first, second, dtype=pandas.api.types.pandas_dtype(f"bool{backend}")
)
elif infer_dtypes == "common_cast":
dtypes = maybe_compute_dtypes_common_cast(
Expand Down
9 changes: 8 additions & 1 deletion modin/core/dataframe/algebra/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@

"""Module houses builder class for Map operator."""

from __future__ import annotations

from typing import TYPE_CHECKING

from .operator import Operator

if TYPE_CHECKING:
from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler


class Map(Operator):
"""Builder class for Map operator."""
Expand All @@ -41,7 +48,7 @@ def register(cls, function, *call_args, **call_kwds):
Function that takes query compiler and executes map function.
"""

def caller(query_compiler, *args, **kwargs):
def caller(query_compiler: PandasQueryCompiler, *args, **kwargs):
"""Execute Map function against passed query compiler."""
shape_hint = call_kwds.pop("shape_hint", None) or query_compiler._shape_hint
return query_compiler.__constructor__(
Expand Down
2 changes: 1 addition & 1 deletion modin/core/dataframe/algebra/tree_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def register(
axis : int, optional
Specifies axis to apply function along.
compute_dtypes : callable(pandas.Series, *func_args, **func_kwargs) -> np.dtype, optional
Callable for computing dtypes.
Callable for computing dtypes. Only NumPy?

Returns
-------
Expand Down
3 changes: 1 addition & 2 deletions modin/core/dataframe/base/dataframe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from enum import Enum
from typing import Dict, List, Sequence, Tuple, cast

import numpy as np
import pandas
from pandas._typing import IndexLabel
from pandas.api.types import is_scalar
Expand Down Expand Up @@ -170,7 +169,7 @@ def is_trivial_index(index: pandas.Index) -> bool:
return True
if isinstance(index, pandas.RangeIndex):
return index.start == 0 and index.step == 1
if not (isinstance(index, pandas.Index) and index.dtype == np.int64):
if not (isinstance(index, pandas.Index) and index.dtype == "int64"):
return False
return (
index.is_monotonic_increasing
Expand Down