extend MatchVariables functionality to match dtypes as well (#669)

* #645 implemented match_dtypes functionality * #645 created and updated tests * fixed long line * fixed style * refactored and added verbosity * removed string coercion * adding specific tests * fixed style * updated user guide * used fixture * renamed functions * added space * updated assert to test specific dict values * retrigger checks * retrigger checks * #645 implemented match_dtypes functionality * #645 created and updated tests * fixed long line * fixed style * refactored and added verbosity * removed string coercion * adding specific tests * fixed style * updated user guide * used fixture * renamed functions * added space * updated assert to test specific dict values * reorganize code outputs in user guide * minor cosmetic update to docstrings * add change to changelog * add assert for dataframe checks --------- Co-authored-by: Soledad Galli <[email protected]>
feature-engine · Sep 13, 2023 · 77ea405 · 77ea405
1 parent bd4c8fc
commit 77ea405
Show file tree

Hide file tree

Showing 4 changed files with 231 additions and 11 deletions.
diff --git a/docs/user_guide/preprocessing/MatchVariables.rst b/docs/user_guide/preprocessing/MatchVariables.rst
@@ -145,8 +145,54 @@ And now, we transform the data with :class:`MatchVariables()`:
 Now, the transformer simultaneously added the missing columns with NA as values and
 removed the additional columns from the resulting dataset.
 
+
+However, if we look closely, the dtypes for the `sex` variable do not match. This could
+cause issues if other transformations depend upon having the correct dtypes.
+
+.. code:: python
+
+ train.sex.dtype
+
+.. code:: python
+
+ dtype('O')
+
+.. code:: python
+
+ test_tt.sex.dtype
+
+.. code:: python
+
+ dtype('float64')
+
+Set the `match_dtypes` parameter to `True` in order to align the dtypes as well.
+
+.. code:: python
+
+ match_cols_and_dtypes = MatchVariables(missing_values="ignore", match_dtypes=True)
+ match_cols_and_dtypes.fit(train)
+
+ test_ttt = match_cols_and_dtypes.transform(test_t)
+
+.. code:: python
+
+ The following variables are added to the DataFrame: ['sex', 'age']
+ The following variables are dropped from the DataFrame: ['var_b', 'var_a']
+ The sex dtype is changing from float64 to object
+
+Now the dtype matches.
+
+.. code:: python
+
+ test_ttt.sex.dtype
+
+.. code:: python
+
+ dtype('O')
+
 By default, :class:`MatchVariables()` will print out messages indicating which variables
-were added or removed. We can switch off the messages through the parameter `verbose`.
+were added, removed and altered. We can switch off the messages through the parameter `verbose`.
+
 
 When to use the transformer
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/whats_new/v_160.rst b/docs/whats_new/v_160.rst
@@ -9,11 +9,13 @@ Deployed: xx September 2023
 Contributors
 ~~~~~~~~~~~~
 
+- `Kyle Gilde <https://github.com/kylegilde>`_
 - `Soledad Galli <https://github.com/solegalli>`_
 
 New functionality
 ~~~~~~~~~~~~~~~~~
 
+- `MatchVariables()` can now also match the **dtypes** of the variables (`Kyle Gilde <https://github.com/kylegilde>`_)
 - `DatetimeFeatures()` and `DatetimeSubtraction()` can now specify the format of the datetime variables (`Soledad Galli <https://github.com/solegalli>`_)
 
 Bug fixes

diff --git a/feature_engine/preprocessing/match_columns.py b/feature_engine/preprocessing/match_columns.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Dict, Union
 
 import numpy as np
 import pandas as pd
@@ -69,6 +69,10 @@ class MatchVariables(BaseEstimator, TransformerMixin, GetFeatureNamesOutMixin):
  contain missing values. If 'ignore', missing data will be ignored when learning
  parameters or performing the transformation.
 
+ match_dtypes: bool, default=False
+ Indicates whether the dtypes observed in the train set should be applied to
+ variables in the test set.
+
  verbose: bool, default=True
  If True, the transformer will print out the names of the variables that are
  added and / or removed from the dataset.
@@ -81,6 +85,10 @@ class MatchVariables(BaseEstimator, TransformerMixin, GetFeatureNamesOutMixin):
  n_features_in_:
  The number of features in the train set used in fit.
 
+ dtype_dict_:
+ If `match_dtypes` is set to `True`, then this attribute will exist, and it will
+ contain a dictionary of variables and their corresponding dtypes.
+
  Methods
  -------
  fit:
@@ -150,15 +158,21 @@ def __init__(
  self,
  fill_value: Union[str, int, float] = np.nan,
  missing_values: str = "raise",
+ match_dtypes: bool = False,
  verbose: bool = True,
  ):
-
  if missing_values not in ["raise", "ignore"]:
  raise ValueError(
  "missing_values takes only values 'raise' or 'ignore'."
  f"Got '{missing_values} instead."
  )
 
+ if not isinstance(match_dtypes, bool):
+ raise ValueError(
+ "match_dtypes takes only booleans True and False. "
+ f"Got '{match_dtypes} instead."
+ )
+
  if not isinstance(verbose, bool):
  raise ValueError(
  "verbose takes only booleans True and False." f"Got '{verbose} instead."
@@ -173,6 +187,7 @@ def __init__(
 
  self.fill_value = fill_value
  self.missing_values = missing_values
+ self.match_dtypes = match_dtypes
  self.verbose = verbose
 
  def fit(self, X: pd.DataFrame, y: pd.Series = None):
@@ -198,6 +213,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
 
  self.n_features_in_ = X.shape[1]
 
+ if self.match_dtypes:
+ self.dtype_dict_: Dict = X.dtypes.to_dict()
+
  return self
 
  def transform(self, X: pd.DataFrame) -> pd.DataFrame:
@@ -243,6 +261,23 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
  X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value)
 
+ if self.match_dtypes:
+ _current_dtypes = X.dtypes.to_dict()
+ _columns_to_update = {
+ column: new_dtype
+ for column, new_dtype in self.dtype_dict_.items()
+ if new_dtype != _current_dtypes[column]
+ }
+
+ if self.verbose:
+ for column, new_dtype in _columns_to_update.items():
+ print(
+ f"The {column} dtype is changing from ",
+ f"{_current_dtypes[column]} to {new_dtype}",
+ )
+
+ X = X.astype(_columns_to_update)
+
  return X
 
  # for the check_estimator tests

diff --git a/tests/test_preprocessing/test_match_columns.py b/tests/test_preprocessing/test_match_columns.py
@@ -13,9 +13,10 @@
 ]
 
 _params_allowed = [
- ([0, 1], "ignore", True),
- ("nan", "hola", True),
- ("nan", "ignore", "hallo"),
+ ([0, 1], "ignore", True, True),
+ ("nan", "hola", True, True),
+ ("nan", "ignore", True, "hallo"),
+ ("nan", "ignore", "hallo", True),
 ]
 
 
@@ -59,6 +60,7 @@ def test_drop_and_add_columns(
  assert match_columns.fill_value == fill_value
  assert match_columns.verbose is True
  assert match_columns.missing_values == "ignore"
+ assert match_columns.match_dtypes is False
  # test fit attrs
  assert list(match_columns.feature_names_in_) == list(train.columns)
  assert match_columns.n_features_in_ == 6
@@ -72,7 +74,6 @@ def test_drop_and_add_columns(
 def test_columns_addition_when_more_columns_in_train_than_test(
  fill_value, expected_studies, expected_age, df_vartypes, df_na
 ):
-
  train = df_na.copy()
  test = df_vartypes.copy()
  test = test.drop("Age", axis=1) # to add more than one column
@@ -103,6 +104,7 @@ def test_columns_addition_when_more_columns_in_train_than_test(
  assert match_columns.fill_value == fill_value
  assert match_columns.verbose is True
  assert match_columns.missing_values == "ignore"
+ assert match_columns.match_dtypes is False
  # test fit attrs
  assert list(match_columns.feature_names_in_) == list(train.columns)
  assert match_columns.n_features_in_ == 6
@@ -126,23 +128,158 @@ def test_drop_columns_when_more_columns_in_test_than_train(df_vartypes, df_na):
  assert match_columns.fill_value is np.nan
  assert match_columns.verbose is True
  assert match_columns.missing_values == "ignore"
+ assert match_columns.match_dtypes is False
  # test fit attrs
  assert list(match_columns.feature_names_in_) == list(train.columns)
  assert match_columns.n_features_in_ == 4
  # test transform output
  pd.testing.assert_frame_equal(expected_result, transformed_df)
 
 
-@pytest.mark.parametrize("fill_value, missing_values, verbose", _params_allowed)
-def test_error_if_param_values_not_allowed(fill_value, missing_values, verbose):
+def test_match_dtypes_string_to_numbers(df_vartypes):
+ train = df_vartypes.copy().select_dtypes("number")
+ test = train.copy().astype("string")
+
+ match_columns = MatchVariables(match_dtypes=True)
+ match_columns.fit(train)
+
+ transformed_df = match_columns.transform(test)
+
+ # test init params
+ assert match_columns.match_dtypes is True
+ # test fit attrs
+ assert match_columns.dtype_dict_ == {
+ "Age": np.dtype("int64"),
+ "Marks": np.dtype("float64"),
+ }
+
+ # test transform output
+ pd.testing.assert_series_equal(train.dtypes, transformed_df.dtypes)
+ pd.testing.assert_frame_equal(transformed_df, train)
+
+
+def test_match_dtypes_numbers_to_string(df_vartypes):
+ train = df_vartypes.copy().select_dtypes("number").astype("string")
+ test = df_vartypes.copy().select_dtypes("number")
+
+ match_columns = MatchVariables(match_dtypes=True)
+ match_columns.fit(train)
+
+ transformed_df = match_columns.transform(test)
+
+ # test init params
+ assert match_columns.match_dtypes is True
+ # test fit attrs
+ assert isinstance(match_columns.dtype_dict_, dict)
+ # test transform output
+ pd.testing.assert_series_equal(train.dtypes, transformed_df.dtypes)
+ pd.testing.assert_frame_equal(transformed_df, train)
+
+
+def test_match_dtypes_string_to_datetime(df_vartypes):
+ train = df_vartypes.copy().loc[:, ["dob"]]
+ test = train.copy().astype("string")
+
+ match_columns = MatchVariables(match_dtypes=True, verbose=False)
+ match_columns.fit(train)
+
+ transformed_df = match_columns.transform(test)
+
+ # test init params
+ assert match_columns.match_dtypes is True
+ assert match_columns.verbose is False
+ # test fit attrs
+ assert match_columns.dtype_dict_ == {"dob": np.dtype("<M8[ns]")}
+ # test transform output
+ pd.testing.assert_series_equal(train.dtypes, transformed_df.dtypes)
+ pd.testing.assert_frame_equal(transformed_df, train)
+
+
+def test_match_dtypes_datetime_to_string(df_vartypes):
+ train = df_vartypes.copy().loc[:, ["dob"]].astype("string")
+ test = df_vartypes.copy().loc[:, ["dob"]]
+
+ match_columns = MatchVariables(match_dtypes=True, verbose=False)
+ match_columns.fit(train)
+
+ transformed_df = match_columns.transform(test)
+
+ # test init params
+ assert match_columns.match_dtypes is True
+ assert match_columns.verbose is False
+ # test fit attrs
+ assert isinstance(match_columns.dtype_dict_, dict)
+ # test transform output
+ pd.testing.assert_series_equal(train.dtypes, transformed_df.dtypes)
+ pd.testing.assert_frame_equal(transformed_df, train)
+
+
+def test_match_dtypes_missing_category(df_vartypes):
+ train = df_vartypes.copy().loc[:, ["Name", "City"]].astype("category")
+ test = df_vartypes.copy().loc[:, ["Name", "City"]].iloc[:-1].astype("category")
+
+ match_columns = MatchVariables(match_dtypes=True, verbose=True)
+ match_columns.fit(train)
+
+ transformed_df = match_columns.transform(test)
+
+ # test init params
+ assert match_columns.match_dtypes is True
+ assert match_columns.verbose is True
+ # test fit attrs
+ assert match_columns.dtype_dict_ == {
+ "Name": pd.CategoricalDtype(
+ categories=["jack", "krish", "nick", "tom"], ordered=False
+ ),
+ "City": pd.CategoricalDtype(
+ categories=["Bristol", "Liverpool", "London", "Manchester"], ordered=False
+ ),
+ }
+ # test transform output
+ pd.testing.assert_series_equal(train.dtypes, transformed_df.dtypes)
+ pd.testing.assert_frame_equal(transformed_df, train.iloc[:-1])
+
+
+def test_match_dtypes_extra_category(df_vartypes):
+ train = df_vartypes.copy().loc[:, ["Name", "City"]].iloc[:-1].astype("category")
+ test = df_vartypes.copy().loc[:, ["Name", "City"]].astype("category")
+
+ match_columns = MatchVariables(match_dtypes=True, verbose=True)
+ match_columns.fit(train)
+
+ transformed_df = match_columns.transform(test)
+
+ # test init params
+ assert match_columns.match_dtypes is True
+ assert match_columns.verbose is True
+ # test fit attrs
+ assert match_columns.dtype_dict_ == {
+ "Name": pd.CategoricalDtype(categories=["krish", "nick", "tom"], ordered=False),
+ "City": pd.CategoricalDtype(
+ categories=["Liverpool", "London", "Manchester"], ordered=False
+ ),
+ }
+
+ # test transform output
+ pd.testing.assert_series_equal(train.dtypes, transformed_df.dtypes)
+
+
+@pytest.mark.parametrize(
+ "fill_value, missing_values, match_dtypes, verbose", _params_allowed
+)
+def test_error_if_param_values_not_allowed(
+ fill_value, missing_values, match_dtypes, verbose
+):
  with pytest.raises(ValueError):
  MatchVariables(
- fill_value=fill_value, missing_values=missing_values, verbose=verbose
+ fill_value=fill_value,
+ missing_values=missing_values,
+ match_dtypes=match_dtypes,
+ verbose=verbose,
  )
 
 
 def test_verbose_print_out(capfd, df_vartypes, df_na):
-
  match_columns = MatchVariables(missing_values="ignore", verbose=True)
 
  train = df_na.copy()