Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH add possibility to have a callable for verbose_feature_names_out of ColumnTransformer #28934

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 39 additions & 5 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings
from collections import Counter, UserList
from itertools import chain
from functools import partial
from numbers import Integral, Real

import numpy as np
Expand Down Expand Up @@ -133,16 +134,29 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
If True, the time elapsed while fitting each transformer will be
printed as it is completed.

verbose_feature_names_out : bool, default=True
verbose_feature_names_out : bool | str | Callable[[str, str], str], default=True
If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix
all feature names with the name of the transformer that generated that
feature.
feature. It is equivalent to setting
`verbose_feature_names_out="{transformer_name}__{feature_name}"`.
If False, :meth:`ColumnTransformer.get_feature_names_out` will not
prefix any feature names and will error if feature names are not
unique.
If Callable[[str, str], str], :meth:`ColumnTransformer.get_feature_names_out`
will rename all the features using the name of the transformer. The
first argument of the callable is the transformer name and the
second argument is the feature name. The returned string will be the
new feature name.
If str, it must be a string ready for formatting. The given string will
be formatted using two field names: transformer_name and feature_name.
See str.format method from the standard library for more info.

.. versionadded:: 1.0

.. versionchanged:: 1.X
`verbose_feature_names_out` can be a callable or a string to be formatted.


force_int_remainder_cols : bool, default=True
Force the columns of the last entry of `transformers_`, which
corresponds to the "remainder" transformer, to always be stored as
Expand Down Expand Up @@ -283,7 +297,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
"n_jobs": [Integral, None],
"transformer_weights": [dict, None],
"verbose": ["verbose"],
"verbose_feature_names_out": ["boolean"],
"verbose_feature_names_out": ["boolean", str, callable],
"force_int_remainder_cols": ["boolean"],
}

Expand Down Expand Up @@ -655,11 +669,21 @@ def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
feature_names_out : ndarray of shape (n_features,), dtype=str
Transformed feature names.
"""
if self.verbose_feature_names_out:
feature_names_out_callable = None
if callable(self.verbose_feature_names_out):
feature_names_out_callable = self.verbose_feature_names_out
elif isinstance(self.verbose_feature_names_out, str):
feature_names_out_callable = partial(
_feature_names_out, str_format=self.verbose_feature_names_out
)
elif self.verbose_feature_names_out is True:
feature_names_out_callable = _feature_names_out

if feature_names_out_callable is not None:
# Prefix the feature names out with the transformers name
names = list(
chain.from_iterable(
(f"{name}__{i}" for i in feature_names_out)
(feature_names_out_callable(name, i) for i in feature_names_out)
MarcBresson marked this conversation as resolved.
Show resolved Hide resolved
for name, feature_names_out in transformer_with_feature_names_out
)
)
Expand Down Expand Up @@ -1652,3 +1676,13 @@ def _with_dtype_warning_enabled_set_to(warning_enabled, transformers):
)
result.append((name, trans, columns))
return result


def _feature_names_out(
transformer_name: str, feature_name: str, str_format: str | None = None
) -> str:
if str_format is None:
str_format = "{transformer_name}__{feature_name}"
return str_format.format(
transformer_name=transformer_name, feature_name=feature_name
)
66 changes: 66 additions & 0 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1865,6 +1865,72 @@ def test_verbose_feature_names_out_true(transformers, remainder, expected_names)
assert_array_equal(names, expected_names)


def _feature_names_out_callable_name_clash(trans_name: str, feat_name: str):
return f"{trans_name[:2]}++{feat_name}"


def _feature_names_out_callable_upper(trans_name: str, feat_name: str):
return f"{trans_name.upper()}={feat_name.upper()}"


@pytest.mark.parametrize(
"transformers, remainder, verbose_feature_names_out, expected_names",
[
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", ["d"]),
],
"passthrough",
_feature_names_out_callable_name_clash,
["by++d", "by++c", "by++d", "re++a", "re++b"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", ["d"]),
],
"drop",
"{feature_name}-{transformer_name}",
["d-bycol1", "c-bycol1", "d-bycol2"],
),
(
[
("bycol1", TransWithNames(), ["d", "c"]),
("bycol2", "passthrough", slice("c", "d")),
],
"passthrough",
_feature_names_out_callable_upper,
[
"BYCOL1=D",
"BYCOL1=C",
"BYCOL2=C",
"BYCOL2=D",
"REMAINDER=A",
"REMAINDER=B",
],
),
],
)
def test_verbose_feature_names_out_callable_or_str(
transformers, remainder, verbose_feature_names_out, expected_names
):
"""Check feature_names_out for verbose_feature_names_out=True (default)"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"])
ct = ColumnTransformer(
transformers,
remainder=remainder,
verbose_feature_names_out=verbose_feature_names_out,
)
ct.fit(df)

names = ct.get_feature_names_out()
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected_names)


@pytest.mark.parametrize(
"transformers, remainder, expected_names",
[
Expand Down