Allow to use prefitted SelectFromModel in ColumnTransformer #28928

NTSER · 2024-05-01T13:18:14Z

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel

iris = load_iris()
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y = iris.target

feature_selection_cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']
clf = LogisticRegression(max_iter=1000)
clf.fit(X[feature_selection_cols], y)
ct = ColumnTransformer(
    [(
        'SelectFromModel',
        SelectFromModel(clf, prefit=True, max_features=2),
        feature_selection_cols,
    )],
    remainder='passthrough',
)
ct.fit(X, y)

yields:

Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_selection\_from_model.py:349, in SelectFromModel.fit(self, X, y, **fit_params)
    348 try:
--> 349     check_is_fitted(self.estimator)
    350 except NotFittedError as exc:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:1461, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1460 if not _is_fitted(estimator, attributes, all_or_any):
-> 1461     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

The above exception was the direct cause of the following exception:

NotFittedError                            Traceback (most recent call last)
Cell In[1], line 22
     13 clf.fit(X[feature_selection_cols], y)
     14 ct = ColumnTransformer(
     15     [(
     16         'SelectFromModel',
   (...)
     20     remainder='passthrough',
     21 )
---> 22 ct.fit(X, y)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\compose\_column_transformer.py:717, in ColumnTransformer.fit(self, X, y)
    699 """Fit all transformers using X.
    700 
    701 Parameters
   (...)
    713     This estimator.
    714 """
    715 # we use fit_transform to make sure to set sparse_output_ (for which we
    716 # need the transformed data) to have consistent output type in predict
--> 717 self.fit_transform(X, y=y)
    718 return self

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_set_output.py:157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    155 @wraps(f)
    156 def wrapped(self, X, *args, **kwargs):
--> 157     data_to_wrap = f(self, X, *args, **kwargs)
    158     if isinstance(data_to_wrap, tuple):
    159         # only wrap the first output for cross decomposition
    160         return_tuple = (
    161             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    162             *data_to_wrap[1:],
    163         )

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1145     estimator._validate_params()
   1147 with config_context(
   1148     skip_parameter_validation=(
   1149         prefer_skip_nested_validation or global_skip_validation
   1150     )
   1151 ):
-> 1152     return fit_method(estimator, *args, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\compose\_column_transformer.py:754, in ColumnTransformer.fit_transform(self, X, y)
    751 self._validate_column_callables(X)
    752 self._validate_remainder(X)
--> 754 result = self._fit_transform(X, y, _fit_transform_one)
    756 if not result:
    757     self._update_fitted_transformers([])

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\compose\_column_transformer.py:681, in ColumnTransformer._fit_transform(self, X, y, func, fitted, column_as_strings)
    675 transformers = list(
    676     self._iter(
    677         fitted=fitted, replace_strings=True, column_as_strings=column_as_strings
    678     )
    679 )
    680 try:
--> 681     return Parallel(n_jobs=self.n_jobs)(
    682         delayed(func)(
    683             transformer=clone(trans) if not fitted else trans,
    684             X=_safe_indexing(X, column, axis=1),
    685             y=y,
    686             weight=weight,
    687             message_clsname="ColumnTransformer",
    688             message=self._log_message(name, idx, len(transformers)),
    689         )
    690         for idx, (name, trans, column, weight) in enumerate(transformers, 1)
    691     )
    692 except ValueError as e:
    693     if "Expected 2D array, got 1D array instead" in str(e):

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\parallel.py:65, in Parallel.__call__(self, iterable)
     60 config = get_config()
     61 iterable_with_config = (
     62     (_with_config(delayed_func, config), args, kwargs)
     63     for delayed_func, args, kwargs in iterable
     64 )
---> 65 return super().__call__(iterable_with_config)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\parallel.py:1863, in Parallel.__call__(self, iterable)
   1861     output = self._get_sequential_output(iterable)
   1862     next(output)
-> 1863     return output if self.return_generator else list(output)
   1865 # Let's create an ID that uniquely identifies the current call. If the
   1866 # call is interrupted early and that the same instance is immediately
   1867 # re-used, this id will be used to prevent workers that were
   1868 # concurrently finalizing a task from the previous call to run the
   1869 # callback.
   1870 with self._lock:

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\parallel.py:1792, in Parallel._get_sequential_output(self, iterable)
   1790 self.n_dispatched_batches += 1
   1791 self.n_dispatched_tasks += 1
-> 1792 res = func(*args, **kwargs)
   1793 self.n_completed_tasks += 1
   1794 self.print_progress()

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\parallel.py:127, in _FuncWrapper.__call__(self, *args, **kwargs)
    125     config = {}
    126 with config_context(**config):
--> 127     return self.function(*args, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py:957, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    955 with _print_elapsed_time(message_clsname, message):
    956     if hasattr(transformer, "fit_transform"):
--> 957         res = transformer.fit_transform(X, y, **fit_params)
    958     else:
    959         res = transformer.fit(X, y, **fit_params).transform(X)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_set_output.py:157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    155 @wraps(f)
    156 def wrapped(self, X, *args, **kwargs):
--> 157     data_to_wrap = f(self, X, *args, **kwargs)
    158     if isinstance(data_to_wrap, tuple):
    159         # only wrap the first output for cross decomposition
    160         return_tuple = (
    161             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    162             *data_to_wrap[1:],
    163         )

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:919, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    916     return self.fit(X, **fit_params).transform(X)
    917 else:
    918     # fit method of arity 2 (supervised transformation)
--> 919     return self.fit(X, y, **fit_params).transform(X)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1145     estimator._validate_params()
   1147 with config_context(
   1148     skip_parameter_validation=(
   1149         prefer_skip_nested_validation or global_skip_validation
   1150     )
   1151 ):
-> 1152     return fit_method(estimator, *args, **kwargs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_selection\_from_model.py:351, in SelectFromModel.fit(self, X, y, **fit_params)
    349         check_is_fitted(self.estimator)
    350     except NotFittedError as exc:
--> 351         raise NotFittedError(
    352             "When `prefit=True`, `estimator` is expected to be a fitted "
    353             "estimator."
    354         ) from exc
    355     self.estimator_ = deepcopy(self.estimator)
    356 else:

NotFittedError: When `prefit=True`, `estimator` is expected to be a fitted estimator.

glemaitre · 2024-05-01T16:04:32Z

ColumnTransformer clones the selector and thus the fitted model.
So this kind of duplicate from #8370

ogrisel · 2024-05-03T13:25:46Z

I checked an indeed this is a discrepancy with the behavior of pipelines that does not attempt to clone the passed estimators prior to fitting:

>>> from sklearn.datasets import load_iris
... from sklearn.linear_model import LogisticRegression
... from sklearn.pipeline import make_pipeline
... from sklearn.feature_selection import SelectFromModel
... 
... iris = load_iris()
... X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
... y = iris.target
... 
... clf = LogisticRegression(max_iter=1000)
... clf.fit(X, y)
... 
... pipe = make_pipeline(SelectFromModel(clf, prefit=True, max_features=2)).fit(X, y)

ogrisel · 2024-05-03T13:28:34Z

One option that would not imply modifying the cloning behavior of ColumnTransformer and other meta-estimators would be to define a custom SelectFromModel.__sklearn_clone__ clone method that would skip the cloning of the underlying base estimator when prefit=True is passed to SelectFromModel.

We could probably do a similar thing for any meta-estimator that can be configured to operate on fitted base estimators.

ogrisel · 2024-05-03T13:31:51Z

@NTSER as a short term workaround for this problem, feel free to use your own custom wrapper for freezing the base estimator as explained in:

https://scikit-learn.org/stable/developers/develop.html#cloning

NTSER · 2024-05-04T09:45:37Z

Thanks.
using

class SelectFromModelWrapper(SelectFromModel):
    def __sklearn_clone__(self):
        return self

instead of SelectFromModel, solves the problem.

I accidentally closed the issue. Not sure if I have to close or keep it open.

glemaitre · 2024-05-18T08:30:36Z

You can keep the issue open. I just change the tag because I would not consider it as a bug but rather a limitation in our API and we should improve.

NTSER added Bug Needs Triage Issue requires triage labels May 1, 2024

ogrisel added API and removed Needs Triage Issue requires triage API labels May 3, 2024

NTSER closed this as completed May 4, 2024

NTSER reopened this May 4, 2024

glemaitre added Enhancement and removed Bug labels May 18, 2024

glemaitre changed the title ~~Using prefitted SelectFromModel in ColumnTransformer~~ Allow to use prefitted SelectFromModel in ColumnTransformer May 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Allow to use prefitted SelectFromModel in ColumnTransformer #28928

Allow to use prefitted SelectFromModel in ColumnTransformer #28928

NTSER commented May 1, 2024 •

edited

glemaitre commented May 1, 2024 •

edited by ogrisel

ogrisel commented May 3, 2024

ogrisel commented May 3, 2024 •

edited

ogrisel commented May 3, 2024

NTSER commented May 4, 2024

glemaitre commented May 18, 2024

Allow to use prefitted SelectFromModel in ColumnTransformer #28928

Allow to use prefitted SelectFromModel in ColumnTransformer #28928

Comments

NTSER commented May 1, 2024 • edited

glemaitre commented May 1, 2024 • edited by ogrisel

ogrisel commented May 3, 2024

ogrisel commented May 3, 2024 • edited

ogrisel commented May 3, 2024

NTSER commented May 4, 2024

glemaitre commented May 18, 2024

NTSER commented May 1, 2024 •

edited

glemaitre commented May 1, 2024 •

edited by ogrisel

ogrisel commented May 3, 2024 •

edited