Skip to content

Automatic optimization of pipeline of machine learning for tabular data

Notifications You must be signed in to change notification settings

Sara-Iftikhar/AutoTab

Repository files navigation

Documentation Status PyPI version DOI

autotab

optimize pipeline for any machine learning mdoel using hierarchical optimization method for tabular datasets.

Installation

This package can be installed using pip from pypi using following command

pip install autotab

or using github link for the latest code

python -m pip install git+https://github.com/Sara-Iftikhar/autotab.git

or using setup file, go to folder where this repoitory is downloaded

python setup.py install

Example

Click here to badge or cick here to open in colab

from ai4water.datasets import busan_beach
from skopt.plots import plot_objective
from autotab import OptimizePipeline

data = busan_beach()
input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]

transformations = ['minmax', 'zscore', 'log', 'log10', 'sqrt', 'robust', 'quantile', 'none', 'scale']

pl = OptimizePipeline(
    inputs_to_transform=data.columns.tolist()[0:-1],
    parent_iterations=400,
    child_iterations=20,
    parent_algorithm='bayes',
    child_algorithm="random",
    cv_parent_hpo=True,
    eval_metric='mse',
    monitor=['r2', 'nse'],
    input_transformations = transformations,
    output_transformations = transformations,
    models=[ "LinearRegression",
            "LassoLars",
            "Lasso",
            "RandomForestRegressor",
            "HistGradientBoostingRegressor",
             "CatBoostRegressor",
             "XGBRegressor",
             "LGBMRegressor",
             "GradientBoostingRegressor",
             "ExtraTreeRegressor",
             "ExtraTreesRegressor"
             ],

    input_features=data.columns.tolist()[0:-1],
    output_features=data.columns.tolist()[-1:],
    cross_validator={"KFold": {"n_splits": 5}},
    split_random=True,
)

get version information

pl._version_info()

perform optimization

results = pl.fit(data=data, process_results=False)

print optimization report

print(pl.report())

show convergence plot

pl.optimizer_._plot_convergence(save=False)
pl.optimizer_._plot_parallel_coords(figsize=(16, 8), save=False)
_ = pl.optimizer_._plot_distributions(save=False)
pl.optimizer_.plot_importance(save=False)
pl.optimizer_.plot_importance(save=False, plot_type="bar")
_ = plot_objective(results)
pl.optimizer._plot_evaluations(save=False)
pl.optimizer._plot_edf(save=False)
pl.dumbbell_plot(data=data)
pl.dumbbell_plot(data=data, metric_name='r2')
pl.taylor_plot(data=data, save=False, figsize=(6,6))
pl.compare_models()
pl.compare_models(plot_type="bar_chart")
pl.compare_models("r2", plot_type="bar_chart")

get best pipeline with respect to evaluation metric

pl.get_best_pipeline_by_metric('r2')

build fit and evaluate the best pipeline

model = pl.bfe_best_model_from_scratch(data=data)
pl.evaluate_model(model, data=data)
pl.evaluate_model(model, data=data, metric_name='nse')
pl.evaluate_model(model, data=data, metric_name='r2')

get best pipeline with respect to $R^2$

pl.get_best_pipeline_by_metric('r2')
model = pl.bfe_best_model_from_scratch(data=data, metric_name='r2')
pl.evaluate_model(model, data=data, metric_name='r2')
print(f"all results are save in {pl.path} folder")