"""Classes for running machine learning pipelines."""
from collections import defaultdict
from typing import Any, Callable, Dict, Optional, Tuple, Union
import numpy as np
import pandas as pd
from pydantic import BaseModel
from sklearn.metrics import accuracy_score # type: ignore
[docs]
class PipelineRunner(BaseModel, arbitrary_types_allowed=True):
"""
Class for optimizing and comparing machine learning pipelines.
Attributes
----------
data : Tuple[
Union[pd.DataFrame, np.ndarray],
Union[pd.DataFrame, np.ndarray],
Union[pd.Series, np.ndarray],
Union[pd.Series, np.ndarray]
]
The data to use for training and validation. Expected to be a tuple of
(X_train, X_valid, y_train, y_valid).
pipelines : Dict[str, Any]
The pipelines to run.
"""
data: Tuple[
Union[pd.DataFrame, np.ndarray],
Union[pd.DataFrame, np.ndarray],
Union[pd.Series, np.ndarray],
Union[pd.Series, np.ndarray],
]
pipelines: Dict[str, Any]
_optimizer: Optional[Callable] = None
_optimizer_kwargs: Dict[str, Any] = {}
_best_params: Optional[Dict[str, Any]] = None
_best_pipeline: Optional[Dict[str, Any]] = None
_best_criterion: Optional[Dict[str, Any]] = None
[docs]
def set_optimizer(self, optimizer: Callable, kwargs=Dict[str, Any]) -> None:
"""
Set the optimizer and optimizer related kwargs to use for hyperparameter tuning.
Parameters
----------
optimizer : Tuple[Callable, Dict[str, Any]]
A scikit-learn optimizer and optimizer related kwargs
to use for hyperparameter tuning.
"""
self._optimizer = optimizer
self._optimizer_kwargs = kwargs
@property
def optimizer(self) -> Tuple[Optional[Callable], Dict[str, Any]]:
"""Return the optimizer and optimizer related kwargs."""
return self._optimizer, self._optimizer_kwargs
@property
def best_params(self) -> Optional[Dict[str, Any]]:
"""Return the best parameters for each pipeline."""
return self._best_params
@property
def best_pipeline(self) -> Optional[Dict[str, Any]]:
"""Return the model trained with the best set of hyperparameters for each pipeline."""
return self._best_pipeline
@property
def best_criterion(self) -> Optional[Dict[str, Any]]:
"""Return the best value for optimization criterion for each pipeline."""
return self._best_criterion
[docs]
def run_pipelines(self) -> None:
"""Optimize and run all pipelines."""
if self._optimizer is None:
raise ValueError("Optimizer not set. Use set_optimizer() to set an optimizer.")
self._best_criterion = defaultdict(dict)
self._best_params = {}
self._best_pipeline = {}
for name, pipeline in self.pipelines.items():
optimizer = self._optimizer(pipeline["model"], pipeline["params"], **self._optimizer_kwargs)
optimizer.fit(self.data[0], self.data[2])
y_train_pred = optimizer.predict(self.data[0])
self._best_criterion[name]["training"] = accuracy_score(self.data[2], y_train_pred)
y_valid_pred = optimizer.predict(self.data[1])
self._best_criterion[name]["validation"] = accuracy_score(self.data[3], y_valid_pred)
self._best_params[name] = optimizer.best_params_
self._best_pipeline[name] = optimizer.best_estimator_