Source code for doeren.ml.pipeline

"""Classes for running machine learning pipelines."""

from collections import defaultdict
from typing import Any, Callable, Dict, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pydantic import BaseModel
from sklearn.metrics import accuracy_score  # type: ignore


[docs] class PipelineRunner(BaseModel, arbitrary_types_allowed=True): """ Class for optimizing and comparing machine learning pipelines. Attributes ---------- data : Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[pd.Series, np.ndarray], Union[pd.Series, np.ndarray] ] The data to use for training and validation. Expected to be a tuple of (X_train, X_valid, y_train, y_valid). pipelines : Dict[str, Any] The pipelines to run. """ data: Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[pd.Series, np.ndarray], Union[pd.Series, np.ndarray], ] pipelines: Dict[str, Any] _optimizer: Optional[Callable] = None _optimizer_kwargs: Dict[str, Any] = {} _best_params: Optional[Dict[str, Any]] = None _best_pipeline: Optional[Dict[str, Any]] = None _best_criterion: Optional[Dict[str, Any]] = None
[docs] def set_optimizer(self, optimizer: Callable, kwargs=Dict[str, Any]) -> None: """ Set the optimizer and optimizer related kwargs to use for hyperparameter tuning. Parameters ---------- optimizer : Tuple[Callable, Dict[str, Any]] A scikit-learn optimizer and optimizer related kwargs to use for hyperparameter tuning. """ self._optimizer = optimizer self._optimizer_kwargs = kwargs
@property def optimizer(self) -> Tuple[Optional[Callable], Dict[str, Any]]: """Return the optimizer and optimizer related kwargs.""" return self._optimizer, self._optimizer_kwargs @property def best_params(self) -> Optional[Dict[str, Any]]: """Return the best parameters for each pipeline.""" return self._best_params @property def best_pipeline(self) -> Optional[Dict[str, Any]]: """Return the model trained with the best set of hyperparameters for each pipeline.""" return self._best_pipeline @property def best_criterion(self) -> Optional[Dict[str, Any]]: """Return the best value for optimization criterion for each pipeline.""" return self._best_criterion
[docs] def run_pipelines(self) -> None: """Optimize and run all pipelines.""" if self._optimizer is None: raise ValueError("Optimizer not set. Use set_optimizer() to set an optimizer.") self._best_criterion = defaultdict(dict) self._best_params = {} self._best_pipeline = {} for name, pipeline in self.pipelines.items(): optimizer = self._optimizer(pipeline["model"], pipeline["params"], **self._optimizer_kwargs) optimizer.fit(self.data[0], self.data[2]) y_train_pred = optimizer.predict(self.data[0]) self._best_criterion[name]["training"] = accuracy_score(self.data[2], y_train_pred) y_valid_pred = optimizer.predict(self.data[1]) self._best_criterion[name]["validation"] = accuracy_score(self.data[3], y_valid_pred) self._best_params[name] = optimizer.best_params_ self._best_pipeline[name] = optimizer.best_estimator_