diff --git a/requirements_dev.txt b/requirements_dev.txt index f70e35ff..78e41dff 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,4 +1,6 @@ pytest==5.3.5 black==19.3b0 +click==8.0.2 +ipython pytest-cov coveralls \ No newline at end of file diff --git a/supervised/automl.py b/supervised/automl.py index 50143ad2..7c62feb1 100644 --- a/supervised/automl.py +++ b/supervised/automl.py @@ -71,6 +71,7 @@ def __init__( n_jobs: int = -1, verbose: int = 1, random_state: int = 1234, + chosen_fold: Optional[int] = None, ): """ Initialize `AutoML` object. @@ -342,6 +343,7 @@ def __init__( self.optuna_verbose = optuna_verbose self.n_jobs = n_jobs self.random_state = random_state + self.chosen_fold = chosen_fold def fit( self, diff --git a/supervised/base_automl.py b/supervised/base_automl.py index a892c485..b45a6731 100644 --- a/supervised/base_automl.py +++ b/supervised/base_automl.py @@ -39,6 +39,7 @@ from supervised.utils.metric import Metric from supervised.utils.metric import UserDefinedEvalMetric from supervised.utils.automl_plots import AutoMLPlots + # disable EDA # from supervised.preprocessing.eda import EDA from supervised.preprocessing.preprocessing_utils import PreprocessingUtils @@ -116,6 +117,7 @@ def __init__(self): self._optuna_init_params = {} self._optuna_verbose = True self._n_jobs = -1 + self._chosen_fold = None def _get_tuner_params( self, start_random_models, hill_climbing_steps, top_models_to_improve @@ -181,6 +183,7 @@ def load(self, path): ) self._n_jobs = params.get("n_jobs", self._n_jobs) self._random_state = params.get("random_state", self._random_state) + self._chosen_fold = params.get("chosen_fold", self._chosen_fold) stacked_models = params.get("stacked") best_model_name = params.get("best_model") @@ -368,10 +371,7 @@ def train_model(self, params): ) # create model framework - mf = ModelFramework( - params, - callbacks=[early_stop, total_time_constraint], - ) + mf = ModelFramework(params, callbacks=[early_stop, total_time_constraint]) # start training logger.info( @@ -930,6 +930,7 @@ def _fit(self, X, y, sample_weight=None, cv=None): self._optuna_verbose = self._get_optuna_verbose() self._n_jobs = self._get_n_jobs() self._random_state = self._get_random_state() + self._chosen_fold = self._get_chosen_fold() self._adjust_validation = False self._apply_constraints() @@ -1159,10 +1160,7 @@ def select_and_save_best(self, show_warnings=False): if m.is_valid() and m.is_fast_enough(self._max_single_prediction_time) ] if model_list: - self._best_model = min( - model_list, - key=lambda x: x.get_final_loss(), - ) + self._best_model = min(model_list, key=lambda x: x.get_final_loss()) # if none selected please select again and warn the user if ( len(self._models) @@ -1211,6 +1209,7 @@ def select_and_save_best(self, show_warnings=False): "random_state": self._random_state, "saved": self._model_subpaths, "fit_level": self._fit_level, + "chosen_fold": self._chosen_fold, } if self._best_model is not None: params["best_model"] = self._best_model.get_name() @@ -1327,11 +1326,11 @@ def _base_predict(self, X, model=None): if model.get_type() == "Ensemble": # Ensemble is using both original and stacked data - predictions = model.predict(X, X_stacked) + predictions = model.predict(X, X_stacked, self._chosen_fold) else: - predictions = model.predict(X_stacked) + predictions = model.predict(X_stacked, self._chosen_fold) else: - predictions = model.predict(X) + predictions = model.predict(X, self._chosen_fold) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold @@ -1790,6 +1789,11 @@ def _get_random_state(self): """Gets the current random_state""" self._validate_random_state() return deepcopy(self.random_state) + + def _get_chosen_fold(self): + """Gets the current chosen_fold""" + self._validate_chosen_fold() + return deepcopy(self.chosen_fold) def _validate_mode(self): """Validates mode parameter""" @@ -2031,6 +2035,12 @@ def _validate_n_jobs(self): def _validate_random_state(self): """Validates random_state parameter""" check_positive_integer(self.random_state, "random_state") + + def _validate_chosen_fold(self): + """Validates chosen_fold parameter""" + if self.chosen_fold is None: + return + check_integer(self.chosen_fold, "chosen_fold") def to_json(self): if self._best_model is None: diff --git a/supervised/ensemble.py b/supervised/ensemble.py index 8cadceb3..59059a8a 100644 --- a/supervised/ensemble.py +++ b/supervised/ensemble.py @@ -290,7 +290,7 @@ def fit(self, oofs, y, sample_weight=None): self.train_time = time.time() - start_time - def predict(self, X, X_stacked=None): + def predict(self, X, X_stacked=None, chosen_fold=None): logger.debug( "Ensemble.predict with {} models".format(len(self.selected_models)) ) @@ -303,9 +303,11 @@ def predict(self, X, X_stacked=None): total_repeat += repeat if model._is_stacked: - y_predicted_from_model = model.predict(X_stacked) + y_predicted_from_model = model.predict( + X_stacked, chosen_fold=chosen_fold + ) else: - y_predicted_from_model = model.predict(X) + y_predicted_from_model = model.predict(X, chosen_fold=chosen_fold) prediction_cols = [] if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: diff --git a/supervised/model_framework.py b/supervised/model_framework.py index 10b54ea9..9831abea 100644 --- a/supervised/model_framework.py +++ b/supervised/model_framework.py @@ -416,26 +416,40 @@ def is_fast_enough(self, max_single_prediction_time): return self._single_prediction_time < max_single_prediction_time - def predict(self, X): + def predict(self, X, chosen_fold=None): logger.debug("ModelFramework.predict") if self.learners is None or len(self.learners) == 0: raise Exception("Learnes are not initialized") # run predict on all learners and return the average y_predicted = None # np.zeros((X.shape[0],)) - for ind, learner in enumerate(self.learners): + + # If no specific fold is chosen, return the average prediction across all folds + if chosen_fold is None: + for ind, learner in enumerate(self.learners): + # preprocessing goes here + X_data, _, _ = self.preprocessings[ind].transform(X.copy(), None) + y_p = learner.predict(X_data) + y_p = self.preprocessings[ind].inverse_scale_target(y_p) + + y_predicted = y_p if y_predicted is None else y_predicted + y_p + + y_predicted_average = y_predicted / float(len(self.learners)) + + y_predicted_final = self.preprocessings[0].prepare_target_labels( + y_predicted_average + ) + else: + ind = chosen_fold + learner = self.learners[ind] # preprocessing goes here X_data, _, _ = self.preprocessings[ind].transform(X.copy(), None) y_p = learner.predict(X_data) - y_p = self.preprocessings[ind].inverse_scale_target(y_p) - - y_predicted = y_p if y_predicted is None else y_predicted + y_p - - y_predicted_average = y_predicted / float(len(self.learners)) + y_predicted = self.preprocessings[ind].inverse_scale_target(y_p) - y_predicted_final = self.preprocessings[0].prepare_target_labels( - y_predicted_average - ) + y_predicted_final = self.preprocessings[0].prepare_target_labels( + y_predicted + ) return y_predicted_final diff --git a/tests/tests_automl/test_chosen_fold.py b/tests/tests_automl/test_chosen_fold.py new file mode 100644 index 00000000..f15fae89 --- /dev/null +++ b/tests/tests_automl/test_chosen_fold.py @@ -0,0 +1,52 @@ +import os +import unittest +import tempfile +import json +import numpy as np +import pandas as pd +import shutil + +from supervised import AutoML +from supervised.exceptions import AutoMLException + + +class AutoMLChosenFoldTest(unittest.TestCase): + + automl_dir = "automl_testing" + + def tearDown(self): + shutil.rmtree(self.automl_dir, ignore_errors=True) + + def test_chosen_fold(self): + + X = np.random.uniform(size=(60, 2)) + y = np.random.randint(0, 2, size=(60,)) + + automl = AutoML( + results_path=self.automl_dir, + model_time_limit=10, + algorithms=["Xgboost"], + mode="Compete", + explain_level=0, + validation_strategy={ + "validation_type": "kfold", + "k_folds": 5, + "shuffle": True, + "random_seed": 123, + }, + start_random_models=1, + hill_climbing_steps=0, + top_models_to_improve=0, + kmeans_features=False, + golden_features=False, + features_selection=False, + boost_on_errors=False, + chosen_fold=-1, + ) + automl.fit(X, y) + automl.predict_proba(X) + automl.predict(X) + + self.assertFalse( + os.path.exists(os.path.join(self.automl_dir, "1_DecisionTree")) + )