[Bug] Fix random halt problems on traditional pipelines (#147)

franchuterivera · web-flow · commit 68fc77f6f1c6 · 2021-03-29T20:05:52.000+02:00
* [feat] Fix random halt problems on traditional pipelines

* Documentation update

* Fix flake

* Flake due to kernel pca errors
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -54,6 +54,23 @@
 
 
 class MyTraditionalTabularClassificationPipeline(BaseEstimator):
+    """
+    A wrapper class that holds a pipeline for traditional classification.
+    Estimators like CatBoost, and Random Forest are considered traditional machine
+    learning models and are fitted before neural architecture search.
+
+    This class is an interface to fit a pipeline containing a traditional machine
+    learning model, and is the final object that is stored for inference.
+
+    Attributes:
+        dataset_properties (Dict[str, Any]):
+            A dictionary containing dataset specific information
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+    """
     def __init__(self, config: str,
                  dataset_properties: Dict[str, Any],
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
@@ -98,6 +115,21 @@ def get_default_pipeline_options() -> Dict[str, Any]:
 
 
 class DummyClassificationPipeline(DummyClassifier):
+    """
+    A wrapper class that holds a pipeline for dummy classification.
+
+    A wrapper over DummyClassifier of scikit learn. This estimator is considered the
+    worst performing model. In case of failure, at least this model will be fitted.
+
+    Attributes:
+        dataset_properties (Dict[str, Any]):
+            A dictionary containing dataset specific information
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+    """
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None
@@ -148,6 +180,21 @@ def get_default_pipeline_options() -> Dict[str, Any]:
 
 
 class DummyRegressionPipeline(DummyRegressor):
+    """
+    A wrapper class that holds a pipeline for dummy regression.
+
+    A wrapper over DummyRegressor of scikit learn. This estimator is considered the
+    worst performing model. In case of failure, at least this model will be fitted.
+
+    Attributes:
+        dataset_properties (Dict[str, Any]):
+            A dictionary containing dataset specific information
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+    """
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None) -> None:
@@ -351,7 +398,7 @@ def _get_pipeline(self) -> BaseEstimator:
         if isinstance(self.configuration, int):
             pipeline = self.pipeline_class(config=self.configuration,
                                            random_state=np.random.RandomState(self.seed),
-                                           init_params=self.fit_dictionary)
+                                           init_params=self._init_params)
         elif isinstance(self.configuration, Configuration):
             pipeline = self.pipeline_class(config=self.configuration,
                                            dataset_properties=self.dataset_properties,
@@ -364,7 +411,7 @@ def _get_pipeline(self) -> BaseEstimator:
             pipeline = self.pipeline_class(config=self.configuration,
                                            dataset_properties=self.dataset_properties,
                                            random_state=np.random.RandomState(self.seed),
-                                           init_params=self.fit_dictionary)
+                                           init_params=self._init_params)
         else:
             raise ValueError("Invalid configuration entered")
         return pipeline
diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py
@@ -1,3 +1,5 @@
+import flaky
+
 import numpy as np
 
 import pytest
@@ -51,6 +53,7 @@ def test_feature_preprocessor(self, fit_dictionary_tabular, preprocessor):
         transformed = column_transformer.transform(X['X_train'])
         assert isinstance(transformed, np.ndarray)
 
+    @flaky.flaky(max_runs=3)
     def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor):
         """
         This test ensures that a tabular classification