add presets

hackingmaterials · Dec 27, 2018 · 9e809c8 · 9e809c8
1 parent 79388f3
commit 9e809c8
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 111 deletions.
diff --git a/automatminer/__init__.py b/automatminer/__init__.py
@@ -1,8 +1,7 @@
 from automatminer.preprocessing import DataCleaner, FeatureReducer
 from automatminer.automl import TPOTAdaptor
 from automatminer.pipeline import MatPipe
-from automatminer.configs import debug_config, default_config, \
-    production_config, fast_config
+from automatminer.presets import get_preset_config
 
 __author__ = 'Alex Dunn, Qi Wang, Alex Ganose, Daniel Dopp, Anubhav Jain'
 __author_email__ = 'ardunn@lbl.gov'

diff --git a/automatminer/automl/tests/test_tpot.py b/automatminer/automl/tests/test_tpot.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from sklearn.metrics import r2_score, f1_score
 
-from automatminer.configs import debug_config
+from automatminer.presets import get_preset_config
 from automatminer.automl.adaptors import TPOTAdaptor
 from automatminer.utils.package_tools import AutomatminerError
 
@@ -18,47 +18,43 @@ def setUp(self):
         df = pd.read_csv(basedir + "/mini_automl_df.csv", index_col=0)
         self.train_df = df.copy(deep=True).iloc[:450]
         self.test_df = df.copy(deep=True).iloc[451:]
+        self.tpot = get_preset_config("debug")["learner"]
 
     def test_regression(self):
         target_key = "K_VRH"
-        tpot = TPOTAdaptor(**debug_config)
-        tpot.fit(self.train_df, target_key)
-        test_w_predictions = tpot.predict(self.test_df, target_key)
+        self.tpot.fit(self.train_df, target_key)
+        test_w_predictions = self.tpot.predict(self.test_df, target_key)
         y_true = test_w_predictions[target_key]
         y_test = test_w_predictions[target_key + " predicted"]
         self.assertTrue(r2_score(y_true, y_test) > 0.75)
 
     def test_classification(self):
-        tpot = TPOTAdaptor(**debug_config)
         max_kvrh = 50
         classifier_key = "K_VRH > {}?".format(max_kvrh)
         train_df = self.train_df.rename(columns={"K_VRH": classifier_key})
         test_df = self.test_df.rename(columns={"K_VRH": classifier_key})
         train_df[classifier_key] = train_df[classifier_key] > max_kvrh
         test_df[classifier_key] = test_df[classifier_key] > max_kvrh
-        tpot.fit(train_df, classifier_key)
-        print(tpot.mode)
-        test_w_predictions = tpot.predict(test_df, classifier_key)
+        self.tpot.fit(train_df, classifier_key)
+        test_w_predictions = self.tpot.predict(test_df, classifier_key)
         y_true = test_w_predictions[classifier_key]
         y_test = test_w_predictions[classifier_key + " predicted"]
         self.assertTrue(f1_score(y_true, y_test) > 0.75)
 
     def test_training_only(self):
-        tpot = TPOTAdaptor(**debug_config)
         target_key = "K_VRH"
-        train_w_predictions = tpot.fit_transform(self.train_df, target_key)
+        train_w_predictions = self.tpot.fit_transform(self.train_df, target_key)
         y_true = train_w_predictions[target_key]
         y_test = train_w_predictions[target_key + " predicted"]
         self.assertTrue(r2_score(y_true, y_test) > 0.85)
 
     def test_feature_mismatching(self):
-        tpot = TPOTAdaptor(**debug_config)
         target_key = "K_VRH"
         df1 = self.train_df
         df2 = self.test_df.rename(columns={'mean X': "some other feature"})
-        tpot.fit(df1, target_key)
+        self.tpot.fit(df1, target_key)
         with self.assertRaises(AutomatminerError):
-            tpot.predict(df2, target_key)
+            self.tpot.predict(df2, target_key)
 
 
 if __name__ == '__main__':

diff --git a/automatminer/configs.py b/automatminer/configs.py
diff --git a/automatminer/pipeline.py b/automatminer/pipeline.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 
-from automatminer.configs import default_config
 from automatminer.base import LoggableMixin, DataframeTransformer
 from automatminer.utils.ml_tools import regression_or_classification
 from automatminer.utils.package_tools import check_fitted, set_fitted, \
@@ -76,15 +75,14 @@ class MatPipe(DataframeTransformer, LoggableMixin):
             fit before being used to predict data.
     """
 
-    def __init__(self, logger=True, log_level=None, autofeaturizer=None,
-                 cleaner=None, reducer=None, learner=None):
+    def __init__(self, autofeaturizer, cleaner, reducer, learner, logger=True,
+                 log_level=None):
 
         self._logger = self.get_logger(logger, level=log_level)
-        self.autofeaturizer = autofeaturizer if autofeaturizer else \
-            default_config['autofeaturizer']
-        self.cleaner = cleaner if cleaner else default_config["cleaner"]
-        self.reducer = reducer if reducer else default_config["reducer"]
-        self.learner = learner if learner else default_config["learner"]
+        self.autofeaturizer = autofeaturizer
+        self.cleaner = cleaner
+        self.reducer = reducer
+        self.learner = learner
 
         self.autofeaturizer._logger = self.get_logger(logger)
         self.cleaner._logger = self.get_logger(logger)
@@ -313,22 +311,4 @@ def load(cls, filename, logger=True):
         pipe.logger.warning("Only use this model to make predictions (do not "
                             "retrain!). Backend was serialzed as only the top "
                             "model, not the full automl backend. ")
-        return pipe
-
-
-if __name__ == "__main__":
-    pass
-
-    # from sklearn.metrics import mean_squared_error
-    # from matminer.datasets.dataset_retrieval import load_dataset
-    #
-    # hugedf = load_dataset("elastic_tensor_2015").rename(
-    #     columns={"formula": "composition"})[["composition", "K_VRH"]]
-    #
-    # validation_ix = [1, 2, 3, 4, 5, 7, 12]
-    # df = hugedf.iloc[:100]
-    # df2 = hugedf.iloc[101:150]
-    # target = "K_VRH"
-    #
-    # mp = MatPipe(**debug_config)
-    # df = mp.benchmark(df, target, test_spec=0.25)
+        return pipe
diff --git a/automatminer/preprocessing/core.py b/automatminer/preprocessing/core.py
@@ -523,14 +523,3 @@ def rm_correlated(self, df, target, r_max=0.95):
             self.logger.debug("Features removed by cross-correlation were: {}"
                               "".format(rm_feats))
         return df
-
-
-if __name__ == "__main__":
-    from matminer.datasets.dataset_retrieval import load_dataset
-    from automatminer.pipeline import MatPipe, debug_config
-    target = "eij_max"
-    df = load_dataset("piezoelectric_tensor").rename(columns={"formula": "composition"})[[target, "composition", "structure"]]
-
-    mp = MatPipe(**debug_config)
-    df2 = mp.benchmark(df, target, test_spec=0.2)
-    print(df2)
diff --git a/automatminer/presets.py b/automatminer/presets.py
@@ -0,0 +1,50 @@
+"""
+Various configurations for MatPipe.
+
+Use them like so:
+
+    config = get_preset_config()
+    pipe = MatPipe(**config)
+"""
+
+__author__ = ["Alex Dunn <ardunn@lbl.gov>",
+              "Abhinav Ashar <AbhinavAshar@lbl.gov"]
+
+from automatminer.featurization import AutoFeaturizer
+from automatminer.preprocessing import FeatureReducer, DataCleaner
+from automatminer.automl import TPOTAdaptor
+
+
+def get_preset_config(preset='default'):
+    production_config = {"learner": TPOTAdaptor(generations=500,
+                                                population_size=500,
+                                                max_time_mins=720,
+                                                max_eval_time_mins=60),
+                         "reducer": FeatureReducer(),
+                         "autofeaturizer": AutoFeaturizer(preset="best"),
+                         "cleaner": DataCleaner()
+                         }
+    default_config = {"learner": TPOTAdaptor(max_time_mins=120),
+                      "reducer": FeatureReducer(),
+                      "autofeaturizer": AutoFeaturizer(preset="best"),
+                      "cleaner": DataCleaner()}
+
+    fast_config = {"learner": TPOTAdaptor(max_time_mins=30, population_size=50),
+                   "reducer": FeatureReducer(reducers=('corr', 'tree')),
+                   "autofeaturizer": AutoFeaturizer(preset="fast"),
+                   "cleaner": DataCleaner()}
+
+    debug_config = {"learner": TPOTAdaptor(max_time_mins=2,
+                                           max_eval_time_mins=1,
+                                           population_size=10),
+                    "reducer": FeatureReducer(reducers=('corr',)),
+                    "autofeaturizer": AutoFeaturizer(preset="fast"),
+                    "cleaner": DataCleaner()}
+    if preset == "default":
+        return default_config
+    elif preset == "fast":
+        return fast_config
+    elif preset == "debug":
+        return debug_config
+    elif preset == "production":
+        return production_config
diff --git a/automatminer/tests/test_pipeline.py b/automatminer/tests/test_pipeline.py
@@ -10,7 +10,7 @@
 from sklearn.exceptions import NotFittedError
 
 from automatminer.pipeline import MatPipe
-from automatminer.configs import debug_config
+from automatminer.presets import get_preset_config
 
 test_dir = os.path.dirname(__file__)
 
@@ -23,13 +23,14 @@ def setUp(self):
         self.df = df[["composition", "K_VRH"]]
         self.extra_features = df["G_VRH"]
         self.target = "K_VRH"
+        self.config = get_preset_config("debug")
+        self.pipe = MatPipe(**self.config)
 
     def test_transferability(self):
         df_train = self.df.iloc[:200]
         df_test = self.df.iloc[201:250]
-        pipe = MatPipe(**debug_config)
-        pipe.fit(df_train, self.target)
-        df_test = pipe.predict(df_test, self.target)
+        self.pipe.fit(df_train, self.target)
+        df_test = self.pipe.predict(df_test, self.target)
         true = df_test[self.target]
         test = df_test[self.target + " predicted"]
         self.assertTrue("composition" not in df_test.columns)
@@ -38,56 +39,53 @@ def test_transferability(self):
         # Use the same pipe object by refitting and reusing
         df_train2 = self.df.iloc[250:450]
         df_test2 = self.df.iloc[451:500]
-        pipe.fit(df_train2, self.target)
-        df_test2 = pipe.predict(df_test2, self.target)
+        self.pipe.fit(df_train2, self.target)
+        df_test2 = self.pipe.predict(df_test2, self.target)
         true2 = df_test2[self.target]
         test2 = df_test2[self.target + " predicted"]
         self.assertTrue("composition" not in df_test2.columns)
         self.assertTrue(r2_score(true2, test2) > 0.5)
 
     def test_user_features(self):
-        pipe = MatPipe(**debug_config)
         df = self.df
         df["G_VRH"] = self.extra_features
         self.assertTrue("G_VRH" in df.columns)
         self.assertTrue("K_VRH" in df.columns)
         df_train = df.iloc[:200]
         df_test = df.iloc[201:250]
-        pipe.fit(df_train, self.target)
+        self.pipe.fit(df_train, self.target)
 
         # If shear modulus is included as a feature it should probably show up
         # in the final pipeline
-        self.assertTrue("G_VRH" in pipe.learner.features)
-        df_test = pipe.predict(df_test, self.target)
+        self.assertTrue("G_VRH" in self.pipe.learner.features)
+        df_test = self.pipe.predict(df_test, self.target)
         true = df_test[self.target]
         test = df_test[self.target + " predicted"]
         self.assertTrue(r2_score(true, test) > 0.75)
 
     def test_benchmarking(self):
-        pipe = MatPipe(**debug_config)
         df = self.df.iloc[500:700]
-        df_test = pipe.benchmark(df, self.target, test_spec=0.25)
+        df_test = self.pipe.benchmark(df, self.target, test_spec=0.25)
         self.assertEqual(df_test.shape[0], 50)
         true = df_test[self.target]
         test = df_test[self.target + " predicted"]
         self.assertTrue(r2_score(true, test) > 0.5)
 
     def test_persistence_and_digest(self):
-        pipe = MatPipe(**debug_config)
         with self.assertRaises(NotFittedError):
-            pipe.save()
+            self.pipe.save()
         df = self.df[-200:]
-        pipe.fit(df, self.target)
+        self.pipe.fit(df, self.target)
 
         filename = os.path.join(test_dir, "test_pipe.p")
-        pipe.save(filename=filename)
-        pipe = MatPipe.load(filename, logger=False)
-        df_test = pipe.predict(self.df[-220:-201], self.target)
+        self.pipe.save(filename=filename)
+        self.pipe = MatPipe.load(filename, logger=False)
+        df_test = self.pipe.predict(self.df[-220:-201], self.target)
         self.assertTrue(self.target in df_test.columns)
         self.assertTrue(self.target + " predicted" in df_test.columns)
 
         digest_file = os.path.join(test_dir, "matdigest.txt")
-        digest = pipe.digest(filename=digest_file)
+        digest = self.pipe.digest(filename=digest_file)
         self.assertTrue(os.path.isfile(digest_file))
         self.assertTrue(isinstance(digest, str))
 

diff --git a/automatminer/tests/test_presets.py b/automatminer/tests/test_presets.py
@@ -0,0 +1,24 @@
+"""
+Testing the preset configurations for MatPipe.
+
+Mainly ensuring all args can be passed to matpipe constituent parts correctly.
+"""
+import unittest
+
+from automatminer.presets import get_preset_config
+
+
+class TestMatPipe(unittest.TestCase):
+    def test_production(self):
+        prod = get_preset_config("production")
+
+    def test_debug(self):
+        debug = get_preset_config("debug")
+
+    def test_fast(self):
+        fast = get_preset_config("fast")
+
+    def test_default(self):
+        default = get_preset_config("default")
+
+
diff --git a/examples/example.py b/examples/example.py