Skip to content

Commit

Permalink
add presets
Browse files Browse the repository at this point in the history
  • Loading branch information
ardunn committed Dec 27, 2018
1 parent 79388f3 commit 9e809c8
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 111 deletions.
3 changes: 1 addition & 2 deletions automatminer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from automatminer.preprocessing import DataCleaner, FeatureReducer
from automatminer.automl import TPOTAdaptor
from automatminer.pipeline import MatPipe
from automatminer.configs import debug_config, default_config, \
production_config, fast_config
from automatminer.presets import get_preset_config

__author__ = 'Alex Dunn, Qi Wang, Alex Ganose, Daniel Dopp, Anubhav Jain'
__author_email__ = 'ardunn@lbl.gov'
Expand Down
22 changes: 9 additions & 13 deletions automatminer/automl/tests/test_tpot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from sklearn.metrics import r2_score, f1_score

from automatminer.configs import debug_config
from automatminer.presets import get_preset_config
from automatminer.automl.adaptors import TPOTAdaptor
from automatminer.utils.package_tools import AutomatminerError

Expand All @@ -18,47 +18,43 @@ def setUp(self):
df = pd.read_csv(basedir + "/mini_automl_df.csv", index_col=0)
self.train_df = df.copy(deep=True).iloc[:450]
self.test_df = df.copy(deep=True).iloc[451:]
self.tpot = get_preset_config("debug")["learner"]

def test_regression(self):
target_key = "K_VRH"
tpot = TPOTAdaptor(**debug_config)
tpot.fit(self.train_df, target_key)
test_w_predictions = tpot.predict(self.test_df, target_key)
self.tpot.fit(self.train_df, target_key)
test_w_predictions = self.tpot.predict(self.test_df, target_key)
y_true = test_w_predictions[target_key]
y_test = test_w_predictions[target_key + " predicted"]
self.assertTrue(r2_score(y_true, y_test) > 0.75)

def test_classification(self):
tpot = TPOTAdaptor(**debug_config)
max_kvrh = 50
classifier_key = "K_VRH > {}?".format(max_kvrh)
train_df = self.train_df.rename(columns={"K_VRH": classifier_key})
test_df = self.test_df.rename(columns={"K_VRH": classifier_key})
train_df[classifier_key] = train_df[classifier_key] > max_kvrh
test_df[classifier_key] = test_df[classifier_key] > max_kvrh
tpot.fit(train_df, classifier_key)
print(tpot.mode)
test_w_predictions = tpot.predict(test_df, classifier_key)
self.tpot.fit(train_df, classifier_key)
test_w_predictions = self.tpot.predict(test_df, classifier_key)
y_true = test_w_predictions[classifier_key]
y_test = test_w_predictions[classifier_key + " predicted"]
self.assertTrue(f1_score(y_true, y_test) > 0.75)

def test_training_only(self):
tpot = TPOTAdaptor(**debug_config)
target_key = "K_VRH"
train_w_predictions = tpot.fit_transform(self.train_df, target_key)
train_w_predictions = self.tpot.fit_transform(self.train_df, target_key)
y_true = train_w_predictions[target_key]
y_test = train_w_predictions[target_key + " predicted"]
self.assertTrue(r2_score(y_true, y_test) > 0.85)

def test_feature_mismatching(self):
tpot = TPOTAdaptor(**debug_config)
target_key = "K_VRH"
df1 = self.train_df
df2 = self.test_df.rename(columns={'mean X': "some other feature"})
tpot.fit(df1, target_key)
self.tpot.fit(df1, target_key)
with self.assertRaises(AutomatminerError):
tpot.predict(df2, target_key)
self.tpot.predict(df2, target_key)


if __name__ == '__main__':
Expand Down
36 changes: 0 additions & 36 deletions automatminer/configs.py

This file was deleted.

34 changes: 7 additions & 27 deletions automatminer/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import numpy as np

from automatminer.configs import default_config
from automatminer.base import LoggableMixin, DataframeTransformer
from automatminer.utils.ml_tools import regression_or_classification
from automatminer.utils.package_tools import check_fitted, set_fitted, \
Expand Down Expand Up @@ -76,15 +75,14 @@ class MatPipe(DataframeTransformer, LoggableMixin):
fit before being used to predict data.
"""

def __init__(self, logger=True, log_level=None, autofeaturizer=None,
cleaner=None, reducer=None, learner=None):
def __init__(self, autofeaturizer, cleaner, reducer, learner, logger=True,
log_level=None):

self._logger = self.get_logger(logger, level=log_level)
self.autofeaturizer = autofeaturizer if autofeaturizer else \
default_config['autofeaturizer']
self.cleaner = cleaner if cleaner else default_config["cleaner"]
self.reducer = reducer if reducer else default_config["reducer"]
self.learner = learner if learner else default_config["learner"]
self.autofeaturizer = autofeaturizer
self.cleaner = cleaner
self.reducer = reducer
self.learner = learner

self.autofeaturizer._logger = self.get_logger(logger)
self.cleaner._logger = self.get_logger(logger)
Expand Down Expand Up @@ -313,22 +311,4 @@ def load(cls, filename, logger=True):
pipe.logger.warning("Only use this model to make predictions (do not "
"retrain!). Backend was serialzed as only the top "
"model, not the full automl backend. ")
return pipe


if __name__ == "__main__":
pass

# from sklearn.metrics import mean_squared_error
# from matminer.datasets.dataset_retrieval import load_dataset
#
# hugedf = load_dataset("elastic_tensor_2015").rename(
# columns={"formula": "composition"})[["composition", "K_VRH"]]
#
# validation_ix = [1, 2, 3, 4, 5, 7, 12]
# df = hugedf.iloc[:100]
# df2 = hugedf.iloc[101:150]
# target = "K_VRH"
#
# mp = MatPipe(**debug_config)
# df = mp.benchmark(df, target, test_spec=0.25)
return pipe
11 changes: 0 additions & 11 deletions automatminer/preprocessing/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,14 +523,3 @@ def rm_correlated(self, df, target, r_max=0.95):
self.logger.debug("Features removed by cross-correlation were: {}"
"".format(rm_feats))
return df


if __name__ == "__main__":
from matminer.datasets.dataset_retrieval import load_dataset
from automatminer.pipeline import MatPipe, debug_config
target = "eij_max"
df = load_dataset("piezoelectric_tensor").rename(columns={"formula": "composition"})[[target, "composition", "structure"]]

mp = MatPipe(**debug_config)
df2 = mp.benchmark(df, target, test_spec=0.2)
print(df2)
50 changes: 50 additions & 0 deletions automatminer/presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Various configurations for MatPipe.
Use them like so:
config = get_preset_config()
pipe = MatPipe(**config)
"""

__author__ = ["Alex Dunn <ardunn@lbl.gov>",
"Abhinav Ashar <AbhinavAshar@lbl.gov"]

from automatminer.featurization import AutoFeaturizer
from automatminer.preprocessing import FeatureReducer, DataCleaner
from automatminer.automl import TPOTAdaptor


def get_preset_config(preset='default'):
production_config = {"learner": TPOTAdaptor(generations=500,
population_size=500,
max_time_mins=720,
max_eval_time_mins=60),
"reducer": FeatureReducer(),
"autofeaturizer": AutoFeaturizer(preset="best"),
"cleaner": DataCleaner()
}
default_config = {"learner": TPOTAdaptor(max_time_mins=120),
"reducer": FeatureReducer(),
"autofeaturizer": AutoFeaturizer(preset="best"),
"cleaner": DataCleaner()}

fast_config = {"learner": TPOTAdaptor(max_time_mins=30, population_size=50),
"reducer": FeatureReducer(reducers=('corr', 'tree')),
"autofeaturizer": AutoFeaturizer(preset="fast"),
"cleaner": DataCleaner()}

debug_config = {"learner": TPOTAdaptor(max_time_mins=2,
max_eval_time_mins=1,
population_size=10),
"reducer": FeatureReducer(reducers=('corr',)),
"autofeaturizer": AutoFeaturizer(preset="fast"),
"cleaner": DataCleaner()}
if preset == "default":
return default_config
elif preset == "fast":
return fast_config
elif preset == "debug":
return debug_config
elif preset == "production":
return production_config
36 changes: 17 additions & 19 deletions automatminer/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn.exceptions import NotFittedError

from automatminer.pipeline import MatPipe
from automatminer.configs import debug_config
from automatminer.presets import get_preset_config

test_dir = os.path.dirname(__file__)

Expand All @@ -23,13 +23,14 @@ def setUp(self):
self.df = df[["composition", "K_VRH"]]
self.extra_features = df["G_VRH"]
self.target = "K_VRH"
self.config = get_preset_config("debug")
self.pipe = MatPipe(**self.config)

def test_transferability(self):
df_train = self.df.iloc[:200]
df_test = self.df.iloc[201:250]
pipe = MatPipe(**debug_config)
pipe.fit(df_train, self.target)
df_test = pipe.predict(df_test, self.target)
self.pipe.fit(df_train, self.target)
df_test = self.pipe.predict(df_test, self.target)
true = df_test[self.target]
test = df_test[self.target + " predicted"]
self.assertTrue("composition" not in df_test.columns)
Expand All @@ -38,56 +39,53 @@ def test_transferability(self):
# Use the same pipe object by refitting and reusing
df_train2 = self.df.iloc[250:450]
df_test2 = self.df.iloc[451:500]
pipe.fit(df_train2, self.target)
df_test2 = pipe.predict(df_test2, self.target)
self.pipe.fit(df_train2, self.target)
df_test2 = self.pipe.predict(df_test2, self.target)
true2 = df_test2[self.target]
test2 = df_test2[self.target + " predicted"]
self.assertTrue("composition" not in df_test2.columns)
self.assertTrue(r2_score(true2, test2) > 0.5)

def test_user_features(self):
pipe = MatPipe(**debug_config)
df = self.df
df["G_VRH"] = self.extra_features
self.assertTrue("G_VRH" in df.columns)
self.assertTrue("K_VRH" in df.columns)
df_train = df.iloc[:200]
df_test = df.iloc[201:250]
pipe.fit(df_train, self.target)
self.pipe.fit(df_train, self.target)

# If shear modulus is included as a feature it should probably show up
# in the final pipeline
self.assertTrue("G_VRH" in pipe.learner.features)
df_test = pipe.predict(df_test, self.target)
self.assertTrue("G_VRH" in self.pipe.learner.features)
df_test = self.pipe.predict(df_test, self.target)
true = df_test[self.target]
test = df_test[self.target + " predicted"]
self.assertTrue(r2_score(true, test) > 0.75)

def test_benchmarking(self):
pipe = MatPipe(**debug_config)
df = self.df.iloc[500:700]
df_test = pipe.benchmark(df, self.target, test_spec=0.25)
df_test = self.pipe.benchmark(df, self.target, test_spec=0.25)
self.assertEqual(df_test.shape[0], 50)
true = df_test[self.target]
test = df_test[self.target + " predicted"]
self.assertTrue(r2_score(true, test) > 0.5)

def test_persistence_and_digest(self):
pipe = MatPipe(**debug_config)
with self.assertRaises(NotFittedError):
pipe.save()
self.pipe.save()
df = self.df[-200:]
pipe.fit(df, self.target)
self.pipe.fit(df, self.target)

filename = os.path.join(test_dir, "test_pipe.p")
pipe.save(filename=filename)
pipe = MatPipe.load(filename, logger=False)
df_test = pipe.predict(self.df[-220:-201], self.target)
self.pipe.save(filename=filename)
self.pipe = MatPipe.load(filename, logger=False)
df_test = self.pipe.predict(self.df[-220:-201], self.target)
self.assertTrue(self.target in df_test.columns)
self.assertTrue(self.target + " predicted" in df_test.columns)

digest_file = os.path.join(test_dir, "matdigest.txt")
digest = pipe.digest(filename=digest_file)
digest = self.pipe.digest(filename=digest_file)
self.assertTrue(os.path.isfile(digest_file))
self.assertTrue(isinstance(digest, str))

Expand Down
24 changes: 24 additions & 0 deletions automatminer/tests/test_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Testing the preset configurations for MatPipe.
Mainly ensuring all args can be passed to matpipe constituent parts correctly.
"""
import unittest

from automatminer.presets import get_preset_config


class TestMatPipe(unittest.TestCase):
def test_production(self):
prod = get_preset_config("production")

def test_debug(self):
debug = get_preset_config("debug")

def test_fast(self):
fast = get_preset_config("fast")

def test_default(self):
default = get_preset_config("default")


3 changes: 0 additions & 3 deletions examples/example.py

This file was deleted.

0 comments on commit 9e809c8

Please # to comment.