From eb3b6b66ec5b80a2112b7f79d211fd17dc23c9b4 Mon Sep 17 00:00:00 2001 From: Will Dean <57733339+wd60622@users.noreply.github.com> Date: Tue, 4 Feb 2025 15:18:03 +0100 Subject: [PATCH] load bug with y name different than `output_var` (#1468) * change the name of the series / array * modifications to tests to use prior_predictive mocking * link up y in test * add the prior to the idata * add different name in xarray test --- pymc_marketing/model_builder.py | 3 +- tests/conftest.py | 2 +- tests/customer_choice/test_mv_its.py | 52 ++-------- tests/mmm/test_base.py | 7 +- tests/mmm/test_mmm.py | 137 ++++++++++++++------------- tests/test_model_builder.py | 17 ++-- 6 files changed, 94 insertions(+), 124 deletions(-) diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py index 87e835d49..aeec493d5 100644 --- a/pymc_marketing/model_builder.py +++ b/pymc_marketing/model_builder.py @@ -632,8 +632,7 @@ def create_fit_data( if isinstance(y, np.ndarray): y = pd.Series(y, index=X.index, name=self.output_var) - if y.name is None: - y.name = self.output_var + y.name = self.output_var if isinstance(X, pd.DataFrame): X = X.to_xarray() diff --git a/tests/conftest.py b/tests/conftest.py index 342785c55..04927acc6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -147,7 +147,7 @@ def mock_sample(*args, **kwargs): return idata -@pytest.fixture +@pytest.fixture(scope="module") def mock_pymc_sample(): original_sample = pm.sample pm.sample = mock_sample diff --git a/tests/customer_choice/test_mv_its.py b/tests/customer_choice/test_mv_its.py index 70de798ac..a366a3c03 100644 --- a/tests/customer_choice/test_mv_its.py +++ b/tests/customer_choice/test_mv_its.py @@ -13,7 +13,6 @@ # limitations under the License. import re -import warnings import numpy as np import pandas as pd @@ -84,38 +83,10 @@ def test_plot_data(saturated_data): plt.close() -def mock_fit(self, X, y, **kwargs): - self.idata.add_groups( - { - "posterior": self.idata.prior, - } - ) - - combined_data = pd.concat([X, y.rename(self.output_var)], axis=1) - - if "fit_data" in self.idata: - del self.idata.fit_data - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - category=UserWarning, - message="The group fit_data is not defined in the InferenceData scheme", - ) - self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore - - return self - - @pytest.fixture(scope="module") -def fit_model(module_mocker, saturated_data): +def fit_model(saturated_data, mock_pymc_sample): model = MVITS(existing_sales=["competitor", "own"], saturated_market=True) - module_mocker.patch( - "pymc_marketing.customer_choice.mv_its.MVITS.fit", - mock_fit, - ) - model.sample( saturated_data.loc[:, ["competitor", "own"]], saturated_data["new"], @@ -151,14 +122,9 @@ def unsaturated_data_good(): @pytest.fixture(scope="module") -def unsaturated_model_bad(module_mocker, unsaturated_data_bad): +def unsaturated_model_bad(unsaturated_data_bad, mock_pymc_sample): model = MVITS(existing_sales=["competitor", "own"], saturated_market=False) - module_mocker.patch( - "pymc_marketing.customer_choice.mv_its.MVITS.fit", - mock_fit, - ) - model.sample( unsaturated_data_bad.loc[:, ["competitor", "own"]], unsaturated_data_bad["new"], @@ -169,14 +135,9 @@ def unsaturated_model_bad(module_mocker, unsaturated_data_bad): @pytest.fixture(scope="module") -def unsaturated_model_good(module_mocker, unsaturated_data_good): +def unsaturated_model_good(unsaturated_data_good, mock_pymc_sample): model = MVITS(existing_sales=["competitor", "own"], saturated_market=False) - module_mocker.patch( - "pymc_marketing.customer_choice.mv_its.MVITS.fit", - mock_fit, - ) - model.sample( unsaturated_data_good.loc[:, ["competitor", "own"]], unsaturated_data_good["new"], @@ -220,8 +181,11 @@ def test_save_load(fit_model, saturated_data) -> None: assert loaded.saturated_market == fit_model.saturated_market assert loaded.X.columns.name is None pd.testing.assert_frame_equal(loaded.X, fit_model.X, check_names=False) - assert loaded.y.name == fit_model.output_var - pd.testing.assert_series_equal(loaded.y.rename("new"), saturated_data["new"]) + assert loaded.y.name == fit_model.y.name + pd.testing.assert_series_equal( + loaded.y, + saturated_data["new"].rename(fit_model.output_var), + ) @pytest.mark.parametrize("variable", ["y", "mu"]) diff --git a/tests/mmm/test_base.py b/tests/mmm/test_base.py index 93625813b..a418e141b 100644 --- a/tests/mmm/test_base.py +++ b/tests/mmm/test_base.py @@ -314,7 +314,12 @@ def test_calling_prior_predictive_before_fit_raises_error(test_mmm, toy_X, toy_y test_mmm.prior_predictive -def test_calling_fit_result_before_fit_raises_error(test_mmm, toy_X, toy_y): +def test_calling_fit_result_before_fit_raises_error( + test_mmm, + toy_X, + toy_y, + mock_pymc_sample, +): # Arrange test_mmm.idata = None with pytest.raises( diff --git a/tests/mmm/test_mmm.py b/tests/mmm/test_mmm.py index fb56e480e..789cdb706 100644 --- a/tests/mmm/test_mmm.py +++ b/tests/mmm/test_mmm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import warnings import arviz as az import numpy as np @@ -37,35 +36,6 @@ rng: np.random.Generator = np.random.default_rng(seed=seed) -def mock_fit(model, X: pd.DataFrame, y: np.ndarray, **kwargs): - model.build_model(X=X, y=y) - - with model.model: - idata = pm.sample_prior_predictive(random_seed=rng, **kwargs) - - model.preprocess("X", X) - model.preprocess("y", y) - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - category=UserWarning, - message="The group fit_data is not defined in the InferenceData scheme", - ) - idata.add_groups( - { - "posterior": idata.prior, - "fit_data": pd.concat( - [X, pd.Series(y, index=X.index, name="y")], axis=1 - ).to_xarray(), - } - ) - model.idata = idata - model.set_idata_attrs(idata=idata) - - return model - - @pytest.fixture(scope="module") def generate_data(): def _generate_data(date_data: pd.DatetimeIndex) -> pd.DataFrame: @@ -159,8 +129,14 @@ def mmm_with_fourier_features() -> MMM: @pytest.fixture(scope="module") -def mmm_fitted(mmm: MMM, toy_X: pd.DataFrame, toy_y: pd.Series) -> MMM: - return mock_fit(mmm, toy_X, toy_y) +def mmm_fitted( + mmm: MMM, + toy_X: pd.DataFrame, + toy_y: pd.Series, + mock_pymc_sample, +) -> MMM: + mmm.fit(X=toy_X, y=toy_y) + return mmm @pytest.fixture(scope="module") @@ -172,13 +148,24 @@ def mmm_fitted_with_posterior_predictive( return mmm_fitted +@pytest.fixture(scope="module") +def mmm_fitted_with_prior_and_posterior_predictive( + mmm_fitted_with_posterior_predictive, + toy_X, +): + _ = mmm_fitted_with_posterior_predictive.sample_prior_predictive(toy_X) + return mmm_fitted_with_posterior_predictive + + @pytest.fixture(scope="module") def mmm_fitted_with_fourier_features( mmm_with_fourier_features: MMM, toy_X: pd.DataFrame, toy_y: pd.Series, + mock_pymc_sample, ) -> MMM: - return mock_fit(mmm_with_fourier_features, toy_X, toy_y) + mmm_with_fourier_features.fit(X=toy_X, y=toy_y) + return mmm_with_fourier_features @pytest.mark.parametrize("media_transform", ["adstock", "saturation"]) @@ -195,7 +182,11 @@ def test_plotting_media_transform_workflow(mmm_fitted, media_transform) -> None: class TestMMM: def test_save_load_with_not_serializable_model_config( - self, model_config_requiring_serialization, toy_X, toy_y + self, + model_config_requiring_serialization, + toy_X, + toy_y, + mock_pymc_sample, ): def deep_equal(dict1, dict2): for key, value in dict1.items(): @@ -222,7 +213,7 @@ def deep_equal(dict1, dict2): adstock=adstock, saturation=saturation, ) - model = mock_fit(model, toy_X, toy_y) + model.fit(toy_X, toy_y) model.save("test_save_load") model2 = MMM.load("test_save_load") assert model.date_column == model2.date_column @@ -371,10 +362,7 @@ def test_init( samples, ) - def test_fit(self, toy_X: pd.DataFrame, toy_y: pd.Series) -> None: - draws: int = 500 - chains: int = 1 - + def test_fit(self, toy_X: pd.DataFrame, toy_y: pd.Series, mock_pymc_sample) -> None: mmm = BaseMMM( date_column="date", channel_columns=["channel_1", "channel_2"], @@ -389,25 +377,27 @@ def test_fit(self, toy_X: pd.DataFrame, toy_y: pd.Series) -> None: assert mmm.model_config is not None n_channel: int = len(mmm.channel_columns) n_control: int = len(mmm.control_columns) - mmm = mock_fit(mmm, toy_X, toy_y) - idata: az.InferenceData = mmm.fit_result + mmm.fit(X=toy_X, y=toy_y) + posterior: az.InferenceData = mmm.fit_result + chains = posterior.sizes["chain"] + draws = posterior.sizes["draw"] assert ( - az.extract(data=idata, var_names=["intercept"], combined=True) + az.extract(data=posterior, var_names=["intercept"], combined=True) .to_numpy() .size == draws * chains ) assert az.extract( - data=idata, var_names=["saturation_beta"], combined=True + data=posterior, var_names=["saturation_beta"], combined=True ).to_numpy().shape == (n_channel, draws * chains) assert az.extract( - data=idata, var_names=["adstock_alpha"], combined=True + data=posterior, var_names=["adstock_alpha"], combined=True ).to_numpy().shape == (n_channel, draws * chains) assert az.extract( - data=idata, var_names=["saturation_lam"], combined=True + data=posterior, var_names=["saturation_lam"], combined=True ).to_numpy().shape == (n_channel, draws * chains) assert az.extract( - data=idata, var_names=["gamma_control"], combined=True + data=posterior, var_names=["gamma_control"], combined=True ).to_numpy().shape == ( n_channel, draws * chains, @@ -439,7 +429,10 @@ def test_fit(self, toy_X: pd.DataFrame, toy_y: pd.Series) -> None: ] def test_mmm_serializes_and_deserializes_dag_and_nodes( - self, toy_X: pd.DataFrame, toy_y: pd.Series + self, + toy_X: pd.DataFrame, + toy_y: pd.Series, + mock_pymc_sample, ) -> None: dag = """ digraph { @@ -462,7 +455,7 @@ def test_mmm_serializes_and_deserializes_dag_and_nodes( outcome_node=outcome_node, ) - mmm = mock_fit(mmm, toy_X, toy_y) + mmm.fit(X=toy_X, y=toy_y) # Save and reload the model mmm.save("test_model") @@ -595,9 +588,11 @@ def test_get_ts_contribution_posterior( var_contribution=var_contribution, original_scale=original_scale ) ) + chains = ts_posterior.sizes["chain"] + draws = ts_posterior.sizes["draw"] assert ts_posterior.dims == ("chain", "draw", "date") - assert ts_posterior.chain.size == 1 - assert ts_posterior.draw.size == 500 + assert ts_posterior.chain.size == chains + assert ts_posterior.draw.size == draws @pytest.mark.parametrize( argnames="original_scale", @@ -612,8 +607,8 @@ def test_get_errors( errors = mmm_fitted_with_posterior_predictive.get_errors( original_scale=original_scale ) - n_chains = 1 - n_draws = 500 + n_chains = errors.sizes["chain"] + n_draws = errors.sizes["draw"] assert isinstance(errors, xr.DataArray) assert errors.name == "errors" assert errors.shape == ( @@ -699,12 +694,12 @@ def test_get_channel_contributions_forward_pass_grid_shapes( ) -> None: n_channels = len(mmm_fitted.channel_columns) data_range = mmm_fitted.X.shape[0] - draws = 500 - chains = 1 grid_size = 2 contributions = mmm_fitted.get_channel_contributions_forward_pass_grid( start=0, stop=1.5, num=grid_size ) + draws = contributions.sizes["draw"] + chains = contributions.sizes["chain"] assert contributions.shape == ( grid_size, chains, @@ -750,27 +745,28 @@ def test_plot_channel_contributions_grid( ) def test_get_group_predictive_data( self, - mmm_fitted_with_posterior_predictive: MMM, + mmm_fitted_with_prior_and_posterior_predictive: MMM, group: str, original_scale: bool, ): - dataset = mmm_fitted_with_posterior_predictive._get_group_predictive_data( - group=group, original_scale=original_scale + dataset = ( + mmm_fitted_with_prior_and_posterior_predictive._get_group_predictive_data( + group=group, + original_scale=original_scale, + ) ) assert isinstance(dataset, xr.Dataset) - assert dataset.dims["chain"] == 1 - assert dataset.dims["draw"] == 500 assert dataset.dims["date"] == 135 assert dataset["y"].dims == ("chain", "draw", "date") - def test_data_setter(self, toy_X, toy_y): + def test_data_setter(self, toy_X, toy_y, mock_pymc_sample): base_delayed_saturated_mmm = BaseMMM( date_column="date", channel_columns=["channel_1", "channel_2"], adstock=GeometricAdstock(l_max=4), saturation=LogisticSaturation(), ) - base_delayed_saturated_mmm = mock_fit(base_delayed_saturated_mmm, toy_X, toy_y) + base_delayed_saturated_mmm.fit(X=toy_X, y=toy_y) X_correct_ndarray = np.random.randint(low=0, high=100, size=(135, 2)) y_correct_ndarray = np.random.randint(low=0, high=100, size=135) @@ -829,7 +825,7 @@ def mock_property(self): ) # Check that the property returns the new value - DSMMM = mock_fit(DSMMM, toy_X, toy_y) + DSMMM.fit(toy_X, toy_y) DSMMM.save("test_model") # Apply the monkeypatch for the property monkeypatch.setattr(MMM, "id", property(mock_property)) @@ -1374,7 +1370,7 @@ def test_save_load_with_tvp( time_varying_intercept=time_varying_intercept, time_varying_media=time_varying_media, ) - mmm = mock_fit(mmm, toy_X, toy_y) + mmm.fit(toy_X, toy_y) file = "tmp-model" mmm.save(file) @@ -1416,7 +1412,8 @@ def mmm_with_media_config_fitted( toy_X: pd.DataFrame, toy_y: pd.Series, ) -> MMM: - return mock_fit(mmm_with_media_config, toy_X, toy_y) + mmm_with_media_config.fit(toy_X, toy_y) + return mmm_with_media_config def test_save_load_with_media_transformation(mmm_with_media_config_fitted) -> None: @@ -1443,7 +1440,7 @@ def test_save_load_with_media_transformation(mmm_with_media_config_fitted) -> No os.remove(file) -def test_missing_attrs_to_defaults(toy_X, toy_y) -> None: +def test_missing_attrs_to_defaults(toy_X, toy_y, mock_pymc_sample) -> None: mmm = MMM( date_column="date", channel_columns=["channel_1", "channel_2"], @@ -1454,7 +1451,7 @@ def test_missing_attrs_to_defaults(toy_X, toy_y) -> None: time_varying_intercept=False, time_varying_media=False, ) - mmm = mock_fit(mmm, toy_X, toy_y) + mmm.fit(toy_X, toy_y) mmm.idata.attrs.pop("adstock_first") mmm.idata.attrs.pop("time_varying_intercept") mmm.idata.attrs.pop("time_varying_media") @@ -1481,7 +1478,11 @@ def test_missing_attrs_to_defaults(toy_X, toy_y) -> None: os.remove(file) -def test_channel_contributions_forward_pass_time_varying_media(toy_X, toy_y) -> None: +def test_channel_contributions_forward_pass_time_varying_media( + toy_X, + toy_y, + mock_pymc_sample, +) -> None: mmm = MMM( date_column="date", channel_columns=["channel_1", "channel_2"], @@ -1490,7 +1491,7 @@ def test_channel_contributions_forward_pass_time_varying_media(toy_X, toy_y) -> saturation=LogisticSaturation(), time_varying_media=True, ) - mmm = mock_fit(mmm, toy_X, toy_y) + mmm.fit(toy_X, toy_y) posterior = mmm.fit_result diff --git a/tests/test_model_builder.py b/tests/test_model_builder.py index 494aa9c45..0eb2ea091 100644 --- a/tests/test_model_builder.py +++ b/tests/test_model_builder.py @@ -38,12 +38,12 @@ def toy_y(toy_X): rng = np.random.default_rng(42) y = 5 * toy_X["input"] + 3 y = y + rng.normal(0, 1, size=len(toy_X)) - y = pd.Series(y, name="output") + y = pd.Series(y, name="different name than output") return y @pytest.fixture(scope="module") -def fitted_model_instance(toy_X): +def fitted_model_instance(toy_X, toy_y, mock_pymc_sample): sampler_config = { "draws": 100, "tune": 100, @@ -62,6 +62,7 @@ def fitted_model_instance(toy_X): ) model.fit( toy_X, + toy_y, chains=1, draws=100, tune=100, @@ -237,7 +238,7 @@ def test_fit(fitted_model_instance): ) -def test_fit_no_t(toy_X): +def test_fit_no_t(toy_X, mock_pymc_sample): model_builder = ModelBuilderTest() model_builder.idata = model_builder.fit(X=toy_X, chains=1, draws=100, tune=100) assert model_builder.model is not None @@ -298,8 +299,8 @@ def test_sample_posterior_predictive(fitted_model_instance, combined): pred = fitted_model_instance.sample_posterior_predictive( prediction_data, combined=combined, extend_idata=True ) - chains = fitted_model_instance.idata.sample_stats.sizes["chain"] - draws = fitted_model_instance.idata.sample_stats.sizes["draw"] + chains = fitted_model_instance.idata.posterior.sizes["chain"] + draws = fitted_model_instance.idata.posterior.sizes["draw"] expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred) assert pred[fitted_model_instance.output_var].shape == expected_shape assert np.issubdtype(pred[fitted_model_instance.output_var].dtype, np.floating) @@ -370,7 +371,7 @@ def test_prediction_kwarg(fitted_model_instance, toy_X): assert isinstance(result, xr.Dataset) -def test_fit_after_prior_keeps_prior(toy_X, toy_y): +def test_fit_after_prior_keeps_prior(toy_X, toy_y, mock_pymc_sample): model = ModelBuilderTest() model.sample_prior_predictive(toy_X) assert "prior" in model.idata @@ -381,7 +382,7 @@ def test_fit_after_prior_keeps_prior(toy_X, toy_y): assert "prior_predictive" in model.idata -def test_second_fit(toy_X, toy_y): +def test_second_fit(toy_X, toy_y, mock_pymc_sample): model = ModelBuilderTest() model.fit(X=toy_X, y=toy_y, chains=1, draws=100, tune=100) @@ -711,7 +712,7 @@ def xarray_y(xarray_X) -> xr.DataArray: ) beta = xr.DataArray([1, 2], dims=["country"], coords={"country": ["A", "B"]}) - return (alpha + beta * xarray_X["x"]).rename("output") + return (alpha + beta * xarray_X["x"]).rename("name other than output") @pytest.mark.parametrize("X_is_array", [False, True], ids=["DataArray", "Dataset"])