From b06ce8e4c9a0469977d427228e343050eee04a35 Mon Sep 17 00:00:00 2001 From: Kacper Solarski Date: Mon, 21 Apr 2025 17:36:27 +0300 Subject: [PATCH 1/3] DID switch --- causalpy/experiments/diff_in_diff.py | 34 ++++++++++++++++------------ pyproject.toml | 1 + 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/causalpy/experiments/diff_in_diff.py b/causalpy/experiments/diff_in_diff.py index 37204052..ec92511b 100644 --- a/causalpy/experiments/diff_in_diff.py +++ b/causalpy/experiments/diff_in_diff.py @@ -19,8 +19,8 @@ import numpy as np import pandas as pd import seaborn as sns +from formulae import design_matrices from matplotlib import pyplot as plt -from patsy import build_design_matrices, dmatrices from sklearn.base import RegressorMixin from causalpy.custom_exceptions import ( @@ -91,16 +91,18 @@ def __init__( self.data = data self.expt_type = "Difference in Differences" self.formula = formula + self.rhs_formula = formula.split("~", 1)[1].strip() self.time_variable_name = time_variable_name self.group_variable_name = group_variable_name self.input_validation() - y, X = dmatrices(formula, self.data) - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) - self.outcome_variable_name = y.design_info.column_names[0] + dm = design_matrices(self.formula, self.data) + self.labels = list(dm.common.terms.keys()) + self.y, self.X = ( + np.asarray(dm.response.design_matrix).reshape(-1, 1), + np.asarray(dm.common.design_matrix), + ) + self.outcome_variable_name = dm.response.name # fit model if isinstance(self.model, PyMCModel): @@ -125,8 +127,8 @@ def __init__( ) if self.x_pred_control.empty: raise ValueError("x_pred_control is empty") - (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_control) - self.y_pred_control = self.model.predict(np.asarray(new_x)) + new_x = np.array(design_matrices(self.rhs_formula, self.x_pred_control).common) + self.y_pred_control = self.model.predict(new_x) # predicted outcome for treatment group self.x_pred_treatment = ( @@ -142,8 +144,10 @@ def __init__( ) if self.x_pred_treatment.empty: raise ValueError("x_pred_treatment is empty") - (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_treatment) - self.y_pred_treatment = self.model.predict(np.asarray(new_x)) + new_x = np.array( + design_matrices(self.rhs_formula, self.x_pred_treatment).common + ) + self.y_pred_treatment = self.model.predict(new_x) # predicted outcome for counterfactual. This is given by removing the influence # of the interaction term between the group and the post_treatment variable @@ -162,15 +166,15 @@ def __init__( ) if self.x_pred_counterfactual.empty: raise ValueError("x_pred_counterfactual is empty") - (new_x,) = build_design_matrices( - [self._x_design_info], self.x_pred_counterfactual, return_type="dataframe" + new_x = np.array( + design_matrices(self.rhs_formula, self.x_pred_counterfactual).common ) # INTERVENTION: set the interaction term between the group and the # post_treatment variable to zero. This is the counterfactual. for i, label in enumerate(self.labels): if "post_treatment" in label and self.group_variable_name in label: - new_x.iloc[:, i] = 0 - self.y_pred_counterfactual = self.model.predict(np.asarray(new_x)) + new_x[:, i] = 0 + self.y_pred_counterfactual = self.model.predict(new_x) # calculate causal impact if isinstance(self.model, PyMCModel): diff --git a/pyproject.toml b/pyproject.toml index 99c4a651..fba5f6a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "numpy", "pandas", "patsy", + "formulae", "pymc>=5.15.1", "scikit-learn>=1", "scipy", From 0734c8b7a25b0e5b09563f8298368a78f268c25b Mon Sep 17 00:00:00 2001 From: Kacper Solarski Date: Wed, 28 May 2025 21:43:09 +0300 Subject: [PATCH 2/3] Switch to formulaic --- causalpy/experiments/diff_in_diff.py | 31 +++++++++---------- causalpy/experiments/instrumental_variable.py | 28 ++++++++--------- .../experiments/interrupted_time_series.py | 21 ++++++------- .../inverse_propensity_weighting.py | 12 +++---- causalpy/experiments/prepostnegd.py | 27 ++++++++-------- .../experiments/regression_discontinuity.py | 23 +++++++------- causalpy/experiments/regression_kink.py | 23 +++++++------- causalpy/experiments/synthetic_control.py | 21 ++++++------- pyproject.toml | 2 +- 9 files changed, 89 insertions(+), 99 deletions(-) diff --git a/causalpy/experiments/diff_in_diff.py b/causalpy/experiments/diff_in_diff.py index ec92511b..a6f704a3 100644 --- a/causalpy/experiments/diff_in_diff.py +++ b/causalpy/experiments/diff_in_diff.py @@ -19,7 +19,7 @@ import numpy as np import pandas as pd import seaborn as sns -from formulae import design_matrices +from formulaic import model_matrix from matplotlib import pyplot as plt from sklearn.base import RegressorMixin @@ -91,18 +91,15 @@ def __init__( self.data = data self.expt_type = "Difference in Differences" self.formula = formula - self.rhs_formula = formula.split("~", 1)[1].strip() self.time_variable_name = time_variable_name self.group_variable_name = group_variable_name self.input_validation() - dm = design_matrices(self.formula, self.data) - self.labels = list(dm.common.terms.keys()) - self.y, self.X = ( - np.asarray(dm.response.design_matrix).reshape(-1, 1), - np.asarray(dm.common.design_matrix), - ) - self.outcome_variable_name = dm.response.name + dm = model_matrix(self.formula, self.data) + self.labels = list(dm.rhs.columns) + self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.rhs_matrix_spec = dm.rhs.model_spec + self.outcome_variable_name = dm.lhs.columns[0] # fit model if isinstance(self.model, PyMCModel): @@ -127,7 +124,9 @@ def __init__( ) if self.x_pred_control.empty: raise ValueError("x_pred_control is empty") - new_x = np.array(design_matrices(self.rhs_formula, self.x_pred_control).common) + new_x = model_matrix( + spec=self.rhs_matrix_spec, data=self.x_pred_control + ).to_numpy() self.y_pred_control = self.model.predict(new_x) # predicted outcome for treatment group @@ -144,9 +143,9 @@ def __init__( ) if self.x_pred_treatment.empty: raise ValueError("x_pred_treatment is empty") - new_x = np.array( - design_matrices(self.rhs_formula, self.x_pred_treatment).common - ) + new_x = model_matrix( + spec=self.rhs_matrix_spec, data=self.x_pred_treatment + ).to_numpy() self.y_pred_treatment = self.model.predict(new_x) # predicted outcome for counterfactual. This is given by removing the influence @@ -166,9 +165,9 @@ def __init__( ) if self.x_pred_counterfactual.empty: raise ValueError("x_pred_counterfactual is empty") - new_x = np.array( - design_matrices(self.rhs_formula, self.x_pred_counterfactual).common - ) + new_x = model_matrix( + spec=self.rhs_matrix_spec, data=self.x_pred_counterfactual + ).to_numpy() # INTERVENTION: set the interaction term between the group and the # post_treatment variable to zero. This is the counterfactual. for i, label in enumerate(self.labels): diff --git a/causalpy/experiments/instrumental_variable.py b/causalpy/experiments/instrumental_variable.py index 001ce9af..7fab3b1c 100644 --- a/causalpy/experiments/instrumental_variable.py +++ b/causalpy/experiments/instrumental_variable.py @@ -19,7 +19,7 @@ import numpy as np import pandas as pd -from patsy import dmatrices +from formulaic import model_matrix from sklearn.linear_model import LinearRegression as sk_lin_reg from causalpy.custom_exceptions import DataException @@ -110,19 +110,17 @@ def __init__( self.model = model self.input_validation() - y, X = dmatrices(formula, self.data) - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) - self.outcome_variable_name = y.design_info.column_names[0] + dm = model_matrix(self.formula, self.data) + self.labels = list(dm.rhs.columns) + self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.rhs_matrix_spec = dm.rhs.model_spec + self.outcome_variable_name = dm.lhs.columns[0] - t, Z = dmatrices(instruments_formula, self.instruments_data) - self._t_design_info = t.design_info - self._z_design_info = Z.design_info - self.labels_instruments = Z.design_info.column_names - self.t, self.Z = np.asarray(t), np.asarray(Z) - self.instrument_variable_name = t.design_info.column_names[0] + dm = model_matrix(self.instruments_formula, self.instruments_data) + self.labels_instruments = list(dm.rhs.columns) + self.t, self.Z = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.instrument_rhs_matrix_spec = dm.rhs.model_spec + self.instrument_variable_name = dm.lhs.columns[0] self.get_naive_OLS_fit() self.get_2SLS_fit() @@ -176,7 +174,7 @@ def get_2SLS_fit(self): fitted_Z_values = first_stage_reg.predict(self.Z) X2 = self.data.copy(deep=True) X2[self.instrument_variable_name] = fitted_Z_values - _, X2 = dmatrices(self.formula, X2) + X2 = model_matrix(self.formula, X2).rhs.to_numpy() second_stage_reg = sk_lin_reg().fit(X=X2, y=self.y) betas_first = list(first_stage_reg.coef_[0][1:]) betas_first.insert(0, first_stage_reg.intercept_[0]) @@ -196,7 +194,7 @@ def get_naive_OLS_fit(self): ols_reg = sk_lin_reg().fit(self.X, self.y) beta_params = list(ols_reg.coef_[0][1:]) beta_params.insert(0, ols_reg.intercept_[0]) - self.ols_beta_params = dict(zip(self._x_design_info.column_names, beta_params)) + self.ols_beta_params = dict(zip(self.labels, beta_params)) self.ols_reg = ols_reg def plot(self, round_to=None): diff --git a/causalpy/experiments/interrupted_time_series.py b/causalpy/experiments/interrupted_time_series.py index 14ec3749..54410ffe 100644 --- a/causalpy/experiments/interrupted_time_series.py +++ b/causalpy/experiments/interrupted_time_series.py @@ -20,8 +20,8 @@ import arviz as az import numpy as np import pandas as pd +from formulaic import model_matrix from matplotlib import pyplot as plt -from patsy import build_design_matrices, dmatrices from sklearn.base import RegressorMixin from causalpy.custom_exceptions import BadIndexException @@ -95,18 +95,15 @@ def __init__( self.formula = formula # set things up with pre-intervention data - y, X = dmatrices(formula, self.datapre) - self.outcome_variable_name = y.design_info.column_names[0] - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.pre_y, self.pre_X = np.asarray(y), np.asarray(X) + dm = model_matrix(self.formula, self.datapre) + self.labels = list(dm.rhs.columns) + self.matrix_spec = dm.model_spec + self.outcome_variable_name = dm.lhs.columns[0] + self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) # process post-intervention data - (new_y, new_x) = build_design_matrices( - [self._y_design_info, self._x_design_info], self.datapost - ) - self.post_X = np.asarray(new_x) - self.post_y = np.asarray(new_y) + new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost) + self.post_X = new_dm.rhs.to_numpy() + self.post_y = new_dm.lhs.to_numpy() # fit the model to the observed (pre-intervention) data if isinstance(self.model, PyMCModel): diff --git a/causalpy/experiments/inverse_propensity_weighting.py b/causalpy/experiments/inverse_propensity_weighting.py index 3e2915f7..fdfe780a 100644 --- a/causalpy/experiments/inverse_propensity_weighting.py +++ b/causalpy/experiments/inverse_propensity_weighting.py @@ -21,8 +21,8 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from formulaic import model_matrix from matplotlib.lines import Line2D -from patsy import dmatrices from sklearn.linear_model import LinearRegression as sk_lin_reg from causalpy.custom_exceptions import DataException @@ -89,11 +89,11 @@ def __init__( self.weighting_scheme = weighting_scheme self.input_validation() - t, X = dmatrices(formula, self.data) - self._t_design_info = t.design_info - self._t_design_info = X.design_info - self.labels = X.design_info.column_names - self.t, self.X = np.asarray(t), np.asarray(X) + dm = model_matrix(self.formula, self.data) + self.labels = list(dm.rhs.columns) + self.t, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.rhs_matrix_spec = dm.rhs.model_spec + self.outcome_variable_name = dm.lhs.columns[0] self.y = self.data[self.outcome_variable] COORDS = {"obs_ind": list(range(self.X.shape[0])), "coeffs": self.labels} diff --git a/causalpy/experiments/prepostnegd.py b/causalpy/experiments/prepostnegd.py index c33d89dc..204caf19 100644 --- a/causalpy/experiments/prepostnegd.py +++ b/causalpy/experiments/prepostnegd.py @@ -21,8 +21,8 @@ import numpy as np import pandas as pd import seaborn as sns +from formulaic import model_matrix from matplotlib import pyplot as plt -from patsy import build_design_matrices, dmatrices from sklearn.base import RegressorMixin from causalpy.custom_exceptions import ( @@ -104,12 +104,11 @@ def __init__( self.pretreatment_variable_name = pretreatment_variable_name self.input_validation() - y, X = dmatrices(formula, self.data) - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) - self.outcome_variable_name = y.design_info.column_names[0] + dm = model_matrix(self.formula, self.data) + self.labels = list(dm.rhs.columns) + self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.rhs_matrix_spec = dm.rhs.model_spec + self.outcome_variable_name = dm.lhs.columns[0] # fit the model to the observed (pre-intervention) data if isinstance(self.model, PyMCModel): @@ -135,10 +134,10 @@ def __init__( self.group_variable_name: np.zeros(self.pred_xi.shape), } ) - (new_x_untreated,) = build_design_matrices( - [self._x_design_info], x_pred_untreated - ) - self.pred_untreated = self.model.predict(X=np.asarray(new_x_untreated)) + new_x_untreated = model_matrix( + spec=self.rhs_matrix_spec, data=x_pred_untreated + ).to_numpy() + self.pred_untreated = self.model.predict(X=new_x_untreated) # treated x_pred_treated = pd.DataFrame( { @@ -146,8 +145,10 @@ def __init__( self.group_variable_name: np.ones(self.pred_xi.shape), } ) - (new_x_treated,) = build_design_matrices([self._x_design_info], x_pred_treated) - self.pred_treated = self.model.predict(X=np.asarray(new_x_treated)) + new_x_treated = model_matrix( + spec=self.rhs_matrix_spec, data=x_pred_treated + ).to_numpy() + self.pred_treated = self.model.predict(X=new_x_treated) # Evaluate causal impact as equal to the trestment effect self.causal_impact = self.model.idata.posterior["beta"].sel( diff --git a/causalpy/experiments/regression_discontinuity.py b/causalpy/experiments/regression_discontinuity.py index da4f98aa..a9e2d885 100644 --- a/causalpy/experiments/regression_discontinuity.py +++ b/causalpy/experiments/regression_discontinuity.py @@ -21,7 +21,7 @@ import pandas as pd import seaborn as sns from matplotlib import pyplot as plt -from patsy import build_design_matrices, dmatrices +from formulaic import model_matrix from sklearn.base import RegressorMixin from causalpy.custom_exceptions import ( @@ -111,15 +111,14 @@ def __init__( f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501 UserWarning, ) - y, X = dmatrices(formula, filtered_data) + dm = model_matrix(formula, filtered_data) else: - y, X = dmatrices(formula, self.data) + dm = model_matrix(formula, self.data) - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) - self.outcome_variable_name = y.design_info.column_names[0] + self.labels = list(dm.rhs.columns) + self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.rhs_matrix_spec = dm.rhs.model_spec + self.outcome_variable_name = dm.lhs.columns[0] # fit model if isinstance(self.model, PyMCModel): @@ -146,8 +145,8 @@ def __init__( self.x_pred = pd.DataFrame( {self.running_variable_name: xi, "treated": self._is_treated(xi)} ) - (new_x,) = build_design_matrices([self._x_design_info], self.x_pred) - self.pred = self.model.predict(X=np.asarray(new_x)) + new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy() + self.pred = self.model.predict(X=new_x) # calculate discontinuity by evaluating the difference in model expectation on # either side of the discontinuity @@ -164,8 +163,8 @@ def __init__( "treated": np.array([0, 1]), } ) - (new_x,) = build_design_matrices([self._x_design_info], self.x_discon) - self.pred_discon = self.model.predict(X=np.asarray(new_x)) + new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_discon).to_numpy() + self.pred_discon = self.model.predict(X=new_x) # ******** THIS IS SUBOPTIMAL AT THE MOMENT ************************************ if isinstance(self.model, PyMCModel): diff --git a/causalpy/experiments/regression_kink.py b/causalpy/experiments/regression_kink.py index 95ad3fcc..19545384 100644 --- a/causalpy/experiments/regression_kink.py +++ b/causalpy/experiments/regression_kink.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd import seaborn as sns -from patsy import build_design_matrices, dmatrices +from formulaic import model_matrix from causalpy.plot_utils import plot_xY @@ -74,15 +74,14 @@ def __init__( f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501 UserWarning, ) - y, X = dmatrices(formula, filtered_data) + dm = model_matrix(formula, filtered_data) else: - y, X = dmatrices(formula, self.data) + dm = model_matrix(formula, self.data) - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.y, self.X = np.asarray(y), np.asarray(X) - self.outcome_variable_name = y.design_info.column_names[0] + self.labels = list(dm.rhs.columns) + self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.rhs_matrix_spec = dm.rhs.model_spec + self.outcome_variable_name = dm.lhs.columns[0] COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])} self.model.fit(X=self.X, y=self.y, coords=COORDS) @@ -102,8 +101,8 @@ def __init__( self.x_pred = pd.DataFrame( {self.running_variable_name: xi, "treated": self._is_treated(xi)} ) - (new_x,) = build_design_matrices([self._x_design_info], self.x_pred) - self.pred = self.model.predict(X=np.asarray(new_x)) + new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy() + self.pred = self.model.predict(X=new_x) # evaluate gradient change around kink point mu_kink_left, mu_kink, mu_kink_right = self._probe_kink_point() @@ -158,8 +157,8 @@ def _probe_kink_point(self): "treated": np.array([0, 1, 1]), } ) - (new_x,) = build_design_matrices([self._x_design_info], x_predict) - predicted = self.model.predict(X=np.asarray(new_x)) + new_x = model_matrix(spec=self.rhs_matrix_spec, data=x_predict).to_numpy() + predicted = self.model.predict(X=new_x) # extract predicted mu values mu_kink_left = predicted["posterior_predictive"].sel(obs_ind=0)["mu"] mu_kink = predicted["posterior_predictive"].sel(obs_ind=1)["mu"] diff --git a/causalpy/experiments/synthetic_control.py b/causalpy/experiments/synthetic_control.py index a1bdecbb..2b427423 100644 --- a/causalpy/experiments/synthetic_control.py +++ b/causalpy/experiments/synthetic_control.py @@ -20,8 +20,8 @@ import arviz as az import numpy as np import pandas as pd +from formulaic import model_matrix from matplotlib import pyplot as plt -from patsy import build_design_matrices, dmatrices from sklearn.base import RegressorMixin from causalpy.custom_exceptions import BadIndexException @@ -90,18 +90,15 @@ def __init__( self.formula = formula # set things up with pre-intervention data - y, X = dmatrices(formula, self.datapre) - self.outcome_variable_name = y.design_info.column_names[0] - self._y_design_info = y.design_info - self._x_design_info = X.design_info - self.labels = X.design_info.column_names - self.pre_y, self.pre_X = np.asarray(y), np.asarray(X) + dm = model_matrix(self.formula, self.datapre) + self.labels = list(dm.rhs.columns) + self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy()) + self.matrix_spec = dm.model_spec + self.outcome_variable_name = dm.lhs.columns[0] # process post-intervention data - (new_y, new_x) = build_design_matrices( - [self._y_design_info, self._x_design_info], self.datapost - ) - self.post_X = np.asarray(new_x) - self.post_y = np.asarray(new_y) + new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost) + self.post_X = new_dm.rhs.to_numpy() + self.post_y = new_dm.lhs.to_numpy() # fit the model to the observed (pre-intervention) data if isinstance(self.model, PyMCModel): diff --git a/pyproject.toml b/pyproject.toml index fba5f6a2..20db8db3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "numpy", "pandas", "patsy", - "formulae", + "formulaic", "pymc>=5.15.1", "scikit-learn>=1", "scipy", From 699c380583c676b69dcf507d857d037dde787f07 Mon Sep 17 00:00:00 2001 From: Kacper Solarski Date: Wed, 28 May 2025 22:21:37 +0300 Subject: [PATCH 3/3] Remove patsy from pyproject --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 20db8db3..6f2c1b25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ dependencies = [ "matplotlib>=3.5.3", "numpy", "pandas", - "patsy", "formulaic", "pymc>=5.15.1", "scikit-learn>=1",