Merge branch 'develop' into 1.1.x

# Conflicts: # src/sklearndf/transformation/_transformation.py
BCG-X-Official · Mar 4, 2021 · 5acf503 · 5acf503
2 parents 9063269 + 8a7dc6c
commit 5acf503
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 44 deletions.
diff --git a/.idea/sklearndf.iml b/.idea/sklearndf.iml
diff --git a/README.rst b/README.rst
@@ -24,35 +24,43 @@ To this end, *sklearndf* enhances scikit-learn's estimators as follows:
 
 .. End-Badges
 
+
 Installation
----------------------
+------------
 
 *sklearndf* supports both PyPI and Anaconda
 
+
 Anaconda
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~
 
 .. code-block:: RST
 
     conda install sklearndf -c bcg_gamma -c conda-forge
 
+
 Pip
-~~~~~~~~~~~~~~~~~~~~~
+~~~
 
 .. code-block:: RST
 
     pip install sklearndf
 
 
 Quickstart
-----------------------
+----------
 
 The following quickstart guide provides a minimal example workflow to get up and running
 with *sklearndf*.
+For additional tutorials and the API reference,
+see the `*sklearndf* documentation <https://bcg-gamma.github.io/facet/>`__.
+
+Changes and additions to new versions are summarized in the
+`release notes <https://bcg-gamma.github.io/sklearndf/release_notes.html>`__.
 
 
 Creating a DataFrame friendly scikit-learn preprocessing pipeline
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The titanic data set includes categorical features such as class and sex, and also has
 missing values for numeric features (i.e., age) and categorical features (i.e., embarked).
@@ -162,7 +170,7 @@ such as in this case for embarked.
 
 
 Completing the pipeline with a classifier
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Scikit-learn regressors and classifiers have a *sklearndf* sibling obtained by appending
 DF to the class name; the API remains the same.
@@ -201,8 +209,9 @@ on a test set.
 
 model score: 0.79
 
+
 Contributing
----------------------------
+------------
 
 *sklearndf* is stable and is being supported long-term.
 
@@ -217,22 +226,24 @@ at our team email: FacetTeam@bcg.com.
 For further information on contributing please see our
 `contribution guide <https://bcg-gamma.github.io/sklearndf/contribution_guide.html>`__.
 
+
 License
----------------------------
+-------
 
 *sklearndf* is licensed under Apache 2.0 as described in the
 `LICENSE <https://github.com/BCG-Gamma/sklearndf/blob/develop/LICENSE>`_ file.
 
 
 Acknowledgements
----------------------------
+----------------
 
 Learners and pipelining from the popular Machine Learning package
 `scikit-learn <https://github.com/scikit-learn/scikit-learn>`__  support
 the corresponding *sklearndf* implementations.
 
+
 BCG GAMMA
----------------------------
+---------
 
 We are always on the lookout for passionate and talented data scientists to join the
 BCG GAMMA team. If you would like to know more you can find out about

diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -0,0 +1,26 @@
+Release Notes
+=============
+
+*sklearndf* 1.0
+---------------
+
+1.0.2
+~~~~~
+
+This is a maintenance release focusing on enhancements to the CI/CD pipeline and bug
+fixes.
+
+- FIX: correctly mirror ``__init__`` signatures of native estimators to their
+  corresponding DF estimators
+- FIX: do not mirror native estimator class attributes and protected members to
+  DF estimators
+- FIX: support ``"passthrough"`` transformer in :class:`.ColumnTransformerDF`
+- FIX: support ``drop`` parameter in :class:`.OneHotEncoderDF`
+- BUILD: add support for numpy 1.20
+- BUILD: updates and changes to the CI/CD pipeline
+
+
+1.0.1
+~~~~~
+
+Initial release.
diff --git a/sphinx/source/.gitignore b/sphinx/source/.gitignore
@@ -1,2 +1,3 @@
 /apidoc/
-/getting_started/
+/getting_started/
+release_notes.rst
diff --git a/sphinx/source/api_landing.rst b/sphinx/source/api_landing.rst
@@ -72,3 +72,5 @@ estimators, most notably :class:`.BorutaDF`,
 :class:`.LGBMClassifierDF`.
 
 All `sklearndf` estimators are fully type hinted.
+
+Please see the :ref:`release notes<release-notes>` for recent API updates and bug fixes.
diff --git a/sphinx/source/index.rst b/sphinx/source/index.rst
@@ -14,3 +14,4 @@ Table of contents
    tutorials
    contribution_guide
    faqs
+   release_notes
diff --git a/test/test/sklearndf/transformation/test_imputers.py b/test/test/sklearndf/transformation/test_imputers.py
@@ -14,45 +14,39 @@
 logger.setLevel(logging.DEBUG)
 
 IMPUTERS_TO_TEST = list_classes(
-    from_modules=sklearndf.transformation, matching=r".*Imputer*DF", excluding=[]
+    from_modules=sklearndf.transformation, matching=r".*Imputer.*DF", excluding=[]
 )
 
 
-@pytest.fixture
-def test_data_x() -> pd.DataFrame:
-    return pd.DataFrame(
-        data=[[7, 2, 3], [4, np.nan, 6], [10, 5, 9]], columns=["a", "b", "c"]
-    )
-
-
-@pytest.fixture
-def test_data_x_with_all_nan() -> pd.DataFrame:
-    return pd.DataFrame(
-        data=[[7, np.nan, 3], [4, np.nan, 6], [np.nan, np.nan, np.nan]],
-        columns=["a", "b", "c"],
-    )
-
-
-@pytest.fixture
-def test_data_y() -> pd.DataFrame:
-    return pd.DataFrame(
-        data=[[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]], columns=["a", "b", "c"]
-    )
-
-
 @pytest.mark.parametrize(
     argnames=["imputer_cls", "add_indicator"],
-    argvalues=itertools.product(IMPUTERS_TO_TEST, (True, False)),
+    argvalues=itertools.product(IMPUTERS_TO_TEST, [True, False]),
 )
 def test_imputer(
     imputer_cls: Type[TransformerDF],
     add_indicator: bool,
-    test_data_x: pd.DataFrame,
-    test_data_y: pd.DataFrame,
-    test_data_x_with_all_nan: pd.DataFrame,
 ) -> None:
-    imputerdf = imputer_cls(add_indicator=add_indicator)
-    imputer_cls_orig = type(imputerdf.native_estimator)
+    """
+    Test imputer classes using the combinations of arguments from
+    ``@pytest.mark.parametrize``
+
+    :param imputer_cls: the imputer class to test
+    :param add_indicator: whether to add an indicator column
+    :return:
+    """
+    imputer_df = imputer_cls(add_indicator=add_indicator)
+    imputer_cls_orig = type(imputer_df.native_estimator)
+
+    test_data_x = pd.DataFrame(
+        data=[[7, 2, 3], [4, np.nan, 6], [10, 5, 9]], columns=["a", "b", "c"]
+    )
+    test_data_x_with_all_nan = pd.DataFrame(
+        data=[[7, np.nan, 3], [4, np.nan, 6], [np.nan, np.nan, np.nan]],
+        columns=["a", "b", "c"],
+    )
+    test_data_y = pd.DataFrame(
+        data=[[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]], columns=["a", "b", "c"]
+    )
 
     # noinspection PyArgumentList
     imputer_orig = imputer_cls_orig(add_indicator=add_indicator)
@@ -61,8 +55,8 @@ def test_imputer(
     # noinspection PyUnresolvedReferences
     y_transformed = imputer_orig.transform(test_data_y)
 
-    imputerdf.fit(test_data_x)
-    y_transformed_df = imputerdf.transform(test_data_y)
+    imputer_df.fit(test_data_x)
+    y_transformed_df = imputer_df.transform(test_data_y)
 
     assert np.array_equal(
         np.round(y_transformed, 4), np.round(y_transformed_df.values, 4)
@@ -79,8 +73,8 @@ def test_imputer(
     # noinspection PyUnresolvedReferences
     y_transformed = imputer_orig.transform(test_data_y)
 
-    imputerdf.fit(test_data_x_with_all_nan)
-    y_transformed_df = imputerdf.transform(test_data_y)
+    imputer_df.fit(test_data_x_with_all_nan)
+    y_transformed_df = imputer_df.transform(test_data_y)
 
     assert np.array_equal(
         np.round(y_transformed, 4), np.round(y_transformed_df.values, 4)

diff --git a/test/test/sklearndf/transformation/test_transformation.py b/test/test/sklearndf/transformation/test_transformation.py
@@ -162,3 +162,62 @@ def test_outlier_remover(df_outlier: pd.DataFrame) -> None:
         }
     )
     assert_frame_equal(df_transformed, df_transformed_expected)
+
+
+def test_one_hot_encoding() -> None:
+    test_data_categorical = pd.DataFrame(
+        data=[
+            ["yes", "red", "child"],
+            ["yes", "blue", "father"],
+            ["no", "green", "mother"],
+        ],
+        columns=["a", "b", "c"],
+    )
+
+    assert_frame_equal(
+        OneHotEncoderDF(drop=None, sparse=False).fit_transform(test_data_categorical),
+        pd.DataFrame(
+            {
+                "a_no": [0.0, 0.0, 1.0],
+                "a_yes": [1.0, 1.0, 0.0],
+                "b_blue": [0.0, 1.0, 0.0],
+                "b_green": [0.0, 0.0, 1.0],
+                "b_red": [1.0, 0.0, 0.0],
+                "c_child": [1.0, 0.0, 0.0],
+                "c_father": [0.0, 1.0, 0.0],
+                "c_mother": [0.0, 0.0, 1.0],
+            }
+        ).rename_axis(columns="feature_out"),
+    )
+
+    assert_frame_equal(
+        OneHotEncoderDF(drop="if_binary", sparse=False).fit_transform(
+            test_data_categorical
+        ),
+        pd.DataFrame(
+            {
+                "a_yes": [1.0, 1.0, 0.0],
+                "b_blue": [0.0, 1.0, 0.0],
+                "b_green": [0.0, 0.0, 1.0],
+                "b_red": [1.0, 0.0, 0.0],
+                "c_child": [1.0, 0.0, 0.0],
+                "c_father": [0.0, 1.0, 0.0],
+                "c_mother": [0.0, 0.0, 1.0],
+            }
+        ).rename_axis(columns="feature_out"),
+    )
+
+    assert_frame_equal(
+        OneHotEncoderDF(drop="first", sparse=False).fit_transform(
+            test_data_categorical
+        ),
+        pd.DataFrame(
+            {
+                "a_yes": [1.0, 1.0, 0.0],
+                "b_green": [0.0, 0.0, 1.0],
+                "b_red": [1.0, 0.0, 0.0],
+                "c_father": [0.0, 1.0, 0.0],
+                "c_mother": [0.0, 0.0, 1.0],
+            }
+        ).rename_axis(columns="feature_out"),
+    )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -72,3 +72,5 @@ estimators, most notably :class:`.BorutaDF`,
		:class:`.LGBMClassifierDF`.

		All `sklearndf` estimators are fully type hinted.

		Please see the :ref:`release notes<release-notes>` for recent API updates and bug fixes.