From 090c8e073e30bb723d5a2f5f1a3d9783fb92ab6b Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Tue, 22 Sep 2020 21:57:39 +0900 Subject: [PATCH 1/9] Implemented DataFrame.lookup --- databricks/koalas/frame.py | 48 +++++++++++++++++++++++ databricks/koalas/missing/frame.py | 1 - databricks/koalas/tests/test_dataframe.py | 31 +++++++++++++++ docs/source/reference/frame.rst | 1 + 4 files changed, 80 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 71bd5035ff..1cf57d67ca 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10224,6 +10224,54 @@ def from_dict(data, orient="columns", dtype=None, columns=None) -> "DataFrame": """ return DataFrame(pd.DataFrame.from_dict(data, orient=orient, dtype=dtype, columns=columns)) + def lookup(self, row_labels, col_labels) -> np.ndarray: + """ + Label-based "fancy indexing" function for DataFrame. + + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + .. note:: This method should only be used when the length of `row_labels` is small enough, + as all the result is loaded into the driver's memory. + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup. + col_labels : sequence + The column labels to use for lookup. + + Returns + ------- + numpy.ndarray + The found values. + + Examples + -------- + >>> kdf = ks.DataFrame({'A': [3, 4, 5, 6, 7], + ... 'B': [10.0, 20.0, 30.0, 40.0, 50.0], + ... 'C': ['a', 'b', 'c', 'd', 'e']}) + >>> kdf + A B C D + 0 3 10.0 a NaN + 1 4 20.0 b NaN + 2 5 30.0 c NaN + 3 6 40.0 d NaN + 4 7 50.0 e NaN + + >>> kdf.lookup([0], ["C"]) + array(['a'], dtype=object) + + >>> kdf.lookup([2, 3], ["A", "D"]) + array([ 5., nan]) + """ + if len(row_labels) != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + lookups = [ + self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels) + ] + return pd.Series(lookups).to_numpy() + def _to_internal_pandas(self): """ Return a pandas DataFrame directly from _internal to avoid overhead of copy. diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py index 254805a6eb..35c02e7704 100644 --- a/databricks/koalas/missing/frame.py +++ b/databricks/koalas/missing/frame.py @@ -53,7 +53,6 @@ class _MissingPandasLikeDataFrame(object): interpolate = _unsupported_function("interpolate") itertuples = _unsupported_function("itertuples") last = _unsupported_function("last") - lookup = _unsupported_function("lookup") mode = _unsupported_function("mode") reindex_like = _unsupported_function("reindex_like") rename_axis = _unsupported_function("rename_axis") diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 042f53ee4e..840ef6476b 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -4019,3 +4019,34 @@ def test_from_dict(self): pdf = pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) kdf = ks.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) self.assert_eq(pdf, kdf) + + def test_lookup(self): + pdf = pd.DataFrame( + { + "A": [3, 4, 5, 6, 7], + "B": [10.0, 20.0, 30.0, 40.0, 50.0], + "C": ["a", "b", "c", "d", "e"], + } + ) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"])) + self.assert_list_eq( + pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"]) + ) + + # MultiIndex + pdf.index = pd.MultiIndex.from_tuples( + [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] + ) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf.lookup([("a", "v")], ["C"]), kdf.lookup([("a", "v")], ["C"])) + self.assert_list_eq( + pdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]), + kdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]), + ) + + err_msg = "Row labels must have same size as column labels" + with self.assertRaisesRegex(ValueError, err_msg): + kdf.lookup([0, 3, 4], ["A", "C"]) diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst index 457fb8d235..719684b978 100644 --- a/docs/source/reference/frame.rst +++ b/docs/source/reference/frame.rst @@ -61,6 +61,7 @@ Indexing, iteration DataFrame.items DataFrame.iteritems DataFrame.iterrows + DataFrame.lookup DataFrame.keys DataFrame.pop DataFrame.tail From dfdb918693e1431f0063e6438e06163c1941fc74 Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Tue, 22 Sep 2020 22:01:15 +0900 Subject: [PATCH 2/9] fix doctest --- databricks/koalas/frame.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 1cf57d67ca..215b568be5 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10252,18 +10252,18 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: ... 'B': [10.0, 20.0, 30.0, 40.0, 50.0], ... 'C': ['a', 'b', 'c', 'd', 'e']}) >>> kdf - A B C D - 0 3 10.0 a NaN - 1 4 20.0 b NaN - 2 5 30.0 c NaN - 3 6 40.0 d NaN - 4 7 50.0 e NaN + A B C + 0 3 10.0 a + 1 4 20.0 b + 2 5 30.0 c + 3 6 40.0 d + 4 7 50.0 e >>> kdf.lookup([0], ["C"]) array(['a'], dtype=object) - >>> kdf.lookup([2, 3], ["A", "D"]) - array([ 5., nan]) + >>> kdf.lookup([2, 3], ["A", "B"]) + array([ 5., 40.]) """ if len(row_labels) != len(col_labels): raise ValueError("Row labels must have same size as column labels") From 9500b50843d2c5448d6c5f0baf32d4835b251114 Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Tue, 22 Sep 2020 23:16:55 +0900 Subject: [PATCH 3/9] Added tests and fixed related codes --- databricks/koalas/frame.py | 9 ++++++- databricks/koalas/tests/test_dataframe.py | 33 +++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 215b568be5..ffa7a86544 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10232,7 +10232,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: array of the values corresponding to each (row, col) pair. .. note:: This method should only be used when the length of `row_labels` is small enough, - as all the result is loaded into the driver's memory. + as all the data belongs to the `row_labels` is loaded into the driver's memory. Parameters ---------- @@ -10265,8 +10265,15 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: >>> kdf.lookup([2, 3], ["A", "B"]) array([ 5., 40.]) """ + from databricks.koalas.series import Series + from databricks.koalas.indexes import Index + if len(row_labels) != len(col_labels): raise ValueError("Row labels must have same size as column labels") + if isinstance(row_labels, (Series, Index)): + row_labels = row_labels.to_numpy().tolist() + if isinstance(col_labels, (Series, Index)): + col_labels = col_labels.to_numpy().tolist() lookups = [ self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels) ] diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 840ef6476b..dd10a84047 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -4030,11 +4030,44 @@ def test_lookup(self): ) kdf = ks.from_pandas(pdf) + # list self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"])) self.assert_list_eq( pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"]) ) + # tuple + self.assert_eq(pdf.lookup((0,), ("C",)), kdf.lookup((0,), ("C",))) + self.assert_list_eq( + pdf.lookup((0, 3, 4), ("A", "C", "A")), kdf.lookup((0, 3, 4), ("A", "C", "A")) + ) + + # dict + self.assert_eq(pdf.lookup({0: None}, {"C": None}), kdf.lookup({0: None}, {"C": None})) + self.assert_list_eq( + pdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}), + kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}), + ) + + # Index + self.assert_eq( + pdf.lookup(pd.Index([0]), pd.Index(["C"])), kdf.lookup(ks.Index([0]), ks.Index(["C"])) + ) + self.assert_list_eq( + pdf.lookup(pd.Index([0, 3, 4]), pd.Index(["A", "C", "A"])), + kdf.lookup(ks.Index([0, 3, 4]), ks.Index(["A", "C", "A"])), + ) + + # Series + self.assert_eq( + pdf.lookup(pd.Series([0]), pd.Series(["C"])), + kdf.lookup(ks.Series([0]), ks.Series(["C"])), + ) + self.assert_list_eq( + pdf.lookup(pd.Series([0, 3, 4]), pd.Series(["A", "C", "A"])), + kdf.lookup(ks.Series([0, 3, 4]), ks.Series(["A", "C", "A"])), + ) + # MultiIndex pdf.index = pd.MultiIndex.from_tuples( [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] From ad7e4b2eb790be2267c00079c7d0b76aebc93e91 Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Wed, 23 Sep 2020 00:27:59 +0900 Subject: [PATCH 4/9] fix pd.Series -> ks.Series --- databricks/koalas/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index ffa7a86544..07ffff18eb 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10277,7 +10277,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: lookups = [ self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels) ] - return pd.Series(lookups).to_numpy() + return Series(lookups).to_numpy() def _to_internal_pandas(self): """ From 4ee022820537b0890c826f6e8a8acb33ea47a7b1 Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Wed, 23 Sep 2020 13:10:35 +0900 Subject: [PATCH 5/9] use np.asarray instead of to_numpy() --- databricks/koalas/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 07ffff18eb..0187956a35 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10277,7 +10277,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: lookups = [ self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels) ] - return Series(lookups).to_numpy() + return np.asarray(pd.Series(lookups)) def _to_internal_pandas(self): """ From 6e8642a90f49b21db59d5761b968186c99c2ee71 Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Thu, 24 Sep 2020 22:07:00 +0900 Subject: [PATCH 6/9] dont support for Index and Series --- databricks/koalas/frame.py | 16 +++++++----- databricks/koalas/tests/test_dataframe.py | 31 +++++++++-------------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 0187956a35..611f827c2f 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10231,8 +10231,8 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. - .. note:: This method should only be used when the length of `row_labels` is small enough, - as all the data belongs to the `row_labels` is loaded into the driver's memory. + `row_labels` and `col_labels` are not support the type `Series` and `Index` + to prevent performance degradation. Parameters ---------- @@ -10268,12 +10268,16 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: from databricks.koalas.series import Series from databricks.koalas.indexes import Index - if len(row_labels) != len(col_labels): - raise ValueError("Row labels must have same size as column labels") if isinstance(row_labels, (Series, Index)): - row_labels = row_labels.to_numpy().tolist() + raise TypeError( + "'row_labels' doesn't support type '{}'.".format(type(row_labels).__name__) + ) if isinstance(col_labels, (Series, Index)): - col_labels = col_labels.to_numpy().tolist() + raise TypeError( + "'col_labels' doesn't support type '{}'.".format(type(col_labels).__name__) + ) + if len(row_labels) != len(col_labels): + raise ValueError("Row labels must have same size as column labels") lookups = [ self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels) ] diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index dd10a84047..830e46faf1 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -4049,25 +4049,6 @@ def test_lookup(self): kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}), ) - # Index - self.assert_eq( - pdf.lookup(pd.Index([0]), pd.Index(["C"])), kdf.lookup(ks.Index([0]), ks.Index(["C"])) - ) - self.assert_list_eq( - pdf.lookup(pd.Index([0, 3, 4]), pd.Index(["A", "C", "A"])), - kdf.lookup(ks.Index([0, 3, 4]), ks.Index(["A", "C", "A"])), - ) - - # Series - self.assert_eq( - pdf.lookup(pd.Series([0]), pd.Series(["C"])), - kdf.lookup(ks.Series([0]), ks.Series(["C"])), - ) - self.assert_list_eq( - pdf.lookup(pd.Series([0, 3, 4]), pd.Series(["A", "C", "A"])), - kdf.lookup(ks.Series([0, 3, 4]), ks.Series(["A", "C", "A"])), - ) - # MultiIndex pdf.index = pd.MultiIndex.from_tuples( [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] @@ -4083,3 +4064,15 @@ def test_lookup(self): err_msg = "Row labels must have same size as column labels" with self.assertRaisesRegex(ValueError, err_msg): kdf.lookup([0, 3, 4], ["A", "C"]) + err_msg = "'row_labels' doesn't support type 'Index'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup(ks.Index([0]), ["C"]) + err_msg = "'row_labels' doesn't support type 'Series'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup(ks.Series([0]), ["C"]) + err_msg = "'col_labels' doesn't support type 'Index'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup([0], ks.Index(["C"])) + err_msg = "'col_labels' doesn't support type 'Series'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup([0], ks.Series(["C"])) From 612043ba4b62f43d84d4a5c0392f8c46087a14af Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Thu, 24 Sep 2020 23:30:09 +0900 Subject: [PATCH 7/9] Empty commit for rebuilding From 98007527fe3e72f248ead472b2da98fb2da24d1b Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Sun, 27 Sep 2020 03:35:32 +0900 Subject: [PATCH 8/9] Addressed comments --- databricks/koalas/frame.py | 23 ++++++++++++++++------- databricks/koalas/tests/test_dataframe.py | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index bdf1defb8d..21cf16158a 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10288,7 +10288,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: array([ 5., 40.]) """ from databricks.koalas.series import Series - from databricks.koalas.indexes import Index + from databricks.koalas.indexes import Index, MultiIndex if isinstance(row_labels, (Series, Index)): raise TypeError( @@ -10298,12 +10298,21 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: raise TypeError( "'col_labels' doesn't support type '{}'.".format(type(col_labels).__name__) ) - if len(row_labels) != len(col_labels): - raise ValueError("Row labels must have same size as column labels") - lookups = [ - self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels) - ] - return np.asarray(pd.Series(lookups)) + + if not isinstance(self.index, MultiIndex): + return ( + self.loc[list(set(row_labels)), list(set(col_labels))] + .to_pandas() + .lookup(row_labels, col_labels) + ) + else: + if len(row_labels) != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + lookups = [ + self.loc[row_label, col_label] + for row_label, col_label in zip(row_labels, col_labels) + ] + return np.asarray(pd.Series(lookups)) def _to_internal_pandas(self): """ diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index dc54f25661..b4654d52b6 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -4154,6 +4154,21 @@ def test_lookup(self): err_msg = "'col_labels' doesn't support type 'Series'." with self.assertRaisesRegex(TypeError, err_msg): kdf.lookup([0], ks.Series(["C"])) + err_msg = "'D'" + with self.assertRaisesRegex(KeyError, err_msg): + kdf.lookup([2, 3], ["A", "D"]) + err_msg = "DataFrame.lookup requires unique index and columns" + with self.assertRaisesRegex(ValueError, err_msg): + pdf = pd.DataFrame( + { + "A": [3, 4, 5, 6, 7], + "B": [10.0, 20.0, 30.0, 40.0, 50.0], + "C": ["a", "b", "c", "d", "e"], + }, + index=pd.Index([1, 1, 2, 2, 3]), + ) + kdf = ks.from_pandas(pdf) + kdf.lookup([1], ["A"]) def test_pad(self): pdf = pd.DataFrame( From 2303f48e75d225eee17f961968f62f0f6819dc10 Mon Sep 17 00:00:00 2001 From: itholic <haejoon309@naver.com> Date: Sun, 27 Sep 2020 04:12:02 +0900 Subject: [PATCH 9/9] Remove tests from pandas directly --- databricks/koalas/tests/test_dataframe.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index b4654d52b6..dc54f25661 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -4154,21 +4154,6 @@ def test_lookup(self): err_msg = "'col_labels' doesn't support type 'Series'." with self.assertRaisesRegex(TypeError, err_msg): kdf.lookup([0], ks.Series(["C"])) - err_msg = "'D'" - with self.assertRaisesRegex(KeyError, err_msg): - kdf.lookup([2, 3], ["A", "D"]) - err_msg = "DataFrame.lookup requires unique index and columns" - with self.assertRaisesRegex(ValueError, err_msg): - pdf = pd.DataFrame( - { - "A": [3, 4, 5, 6, 7], - "B": [10.0, 20.0, 30.0, 40.0, 50.0], - "C": ["a", "b", "c", "d", "e"], - }, - index=pd.Index([1, 1, 2, 2, 3]), - ) - kdf = ks.from_pandas(pdf) - kdf.lookup([1], ["A"]) def test_pad(self): pdf = pd.DataFrame(