From 090c8e073e30bb723d5a2f5f1a3d9783fb92ab6b Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Tue, 22 Sep 2020 21:57:39 +0900
Subject: [PATCH 1/9] Implemented DataFrame.lookup

---
 databricks/koalas/frame.py                | 48 +++++++++++++++++++++++
 databricks/koalas/missing/frame.py        |  1 -
 databricks/koalas/tests/test_dataframe.py | 31 +++++++++++++++
 docs/source/reference/frame.rst           |  1 +
 4 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index 71bd5035ff..1cf57d67ca 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10224,6 +10224,54 @@ def from_dict(data, orient="columns", dtype=None, columns=None) -> "DataFrame":
         """
         return DataFrame(pd.DataFrame.from_dict(data, orient=orient, dtype=dtype, columns=columns))
 
+    def lookup(self, row_labels, col_labels) -> np.ndarray:
+        """
+        Label-based "fancy indexing" function for DataFrame.
+
+        Given equal-length arrays of row and column labels, return an
+        array of the values corresponding to each (row, col) pair.
+
+        .. note:: This method should only be used when the length of `row_labels` is small enough,
+                  as all the result is loaded into the driver's memory.
+
+        Parameters
+        ----------
+        row_labels : sequence
+            The row labels to use for lookup.
+        col_labels : sequence
+            The column labels to use for lookup.
+
+        Returns
+        -------
+        numpy.ndarray
+            The found values.
+
+        Examples
+        --------
+        >>> kdf = ks.DataFrame({'A': [3, 4, 5, 6, 7],
+        ...                     'B': [10.0, 20.0, 30.0, 40.0, 50.0],
+        ...                     'C': ['a', 'b', 'c', 'd', 'e']})
+        >>> kdf
+           A     B  C   D
+        0  3  10.0  a NaN
+        1  4  20.0  b NaN
+        2  5  30.0  c NaN
+        3  6  40.0  d NaN
+        4  7  50.0  e NaN
+
+        >>> kdf.lookup([0], ["C"])
+        array(['a'], dtype=object)
+
+        >>> kdf.lookup([2, 3], ["A", "D"])
+        array([ 5., nan])
+        """
+        if len(row_labels) != len(col_labels):
+            raise ValueError("Row labels must have same size as column labels")
+        lookups = [
+            self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
+        ]
+        return pd.Series(lookups).to_numpy()
+
     def _to_internal_pandas(self):
         """
         Return a pandas DataFrame directly from _internal to avoid overhead of copy.
diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
index 254805a6eb..35c02e7704 100644
--- a/databricks/koalas/missing/frame.py
+++ b/databricks/koalas/missing/frame.py
@@ -53,7 +53,6 @@ class _MissingPandasLikeDataFrame(object):
     interpolate = _unsupported_function("interpolate")
     itertuples = _unsupported_function("itertuples")
     last = _unsupported_function("last")
-    lookup = _unsupported_function("lookup")
     mode = _unsupported_function("mode")
     reindex_like = _unsupported_function("reindex_like")
     rename_axis = _unsupported_function("rename_axis")
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
index 042f53ee4e..840ef6476b 100644
--- a/databricks/koalas/tests/test_dataframe.py
+++ b/databricks/koalas/tests/test_dataframe.py
@@ -4019,3 +4019,34 @@ def test_from_dict(self):
         pdf = pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
         kdf = ks.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
         self.assert_eq(pdf, kdf)
+
+    def test_lookup(self):
+        pdf = pd.DataFrame(
+            {
+                "A": [3, 4, 5, 6, 7],
+                "B": [10.0, 20.0, 30.0, 40.0, 50.0],
+                "C": ["a", "b", "c", "d", "e"],
+            }
+        )
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"]))
+        self.assert_list_eq(
+            pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"])
+        )
+
+        # MultiIndex
+        pdf.index = pd.MultiIndex.from_tuples(
+            [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")]
+        )
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq(pdf.lookup([("a", "v")], ["C"]), kdf.lookup([("a", "v")], ["C"]))
+        self.assert_list_eq(
+            pdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]),
+            kdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]),
+        )
+
+        err_msg = "Row labels must have same size as column labels"
+        with self.assertRaisesRegex(ValueError, err_msg):
+            kdf.lookup([0, 3, 4], ["A", "C"])
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
index 457fb8d235..719684b978 100644
--- a/docs/source/reference/frame.rst
+++ b/docs/source/reference/frame.rst
@@ -61,6 +61,7 @@ Indexing, iteration
    DataFrame.items
    DataFrame.iteritems
    DataFrame.iterrows
+   DataFrame.lookup
    DataFrame.keys
    DataFrame.pop
    DataFrame.tail

From dfdb918693e1431f0063e6438e06163c1941fc74 Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Tue, 22 Sep 2020 22:01:15 +0900
Subject: [PATCH 2/9] fix doctest

---
 databricks/koalas/frame.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index 1cf57d67ca..215b568be5 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10252,18 +10252,18 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         ...                     'B': [10.0, 20.0, 30.0, 40.0, 50.0],
         ...                     'C': ['a', 'b', 'c', 'd', 'e']})
         >>> kdf
-           A     B  C   D
-        0  3  10.0  a NaN
-        1  4  20.0  b NaN
-        2  5  30.0  c NaN
-        3  6  40.0  d NaN
-        4  7  50.0  e NaN
+           A     B  C
+        0  3  10.0  a
+        1  4  20.0  b
+        2  5  30.0  c
+        3  6  40.0  d
+        4  7  50.0  e
 
         >>> kdf.lookup([0], ["C"])
         array(['a'], dtype=object)
 
-        >>> kdf.lookup([2, 3], ["A", "D"])
-        array([ 5., nan])
+        >>> kdf.lookup([2, 3], ["A", "B"])
+        array([ 5., 40.])
         """
         if len(row_labels) != len(col_labels):
             raise ValueError("Row labels must have same size as column labels")

From 9500b50843d2c5448d6c5f0baf32d4835b251114 Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Tue, 22 Sep 2020 23:16:55 +0900
Subject: [PATCH 3/9] Added tests and fixed related codes

---
 databricks/koalas/frame.py                |  9 ++++++-
 databricks/koalas/tests/test_dataframe.py | 33 +++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index 215b568be5..ffa7a86544 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10232,7 +10232,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         array of the values corresponding to each (row, col) pair.
 
         .. note:: This method should only be used when the length of `row_labels` is small enough,
-                  as all the result is loaded into the driver's memory.
+                  as all the data belongs to the `row_labels` is loaded into the driver's memory.
 
         Parameters
         ----------
@@ -10265,8 +10265,15 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         >>> kdf.lookup([2, 3], ["A", "B"])
         array([ 5., 40.])
         """
+        from databricks.koalas.series import Series
+        from databricks.koalas.indexes import Index
+
         if len(row_labels) != len(col_labels):
             raise ValueError("Row labels must have same size as column labels")
+        if isinstance(row_labels, (Series, Index)):
+            row_labels = row_labels.to_numpy().tolist()
+        if isinstance(col_labels, (Series, Index)):
+            col_labels = col_labels.to_numpy().tolist()
         lookups = [
             self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
         ]
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
index 840ef6476b..dd10a84047 100644
--- a/databricks/koalas/tests/test_dataframe.py
+++ b/databricks/koalas/tests/test_dataframe.py
@@ -4030,11 +4030,44 @@ def test_lookup(self):
         )
         kdf = ks.from_pandas(pdf)
 
+        # list
         self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"]))
         self.assert_list_eq(
             pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"])
         )
 
+        # tuple
+        self.assert_eq(pdf.lookup((0,), ("C",)), kdf.lookup((0,), ("C",)))
+        self.assert_list_eq(
+            pdf.lookup((0, 3, 4), ("A", "C", "A")), kdf.lookup((0, 3, 4), ("A", "C", "A"))
+        )
+
+        # dict
+        self.assert_eq(pdf.lookup({0: None}, {"C": None}), kdf.lookup({0: None}, {"C": None}))
+        self.assert_list_eq(
+            pdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
+            kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
+        )
+
+        # Index
+        self.assert_eq(
+            pdf.lookup(pd.Index([0]), pd.Index(["C"])), kdf.lookup(ks.Index([0]), ks.Index(["C"]))
+        )
+        self.assert_list_eq(
+            pdf.lookup(pd.Index([0, 3, 4]), pd.Index(["A", "C", "A"])),
+            kdf.lookup(ks.Index([0, 3, 4]), ks.Index(["A", "C", "A"])),
+        )
+
+        # Series
+        self.assert_eq(
+            pdf.lookup(pd.Series([0]), pd.Series(["C"])),
+            kdf.lookup(ks.Series([0]), ks.Series(["C"])),
+        )
+        self.assert_list_eq(
+            pdf.lookup(pd.Series([0, 3, 4]), pd.Series(["A", "C", "A"])),
+            kdf.lookup(ks.Series([0, 3, 4]), ks.Series(["A", "C", "A"])),
+        )
+
         # MultiIndex
         pdf.index = pd.MultiIndex.from_tuples(
             [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")]

From ad7e4b2eb790be2267c00079c7d0b76aebc93e91 Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Wed, 23 Sep 2020 00:27:59 +0900
Subject: [PATCH 4/9] fix pd.Series -> ks.Series

---
 databricks/koalas/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index ffa7a86544..07ffff18eb 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10277,7 +10277,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         lookups = [
             self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
         ]
-        return pd.Series(lookups).to_numpy()
+        return Series(lookups).to_numpy()
 
     def _to_internal_pandas(self):
         """

From 4ee022820537b0890c826f6e8a8acb33ea47a7b1 Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Wed, 23 Sep 2020 13:10:35 +0900
Subject: [PATCH 5/9] use np.asarray instead of to_numpy()

---
 databricks/koalas/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index 07ffff18eb..0187956a35 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10277,7 +10277,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         lookups = [
             self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
         ]
-        return Series(lookups).to_numpy()
+        return np.asarray(pd.Series(lookups))
 
     def _to_internal_pandas(self):
         """

From 6e8642a90f49b21db59d5761b968186c99c2ee71 Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Thu, 24 Sep 2020 22:07:00 +0900
Subject: [PATCH 6/9] dont support for Index and Series

---
 databricks/koalas/frame.py                | 16 +++++++-----
 databricks/koalas/tests/test_dataframe.py | 31 +++++++++--------------
 2 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index 0187956a35..611f827c2f 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10231,8 +10231,8 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         Given equal-length arrays of row and column labels, return an
         array of the values corresponding to each (row, col) pair.
 
-        .. note:: This method should only be used when the length of `row_labels` is small enough,
-                  as all the data belongs to the `row_labels` is loaded into the driver's memory.
+        `row_labels` and `col_labels` are not support the type `Series` and `Index`
+        to prevent performance degradation.
 
         Parameters
         ----------
@@ -10268,12 +10268,16 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         from databricks.koalas.series import Series
         from databricks.koalas.indexes import Index
 
-        if len(row_labels) != len(col_labels):
-            raise ValueError("Row labels must have same size as column labels")
         if isinstance(row_labels, (Series, Index)):
-            row_labels = row_labels.to_numpy().tolist()
+            raise TypeError(
+                "'row_labels' doesn't support type '{}'.".format(type(row_labels).__name__)
+            )
         if isinstance(col_labels, (Series, Index)):
-            col_labels = col_labels.to_numpy().tolist()
+            raise TypeError(
+                "'col_labels' doesn't support type '{}'.".format(type(col_labels).__name__)
+            )
+        if len(row_labels) != len(col_labels):
+            raise ValueError("Row labels must have same size as column labels")
         lookups = [
             self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
         ]
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
index dd10a84047..830e46faf1 100644
--- a/databricks/koalas/tests/test_dataframe.py
+++ b/databricks/koalas/tests/test_dataframe.py
@@ -4049,25 +4049,6 @@ def test_lookup(self):
             kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}),
         )
 
-        # Index
-        self.assert_eq(
-            pdf.lookup(pd.Index([0]), pd.Index(["C"])), kdf.lookup(ks.Index([0]), ks.Index(["C"]))
-        )
-        self.assert_list_eq(
-            pdf.lookup(pd.Index([0, 3, 4]), pd.Index(["A", "C", "A"])),
-            kdf.lookup(ks.Index([0, 3, 4]), ks.Index(["A", "C", "A"])),
-        )
-
-        # Series
-        self.assert_eq(
-            pdf.lookup(pd.Series([0]), pd.Series(["C"])),
-            kdf.lookup(ks.Series([0]), ks.Series(["C"])),
-        )
-        self.assert_list_eq(
-            pdf.lookup(pd.Series([0, 3, 4]), pd.Series(["A", "C", "A"])),
-            kdf.lookup(ks.Series([0, 3, 4]), ks.Series(["A", "C", "A"])),
-        )
-
         # MultiIndex
         pdf.index = pd.MultiIndex.from_tuples(
             [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")]
@@ -4083,3 +4064,15 @@ def test_lookup(self):
         err_msg = "Row labels must have same size as column labels"
         with self.assertRaisesRegex(ValueError, err_msg):
             kdf.lookup([0, 3, 4], ["A", "C"])
+        err_msg = "'row_labels' doesn't support type 'Index'."
+        with self.assertRaisesRegex(TypeError, err_msg):
+            kdf.lookup(ks.Index([0]), ["C"])
+        err_msg = "'row_labels' doesn't support type 'Series'."
+        with self.assertRaisesRegex(TypeError, err_msg):
+            kdf.lookup(ks.Series([0]), ["C"])
+        err_msg = "'col_labels' doesn't support type 'Index'."
+        with self.assertRaisesRegex(TypeError, err_msg):
+            kdf.lookup([0], ks.Index(["C"]))
+        err_msg = "'col_labels' doesn't support type 'Series'."
+        with self.assertRaisesRegex(TypeError, err_msg):
+            kdf.lookup([0], ks.Series(["C"]))

From 612043ba4b62f43d84d4a5c0392f8c46087a14af Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Thu, 24 Sep 2020 23:30:09 +0900
Subject: [PATCH 7/9] Empty commit for rebuilding


From 98007527fe3e72f248ead472b2da98fb2da24d1b Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Sun, 27 Sep 2020 03:35:32 +0900
Subject: [PATCH 8/9] Addressed comments

---
 databricks/koalas/frame.py                | 23 ++++++++++++++++-------
 databricks/koalas/tests/test_dataframe.py | 15 +++++++++++++++
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
index bdf1defb8d..21cf16158a 100644
--- a/databricks/koalas/frame.py
+++ b/databricks/koalas/frame.py
@@ -10288,7 +10288,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
         array([ 5., 40.])
         """
         from databricks.koalas.series import Series
-        from databricks.koalas.indexes import Index
+        from databricks.koalas.indexes import Index, MultiIndex
 
         if isinstance(row_labels, (Series, Index)):
             raise TypeError(
@@ -10298,12 +10298,21 @@ def lookup(self, row_labels, col_labels) -> np.ndarray:
             raise TypeError(
                 "'col_labels' doesn't support type '{}'.".format(type(col_labels).__name__)
             )
-        if len(row_labels) != len(col_labels):
-            raise ValueError("Row labels must have same size as column labels")
-        lookups = [
-            self.loc[row_label, col_label] for row_label, col_label in zip(row_labels, col_labels)
-        ]
-        return np.asarray(pd.Series(lookups))
+
+        if not isinstance(self.index, MultiIndex):
+            return (
+                self.loc[list(set(row_labels)), list(set(col_labels))]
+                .to_pandas()
+                .lookup(row_labels, col_labels)
+            )
+        else:
+            if len(row_labels) != len(col_labels):
+                raise ValueError("Row labels must have same size as column labels")
+            lookups = [
+                self.loc[row_label, col_label]
+                for row_label, col_label in zip(row_labels, col_labels)
+            ]
+            return np.asarray(pd.Series(lookups))
 
     def _to_internal_pandas(self):
         """
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
index dc54f25661..b4654d52b6 100644
--- a/databricks/koalas/tests/test_dataframe.py
+++ b/databricks/koalas/tests/test_dataframe.py
@@ -4154,6 +4154,21 @@ def test_lookup(self):
         err_msg = "'col_labels' doesn't support type 'Series'."
         with self.assertRaisesRegex(TypeError, err_msg):
             kdf.lookup([0], ks.Series(["C"]))
+        err_msg = "'D'"
+        with self.assertRaisesRegex(KeyError, err_msg):
+            kdf.lookup([2, 3], ["A", "D"])
+        err_msg = "DataFrame.lookup requires unique index and columns"
+        with self.assertRaisesRegex(ValueError, err_msg):
+            pdf = pd.DataFrame(
+                {
+                    "A": [3, 4, 5, 6, 7],
+                    "B": [10.0, 20.0, 30.0, 40.0, 50.0],
+                    "C": ["a", "b", "c", "d", "e"],
+                },
+                index=pd.Index([1, 1, 2, 2, 3]),
+            )
+            kdf = ks.from_pandas(pdf)
+            kdf.lookup([1], ["A"])
 
     def test_pad(self):
         pdf = pd.DataFrame(

From 2303f48e75d225eee17f961968f62f0f6819dc10 Mon Sep 17 00:00:00 2001
From: itholic <haejoon309@naver.com>
Date: Sun, 27 Sep 2020 04:12:02 +0900
Subject: [PATCH 9/9] Remove tests from pandas directly

---
 databricks/koalas/tests/test_dataframe.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
index b4654d52b6..dc54f25661 100644
--- a/databricks/koalas/tests/test_dataframe.py
+++ b/databricks/koalas/tests/test_dataframe.py
@@ -4154,21 +4154,6 @@ def test_lookup(self):
         err_msg = "'col_labels' doesn't support type 'Series'."
         with self.assertRaisesRegex(TypeError, err_msg):
             kdf.lookup([0], ks.Series(["C"]))
-        err_msg = "'D'"
-        with self.assertRaisesRegex(KeyError, err_msg):
-            kdf.lookup([2, 3], ["A", "D"])
-        err_msg = "DataFrame.lookup requires unique index and columns"
-        with self.assertRaisesRegex(ValueError, err_msg):
-            pdf = pd.DataFrame(
-                {
-                    "A": [3, 4, 5, 6, 7],
-                    "B": [10.0, 20.0, 30.0, 40.0, 50.0],
-                    "C": ["a", "b", "c", "d", "e"],
-                },
-                index=pd.Index([1, 1, 2, 2, 3]),
-            )
-            kdf = ks.from_pandas(pdf)
-            kdf.lookup([1], ["A"])
 
     def test_pad(self):
         pdf = pd.DataFrame(