From 36255096b1068c2ba2e73bd2f7d025ed7cb70f31 Mon Sep 17 00:00:00 2001
From: Denis <denis.smirnov@intel.com>
Date: Thu, 21 May 2020 19:11:48 +0300
Subject: [PATCH] Re-implement df.at/iat/loc/iloc based on new structure

---
 .../hpat_pandas_dataframe_functions.py        |  61 +++++---
 sdc/hiframes/pd_dataframe_ext.py              |   4 +-
 sdc/tests/test_dataframe.py                   | 139 ++++++++++++++++--
 3 files changed, 165 insertions(+), 39 deletions(-)

diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
index b75fd29ba..767efe7fc 100644
--- a/sdc/datatypes/hpat_pandas_dataframe_functions.py
+++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -1896,7 +1896,7 @@ def df_getitem_tuple_at_codegen(self, row, col):
     Example of generated implementation:
         def _df_getitem_tuple_at_impl(self, idx):
             row, _ = idx
-            data = self._dataframe._data[1]
+            data = self._dataframe._data[1][0]
             res_data = pandas.Series(data, index=self._dataframe.index)
             return res_data.at[row]
     """
@@ -1905,9 +1905,11 @@ def _df_getitem_tuple_at_impl(self, idx):
     check = False
     for i in range(len(self.columns)):
         if self.columns[i] == col:
+            col_loc = self.column_loc[col]
+            type_id, col_id = col_loc.type_id, col_loc.col_id
             check = True
             func_lines += [
-                f'  data = self._dataframe._data[{i}]',
+                f'  data = self._dataframe._data[{type_id}][{col_id}]',
                 f'  res_data = pandas.Series(data, index=self._dataframe.index)',
                 '  return res_data.at[row]',
             ]
@@ -1925,9 +1927,9 @@ def df_getitem_single_label_loc_codegen(self, idx):
     Example of generated implementation:
         def _df_getitem_single_label_loc_impl(self, idx):
             idx_list = find_idx(self._dataframe._index, idx)
-            data_0 = _sdc_take(self._dataframe._data[0], idx_list)
+            data_0 = _sdc_take(self._dataframe._data[0][0], idx_list)
             res_data_0 = pandas.Series(data_0)
-            data_1 = _sdc_take(self._dataframe._data[1], idx_list)
+            data_1 = _sdc_take(self._dataframe._data[1][0], idx_list)
             res_data_1 = pandas.Series(data_1)
             if len(idx_list) < 1:
                 raise KeyError('Index is not in the DataFrame')
@@ -1948,10 +1950,11 @@ def _df_getitem_single_label_loc_impl(self, idx):
                   f'{fill_list_text}']
     results = []
     for i, c in enumerate(self.columns):
+        col_loc = self.column_loc[c]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         data = f'data_{i}'
-        index_in_list = f'index_in_list_{i}'
         res_data = f'res_data_{i}'
-        func_lines += [f'  {data} = _sdc_take(self._dataframe._data[{i}], idx_list)',
+        func_lines += [f'  {data} = _sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)',
                        f'  {res_data} = pandas.Series({data})']
         results.append((c, res_data))
 
@@ -1976,13 +1979,13 @@ def df_getitem_int_iloc_codegen(self, idx):
     """
     Example of generated implementation:
         def _df_getitem_int_iloc_impl(self, idx):
-        if -1 < idx < len(self._dataframe.index):
-            data_0 = pandas.Series(self._dataframe._data[0])
-            result_0 = data_0.iat[idx]
-            data_1 = pandas.Series(self._dataframe._data[1])
-            result_1 = data_1.iat[idx]
-            return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx))
-        raise IndexingError('Index is out of bounds for axis')
+            if -1 < idx < len(self._dataframe.index):
+                data_0 = pandas.Series(self._dataframe._data[0][0])
+                result_0 = data_0.iat[idx]
+                data_1 = pandas.Series(self._dataframe._data[0][1])
+                result_1 = data_1.iat[idx]
+                return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx))
+            raise IndexingError('Index is out of bounds for axis')
     """
     func_lines = ['def _df_getitem_int_iloc_impl(self, idx):',
                   '  if -1 < idx < len(self._dataframe.index):']
@@ -1992,8 +1995,10 @@ def _df_getitem_int_iloc_impl(self, idx):
     if isinstance(self.index, types.NoneType):
         name = 'idx'
     for i, c in enumerate(self.columns):
+        col_loc = self.column_loc[c]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         result_c = f"result_{i}"
-        func_lines += [f"    data_{i} = pandas.Series(self._dataframe._data[{i}])",
+        func_lines += [f"    data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])",
                        f"    {result_c} = data_{i}.iat[idx]"]
         results.append(result_c)
         index.append(c)
@@ -2011,17 +2016,19 @@ def df_getitem_slice_iloc_codegen(self, idx):
     """
     Example of generated implementation:
         def _df_getitem_slice_iloc_impl(self, idx):
-            data_0 = pandas.Series(self._dataframe._data[0])
+            data_0 = pandas.Series(self._dataframe._data[0][0])
             result_0 = data_0.iloc[idx]
-            data_1 = pandas.Series(self._dataframe._data[1])
+            data_1 = pandas.Series(self._dataframe._data[1][0])
             result_1 = data_1.iloc[idx]
             return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=self._dataframe.index[idx])
     """
     func_lines = ['def _df_getitem_slice_iloc_impl(self, idx):']
     results = []
     for i, c in enumerate(self.columns):
+        col_loc = self.column_loc[c]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         result_c = f"result_{i}"
-        func_lines += [f"  data_{i} = pandas.Series(self._dataframe._data[{i}])",
+        func_lines += [f"  data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])",
                        f"  {result_c} = data_{i}.iloc[idx]"]
         results.append((c, result_c))
     data = ', '.join(f'"{col}": {data}' for col, data in results)
@@ -2042,9 +2049,9 @@ def _df_getitem_list_iloc_impl(self, idx):
                 if -1 < i < len(self._dataframe.index):
                     check_idx = True
             if check_idx == True:
-                data_0 = pandas.Series(self._dataframe._data[0])
+                data_0 = pandas.Series(self._dataframe._data[0][0])
                 result_0 = data_0.iloc[numpy.array(idx)]
-                data_1 = pandas.Series(self._dataframe._data[1])
+                data_1 = pandas.Series(self._dataframe._data[1][0])
                 result_1 = data_1.iloc[numpy.array(idx)]
                 return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=idx)
             raise IndexingError('Index is out of bounds for axis')
@@ -2060,8 +2067,10 @@ def _df_getitem_list_iloc_impl(self, idx):
     if isinstance(self.index, types.NoneType):
         index = 'idx'
     for i, c in enumerate(self.columns):
+        col_loc = self.column_loc[c]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         result_c = f"result_{i}"
-        func_lines += [f"    data_{i} = pandas.Series(self._dataframe._data[{i}])",
+        func_lines += [f"    data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])",
                        f"    {result_c} = data_{i}.iloc[numpy.array(idx)]"]
         results.append((c, result_c))
     data = ', '.join(f'"{col}": {data}' for col, data in results)
@@ -2079,9 +2088,9 @@ def df_getitem_list_bool_iloc_codegen(self, idx):
     Example of generated implementation:
         def _df_getitem_list_bool_iloc_impl(self, idx):
             if len(self._dataframe.index) == len(idx):
-                data_0 = self._dataframe._data[0]
+                data_0 = self._dataframe._data[0][0]
                 result_0 = pandas.Series(data_0[numpy.array(idx)])
-                data_1 = self._dataframe._data[1]
+                data_1 = self._dataframe._data[1][0]
                 result_1 = pandas.Series(data_1[numpy.array(idx)])
                 return pandas.DataFrame(data={"A": result_0, "B": result_1},
                     index=self._dataframe.index[numpy.array(idx)])
@@ -2092,8 +2101,10 @@ def _df_getitem_list_bool_iloc_impl(self, idx):
     index = 'self._dataframe.index[numpy.array(idx)]'
     func_lines += ['  if len(self._dataframe.index) == len(idx):']
     for i, c in enumerate(self.columns):
+        col_loc = self.column_loc[c]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         result_c = f"result_{i}"
-        func_lines += [f"    data_{i} = self._dataframe._data[{i}]",
+        func_lines += [f"    data_{i} = self._dataframe._data[{type_id}][{col_id}]",
                        f"    {result_c} = pandas.Series(data_{i}[numpy.array(idx)])"]
         results.append((c, result_c))
     data = ', '.join(f'"{col}": {data}' for col, data in results)
@@ -2164,11 +2175,13 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
         if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
             col = idx[1].literal_value
             if -1 < col < len(self.dataframe.columns):
+                col_loc = self.dataframe.column_loc[self.dataframe.columns[col]]
+                type_id, col_id = col_loc.type_id, col_loc.col_id
 
                 def df_getitem_iat_tuple_impl(self, idx):
                     row, _ = idx
                     if -1 < row < len(self._dataframe.index):
-                        data = self._dataframe._data[col]
+                        data = self._dataframe._data[type_id][col_id]
                         res_data = pandas.Series(data)
                         return res_data.iat[row]
 
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
index 64b0ca6e6..bf4d1cfb7 100644
--- a/sdc/hiframes/pd_dataframe_ext.py
+++ b/sdc/hiframes/pd_dataframe_ext.py
@@ -90,9 +90,9 @@ def init_dataframe(typingctx, *args):
             type_id += 1
         else:
             # Get index of column in list of types
-            type_id, col_indices = data_typs_map[col_typ]
+            existing_type_id, col_indices = data_typs_map[col_typ]
             col_id = len(col_indices)
-            column_loc[col_name] = ColumnLoc(type_id, col_id)
+            column_loc[col_name] = ColumnLoc(existing_type_id, col_id)
             col_indices.append(i)
 
     def codegen(context, builder, signature, args):
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
index c9cc19b1c..645c2cd93 100644
--- a/sdc/tests/test_dataframe.py
+++ b/sdc/tests/test_dataframe.py
@@ -1101,7 +1101,7 @@ def test_impl(df, n):
                 with self.subTest(n=n, index=idx):
                     pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iloc_slice(self):
         def test_impl(df, n, k):
             return df.iloc[n:k]
@@ -1116,7 +1116,23 @@ def test_impl(df, n, k):
                 with self.subTest(index=idx, n=n, k=k):
                     pd.testing.assert_frame_equal(sdc_func(df, n, k), test_impl(df, n, k))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame boxing
+    def test_df_iloc_slice_no_unboxing(self):
+        def test_impl(n, k):
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [5.5, np.nan, 3, 0, 7.7],
+                'C': [3, 4, 1, 0, 222],
+            }, index=[3, 4, 2, 6, 1])
+            return df.iloc[n:k]
+
+        sdc_func = sdc.jit(test_impl)
+        cases_n = [-10, 0, 8, None]
+        for n, k in product(cases_n, cases_n[::-1]):
+            with self.subTest(n=n, k=k):
+                pd.testing.assert_frame_equal(sdc_func(n, k), test_impl(n, k))
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iloc_values(self):
         def test_impl(df, n):
             return df.iloc[n, 1]
@@ -1132,7 +1148,22 @@ def test_impl(df, n):
                     if not (np.isnan(sdc_func(df, n)) and np.isnan(test_impl(df, n))):
                         self.assertEqual(sdc_func(df, n), test_impl(df, n))
 
-    @dfRefactoringNotImplemented
+    def test_df_iloc_values_no_unboxing(self):
+        def test_impl(n):
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [5.5, np.nan, 3, 0, 7.7],
+                'C': [3, 4, 1, 0, 222],
+            }, index=[3, 4, 2, 6, 1])
+            return df.iloc[n, 1]
+
+        sdc_func = sdc.jit(test_impl)
+        for n in [1, 0, 2]:
+            with self.subTest(n=n):
+                if not (np.isnan(sdc_func(n)) and np.isnan(test_impl(n))):
+                    self.assertEqual(sdc_func(n), test_impl(n))
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iloc_value_error(self):
         def int_impl(df):
             return df.iloc[11]
@@ -1157,7 +1188,7 @@ def list_bool_impl(df):
                     func(df)
                 self.assertIn(msg, str(raises.exception))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iloc_int(self):
         def test_impl(df, n):
             return df.iloc[n]
@@ -1172,7 +1203,21 @@ def test_impl(df, n):
                 with self.subTest(index=idx, n=n):
                     pd.testing.assert_series_equal(sdc_func(df, n), test_impl(df, n), check_names=False)
 
-    @dfRefactoringNotImplemented
+    def test_df_iloc_int_no_unboxing(self):
+        def test_impl(n):
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [5.5, np.nan, 3, 0, 7.7],
+                'C': [3, 4, 1, 0, 222],
+            }, index=[3, 4, 2, 6, 1])
+            return df.iloc[n]
+
+        sdc_func = sdc.jit(test_impl)
+        for n in [0, 1, 2]:
+            with self.subTest(n=n):
+                pd.testing.assert_series_equal(sdc_func(n), test_impl(n), check_names=False)
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iloc_list(self):
         def test_impl(df, n):
             return df.iloc[n]
@@ -1187,7 +1232,22 @@ def test_impl(df, n):
                 with self.subTest(index=idx, n=n):
                     pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame boxing
+    def test_df_iloc_list_no_unboxing(self):
+        def test_impl(n):
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [5.5, np.nan, 3, 0, 7.7],
+                'C': [3, 4, 1, 0, 222]
+            }, index=[3, 4, 2, 6, 1])
+            return df.iloc[n]
+
+        sdc_func = sdc.jit(test_impl)
+        for n in [[0, 1], [2, 0]]:
+            with self.subTest(n=n):
+                pd.testing.assert_frame_equal(sdc_func(n), test_impl(n))
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iloc_list_bool(self):
         def test_impl(df, n):
             return df.iloc[n]
@@ -1202,7 +1262,22 @@ def test_impl(df, n):
                 with self.subTest(index=idx, n=n):
                     pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame boxing
+    def test_df_iloc_list_bool_no_unboxing(self):
+        def test_impl(n):
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [5.5, np.nan, 3, 0, 7.7],
+                'C': [3, 4, 1, 0, 222]
+            }, index=[3, 4, 2, 6, 1])
+            return df.iloc[n]
+
+        sdc_func = sdc.jit(test_impl)
+        for n in [[True, False, True, False, True]]:
+            with self.subTest(n=n):
+                pd.testing.assert_frame_equal(sdc_func(n), test_impl(n))
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_iat(self):
         def test_impl(df):
             return df.iat[0, 1]
@@ -1213,6 +1288,18 @@ def test_impl(df):
                            "C": ['a', 'dd', 'c', '12', 'ddf']}, index=idx)
         self.assertEqual(sdc_func(df), test_impl(df))
 
+    def test_df_iat_no_unboxing(self):
+        def test_impl():
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [3, 4, 1, 0, 222],
+                'C': ['a', 'dd', 'c', '12', 'ddf']
+            }, index=[3, 4, 2, 6, 1])
+            return df.iat[0, 1]
+
+        sdc_func = sdc.jit(test_impl)
+        self.assertEqual(sdc_func(), test_impl())
+
     def test_df_iat_value_error(self):
         def test_impl(df):
             return df.iat[1, 22]
@@ -1226,7 +1313,7 @@ def test_impl(df):
         msg = 'Index is out of bounds for axis'
         self.assertIn(msg, str(raises.exception))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_at(self):
         def test_impl(df, n):
             return df.at[n, 'C']
@@ -1240,7 +1327,20 @@ def test_impl(df, n):
         for n in n_cases:
             np.testing.assert_array_equal(sdc_func(df, n), test_impl(df, n))
 
-    @dfRefactoringNotImplemented
+    def test_df_at_no_unboxing(self):
+        def test_impl(n):
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [3, 4, 1, 0, 222],
+                'C': ['a', 'dd', 'c', '12', 'ddf']
+            }, index=[3, 0, 1, 2, 0])
+            return df.at[n, 'C']
+
+        sdc_func = sdc.jit(test_impl)
+        for n in [0, 2]:
+            np.testing.assert_array_equal(sdc_func(n), test_impl(n))
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_at_type(self):
         def test_impl(df, n, k):
             return df.at[n, "B"]
@@ -1254,7 +1354,7 @@ def test_impl(df, n, k):
         for n in n_cases:
             self.assertEqual(sdc_func(df, n, "B"), test_impl(df, n, "B"))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_at_value_error(self):
         def test_impl(df):
             return df.at[5, 'C']
@@ -1269,7 +1369,7 @@ def test_impl(df):
         msg = 'Index is not in the Series'
         self.assertIn(msg, str(raises.exception))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_loc(self):
         def test_impl(df):
             return df.loc[4]
@@ -1281,7 +1381,20 @@ def test_impl(df):
                            "C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx)
         pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame boxing
+    def test_df_loc_no_unboxing(self):
+        def test_impl():
+            df = pd.DataFrame({
+                'A': [3.2, 4.4, 7.0, 3.3, 1.0],
+                'B': [3, 4, 1, 0, 222],
+                'C': [3.1, 8.4, 7.1, 3.2, 1]
+            }, index=[3, 4, 1, 4, 0])
+            return df.loc[4]
+
+        sdc_func = sdc.jit(test_impl)
+        pd.testing.assert_frame_equal(sdc_func(), test_impl())
+
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_loc_str(self):
         def test_impl(df):
             return df.loc['c']
@@ -1293,7 +1406,7 @@ def test_impl(df):
                            "C": ['3.1', '8.4', '7.1', '3.2', '1']}, index=idx)
         pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
 
-    @dfRefactoringNotImplemented
+    @dfRefactoringNotImplemented  # required re-implementing DataFrame unboxing
     def test_df_loc_no_idx(self):
         def test_impl(df):
             return df.loc[2]