From 36255096b1068c2ba2e73bd2f7d025ed7cb70f31 Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 21 May 2020 19:11:48 +0300 Subject: [PATCH] Re-implement df.at/iat/loc/iloc based on new structure --- .../hpat_pandas_dataframe_functions.py | 61 +++++--- sdc/hiframes/pd_dataframe_ext.py | 4 +- sdc/tests/test_dataframe.py | 139 ++++++++++++++++-- 3 files changed, 165 insertions(+), 39 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index b75fd29ba..767efe7fc 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -1896,7 +1896,7 @@ def df_getitem_tuple_at_codegen(self, row, col): Example of generated implementation: def _df_getitem_tuple_at_impl(self, idx): row, _ = idx - data = self._dataframe._data[1] + data = self._dataframe._data[1][0] res_data = pandas.Series(data, index=self._dataframe.index) return res_data.at[row] """ @@ -1905,9 +1905,11 @@ def _df_getitem_tuple_at_impl(self, idx): check = False for i in range(len(self.columns)): if self.columns[i] == col: + col_loc = self.column_loc[col] + type_id, col_id = col_loc.type_id, col_loc.col_id check = True func_lines += [ - f' data = self._dataframe._data[{i}]', + f' data = self._dataframe._data[{type_id}][{col_id}]', f' res_data = pandas.Series(data, index=self._dataframe.index)', ' return res_data.at[row]', ] @@ -1925,9 +1927,9 @@ def df_getitem_single_label_loc_codegen(self, idx): Example of generated implementation: def _df_getitem_single_label_loc_impl(self, idx): idx_list = find_idx(self._dataframe._index, idx) - data_0 = _sdc_take(self._dataframe._data[0], idx_list) + data_0 = _sdc_take(self._dataframe._data[0][0], idx_list) res_data_0 = pandas.Series(data_0) - data_1 = _sdc_take(self._dataframe._data[1], idx_list) + data_1 = _sdc_take(self._dataframe._data[1][0], idx_list) res_data_1 = pandas.Series(data_1) if len(idx_list) < 1: raise KeyError('Index is not in the DataFrame') @@ -1948,10 +1950,11 @@ def _df_getitem_single_label_loc_impl(self, idx): f'{fill_list_text}'] results = [] for i, c in enumerate(self.columns): + col_loc = self.column_loc[c] + type_id, col_id = col_loc.type_id, col_loc.col_id data = f'data_{i}' - index_in_list = f'index_in_list_{i}' res_data = f'res_data_{i}' - func_lines += [f' {data} = _sdc_take(self._dataframe._data[{i}], idx_list)', + func_lines += [f' {data} = _sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)', f' {res_data} = pandas.Series({data})'] results.append((c, res_data)) @@ -1976,13 +1979,13 @@ def df_getitem_int_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_int_iloc_impl(self, idx): - if -1 < idx < len(self._dataframe.index): - data_0 = pandas.Series(self._dataframe._data[0]) - result_0 = data_0.iat[idx] - data_1 = pandas.Series(self._dataframe._data[1]) - result_1 = data_1.iat[idx] - return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx)) - raise IndexingError('Index is out of bounds for axis') + if -1 < idx < len(self._dataframe.index): + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iat[idx] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iat[idx] + return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx)) + raise IndexingError('Index is out of bounds for axis') """ func_lines = ['def _df_getitem_int_iloc_impl(self, idx):', ' if -1 < idx < len(self._dataframe.index):'] @@ -1992,8 +1995,10 @@ def _df_getitem_int_iloc_impl(self, idx): if isinstance(self.index, types.NoneType): name = 'idx' for i, c in enumerate(self.columns): + col_loc = self.column_loc[c] + type_id, col_id = col_loc.type_id, col_loc.col_id result_c = f"result_{i}" - func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{i}])", + func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])", f" {result_c} = data_{i}.iat[idx]"] results.append(result_c) index.append(c) @@ -2011,17 +2016,19 @@ def df_getitem_slice_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_slice_iloc_impl(self, idx): - data_0 = pandas.Series(self._dataframe._data[0]) + data_0 = pandas.Series(self._dataframe._data[0][0]) result_0 = data_0.iloc[idx] - data_1 = pandas.Series(self._dataframe._data[1]) + data_1 = pandas.Series(self._dataframe._data[1][0]) result_1 = data_1.iloc[idx] return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=self._dataframe.index[idx]) """ func_lines = ['def _df_getitem_slice_iloc_impl(self, idx):'] results = [] for i, c in enumerate(self.columns): + col_loc = self.column_loc[c] + type_id, col_id = col_loc.type_id, col_loc.col_id result_c = f"result_{i}" - func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{i}])", + func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])", f" {result_c} = data_{i}.iloc[idx]"] results.append((c, result_c)) data = ', '.join(f'"{col}": {data}' for col, data in results) @@ -2042,9 +2049,9 @@ def _df_getitem_list_iloc_impl(self, idx): if -1 < i < len(self._dataframe.index): check_idx = True if check_idx == True: - data_0 = pandas.Series(self._dataframe._data[0]) + data_0 = pandas.Series(self._dataframe._data[0][0]) result_0 = data_0.iloc[numpy.array(idx)] - data_1 = pandas.Series(self._dataframe._data[1]) + data_1 = pandas.Series(self._dataframe._data[1][0]) result_1 = data_1.iloc[numpy.array(idx)] return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=idx) raise IndexingError('Index is out of bounds for axis') @@ -2060,8 +2067,10 @@ def _df_getitem_list_iloc_impl(self, idx): if isinstance(self.index, types.NoneType): index = 'idx' for i, c in enumerate(self.columns): + col_loc = self.column_loc[c] + type_id, col_id = col_loc.type_id, col_loc.col_id result_c = f"result_{i}" - func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{i}])", + func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])", f" {result_c} = data_{i}.iloc[numpy.array(idx)]"] results.append((c, result_c)) data = ', '.join(f'"{col}": {data}' for col, data in results) @@ -2079,9 +2088,9 @@ def df_getitem_list_bool_iloc_codegen(self, idx): Example of generated implementation: def _df_getitem_list_bool_iloc_impl(self, idx): if len(self._dataframe.index) == len(idx): - data_0 = self._dataframe._data[0] + data_0 = self._dataframe._data[0][0] result_0 = pandas.Series(data_0[numpy.array(idx)]) - data_1 = self._dataframe._data[1] + data_1 = self._dataframe._data[1][0] result_1 = pandas.Series(data_1[numpy.array(idx)]) return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=self._dataframe.index[numpy.array(idx)]) @@ -2092,8 +2101,10 @@ def _df_getitem_list_bool_iloc_impl(self, idx): index = 'self._dataframe.index[numpy.array(idx)]' func_lines += [' if len(self._dataframe.index) == len(idx):'] for i, c in enumerate(self.columns): + col_loc = self.column_loc[c] + type_id, col_id = col_loc.type_id, col_loc.col_id result_c = f"result_{i}" - func_lines += [f" data_{i} = self._dataframe._data[{i}]", + func_lines += [f" data_{i} = self._dataframe._data[{type_id}][{col_id}]", f" {result_c} = pandas.Series(data_{i}[numpy.array(idx)])"] results.append((c, result_c)) data = ', '.join(f'"{col}": {data}' for col, data in results) @@ -2164,11 +2175,13 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx): if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal): col = idx[1].literal_value if -1 < col < len(self.dataframe.columns): + col_loc = self.dataframe.column_loc[self.dataframe.columns[col]] + type_id, col_id = col_loc.type_id, col_loc.col_id def df_getitem_iat_tuple_impl(self, idx): row, _ = idx if -1 < row < len(self._dataframe.index): - data = self._dataframe._data[col] + data = self._dataframe._data[type_id][col_id] res_data = pandas.Series(data) return res_data.iat[row] diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 64b0ca6e6..bf4d1cfb7 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -90,9 +90,9 @@ def init_dataframe(typingctx, *args): type_id += 1 else: # Get index of column in list of types - type_id, col_indices = data_typs_map[col_typ] + existing_type_id, col_indices = data_typs_map[col_typ] col_id = len(col_indices) - column_loc[col_name] = ColumnLoc(type_id, col_id) + column_loc[col_name] = ColumnLoc(existing_type_id, col_id) col_indices.append(i) def codegen(context, builder, signature, args): diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index c9cc19b1c..645c2cd93 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1101,7 +1101,7 @@ def test_impl(df, n): with self.subTest(n=n, index=idx): pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iloc_slice(self): def test_impl(df, n, k): return df.iloc[n:k] @@ -1116,7 +1116,23 @@ def test_impl(df, n, k): with self.subTest(index=idx, n=n, k=k): pd.testing.assert_frame_equal(sdc_func(df, n, k), test_impl(df, n, k)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame boxing + def test_df_iloc_slice_no_unboxing(self): + def test_impl(n, k): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [5.5, np.nan, 3, 0, 7.7], + 'C': [3, 4, 1, 0, 222], + }, index=[3, 4, 2, 6, 1]) + return df.iloc[n:k] + + sdc_func = sdc.jit(test_impl) + cases_n = [-10, 0, 8, None] + for n, k in product(cases_n, cases_n[::-1]): + with self.subTest(n=n, k=k): + pd.testing.assert_frame_equal(sdc_func(n, k), test_impl(n, k)) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iloc_values(self): def test_impl(df, n): return df.iloc[n, 1] @@ -1132,7 +1148,22 @@ def test_impl(df, n): if not (np.isnan(sdc_func(df, n)) and np.isnan(test_impl(df, n))): self.assertEqual(sdc_func(df, n), test_impl(df, n)) - @dfRefactoringNotImplemented + def test_df_iloc_values_no_unboxing(self): + def test_impl(n): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [5.5, np.nan, 3, 0, 7.7], + 'C': [3, 4, 1, 0, 222], + }, index=[3, 4, 2, 6, 1]) + return df.iloc[n, 1] + + sdc_func = sdc.jit(test_impl) + for n in [1, 0, 2]: + with self.subTest(n=n): + if not (np.isnan(sdc_func(n)) and np.isnan(test_impl(n))): + self.assertEqual(sdc_func(n), test_impl(n)) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iloc_value_error(self): def int_impl(df): return df.iloc[11] @@ -1157,7 +1188,7 @@ def list_bool_impl(df): func(df) self.assertIn(msg, str(raises.exception)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iloc_int(self): def test_impl(df, n): return df.iloc[n] @@ -1172,7 +1203,21 @@ def test_impl(df, n): with self.subTest(index=idx, n=n): pd.testing.assert_series_equal(sdc_func(df, n), test_impl(df, n), check_names=False) - @dfRefactoringNotImplemented + def test_df_iloc_int_no_unboxing(self): + def test_impl(n): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [5.5, np.nan, 3, 0, 7.7], + 'C': [3, 4, 1, 0, 222], + }, index=[3, 4, 2, 6, 1]) + return df.iloc[n] + + sdc_func = sdc.jit(test_impl) + for n in [0, 1, 2]: + with self.subTest(n=n): + pd.testing.assert_series_equal(sdc_func(n), test_impl(n), check_names=False) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iloc_list(self): def test_impl(df, n): return df.iloc[n] @@ -1187,7 +1232,22 @@ def test_impl(df, n): with self.subTest(index=idx, n=n): pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame boxing + def test_df_iloc_list_no_unboxing(self): + def test_impl(n): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [5.5, np.nan, 3, 0, 7.7], + 'C': [3, 4, 1, 0, 222] + }, index=[3, 4, 2, 6, 1]) + return df.iloc[n] + + sdc_func = sdc.jit(test_impl) + for n in [[0, 1], [2, 0]]: + with self.subTest(n=n): + pd.testing.assert_frame_equal(sdc_func(n), test_impl(n)) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iloc_list_bool(self): def test_impl(df, n): return df.iloc[n] @@ -1202,7 +1262,22 @@ def test_impl(df, n): with self.subTest(index=idx, n=n): pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame boxing + def test_df_iloc_list_bool_no_unboxing(self): + def test_impl(n): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [5.5, np.nan, 3, 0, 7.7], + 'C': [3, 4, 1, 0, 222] + }, index=[3, 4, 2, 6, 1]) + return df.iloc[n] + + sdc_func = sdc.jit(test_impl) + for n in [[True, False, True, False, True]]: + with self.subTest(n=n): + pd.testing.assert_frame_equal(sdc_func(n), test_impl(n)) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_iat(self): def test_impl(df): return df.iat[0, 1] @@ -1213,6 +1288,18 @@ def test_impl(df): "C": ['a', 'dd', 'c', '12', 'ddf']}, index=idx) self.assertEqual(sdc_func(df), test_impl(df)) + def test_df_iat_no_unboxing(self): + def test_impl(): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [3, 4, 1, 0, 222], + 'C': ['a', 'dd', 'c', '12', 'ddf'] + }, index=[3, 4, 2, 6, 1]) + return df.iat[0, 1] + + sdc_func = sdc.jit(test_impl) + self.assertEqual(sdc_func(), test_impl()) + def test_df_iat_value_error(self): def test_impl(df): return df.iat[1, 22] @@ -1226,7 +1313,7 @@ def test_impl(df): msg = 'Index is out of bounds for axis' self.assertIn(msg, str(raises.exception)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_at(self): def test_impl(df, n): return df.at[n, 'C'] @@ -1240,7 +1327,20 @@ def test_impl(df, n): for n in n_cases: np.testing.assert_array_equal(sdc_func(df, n), test_impl(df, n)) - @dfRefactoringNotImplemented + def test_df_at_no_unboxing(self): + def test_impl(n): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [3, 4, 1, 0, 222], + 'C': ['a', 'dd', 'c', '12', 'ddf'] + }, index=[3, 0, 1, 2, 0]) + return df.at[n, 'C'] + + sdc_func = sdc.jit(test_impl) + for n in [0, 2]: + np.testing.assert_array_equal(sdc_func(n), test_impl(n)) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_at_type(self): def test_impl(df, n, k): return df.at[n, "B"] @@ -1254,7 +1354,7 @@ def test_impl(df, n, k): for n in n_cases: self.assertEqual(sdc_func(df, n, "B"), test_impl(df, n, "B")) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_at_value_error(self): def test_impl(df): return df.at[5, 'C'] @@ -1269,7 +1369,7 @@ def test_impl(df): msg = 'Index is not in the Series' self.assertIn(msg, str(raises.exception)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_loc(self): def test_impl(df): return df.loc[4] @@ -1281,7 +1381,20 @@ def test_impl(df): "C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx) pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame boxing + def test_df_loc_no_unboxing(self): + def test_impl(): + df = pd.DataFrame({ + 'A': [3.2, 4.4, 7.0, 3.3, 1.0], + 'B': [3, 4, 1, 0, 222], + 'C': [3.1, 8.4, 7.1, 3.2, 1] + }, index=[3, 4, 1, 4, 0]) + return df.loc[4] + + sdc_func = sdc.jit(test_impl) + pd.testing.assert_frame_equal(sdc_func(), test_impl()) + + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_loc_str(self): def test_impl(df): return df.loc['c'] @@ -1293,7 +1406,7 @@ def test_impl(df): "C": ['3.1', '8.4', '7.1', '3.2', '1']}, index=idx) pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - @dfRefactoringNotImplemented + @dfRefactoringNotImplemented # required re-implementing DataFrame unboxing def test_df_loc_no_idx(self): def test_impl(df): return df.loc[2]