Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Re-implement df.at/iat/loc/iloc based on new structure #858

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 37 additions & 24 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1896,7 +1896,7 @@ def df_getitem_tuple_at_codegen(self, row, col):
Example of generated implementation:
def _df_getitem_tuple_at_impl(self, idx):
row, _ = idx
data = self._dataframe._data[1]
data = self._dataframe._data[1][0]
res_data = pandas.Series(data, index=self._dataframe.index)
return res_data.at[row]
"""
Expand All @@ -1905,9 +1905,11 @@ def _df_getitem_tuple_at_impl(self, idx):
check = False
for i in range(len(self.columns)):
if self.columns[i] == col:
col_loc = self.column_loc[col]
type_id, col_id = col_loc.type_id, col_loc.col_id
check = True
func_lines += [
f' data = self._dataframe._data[{i}]',
f' data = self._dataframe._data[{type_id}][{col_id}]',
f' res_data = pandas.Series(data, index=self._dataframe.index)',
' return res_data.at[row]',
]
Expand All @@ -1925,9 +1927,9 @@ def df_getitem_single_label_loc_codegen(self, idx):
Example of generated implementation:
def _df_getitem_single_label_loc_impl(self, idx):
idx_list = find_idx(self._dataframe._index, idx)
data_0 = _sdc_take(self._dataframe._data[0], idx_list)
data_0 = _sdc_take(self._dataframe._data[0][0], idx_list)
res_data_0 = pandas.Series(data_0)
data_1 = _sdc_take(self._dataframe._data[1], idx_list)
data_1 = _sdc_take(self._dataframe._data[1][0], idx_list)
res_data_1 = pandas.Series(data_1)
if len(idx_list) < 1:
raise KeyError('Index is not in the DataFrame')
Expand All @@ -1948,10 +1950,11 @@ def _df_getitem_single_label_loc_impl(self, idx):
f'{fill_list_text}']
results = []
for i, c in enumerate(self.columns):
col_loc = self.column_loc[c]
type_id, col_id = col_loc.type_id, col_loc.col_id
data = f'data_{i}'
index_in_list = f'index_in_list_{i}'
res_data = f'res_data_{i}'
func_lines += [f' {data} = _sdc_take(self._dataframe._data[{i}], idx_list)',
func_lines += [f' {data} = _sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)',
f' {res_data} = pandas.Series({data})']
results.append((c, res_data))

Expand All @@ -1976,13 +1979,13 @@ def df_getitem_int_iloc_codegen(self, idx):
"""
Example of generated implementation:
def _df_getitem_int_iloc_impl(self, idx):
if -1 < idx < len(self._dataframe.index):
data_0 = pandas.Series(self._dataframe._data[0])
result_0 = data_0.iat[idx]
data_1 = pandas.Series(self._dataframe._data[1])
result_1 = data_1.iat[idx]
return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx))
raise IndexingError('Index is out of bounds for axis')
if -1 < idx < len(self._dataframe.index):
data_0 = pandas.Series(self._dataframe._data[0][0])
result_0 = data_0.iat[idx]
data_1 = pandas.Series(self._dataframe._data[0][1])
result_1 = data_1.iat[idx]
return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx))
raise IndexingError('Index is out of bounds for axis')
"""
func_lines = ['def _df_getitem_int_iloc_impl(self, idx):',
' if -1 < idx < len(self._dataframe.index):']
Expand All @@ -1992,8 +1995,10 @@ def _df_getitem_int_iloc_impl(self, idx):
if isinstance(self.index, types.NoneType):
name = 'idx'
for i, c in enumerate(self.columns):
col_loc = self.column_loc[c]
type_id, col_id = col_loc.type_id, col_loc.col_id
result_c = f"result_{i}"
func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{i}])",
func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])",
f" {result_c} = data_{i}.iat[idx]"]
results.append(result_c)
index.append(c)
Expand All @@ -2011,17 +2016,19 @@ def df_getitem_slice_iloc_codegen(self, idx):
"""
Example of generated implementation:
def _df_getitem_slice_iloc_impl(self, idx):
data_0 = pandas.Series(self._dataframe._data[0])
data_0 = pandas.Series(self._dataframe._data[0][0])
result_0 = data_0.iloc[idx]
data_1 = pandas.Series(self._dataframe._data[1])
data_1 = pandas.Series(self._dataframe._data[1][0])
result_1 = data_1.iloc[idx]
return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=self._dataframe.index[idx])
"""
func_lines = ['def _df_getitem_slice_iloc_impl(self, idx):']
results = []
for i, c in enumerate(self.columns):
col_loc = self.column_loc[c]
type_id, col_id = col_loc.type_id, col_loc.col_id
result_c = f"result_{i}"
func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{i}])",
func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])",
f" {result_c} = data_{i}.iloc[idx]"]
results.append((c, result_c))
data = ', '.join(f'"{col}": {data}' for col, data in results)
Expand All @@ -2042,9 +2049,9 @@ def _df_getitem_list_iloc_impl(self, idx):
if -1 < i < len(self._dataframe.index):
check_idx = True
if check_idx == True:
data_0 = pandas.Series(self._dataframe._data[0])
data_0 = pandas.Series(self._dataframe._data[0][0])
result_0 = data_0.iloc[numpy.array(idx)]
data_1 = pandas.Series(self._dataframe._data[1])
data_1 = pandas.Series(self._dataframe._data[1][0])
result_1 = data_1.iloc[numpy.array(idx)]
return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=idx)
raise IndexingError('Index is out of bounds for axis')
Expand All @@ -2060,8 +2067,10 @@ def _df_getitem_list_iloc_impl(self, idx):
if isinstance(self.index, types.NoneType):
index = 'idx'
for i, c in enumerate(self.columns):
col_loc = self.column_loc[c]
type_id, col_id = col_loc.type_id, col_loc.col_id
result_c = f"result_{i}"
func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{i}])",
func_lines += [f" data_{i} = pandas.Series(self._dataframe._data[{type_id}][{col_id}])",
f" {result_c} = data_{i}.iloc[numpy.array(idx)]"]
results.append((c, result_c))
data = ', '.join(f'"{col}": {data}' for col, data in results)
Expand All @@ -2079,9 +2088,9 @@ def df_getitem_list_bool_iloc_codegen(self, idx):
Example of generated implementation:
def _df_getitem_list_bool_iloc_impl(self, idx):
if len(self._dataframe.index) == len(idx):
data_0 = self._dataframe._data[0]
data_0 = self._dataframe._data[0][0]
result_0 = pandas.Series(data_0[numpy.array(idx)])
data_1 = self._dataframe._data[1]
data_1 = self._dataframe._data[1][0]
result_1 = pandas.Series(data_1[numpy.array(idx)])
return pandas.DataFrame(data={"A": result_0, "B": result_1},
index=self._dataframe.index[numpy.array(idx)])
Expand All @@ -2092,8 +2101,10 @@ def _df_getitem_list_bool_iloc_impl(self, idx):
index = 'self._dataframe.index[numpy.array(idx)]'
func_lines += [' if len(self._dataframe.index) == len(idx):']
for i, c in enumerate(self.columns):
col_loc = self.column_loc[c]
type_id, col_id = col_loc.type_id, col_loc.col_id
result_c = f"result_{i}"
func_lines += [f" data_{i} = self._dataframe._data[{i}]",
func_lines += [f" data_{i} = self._dataframe._data[{type_id}][{col_id}]",
f" {result_c} = pandas.Series(data_{i}[numpy.array(idx)])"]
results.append((c, result_c))
data = ', '.join(f'"{col}": {data}' for col, data in results)
Expand Down Expand Up @@ -2164,11 +2175,13 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
col = idx[1].literal_value
if -1 < col < len(self.dataframe.columns):
col_loc = self.dataframe.column_loc[self.dataframe.columns[col]]
type_id, col_id = col_loc.type_id, col_loc.col_id

def df_getitem_iat_tuple_impl(self, idx):
row, _ = idx
if -1 < row < len(self._dataframe.index):
data = self._dataframe._data[col]
data = self._dataframe._data[type_id][col_id]
res_data = pandas.Series(data)
return res_data.iat[row]

Expand Down
4 changes: 2 additions & 2 deletions sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ def init_dataframe(typingctx, *args):
type_id += 1
else:
# Get index of column in list of types
type_id, col_indices = data_typs_map[col_typ]
existing_type_id, col_indices = data_typs_map[col_typ]
col_id = len(col_indices)
column_loc[col_name] = ColumnLoc(type_id, col_id)
column_loc[col_name] = ColumnLoc(existing_type_id, col_id)
col_indices.append(i)

def codegen(context, builder, signature, args):
Expand Down
Loading