Skip to content

Commit

Permalink
feat(data_preprocessing.py): add argument to function
Browse files Browse the repository at this point in the history
  • Loading branch information
riddhibattu committed Apr 12, 2024
1 parent 5020ebd commit 506183e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
8 changes: 5 additions & 3 deletions src/pynyairbnb/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def add_price_category(data):
data['price_category'] = categories
return data

def data_preprocessing(input_path, out_dir):
def data_preprocessing(input_path, out_dir, raw_dir="./data/raw"):
"""Main function orchestrating the data cleaning and preprocessing.
Reads data from a specified input path, performs cleaning operations including filling missing values and converting data types, splits the data into training and testing datasets, adds a 'price_category' column based on predefined price ranges, and saves the processed datasets to the specified output directory.
Expand All @@ -211,19 +211,21 @@ def data_preprocessing(input_path, out_dir):
Path to input data file.
out_dir : str
Path to directory where processed files will be saved.
raw_dir : str
Path to directory where raw data file will be saved.
Returns
-------
None
Examples
--------
>>> data_preprocessing('data/raw/airbnb_data_2023.csv', 'data/processed')
>>> data_preprocessing('data/raw/airbnb_data_2023.csv', 'data/processed', 'data/raw')
# Reads the raw data, processes it, and saves the processed data into the 'data/processed' directory.
"""
create_dir_if_not_exists(out_dir)

data = read_data(input_path, out_dir)
data = read_data(input_path, raw_dir)
data = convert_missing_values(data)
train_df, test_df = split_data(data)
train_df = add_price_category(train_df)
Expand Down
7 changes: 3 additions & 4 deletions tests/test_data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,12 +294,11 @@ def setup_mocks(mocker, mock_data):

def test_data_preprocessing(setup_mocks):
"""Tests the orchestration of the data preprocessing pipeline."""
from pynyairbnb.data_preprocessing import data_preprocessing
# Execute the function
data_preprocessing('dummy/path/to/data.csv', 'dummy/path/to/output')
data_preprocessing('dummy/path/to/data.csv', 'dummy/path/to/output', 'dummy/path/to/raw')
# Verify all steps are called correctly
setup_mocks['mock_create_dir'].assert_called_once_with('dummy/path/to/output')
setup_mocks['mock_read_data'].assert_called_once_with('dummy/path/to/data.csv', 'dummy/path/to/output')
setup_mocks['mock_read_data'].assert_called_once_with('dummy/path/to/data.csv', 'dummy/path/to/raw')
setup_mocks['mock_convert_missing'].assert_called_once()
setup_mocks['mock_split_data'].assert_called_once()
setup_mocks['mock_save_dataframes'].assert_called_once()
setup_mocks['mock_save_dataframes'].assert_called_once()

0 comments on commit 506183e

Please # to comment.