From 506183e7364e17afc75d152eeca2c20df1915555 Mon Sep 17 00:00:00 2001 From: Riddhi Battu Date: Thu, 11 Apr 2024 22:15:51 -0700 Subject: [PATCH] feat(data_preprocessing.py): add argument to function --- src/pynyairbnb/data_preprocessing.py | 8 +++++--- tests/test_data_preprocessing.py | 7 +++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/pynyairbnb/data_preprocessing.py b/src/pynyairbnb/data_preprocessing.py index 61246a0..0d407eb 100644 --- a/src/pynyairbnb/data_preprocessing.py +++ b/src/pynyairbnb/data_preprocessing.py @@ -200,7 +200,7 @@ def add_price_category(data): data['price_category'] = categories return data -def data_preprocessing(input_path, out_dir): +def data_preprocessing(input_path, out_dir, raw_dir="./data/raw"): """Main function orchestrating the data cleaning and preprocessing. Reads data from a specified input path, performs cleaning operations including filling missing values and converting data types, splits the data into training and testing datasets, adds a 'price_category' column based on predefined price ranges, and saves the processed datasets to the specified output directory. @@ -211,6 +211,8 @@ def data_preprocessing(input_path, out_dir): Path to input data file. out_dir : str Path to directory where processed files will be saved. + raw_dir : str + Path to directory where raw data file will be saved. Returns ------- @@ -218,12 +220,12 @@ def data_preprocessing(input_path, out_dir): Examples -------- - >>> data_preprocessing('data/raw/airbnb_data_2023.csv', 'data/processed') + >>> data_preprocessing('data/raw/airbnb_data_2023.csv', 'data/processed', 'data/raw') # Reads the raw data, processes it, and saves the processed data into the 'data/processed' directory. """ create_dir_if_not_exists(out_dir) - data = read_data(input_path, out_dir) + data = read_data(input_path, raw_dir) data = convert_missing_values(data) train_df, test_df = split_data(data) train_df = add_price_category(train_df) diff --git a/tests/test_data_preprocessing.py b/tests/test_data_preprocessing.py index 73d7600..0001ad3 100644 --- a/tests/test_data_preprocessing.py +++ b/tests/test_data_preprocessing.py @@ -294,12 +294,11 @@ def setup_mocks(mocker, mock_data): def test_data_preprocessing(setup_mocks): """Tests the orchestration of the data preprocessing pipeline.""" - from pynyairbnb.data_preprocessing import data_preprocessing # Execute the function - data_preprocessing('dummy/path/to/data.csv', 'dummy/path/to/output') + data_preprocessing('dummy/path/to/data.csv', 'dummy/path/to/output', 'dummy/path/to/raw') # Verify all steps are called correctly setup_mocks['mock_create_dir'].assert_called_once_with('dummy/path/to/output') - setup_mocks['mock_read_data'].assert_called_once_with('dummy/path/to/data.csv', 'dummy/path/to/output') + setup_mocks['mock_read_data'].assert_called_once_with('dummy/path/to/data.csv', 'dummy/path/to/raw') setup_mocks['mock_convert_missing'].assert_called_once() setup_mocks['mock_split_data'].assert_called_once() - setup_mocks['mock_save_dataframes'].assert_called_once() + setup_mocks['mock_save_dataframes'].assert_called_once() \ No newline at end of file