diff --git a/poetry.lock b/poetry.lock index cd93b1d..57ee49c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3466,4 +3466,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "dae9b94b08cad52b50964793c87f9675496f9a2a4c5c964f4d8a24669707ac7e" +content-hash = "6bcc3672d45c185ad6be19592ac5f1e3d36f16b32a293e278c24cbbbd8066e37" diff --git a/pyproject.toml b/pyproject.toml index 1cffb07..b9c9ab4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ matplotlib = ">=3.8.3" seaborn = ">=0.13.2" numpy = ">=1.26.4" scipy = ">=1.12.0" +click = "^8.1.7" [tool.poetry.dev-dependencies] diff --git a/src/pynyairbnb/data_preprocessing.py b/src/pynyairbnb/data_preprocessing.py index ee96df9..438741f 100644 --- a/src/pynyairbnb/data_preprocessing.py +++ b/src/pynyairbnb/data_preprocessing.py @@ -81,10 +81,13 @@ def convert_missing_values(data): 1 2 2 0.0 2 NaN 3 2.0 """ - - data['id'] = data['id'].astype(str) - data['host_id'] = data['host_id'].astype(str) - data['reviews_per_month'] = data['reviews_per_month'].fillna(0) + + if data is None or data.empty: + pass + else: + data['id'] = data['id'].astype(str) + data['host_id'] = data['host_id'].astype(str) + data['reviews_per_month'] = data['reviews_per_month'].fillna(0) return data def split_data(data): @@ -221,7 +224,7 @@ def data_preprocessing(input_path, out_dir): """ create_dir_if_not_exists(out_dir) - data = read_data(input_path) + data = read_data(input_path, out_dir) data = convert_missing_values(data) train_df, test_df = split_data(data) train_df = add_price_category(train_df) diff --git a/src/pynyairbnb/pynyairbnb.py b/src/pynyairbnb/pynyairbnb.py index 4fc26a3..785a736 100644 --- a/src/pynyairbnb/pynyairbnb.py +++ b/src/pynyairbnb/pynyairbnb.py @@ -4,7 +4,10 @@ from sklearn.pipeline import make_pipeline from sklearn.dummy import DummyClassifier from sklearn.metrics import classification_report +from sklearn.compose import make_column_transformer from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.feature_extraction.text import CountVectorizer from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint @@ -12,6 +15,36 @@ # from src.function_build_preprocessor import build_preprocessor from pynyairbnb.data_preprocessing import create_dir_if_not_exists +def build_preprocessor(numerical_data, text_data, categorical_data): + """_summary_ + Builds a preprocessor for numerical, text, and categorical data with the following transformations: Standard scaler, One hot encoding, and count Vectorizer + Args: + numerical_data (_type_): numeric data + text_data (_type_): text data + categorical_data (_type_): cateogorical data + + Returns: + _type_: _description_ + """ + # Numerical Transformer + numerical_transformer = StandardScaler() + + # Categorical Transformer + categorical_transformer = OneHotEncoder(handle_unknown='ignore') + + # Text Data Transformer + text_transformer = CountVectorizer() + + # Making Our Preprocessor + preprocessor = make_column_transformer( + (numerical_transformer, numerical_data), + (categorical_transformer, categorical_data), + (text_transformer, text_data), + remainder='drop' + ) + + return preprocessor + def build_clf_model(model, preprocessor, tbl_out_dir, X_train, y_train, X_test, y_test, replacement_dict, clf_report_file_name): """_summary_ Builds a classification model with X_train, y_train, X_test, y_test and saves the classification report to clf_saved_fp diff --git a/tests/test_data_preprocessing.py b/tests/test_data_preprocessing.py index 85f3426..2061c99 100644 --- a/tests/test_data_preprocessing.py +++ b/tests/test_data_preprocessing.py @@ -1,6 +1,9 @@ import os import pandas as pd import sys +import pytest +import warnings +from click.testing import CliRunner sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from src.pynyairbnb.data_preprocessing import create_dir_if_not_exists, read_data, convert_missing_values, split_data, save_dataframes, add_price_category, data_preprocessing @@ -26,6 +29,24 @@ def test_read_data(tmpdir): saved_file = os.path.join(out_dir, 'airbnb_data_2023.csv') assert os.path.exists(saved_file) # More checks can be added here +@pytest.fixture +def mock_data(): + return pd.DataFrame({ + 'id': range(10), # 10 samples + 'host_id': range(10, 20), + 'price': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], + 'reviews_per_month': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + 'price_category': ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+', '350+', '350+'] + }) + +# def test_preprocess_data(mock_data, tmpdir): +# csv_path = os.path.join(tmpdir, "mock_data.csv") +# mock_data.to_csv(csv_path, index=False) +# processed_data = data_preprocessing(csv_path, tmpdir) +# assert all(processed_data['id'].apply(lambda x: isinstance(x, str))) +# assert all(processed_data['host_id'].apply(lambda x: isinstance(x, str))) +# assert 'reviews_per_month' in processed_data.columns + def test_convert_missing_values(): """ Test that missing values are correctly converted. @@ -73,6 +94,83 @@ def test_add_price_category(): assert 'price_category' in result_df.columns, "price_category column not added" assert all(result_df['price_category'] == expected_categories), "Price categories do not match expected values" +def test_add_price_category_spanning_all_ranges(): + data = pd.DataFrame({'price': [-10, 25, 75, 125, 175, 225, 275, 325, 375]}) + result = add_price_category(data) + expected_categories = ['0-50', '0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+'] + assert all(result['price_category'] == expected_categories), "Test failed: Prices spanning all ranges are not categorized correctly." + +def test_add_price_category_single_category(): + data = pd.DataFrame({'price': [100, 105, 110]}) + result = add_price_category(data) + # Ensure the expected_series has the same categories and order as the result + expected_series = pd.Series(['50-100', '100-150', '100-150'], name='price_category') + expected_dtype = pd.CategoricalDtype(categories=['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+'], ordered=True) + expected_series = expected_series.astype(expected_dtype) + pd.testing.assert_series_equal(result['price_category'], expected_series, check_categorical=True) + +def test_add_price_category_empty_dataframe(): + data = pd.DataFrame({'price': []}) + result = add_price_category(data) + assert result.empty, "Test failed: The function should return an empty DataFrame when provided with one." + + +def test_add_price_category_with_negative_prices(): + data = pd.DataFrame({'price': [-1, -20]}) + result = add_price_category(data) + assert all(result['price_category'] == ['0-50', '0-50']), "Test failed: Negative prices are not categorized correctly." + +def test_add_price_category_with_floats(): + data = pd.DataFrame({'price': [49.99, 100.01]}) + result = add_price_category(data) + assert all(result['price_category'] == ['0-50', '100-150']), "Test failed: Float prices are not categorized correctly." + +def test_add_price_category_with_boundary_prices(): + data = pd.DataFrame({'price': [50, 100, 150, 200, 250, 300, 350]}) + result = add_price_category(data) + expected_categories = ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350'] + assert all(result['price_category'] == expected_categories), "Test failed: Boundary prices are not categorized correctly." + +def test_add_price_category_preserves_dtype(): + data = pd.DataFrame({'price': [25, 75], 'other_column': [1, 2]}) + original_dtype = data.dtypes + result = add_price_category(data) + assert data.drop(columns=['price_category']).dtypes.equals(original_dtype), "Test failed: Original data types are altered." + +def test_invalid_data_formats(mock_data, tmp_path): + with warnings.catch_warnings(): + # Filter out the specific UserWarning + warnings.filterwarnings("ignore", message="Pandas doesn't allow columns to be created via a new attribute name", category=UserWarning) + + mock_data.return_value = pd.DataFrame({ + 'id': ['one', 'two'], + 'host_id': [3, 4], + 'price': ['a hundred', 'two hundred'], + 'reviews_per_month': [1.0, 'two'] + }) + runner = CliRunner() + with runner.isolated_filesystem(): + result = runner.invoke(data_preprocessing, ['--input_path', 'dummy.csv', '--out_dir', str(tmp_path)], prog_name="data_preprocessing") + assert result.exit_code != 0 + + +def test_price_categorization_logic(): + mock_df = pd.DataFrame({ + 'price': [75, 150, 225, 300] + }) + expected_categories = ['50-100', '100-150', '200-250', '250-300'] + categorized_df = add_price_category(mock_df) + assert all(categorized_df['price_category'] == expected_categories) + + +def test_data_splitting_proportions(mock_data): + train_df, test_df = split_data(mock_data) + total_len = len(mock_data) + train_len = len(train_df) + test_len = len(test_df) + # Check if the proportions approximately match the expected 80-20 split + assert train_len / total_len == pytest.approx(0.8, 0.05) + assert test_len / total_len == pytest.approx(0.2, 0.05) # def test_data_preprocessing(tmpdir): # """ diff --git a/tests/test_plotting.py b/tests/test_plotting.py index e69de29..91af4e1 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -0,0 +1,38 @@ +import pandas as pd +import matplotlib +import pytest +import os +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from src.pynyairbnb.plotting import sns_plotting + +# Same Data to be used for all tests +data = pd.DataFrame({"price": [25, 75, 125, 175, 225, 275, 325, 375], + "number_of_reviews": [2, 0, 1, 15, 22, 7, 5, 3], + "reviews_per_month": [0, 3, 4, 1, 2, 0, 3, 4], + "room_type": ["cat2", "cat1", "cat5", "cat4", "cat3", "cat6", "cat8", "cat7"]}) + +# Check to see if output type is correct given correct inputs +def test_sns_plotting_output_type(): + result = sns_plotting('scatterplot', data, 'number_of_reviews', 'price', 20, 10) + assert type(result) == matplotlib.figure.Figure, "Test failed: Output type is incorrect." + +# Check to see if exception raised for n/a plot type +def test_sns_plotting_plottype_error(): + with pytest.raises(Exception): + result = sns_plotting('barplot', data, 'number_of_reviews', 'price', 20, 10) + +# Check to see if value error raised for x-variable not in data +def test_sns_plotting_x_error(): + with pytest.raises(ValueError): + sns_plotting('scatterplot', data, 'random_x', 'price', 20, 10) + +# Check to see if value error raised for y-variable not in data +def test_sns_plotting_y_error(): + with pytest.raises(ValueError): + sns_plotting('scatterplot', data, 'number_of_reviews', 'random_y', 20, 10) + +# Check to see the figlength and figheight are both <= 25 to avoid being too large +def test_sns_plotting_figsize_check(): + result = sns_plotting('scatterplot', data, 'number_of_reviews', 'price') + assert result.get_size_inches()[0] <= 25 and result.get_size_inches()[1] <= 25, "Test failed: Plot size is too large." \ No newline at end of file diff --git a/tests/test_pynyairbnb.py b/tests/test_pynyairbnb.py index 968750b..51cd653 100644 --- a/tests/test_pynyairbnb.py +++ b/tests/test_pynyairbnb.py @@ -1 +1,94 @@ -from pynyairbnb import pynyairbnb +import sys +import os +from sklearn.dummy import DummyClassifier +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.neighbors import KNeighborsClassifier +import pandas as pd +import numpy as np +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.datasets import make_classification +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from src.pynyairbnb.pynyairbnb import build_preprocessor, build_clf_model, knn_param_optimization, nyairbnb_analysis + + +def test_preprocessor_type(): + """Test that the function returns a ColumnTransformer object.""" + numerical_data = ['numerical'] + text_data = 'text' + categorical_data = ['category'] + + preprocessor = build_preprocessor(numerical_data, text_data, categorical_data) + assert isinstance(preprocessor, ColumnTransformer) + +def test_transformer_assignment(): + """Test that the correct transformers are assigned to the specified types of data.""" + preprocessor = build_preprocessor(['num'], ['text'], ['cat']) + transformers = {name: type(trans) for name, trans, cols in preprocessor.transformers} + + assert transformers.get('standardscaler') == StandardScaler + assert transformers.get('onehotencoder') == OneHotEncoder + assert transformers.get('countvectorizer') == CountVectorizer + +def test_preprocessor(): + # Create an artificial dataset + np.random.seed(0) # For reproducibility + data = pd.DataFrame({ + 'numerical': np.random.randn(100), + 'text': np.random.choice(['First text', 'Text number 2', 'Third sentence of text'], size=100), + 'category': np.random.choice(['A', 'B', 'C'], size=100), + 'target': np.array([0]*90 + [1]*10) # target variable made with 90 zeros and 10 1s, meaning dummy classifier should predict 0 everytime + }) + + # Split the dataset + X_train, X_test, y_train, y_test = train_test_split( + data.drop('target', axis=1), data['target'], test_size=0.25, random_state=42) + + # Define columns + numerical_data = ['numerical'] + text_data = 'text' + categorical_data = ['category'] + + # Build the preprocessor + preprocessor = build_preprocessor(numerical_data, text_data, categorical_data) + + # combine preprocessor with a dummymodel with strategy most frequent + model = make_pipeline(preprocessor, DummyClassifier(strategy='most_frequent')) + + # Fit the model + model.fit(X_train, y_train) + + # Make predictions and evaluate + predictions = model.predict(X_test) + accuracy = accuracy_score(y_test, predictions) + + print(f"Dummy Classifier Accuracy: {accuracy:.4f}") + + assert accuracy >= 0.5, "Accuracy should be at least 0.5" + +X, y = make_classification(n_samples=100, n_features=4, n_informative=2, n_redundant=0, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + +def test_build_clf_model(): + preprocessor = StandardScaler() + model = KNeighborsClassifier() + tbl_out_dir = "./test_outputs" + os.makedirs(tbl_out_dir, exist_ok=True) + replacement_dict = {'0': 'Class 0', '1': 'Class 1'} + clf_report_file_name = "test_clf_report.csv" + + + clf_model = build_clf_model(model, preprocessor, tbl_out_dir, X_train, y_train, X_test, y_test, replacement_dict, clf_report_file_name) + + + assert clf_model is not None + + assert os.path.isfile(os.path.join(tbl_out_dir, clf_report_file_name)) + + + os.remove(os.path.join(tbl_out_dir, clf_report_file_name)) + os.rmdir(tbl_out_dir) \ No newline at end of file