added tests and fixed them such that all run

DSCI-310-2024 · Apr 11, 2024 · 42f32c0 · 42f32c0
1 parent a52279c
commit 42f32c0
Show file tree

Hide file tree

Showing 7 changed files with 273 additions and 7 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ matplotlib = ">=3.8.3"
 seaborn = ">=0.13.2"
 numpy = ">=1.26.4"
 scipy = ">=1.12.0"
+click = "^8.1.7"
 
 [tool.poetry.dev-dependencies]
 

diff --git a/src/pynyairbnb/data_preprocessing.py b/src/pynyairbnb/data_preprocessing.py
@@ -81,10 +81,13 @@ def convert_missing_values(data):
     1   2       2                0.0
     2  NaN       3                2.0
     """
-
-    data['id'] = data['id'].astype(str)
-    data['host_id'] = data['host_id'].astype(str)
-    data['reviews_per_month'] = data['reviews_per_month'].fillna(0)
+
+    if data is None or data.empty:
+        pass
+    else:
+        data['id'] = data['id'].astype(str)
+        data['host_id'] = data['host_id'].astype(str)
+        data['reviews_per_month'] = data['reviews_per_month'].fillna(0)
     return data
 
 def split_data(data):
@@ -221,7 +224,7 @@ def data_preprocessing(input_path, out_dir):
     """
     create_dir_if_not_exists(out_dir)
 
-    data = read_data(input_path)
+    data = read_data(input_path, out_dir)
     data = convert_missing_values(data)
     train_df, test_df = split_data(data)
     train_df = add_price_category(train_df)

diff --git a/src/pynyairbnb/pynyairbnb.py b/src/pynyairbnb/pynyairbnb.py
@@ -4,14 +4,47 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.dummy import DummyClassifier
 from sklearn.metrics import classification_report
+from sklearn.compose import make_column_transformer
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.model_selection import RandomizedSearchCV
 from scipy.stats import randint
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 # from src.function_build_preprocessor import build_preprocessor
 from pynyairbnb.data_preprocessing import create_dir_if_not_exists
 
+def build_preprocessor(numerical_data, text_data, categorical_data):
+    """_summary_
+    Builds a preprocessor for numerical, text, and categorical data with the following transformations: Standard scaler, One hot encoding, and count Vectorizer
+    Args:
+        numerical_data (_type_): numeric data
+        text_data (_type_): text data 
+        categorical_data (_type_): cateogorical data 
+
+    Returns:
+        _type_: _description_
+    """
+    # Numerical Transformer
+    numerical_transformer = StandardScaler()
+
+    # Categorical Transformer
+    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
+
+    # Text Data Transformer
+    text_transformer = CountVectorizer()
+
+    # Making Our Preprocessor
+    preprocessor = make_column_transformer(
+        (numerical_transformer, numerical_data),
+        (categorical_transformer, categorical_data),
+        (text_transformer, text_data),
+        remainder='drop'
+    )
+
+    return preprocessor
+
 def build_clf_model(model, preprocessor, tbl_out_dir, X_train, y_train, X_test, y_test, replacement_dict, clf_report_file_name):
     """_summary_
     Builds a classification model with X_train, y_train, X_test, y_test and saves the classification report to clf_saved_fp 

diff --git a/tests/test_data_preprocessing.py b/tests/test_data_preprocessing.py
@@ -1,6 +1,9 @@
 import os
 import pandas as pd
 import sys
+import pytest
+import warnings
+from click.testing import CliRunner
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from src.pynyairbnb.data_preprocessing import create_dir_if_not_exists, read_data, convert_missing_values, split_data, save_dataframes, add_price_category, data_preprocessing
 
@@ -26,6 +29,24 @@ def test_read_data(tmpdir):
     saved_file = os.path.join(out_dir, 'airbnb_data_2023.csv')
     assert os.path.exists(saved_file)  # More checks can be added here
 
+@pytest.fixture
+def mock_data():
+    return pd.DataFrame({
+        'id': range(10),  # 10 samples
+        'host_id': range(10, 20),
+        'price': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
+        'reviews_per_month': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        'price_category': ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+', '350+', '350+']
+    })
+
+# def test_preprocess_data(mock_data, tmpdir):
+#     csv_path = os.path.join(tmpdir, "mock_data.csv")
+#     mock_data.to_csv(csv_path, index=False)
+#     processed_data = data_preprocessing(csv_path, tmpdir)
+#     assert all(processed_data['id'].apply(lambda x: isinstance(x, str)))
+#     assert all(processed_data['host_id'].apply(lambda x: isinstance(x, str)))
+#     assert 'reviews_per_month' in processed_data.columns
+
 def test_convert_missing_values():
     """
     Test that missing values are correctly converted.
@@ -73,6 +94,83 @@ def test_add_price_category():
 
     assert 'price_category' in result_df.columns, "price_category column not added"
     assert all(result_df['price_category'] == expected_categories), "Price categories do not match expected values"
+def test_add_price_category_spanning_all_ranges():
+    data = pd.DataFrame({'price': [-10, 25, 75, 125, 175, 225, 275, 325, 375]})
+    result = add_price_category(data)
+    expected_categories = ['0-50', '0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+']
+    assert all(result['price_category'] == expected_categories), "Test failed: Prices spanning all ranges are not categorized correctly."
+
+def test_add_price_category_single_category():
+    data = pd.DataFrame({'price': [100, 105, 110]})
+    result = add_price_category(data)
+    # Ensure the expected_series has the same categories and order as the result
+    expected_series = pd.Series(['50-100', '100-150', '100-150'], name='price_category')
+    expected_dtype = pd.CategoricalDtype(categories=['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+'], ordered=True)
+    expected_series = expected_series.astype(expected_dtype)
+    pd.testing.assert_series_equal(result['price_category'], expected_series, check_categorical=True)
+
+def test_add_price_category_empty_dataframe():
+    data = pd.DataFrame({'price': []})
+    result = add_price_category(data)
+    assert result.empty, "Test failed: The function should return an empty DataFrame when provided with one."
+
+
+def test_add_price_category_with_negative_prices():
+    data = pd.DataFrame({'price': [-1, -20]})
+    result = add_price_category(data)
+    assert all(result['price_category'] == ['0-50', '0-50']), "Test failed: Negative prices are not categorized correctly."
+
+def test_add_price_category_with_floats():
+    data = pd.DataFrame({'price': [49.99, 100.01]})
+    result = add_price_category(data)
+    assert all(result['price_category'] == ['0-50', '100-150']), "Test failed: Float prices are not categorized correctly."
+
+def test_add_price_category_with_boundary_prices():
+    data = pd.DataFrame({'price': [50, 100, 150, 200, 250, 300, 350]})
+    result = add_price_category(data)
+    expected_categories = ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350']
+    assert all(result['price_category'] == expected_categories), "Test failed: Boundary prices are not categorized correctly."
+
+def test_add_price_category_preserves_dtype():
+    data = pd.DataFrame({'price': [25, 75], 'other_column': [1, 2]})
+    original_dtype = data.dtypes
+    result = add_price_category(data)
+    assert data.drop(columns=['price_category']).dtypes.equals(original_dtype), "Test failed: Original data types are altered."
+
+def test_invalid_data_formats(mock_data, tmp_path):
+    with warnings.catch_warnings():
+        # Filter out the specific UserWarning
+        warnings.filterwarnings("ignore", message="Pandas doesn't allow columns to be created via a new attribute name", category=UserWarning)
+
+        mock_data.return_value = pd.DataFrame({
+            'id': ['one', 'two'],
+            'host_id': [3, 4],
+            'price': ['a hundred', 'two hundred'],
+            'reviews_per_month': [1.0, 'two']
+        })
+        runner = CliRunner()
+        with runner.isolated_filesystem():
+            result = runner.invoke(data_preprocessing, ['--input_path', 'dummy.csv', '--out_dir', str(tmp_path)], prog_name="data_preprocessing")
+            assert result.exit_code != 0
+
+
+def test_price_categorization_logic():
+    mock_df = pd.DataFrame({
+        'price': [75, 150, 225, 300]
+    })
+    expected_categories = ['50-100', '100-150', '200-250', '250-300']
+    categorized_df = add_price_category(mock_df)
+    assert all(categorized_df['price_category'] == expected_categories)
+
+
+def test_data_splitting_proportions(mock_data):
+    train_df, test_df = split_data(mock_data)
+    total_len = len(mock_data)
+    train_len = len(train_df)
+    test_len = len(test_df)
+    # Check if the proportions approximately match the expected 80-20 split
+    assert train_len / total_len == pytest.approx(0.8, 0.05)
+    assert test_len / total_len == pytest.approx(0.2, 0.05)
 
 # def test_data_preprocessing(tmpdir):
 #     """

diff --git a/tests/test_plotting.py b/tests/test_plotting.py
@@ -0,0 +1,38 @@
+import pandas as pd
+import matplotlib
+import pytest
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from src.pynyairbnb.plotting import sns_plotting
+
+# Same Data to be used for all tests
+data = pd.DataFrame({"price": [25, 75, 125, 175, 225, 275, 325, 375],
+                    "number_of_reviews": [2, 0, 1, 15, 22, 7, 5, 3],
+                    "reviews_per_month": [0, 3, 4, 1, 2, 0, 3, 4],
+                    "room_type": ["cat2", "cat1", "cat5", "cat4", "cat3", "cat6", "cat8", "cat7"]})
+
+# Check to see if output type is correct given correct inputs
+def test_sns_plotting_output_type():
+    result = sns_plotting('scatterplot', data, 'number_of_reviews', 'price', 20, 10)
+    assert type(result) == matplotlib.figure.Figure, "Test failed: Output type is incorrect."
+
+# Check to see if exception raised for n/a plot type
+def test_sns_plotting_plottype_error():
+    with pytest.raises(Exception):
+        result = sns_plotting('barplot', data, 'number_of_reviews', 'price', 20, 10)
+
+# Check to see if value error raised for x-variable not in data
+def test_sns_plotting_x_error():
+    with pytest.raises(ValueError):
+        sns_plotting('scatterplot', data, 'random_x', 'price', 20, 10)
+
+# Check to see if value error raised for y-variable not in data
+def test_sns_plotting_y_error():
+    with pytest.raises(ValueError):
+        sns_plotting('scatterplot', data, 'number_of_reviews', 'random_y', 20, 10)
+
+# Check to see the figlength and figheight are both <= 25 to avoid being too large
+def test_sns_plotting_figsize_check():
+    result = sns_plotting('scatterplot', data, 'number_of_reviews', 'price')
+    assert result.get_size_inches()[0] <= 25 and result.get_size_inches()[1] <= 25, "Test failed: Plot size is too large."
diff --git a/tests/test_pynyairbnb.py b/tests/test_pynyairbnb.py
@@ -1 +1,94 @@
-from pynyairbnb import pynyairbnb
+import sys
+import os
+from sklearn.dummy import DummyClassifier
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.neighbors import KNeighborsClassifier
+import pandas as pd
+import numpy as np
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import make_classification
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from src.pynyairbnb.pynyairbnb import build_preprocessor, build_clf_model, knn_param_optimization, nyairbnb_analysis
+
+
+def test_preprocessor_type():
+    """Test that the function returns a ColumnTransformer object."""
+    numerical_data = ['numerical']
+    text_data = 'text'
+    categorical_data = ['category']
+
+    preprocessor = build_preprocessor(numerical_data, text_data, categorical_data)
+    assert isinstance(preprocessor, ColumnTransformer)
+
+def test_transformer_assignment():
+    """Test that the correct transformers are assigned to the specified types of data."""
+    preprocessor = build_preprocessor(['num'], ['text'], ['cat'])
+    transformers = {name: type(trans) for name, trans, cols in preprocessor.transformers}
+
+    assert transformers.get('standardscaler') == StandardScaler
+    assert transformers.get('onehotencoder') == OneHotEncoder
+    assert transformers.get('countvectorizer') == CountVectorizer
+
+def test_preprocessor():
+    # Create an artificial dataset
+    np.random.seed(0)  # For reproducibility
+    data = pd.DataFrame({
+        'numerical': np.random.randn(100),
+        'text': np.random.choice(['First text', 'Text number 2', 'Third sentence of text'], size=100),
+        'category': np.random.choice(['A', 'B', 'C'], size=100),
+        'target': np.array([0]*90 + [1]*10)  # target variable made with 90 zeros and 10 1s, meaning dummy classifier should predict 0 everytime
+    })
+
+    # Split the dataset
+    X_train, X_test, y_train, y_test = train_test_split(
+        data.drop('target', axis=1), data['target'], test_size=0.25, random_state=42)
+
+    # Define columns
+    numerical_data = ['numerical']
+    text_data = 'text'
+    categorical_data = ['category']
+
+    # Build the preprocessor
+    preprocessor = build_preprocessor(numerical_data, text_data, categorical_data)
+
+    # combine preprocessor with a dummymodel with strategy most frequent
+    model = make_pipeline(preprocessor, DummyClassifier(strategy='most_frequent'))
+
+    # Fit the model
+    model.fit(X_train, y_train)
+
+    # Make predictions and evaluate
+    predictions = model.predict(X_test)
+    accuracy = accuracy_score(y_test, predictions)
+
+    print(f"Dummy Classifier Accuracy: {accuracy:.4f}")
+
+    assert accuracy >= 0.5, "Accuracy should be at least 0.5"
+
+X, y = make_classification(n_samples=100, n_features=4, n_informative=2, n_redundant=0, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+
+def test_build_clf_model():
+    preprocessor = StandardScaler()
+    model = KNeighborsClassifier()
+    tbl_out_dir = "./test_outputs"
+    os.makedirs(tbl_out_dir, exist_ok=True)
+    replacement_dict = {'0': 'Class 0', '1': 'Class 1'}
+    clf_report_file_name = "test_clf_report.csv"
+
+
+    clf_model = build_clf_model(model, preprocessor, tbl_out_dir, X_train, y_train, X_test, y_test, replacement_dict, clf_report_file_name)
+
+
+    assert clf_model is not None
+
+    assert os.path.isfile(os.path.join(tbl_out_dir, clf_report_file_name))
+
+
+    os.remove(os.path.join(tbl_out_dir, clf_report_file_name))
+    os.rmdir(tbl_out_dir)