Skip to content

Commit

Permalink
added tests and fixed them such that all run
Browse files Browse the repository at this point in the history
  • Loading branch information
rashiselarka committed Apr 11, 2024
1 parent a52279c commit 42f32c0
Show file tree
Hide file tree
Showing 7 changed files with 273 additions and 7 deletions.
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ matplotlib = ">=3.8.3"
seaborn = ">=0.13.2"
numpy = ">=1.26.4"
scipy = ">=1.12.0"
click = "^8.1.7"

[tool.poetry.dev-dependencies]

Expand Down
13 changes: 8 additions & 5 deletions src/pynyairbnb/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,13 @@ def convert_missing_values(data):
1 2 2 0.0
2 NaN 3 2.0
"""

data['id'] = data['id'].astype(str)
data['host_id'] = data['host_id'].astype(str)
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

if data is None or data.empty:
pass
else:
data['id'] = data['id'].astype(str)
data['host_id'] = data['host_id'].astype(str)
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)
return data

def split_data(data):
Expand Down Expand Up @@ -221,7 +224,7 @@ def data_preprocessing(input_path, out_dir):
"""
create_dir_if_not_exists(out_dir)

data = read_data(input_path)
data = read_data(input_path, out_dir)
data = convert_missing_values(data)
train_df, test_df = split_data(data)
train_df = add_price_category(train_df)
Expand Down
33 changes: 33 additions & 0 deletions src/pynyairbnb/pynyairbnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,47 @@
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
# from src.function_build_preprocessor import build_preprocessor
from pynyairbnb.data_preprocessing import create_dir_if_not_exists

def build_preprocessor(numerical_data, text_data, categorical_data):
"""_summary_
Builds a preprocessor for numerical, text, and categorical data with the following transformations: Standard scaler, One hot encoding, and count Vectorizer
Args:
numerical_data (_type_): numeric data
text_data (_type_): text data
categorical_data (_type_): cateogorical data
Returns:
_type_: _description_
"""
# Numerical Transformer
numerical_transformer = StandardScaler()

# Categorical Transformer
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Text Data Transformer
text_transformer = CountVectorizer()

# Making Our Preprocessor
preprocessor = make_column_transformer(
(numerical_transformer, numerical_data),
(categorical_transformer, categorical_data),
(text_transformer, text_data),
remainder='drop'
)

return preprocessor

def build_clf_model(model, preprocessor, tbl_out_dir, X_train, y_train, X_test, y_test, replacement_dict, clf_report_file_name):
"""_summary_
Builds a classification model with X_train, y_train, X_test, y_test and saves the classification report to clf_saved_fp
Expand Down
98 changes: 98 additions & 0 deletions tests/test_data_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import pandas as pd
import sys
import pytest
import warnings
from click.testing import CliRunner
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.pynyairbnb.data_preprocessing import create_dir_if_not_exists, read_data, convert_missing_values, split_data, save_dataframes, add_price_category, data_preprocessing

Expand All @@ -26,6 +29,24 @@ def test_read_data(tmpdir):
saved_file = os.path.join(out_dir, 'airbnb_data_2023.csv')
assert os.path.exists(saved_file) # More checks can be added here

@pytest.fixture
def mock_data():
return pd.DataFrame({
'id': range(10), # 10 samples
'host_id': range(10, 20),
'price': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
'reviews_per_month': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
'price_category': ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+', '350+', '350+']
})

# def test_preprocess_data(mock_data, tmpdir):
# csv_path = os.path.join(tmpdir, "mock_data.csv")
# mock_data.to_csv(csv_path, index=False)
# processed_data = data_preprocessing(csv_path, tmpdir)
# assert all(processed_data['id'].apply(lambda x: isinstance(x, str)))
# assert all(processed_data['host_id'].apply(lambda x: isinstance(x, str)))
# assert 'reviews_per_month' in processed_data.columns

def test_convert_missing_values():
"""
Test that missing values are correctly converted.
Expand Down Expand Up @@ -73,6 +94,83 @@ def test_add_price_category():

assert 'price_category' in result_df.columns, "price_category column not added"
assert all(result_df['price_category'] == expected_categories), "Price categories do not match expected values"
def test_add_price_category_spanning_all_ranges():
data = pd.DataFrame({'price': [-10, 25, 75, 125, 175, 225, 275, 325, 375]})
result = add_price_category(data)
expected_categories = ['0-50', '0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+']
assert all(result['price_category'] == expected_categories), "Test failed: Prices spanning all ranges are not categorized correctly."

def test_add_price_category_single_category():
data = pd.DataFrame({'price': [100, 105, 110]})
result = add_price_category(data)
# Ensure the expected_series has the same categories and order as the result
expected_series = pd.Series(['50-100', '100-150', '100-150'], name='price_category')
expected_dtype = pd.CategoricalDtype(categories=['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350+'], ordered=True)
expected_series = expected_series.astype(expected_dtype)
pd.testing.assert_series_equal(result['price_category'], expected_series, check_categorical=True)

def test_add_price_category_empty_dataframe():
data = pd.DataFrame({'price': []})
result = add_price_category(data)
assert result.empty, "Test failed: The function should return an empty DataFrame when provided with one."


def test_add_price_category_with_negative_prices():
data = pd.DataFrame({'price': [-1, -20]})
result = add_price_category(data)
assert all(result['price_category'] == ['0-50', '0-50']), "Test failed: Negative prices are not categorized correctly."

def test_add_price_category_with_floats():
data = pd.DataFrame({'price': [49.99, 100.01]})
result = add_price_category(data)
assert all(result['price_category'] == ['0-50', '100-150']), "Test failed: Float prices are not categorized correctly."

def test_add_price_category_with_boundary_prices():
data = pd.DataFrame({'price': [50, 100, 150, 200, 250, 300, 350]})
result = add_price_category(data)
expected_categories = ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350']
assert all(result['price_category'] == expected_categories), "Test failed: Boundary prices are not categorized correctly."

def test_add_price_category_preserves_dtype():
data = pd.DataFrame({'price': [25, 75], 'other_column': [1, 2]})
original_dtype = data.dtypes
result = add_price_category(data)
assert data.drop(columns=['price_category']).dtypes.equals(original_dtype), "Test failed: Original data types are altered."

def test_invalid_data_formats(mock_data, tmp_path):
with warnings.catch_warnings():
# Filter out the specific UserWarning
warnings.filterwarnings("ignore", message="Pandas doesn't allow columns to be created via a new attribute name", category=UserWarning)

mock_data.return_value = pd.DataFrame({
'id': ['one', 'two'],
'host_id': [3, 4],
'price': ['a hundred', 'two hundred'],
'reviews_per_month': [1.0, 'two']
})
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(data_preprocessing, ['--input_path', 'dummy.csv', '--out_dir', str(tmp_path)], prog_name="data_preprocessing")
assert result.exit_code != 0


def test_price_categorization_logic():
mock_df = pd.DataFrame({
'price': [75, 150, 225, 300]
})
expected_categories = ['50-100', '100-150', '200-250', '250-300']
categorized_df = add_price_category(mock_df)
assert all(categorized_df['price_category'] == expected_categories)


def test_data_splitting_proportions(mock_data):
train_df, test_df = split_data(mock_data)
total_len = len(mock_data)
train_len = len(train_df)
test_len = len(test_df)
# Check if the proportions approximately match the expected 80-20 split
assert train_len / total_len == pytest.approx(0.8, 0.05)
assert test_len / total_len == pytest.approx(0.2, 0.05)

# def test_data_preprocessing(tmpdir):
# """
Expand Down
38 changes: 38 additions & 0 deletions tests/test_plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd
import matplotlib
import pytest
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.pynyairbnb.plotting import sns_plotting

# Same Data to be used for all tests
data = pd.DataFrame({"price": [25, 75, 125, 175, 225, 275, 325, 375],
"number_of_reviews": [2, 0, 1, 15, 22, 7, 5, 3],
"reviews_per_month": [0, 3, 4, 1, 2, 0, 3, 4],
"room_type": ["cat2", "cat1", "cat5", "cat4", "cat3", "cat6", "cat8", "cat7"]})

# Check to see if output type is correct given correct inputs
def test_sns_plotting_output_type():
result = sns_plotting('scatterplot', data, 'number_of_reviews', 'price', 20, 10)
assert type(result) == matplotlib.figure.Figure, "Test failed: Output type is incorrect."

# Check to see if exception raised for n/a plot type
def test_sns_plotting_plottype_error():
with pytest.raises(Exception):
result = sns_plotting('barplot', data, 'number_of_reviews', 'price', 20, 10)

# Check to see if value error raised for x-variable not in data
def test_sns_plotting_x_error():
with pytest.raises(ValueError):
sns_plotting('scatterplot', data, 'random_x', 'price', 20, 10)

# Check to see if value error raised for y-variable not in data
def test_sns_plotting_y_error():
with pytest.raises(ValueError):
sns_plotting('scatterplot', data, 'number_of_reviews', 'random_y', 20, 10)

# Check to see the figlength and figheight are both <= 25 to avoid being too large
def test_sns_plotting_figsize_check():
result = sns_plotting('scatterplot', data, 'number_of_reviews', 'price')
assert result.get_size_inches()[0] <= 25 and result.get_size_inches()[1] <= 25, "Test failed: Plot size is too large."
95 changes: 94 additions & 1 deletion tests/test_pynyairbnb.py
Original file line number Diff line number Diff line change
@@ -1 +1,94 @@
from pynyairbnb import pynyairbnb
import sys
import os
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.pynyairbnb.pynyairbnb import build_preprocessor, build_clf_model, knn_param_optimization, nyairbnb_analysis


def test_preprocessor_type():
"""Test that the function returns a ColumnTransformer object."""
numerical_data = ['numerical']
text_data = 'text'
categorical_data = ['category']

preprocessor = build_preprocessor(numerical_data, text_data, categorical_data)
assert isinstance(preprocessor, ColumnTransformer)

def test_transformer_assignment():
"""Test that the correct transformers are assigned to the specified types of data."""
preprocessor = build_preprocessor(['num'], ['text'], ['cat'])
transformers = {name: type(trans) for name, trans, cols in preprocessor.transformers}

assert transformers.get('standardscaler') == StandardScaler
assert transformers.get('onehotencoder') == OneHotEncoder
assert transformers.get('countvectorizer') == CountVectorizer

def test_preprocessor():
# Create an artificial dataset
np.random.seed(0) # For reproducibility
data = pd.DataFrame({
'numerical': np.random.randn(100),
'text': np.random.choice(['First text', 'Text number 2', 'Third sentence of text'], size=100),
'category': np.random.choice(['A', 'B', 'C'], size=100),
'target': np.array([0]*90 + [1]*10) # target variable made with 90 zeros and 10 1s, meaning dummy classifier should predict 0 everytime
})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
data.drop('target', axis=1), data['target'], test_size=0.25, random_state=42)

# Define columns
numerical_data = ['numerical']
text_data = 'text'
categorical_data = ['category']

# Build the preprocessor
preprocessor = build_preprocessor(numerical_data, text_data, categorical_data)

# combine preprocessor with a dummymodel with strategy most frequent
model = make_pipeline(preprocessor, DummyClassifier(strategy='most_frequent'))

# Fit the model
model.fit(X_train, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Dummy Classifier Accuracy: {accuracy:.4f}")

assert accuracy >= 0.5, "Accuracy should be at least 0.5"

X, y = make_classification(n_samples=100, n_features=4, n_informative=2, n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def test_build_clf_model():
preprocessor = StandardScaler()
model = KNeighborsClassifier()
tbl_out_dir = "./test_outputs"
os.makedirs(tbl_out_dir, exist_ok=True)
replacement_dict = {'0': 'Class 0', '1': 'Class 1'}
clf_report_file_name = "test_clf_report.csv"


clf_model = build_clf_model(model, preprocessor, tbl_out_dir, X_train, y_train, X_test, y_test, replacement_dict, clf_report_file_name)


assert clf_model is not None

assert os.path.isfile(os.path.join(tbl_out_dir, clf_report_file_name))


os.remove(os.path.join(tbl_out_dir, clf_report_file_name))
os.rmdir(tbl_out_dir)

0 comments on commit 42f32c0

Please # to comment.