-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict.py
164 lines (127 loc) · 5.86 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import argparse
import os
from typing import Tuple
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from utils import load_model_and_encoders, load_test_data, CONFIG
# Instantiate the parser
parser = argparse.ArgumentParser(
description='This script is used for make bulk predictions of fake users detection in inference time.',
)
def label_encode_features(
df: pd.DataFrame, m_enc: dict
) -> pd.DataFrame:
"""
This function only transforms the categorical features 'Event' and 'Category' in a dataframe
:param df: Test dataframe
:param m_enc: Dictionary of label encoders that must contain 'event_le' and 'cat_le'
:return: The same dataframe with transformed Event and Category
"""
df[CONFIG["COLUMNS"]["Event"]] = m_enc["event_le"].transform(df[CONFIG["COLUMNS"]["Event"]])
df[CONFIG["COLUMNS"]["Category"]] = m_enc["cat_le"].transform(df[CONFIG["COLUMNS"]["Category"]])
return df
def predict_by_user(
test_df: pd.DataFrame, model, ohe: OneHotEncoder
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
This function is used to get the predictions for the distinct set of users in the in a test set
:param test_df: Test dataframe
:param model: The classifier object
:param ohe: One Hot Encoder, previously fit in the training phase
:return: The dataframe of users with their fake probabilities, together with the granular predictions
"""
per_user_pred = test_df.groupby(CONFIG["COLUMNS"]["UserId"]).first()
per_user_pred[CONFIG["COLUMNS"]["Pred"]] = None
test_df[CONFIG["COLUMNS"]["Pred"]] = None
for user, group in test_df.groupby("UserId"):
preds = model.predict_proba(
ohe.transform(group[[CONFIG["COLUMNS"]["Event"], CONFIG["COLUMNS"]["Category"]]])
)
# Only fake class probability is counted
fake_preds = preds[:, 1]
per_user_pred.loc[user, CONFIG["COLUMNS"]["Pred"]] = (
sum(fake_preds) / preds.shape[0]
)
for idx, pred in zip(group.index, fake_preds):
test_df.loc[idx, CONFIG["COLUMNS"]["Pred"]] = pred
return per_user_pred.reset_index().rename(columns={"index": "UserId"}), test_df
def extract_and_save_results(
pred_df: pd.DataFrame, m_enc: dict, result_folder: str, by_user: bool = False, threshold: float = 0.5
) -> Tuple[str, str]:
"""
Calculates the classification report, saves it and returns paths of the saved files.
:param pred_df: Dataframe of the transactions and their predictions
:param m_enc: Dictionary that holds the model directory and the label encoders
:param result_folder: Under which the results will be saved
:param by_user: Flag, to determine between the output format
:param threshold: Classification threshold
:return: Paths of the saved output and the classification report files
"""
clf_report = classification_report(
pred_df[CONFIG["COLUMNS"]["Fake"]],
pred_df[CONFIG["COLUMNS"]["Pred"]].apply(lambda x: 1 if x > threshold else 0),
output_dict=True,
digits=4,
)
base_path = os.path.join(m_enc["model_base_dir"], result_folder)
os.mkdir(base_path) if not os.path.isdir(base_path) else None
clf_report_path = os.path.join(base_path, f"{('per_user_' if by_user else '')}clf_report.csv")
output_path = os.path.join(base_path, f"{('per_user_' if by_user else '')}output.csv")
pd.DataFrame(clf_report).transpose().to_csv(clf_report_path, index=True)
if by_user:
# Only desired columns for the output
output_df = pred_df[
[CONFIG["COLUMNS"]["UserId"], CONFIG["COLUMNS"]["Fake"], CONFIG["COLUMNS"]["Pred"]]
]
else:
output_df = pred_df
output_df[CONFIG["COLUMNS"]["Event"]] = m_enc["event_le"].inverse_transform(
output_df[CONFIG["COLUMNS"]["Event"]]
)
output_df[CONFIG["COLUMNS"]["Category"]] = m_enc["cat_le"].inverse_transform(
output_df[CONFIG["COLUMNS"]["Category"]]
)
output_df.to_csv(output_path, index=False, float_format='%.4f')
return output_path, clf_report_path
def start_prediction(params) -> None:
"""
Starts the prediction procedure
:param params: Parameters passed when calling the script
"""
m_enc = load_model_and_encoders(params.model_path)
test_df = load_test_data(params.test_file)
test_df = label_encode_features(test_df, m_enc)
per_user_pred, test_df = predict_by_user(test_df, m_enc["model"], m_enc["ohe"])
extract_and_save_results(test_df, m_enc, params.result_folder)
output_path, report_path = extract_and_save_results(
per_user_pred, m_enc, params.result_folder, by_user=True
)
print(f"Final per-user output path: {os.path.abspath(output_path)}")
def validate_args(input_args) -> object:
"""
This function validates the arguments passed to the script
:param input_args: Parser arguments
:return: Validated parser arguments
"""
if not input_args.test_file:
parser.error("test_file must be provided to make predictions.")
if not input_args.result_folder:
parser.error("result_folder must be provided to output classification report.")
return input_args
if __name__ == '__main__':
parser.add_argument(
'--test_file', type=str,
help='Required arguments, should be a path to CSV file that contains test data'
'with the following columns: [UserId, Event, Category]',
)
parser.add_argument(
'--result_folder', type=str, default=CONFIG["TEST_RESULTS_PATH"],
help='The folder path under which the script will output classification reports.',
)
parser.add_argument(
'--model_path', type=str, default=None,
help='The path of the model to be used in the prediction.',
)
args = validate_args(parser.parse_args())
start_prediction(args)