-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdata_profiler.py
85 lines (70 loc) · 4.56 KB
/
data_profiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from IPython.display import display
from IPython.display import JSON
from IPython.core.display import HTML
import pandas as pd
# --- no changes; just run this code block ---
def summary_stats(df):
""" Generate summary statistics for a panda's data frame
Args:
df (DataFrame): panda's dataframe to create summary statistics for.
Returns:
DataFrame of summary statistics, training data schema, event variables and event lables
"""
df = df.copy()
rowcnt = len(df)
df['EVENT_LABEL'] = df['EVENT_LABEL'].astype('str', errors='ignore')
df_s1 = df.agg(['count', 'nunique']).transpose().reset_index().rename(columns={"index":"feature_name"})
df_s1["null"] = (rowcnt - df_s1["count"]).astype('int64')
df_s1["not_null"] = rowcnt - df_s1["null"]
df_s1["null_pct"] = df_s1["null"] / rowcnt
df_s1["nunique_pct"] = df_s1['nunique']/ rowcnt
dt = pd.DataFrame(df.dtypes).reset_index().rename(columns={"index":"feature_name", 0:"dtype"})
df_stats = pd.merge(dt, df_s1, on='feature_name', how='inner').round(4)
df_stats['nunique'] = df_stats['nunique'].astype('int64')
df_stats['count'] = df_stats['count'].astype('int64')
# -- variable type mapper --
df_stats['feature_type'] = "UNKOWN"
df_stats.loc[df_stats["dtype"] == object, 'feature_type'] = "CATEGORY"
df_stats.loc[(df_stats["dtype"] == "int64") | (df_stats["dtype"] == "float64"), 'feature_type'] = "NUMERIC"
df_stats.loc[df_stats["feature_name"].str.contains("ipaddress|ip_address|ipaddr"), 'feature_type'] = "IP_ADDRESS"
df_stats.loc[df_stats["feature_name"].str.contains("email|email_address|emailaddr"), 'feature_type'] = "EMAIL_ADDRESS"
df_stats.loc[df_stats["feature_name"] == "EVENT_LABEL", 'feature_type'] = "TARGET"
df_stats.loc[df_stats["feature_name"] == "EVENT_TIMESTAMP", 'feature_type'] = "EVENT_TIMESTAMP"
# -- variable warnings --
df_stats['feature_warning'] = "NO WARNING"
df_stats.loc[(df_stats["nunique"] != 2) & (df_stats["feature_name"] == "EVENT_LABEL"),'feature_warning' ] = "LABEL WARNING, NON-BINARY EVENT LABEL"
df_stats.loc[(df_stats["nunique_pct"] > 0.9) & (df_stats['feature_type'] == "CATEGORY") ,'feature_warning' ] = "EXCLUDE, GT 90% UNIQUE"
df_stats.loc[(df_stats["null_pct"] > 0.2) & (df_stats["null_pct"] <= 0.5), 'feature_warning' ] = "NULL WARNING, GT 20% MISSING"
df_stats.loc[df_stats["null_pct"] > 0.5,'feature_warning' ] = "EXCLUDE, GT 50% MISSING"
df_stats.loc[((df_stats['dtype'] == "int64" ) | (df_stats['dtype'] == "float64" ) ) & (df_stats['nunique'] < 0.2), 'feature_warning' ] = "LIKELY CATEGORICAL, NUMERIC w. LOW CARDINALITY"
# -- target check --
exclude_fields = df_stats.loc[(df_stats['feature_warning'] != 'NO WARNING')]['feature_name'].to_list()
event_variables = df_stats.loc[(~df_stats['feature_name'].isin(['EVENT_LABEL', 'EVENT_TIMESTAMP']))]['feature_name'].to_list()
event_labels = df["EVENT_LABEL"].unique().tolist()
trainingDataSchema = {
'modelVariables' : df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list(),
'labelSchema' : {
'labelMapper' : {
'FRAUD' : [df["EVENT_LABEL"].value_counts().idxmin()],
'LEGIT' : [df["EVENT_LABEL"].value_counts().idxmax()]
}
}
}
model_variables = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list()
# -- label schema --
label_map = {
'FRAUD' : [df["EVENT_LABEL"].value_counts().idxmin()],
'LEGIT' : [df["EVENT_LABEL"].value_counts().idxmax()]
}
display(HTML("<h4>Summary Stats </h4>"))
display(df_stats)
display(HTML("<h4>Event Variables </h4>"))
display(HTML("<p>These are the available features in the data set for the AFD model training</p>"))
display(JSON(event_variables))
display(HTML("<h4>Event Labels </h4>"))
display(HTML("<p>We have two types of events - Fraud events and legitimate events </p>"))
display(JSON(event_labels))
display(HTML("<h4>Training Data Schema </h4>"))
display(HTML("<p>Training data schema is required for creating and training the model. Refer to <a href='https://docs.aws.amazon.com/frauddetector/latest/api/API_CreateModelVersion.html#FraudDetector-CreateModelVersion-request-trainingDataSchema'>documentation</a> </p>"))
display(JSON(trainingDataSchema))
return df_stats, trainingDataSchema, event_variables, event_labels