-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBaselineFeatureTransformation.py
139 lines (91 loc) · 5.37 KB
/
BaselineFeatureTransformation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from SharedFunctions import *
import autopep8
DIR_INPUT = './data/'
BEGIN_DATE = "2018-04-01"
END_DATE = "2018-08-31"
print("Load files")
transactions_df = read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),
transactions_df.TX_FRAUD.sum()))
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 25)
# print(transactions_df.head())
autopep8.fix_file("BaselineFeatureTransformation.py")
# print(transactions_df.shape)
#
# print(transactions_df.dtypes)
#
# print(transactions_df.isna().sum())
#
# print(transactions_df.duplicated().sum())
#
# print(transactions_df.info())
def is_weekend(tx_datetime):
# Transform date into weekday (0 is Monday, 6 is Sunday)
weekday = tx_datetime.weekday()
# Binary value: 0 if weekday, 1 if weekend
is_weekend = weekday >= 5
return int(is_weekend)
transactions_df['TX_DURING_WEEKEND'] = transactions_df.TX_DATETIME.apply(is_weekend)
def is_night(tx_datetime):
# Get the hour of the transaction
tx_hour = tx_datetime.hour
# Binary value: 1 if hour less than 6, and 0 otherwise
is_night = tx_hour <= 6
return int(is_night)
transactions_df['TX_DURING_NIGHT'] = transactions_df.TX_DATETIME.apply(is_night)
# print(transactions_df[transactions_df.TX_TIME_DAYS >= 30])
def get_customer_spending_behaviour_features(customer_transactions, windows_size_in_days=[1, 7, 30]):
# Let us first order transactions chronologically
customer_transactions = customer_transactions.sort_values('TX_DATETIME')
# The transaction date and time is set as the index, which will allow the use of the rolling function
customer_transactions.index = customer_transactions.TX_DATETIME
# For each window size
for window_size in windows_size_in_days:
# Compute the sum of the transaction amounts and the number of transactions for the given window size
SUM_AMOUNT_TX_WINDOW = customer_transactions['TX_AMOUNT'].rolling(str(window_size) + 'd').sum()
NB_TX_WINDOW = customer_transactions['TX_AMOUNT'].rolling(str(window_size) + 'd').count()
# Compute the average transaction amount for the given window size
# NB_TX_WINDOW is always >0 since current transaction is always included
AVG_AMOUNT_TX_WINDOW = SUM_AMOUNT_TX_WINDOW / NB_TX_WINDOW
# Save feature values
customer_transactions['CUSTOMER_ID_NB_TX_' + str(window_size) + 'DAY_WINDOW'] = list(NB_TX_WINDOW)
customer_transactions['CUSTOMER_ID_AVG_AMOUNT_' + str(window_size) + 'DAY_WINDOW'] = list(AVG_AMOUNT_TX_WINDOW)
# Reindex according to transaction IDs
customer_transactions.index = customer_transactions.TRANSACTION_ID
# And return the dataframe with the new features
return customer_transactions
spending_behaviour_customer_0 = get_customer_spending_behaviour_features(
transactions_df[transactions_df.CUSTOMER_ID == 0])
spending_behaviour_customer_0
transactions_df = transactions_df.groupby('CUSTOMER_ID').apply(
lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1, 7, 30]))
transactions_df = transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)
# print(transactions_df)
def get_count_risk_rolling_window(terminal_transactions, delay_period=7, windows_size_in_days=[1, 7, 30],
feature="TERMINAL_ID"):
terminal_transactions = terminal_transactions.sort_values('TX_DATETIME')
terminal_transactions.index = terminal_transactions.TX_DATETIME
NB_FRAUD_DELAY = terminal_transactions['TX_FRAUD'].rolling(str(delay_period) + 'd').sum()
NB_TX_DELAY = terminal_transactions['TX_FRAUD'].rolling(str(delay_period) + 'd').count()
for window_size in windows_size_in_days:
NB_FRAUD_DELAY_WINDOW = terminal_transactions['TX_FRAUD'].rolling(str(delay_period + window_size) + 'd').sum()
NB_TX_DELAY_WINDOW = terminal_transactions['TX_FRAUD'].rolling(str(delay_period + window_size) + 'd').count()
NB_FRAUD_WINDOW = NB_FRAUD_DELAY_WINDOW - NB_FRAUD_DELAY
NB_TX_WINDOW = NB_TX_DELAY_WINDOW - NB_TX_DELAY
RISK_WINDOW = NB_FRAUD_WINDOW / NB_TX_WINDOW
terminal_transactions[feature + '_NB_TX_' + str(window_size) + 'DAY_WINDOW'] = list(NB_TX_WINDOW)
terminal_transactions[feature + '_RISK_' + str(window_size) + 'DAY_WINDOW'] = list(RISK_WINDOW)
terminal_transactions.index = terminal_transactions.TRANSACTION_ID
# Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0)
terminal_transactions.fillna(0, inplace=True)
return terminal_transactions
# print(transactions_df[transactions_df.TX_FRAUD == 1].head())
# print(transactions_df[transactions_df.TX_FRAUD == 0].TERMINAL_ID[0])
transactions_df = transactions_df.groupby('TERMINAL_ID').apply(
lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1, 7, 30], feature="TERMINAL_ID"))
transactions_df = transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)
# print(transactions_df.head(10))
transactions_df=transactions_df.groupby('TERMINAL_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="TERMINAL_ID"))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)
# print(transactions_df.head())