-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
63 lines (56 loc) · 2.35 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import glob
import os
import numpy as np
import argparse
def process(df_dataset, filename):
columns_to_drop = [
'Dst Port',
'Timestamp',
'Fwd PSH Flags',
'Bwd PSH Flags',
'Fwd URG Flags',
'Bwd URG Flags',
'Flow Byts/s', # some np.inf values
'Flow Pkts/s', # some np.inf values
'Dst IP', #These fields are in some days but not others
'Flow ID',
'Src IP',
'Src Port'
]
# Step 2-3
print(f'Labels: {df_dataset.Label.unique()}')
df_dataset.drop(df_dataset.loc[df_dataset["Label"] == "Label"].index, inplace=True)
mal_labels = set(df_dataset['Label'].unique())
mal_labels.remove('Benign')
df_dataset.replace({'Label': mal_labels}, 'Malicious', inplace=True)
print(f'two labels: {df_dataset.Label.unique()}')
#Step 4
print(f'protocols: {df_dataset.Protocol.unique()}')
df_dataset = df_dataset.astype({"Protocol": str})
df_dataset = pd.get_dummies(df_dataset, columns=['Protocol'], drop_first=True)
# making Label column the last column again
df_dataset.insert(len(df_dataset.columns)-1, 'Label', df_dataset.pop('Label'))
# Step 5-6
# df_dataset.drop(columns=columns_to_drop, inplace=True)
df_dataset.drop(df_dataset.filter(columns_to_drop), axis=1, inplace=True) # axis=1 for columns
df_dataset.dropna(inplace=True)
df_dataset.drop_duplicates(inplace=True)
bsum = (df_dataset["Label"].value_counts()[['Benign']].sum())
msum = (df_dataset["Label"].value_counts()[['Malicious']].sum())
print(f'Benign percentage: {bsum / (bsum + msum)}')
df_dataset.replace(to_replace="Benign", value=0, inplace=True)
df_dataset.replace(to_replace="Malicious", value=1, inplace=True)
print(df_dataset.info())
df_dataset.to_csv(filename, index=False)
return df_dataset
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--data', default="Processed Traffic Data for ML Algorithms")
args = parser.parse_args()
# code for over all days, can add options to split by specific days
#joined_files = os.path.join(args.data, "*.csv")
joined_files = os.path.join("args.data", "*.csv")
joined_list = glob.glob(joined_files)
all_df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
process(all_df, 'processed_data.csv')