-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrun.py
113 lines (81 loc) · 3.28 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve
from scipy import interp
import pandas as pd
# from sklearn.ensemble import AdaBoostClassifier
from CUSBoost import CUSBoostClassifier
from rusboost import RusBoost
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import math
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
# datasets = ['gpcr_dataset_1282.txt' ]
dataset = 'pima.txt'
print("dataset : ", dataset)
df = pd.read_csv(dataset, header=None)
df['label'] = df[df.shape[1] - 1]
#
df.drop([df.shape[1] - 2], axis=1, inplace=True)
labelencoder = LabelEncoder()
df['label'] = labelencoder.fit_transform(df['label'])
#
X = np.array(df.drop(['label'], axis=1))
y = np.array(df['label'])
normalization_object = Normalizer()
X = normalization_object.fit_transform(X)
skf = StratifiedKFold(n_splits=5, shuffle=True)
top_auc = 0
mean_fpr = np.linspace(0, 1, 100)
number_of_clusters = 23
percentage_to_choose_from_each_cluster = 0.5
for depth in range(2, 20, 10):
for estimators in range(20, 50, 10):
current_param_auc = []
current_param_aupr = []
tprs = []
for train_index, test_index in skf.split(X, y):
X_train = X[train_index]
X_test = X[test_index]
y_train = y[train_index]
y_test = y[test_index]
classifier = CUSBoostClassifier(depth=depth, n_estimators=estimators)
# classifier = RusBoost(depth=depth, n_estimators=estimators)
classifier.fit(X, y)
predictions = classifier.predict_proba_samme(X_test)
auc = roc_auc_score(y_test, predictions[:, 1])
aupr = average_precision_score(y_test, predictions[:, 1])
current_param_auc.append(auc)
current_param_aupr.append(aupr)
fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
current_mean_auc = np.mean(np.array(current_param_auc))
current_mean_aupr = np.mean(np.array(current_param_aupr))
if top_auc < current_mean_auc:
top_auc = current_mean_auc
best_depth = depth
best_estimators = estimators
best_auc = top_auc
best_aupr = current_mean_aupr
best_tpr = np.mean(tprs, axis=0)
best_fpr = mean_fpr
best_precision, best_recall, _ = precision_recall_curve(y_test, predictions[:, 1])
best_fpr, best_tpr, thresholds = roc_curve(y_test, predictions[:, 1])
print('ROC: ', top_auc, ' Aupr: ', best_aupr, ' for depth= ', best_depth, ' estimators = ', best_estimators)
print('ploting', dataset)
# plt.clf()
plt.plot(best_recall, best_precision, lw=2, color='Blue',
label='Precision-Recall Curve')
plt.plot(best_fpr, best_tpr, lw=2, color='red',
label='ROC curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.legend(loc="upper right")
plt.show()
# plt.plot(fpr_c[1], tpr_c[1], lw=2, color='red',label='Roc curve: Clustered sampling')