-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlabel_leakage_test.py
306 lines (253 loc) · 11.1 KB
/
label_leakage_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data.dataset import Dataset
from torch.utils.tensorboard import SummaryWriter
from splitLearning import SplitNN, SplitNNClient
from attack import norm_attack, direction_attack
class NumpyDataset(Dataset):
"""This class allows you to convert numpy.array to torch.Dataset
Args:
x (np.array):
y (np.array):
transform (torch.transform):
Attriutes
x (np.array):
y (np.array):
transform (torch.transform):
"""
def __init__(self, x, y=None, transform=None, return_idx=False):
self.x = x
self.y = y
self.transform = transform
self.return_idx = return_idx
def __getitem__(self, index):
x = self.x[index]
if self.y is not None:
y = self.y[index]
if self.transform is not None:
x = self.transform(x)
if not self.return_idx:
if self.y is not None:
return x, y
else:
return x
else:
if self.y is not None:
return index, x, y
else:
return index, x
def __len__(self):
"""get the number of rows of self.x"""
return len(self.x)
config = {"batch_size": 128}
# 输入貌似只有28个维度 线性层输入后有16个隐藏层
hidden_dim = 16
class FirstNet(nn.Module):
def __init__(self, train_features):
super(FirstNet, self).__init__()
self.L1 = nn.Linear(train_features.shape[-1], hidden_dim)
def forward(self, x):
x = self.L1(x)
x = nn.functional.relu(x)
return x
class SecondNet(nn.Module):
def __init__(self):
super(SecondNet, self).__init__()
self.L2 = nn.Linear(hidden_dim, 1)
def forward(self, x):
x = self.L2(x)
x = torch.sigmoid(x)
return x
def torch_auc(label, pred):
return roc_auc_score(label.detach().numpy(), pred.detach().numpy())
def main(writer=None):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device is ", device)
raw_df = pd.read_csv(
"creditcard.csv"
)
# 将读入的数据分成两类
raw_df_neg = raw_df[raw_df["Class"] == 0]
raw_df_pos = raw_df[raw_df["Class"] == 1]
down_df_neg = raw_df_neg # .sample(40000)
# 基于同一轴将多个数据集合并 为啥要拆开再合并呢?
down_df = pd.concat([down_df_neg, raw_df_pos])
neg, pos = np.bincount(down_df["Class"])
total = neg + pos
print(
"Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n".format(
total, pos, 100 * pos / total
)
)
cleaned_df = down_df.copy()
# You don't want the `Time` column.
cleaned_df.pop("Time")
# The `Amount` column covers a huge range. Convert to log-space.
# “金额”一栏涵盖的范围很广。转换为log空间。
eps = 0.001 # 0 => 0.1¢
# 注:在pop删除的时候同时放回该值
# 但是这个加 eps是为什么?
cleaned_df["Log Ammount"] = np.log(cleaned_df.pop("Amount") + eps)
# Use a utility from sklearn to split and shuffle our dataset.
# 使用sklearn中的实用程序分割和shuffle数据集。
train_df, test_df = train_test_split(cleaned_df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)
# 划分后 train_df 0.64||val_df 0.16|| test_df 0.2
# Form np arrays of labels and features.
# 获取对应的标签
train_labels = np.array(train_df.pop("Class"))
val_labels = np.array(val_df.pop("Class"))
test_labels = np.array(test_df.pop("Class"))
train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)
# sklearn.preprocessing.StandardScaler是数据标准化的一个库,可以将现有的数据通过某种关系,映射到某一空间内
# 常用的标准化方式是,减去平均值,然后通过标准差映射到均至为0的空间内
# StandardScaler类是一个用来将数据进行归一化和标准化的类。
scaler = StandardScaler()
# 使得新的X数据集方差为1,均值为0
'''
fit_transform方法是fit和transform的结合,
fit_transform(X_train) 意思是找出X_train的和,并应用在X_train上。
即fit_transform(partData)对部分数据先拟合fit,
找到该part的整体指标,如均值、方差、最大值最小值等等(根据具体转换的目的),
然后对该partData进行转换transform,从而实现数据的标准化、归一化等等。
fit_transform方法是fit和transform的结合,fit_transform(X_train) 意思是找出X_train的均值和标准差,并应用在X_train上。
这时对于X_test,我们就可以直接使用transform方法。
因为此时StandardScaler已经保存了X_train的均值和标准差
'''
train_features = scaler.fit_transform(train_features)
# 根据训练集的数据,将剩下的数据进行标准化,所有的数据标准化是一致的
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)
# np.clip是一个截取函数,用于截取数组中小于或者大于某值的部分,并使得被截取部分等于固定值
# 最大值直接定为5 ,最小值定为-5
# 注:由于数据服从标准正态分布:3σ原理,>5或者<5的值都几乎不存在,
train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)
print("Training labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)
print("Test labels shape:", test_labels.shape)
print("Training features shape:", train_features.shape)
print("Validation features shape:", val_features.shape)
print("Test features shape:", test_features.shape)
# 向将 numpy的数据转化为torch的数据,传入dataloader
# astype是进行数据装换的将标签变为float64
# .reshape(-1, 1) -1表示未指定值,1表示一行一个,即转化为一列数据
train_dataset = NumpyDataset(
train_features, train_labels.astype(np.float64).reshape(-1, 1)
)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=config["batch_size"], shuffle=True
)
test_dataset = NumpyDataset(
test_features, test_labels.astype(np.float64).reshape(-1, 1)
)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=config["batch_size"], shuffle=True
)
# -------------------------------------------------
# 至此完成数据加载,开始搭建模型
# -------------------------------------------------
# 第一层输入貌似只有28个维度 线性层输入后有16个隐藏层
# 使用relu激活函数
model_1 = FirstNet(train_features)
# 将模型指定到对应的硬件上
model_1 = model_1.to(device)
# 第二层输入是16个维度,输出是1个维度
# 使用sigmoid激活函数
model_2 = SecondNet()
model_2 = model_2.to(device)
# 将所有的浮点类型的参数和缓冲转换为(双浮点)double数据类型.
model_1.double()
model_2.double()
# 用神经网络 对模型的参数 初始化优化器 lr是学习速率
opt_1 = optim.Adam(model_1.parameters(), lr=1e-3)
opt_2 = optim.Adam(model_2.parameters(), lr=1e-3)
optimizers = [opt_1, opt_2]
# 损失函数 二元交叉熵并且求平均
criterion = nn.BCELoss()
client_1 = SplitNNClient(model_1, user_id=0)
client_2 = SplitNNClient(model_2, user_id=0)
# 下面就比较难了
clients = [client_1, client_2]
splitnn = SplitNN(clients, optimizers)
# -------------------------------------------------
# 至此搭建模型完成,开始训练
# -------------------------------------------------
# 切换到训练模式
splitnn.train()
for epoch in range(128):
epoch_loss = 0
epoch_outputs = []
epoch_labels = []
for i, data in enumerate(train_loader):
for opt in optimizers:
# 第一次训练需要清空梯度
opt.zero_grad()
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = splitnn(inputs)
loss = criterion(outputs, labels)
# loss.backward()
'''
outputs.grad该参数默认情况下是None,但是当第一次为当前张量自身self计算梯度调用backward()方法时,
该属性grad将变成一个Tensor张量类型. 该属性将包含计算所得的梯度,在这之后如果再次调用
backward()方法,那么将会对这个grad属性进行累加.
'''
splitnn.backward(loss)
epoch_loss += loss.item() / len(train_loader.dataset)
epoch_outputs.append(outputs)
epoch_labels.append(labels)
# 更新对两个模型参数
for opt in optimizers:
opt.step()
'''
sklearn库中提供了auc计算工具,只需提供真实值和预测值即可
torch.cat()将不同批次的输出合并到一个tensor中
'''
auc = torch_auc(torch.cat(epoch_labels), torch.cat(epoch_outputs))
print(
f"epoch={epoch}, loss: {epoch_loss}, auc: {auc}"
)
if writer:
writer.add_scalar(
"Loss", scalar_value=epoch_loss, global_step=epoch)
writer.add_scalar("auc", scalar_value=auc, global_step=epoch)
# 下面是添加了测试数据上auc
test_epoch_outputs = []
test_epoch_labels = []
for i, data in enumerate(test_loader):
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = splitnn(inputs)
test_epoch_outputs.append(outputs)
test_epoch_labels.append(labels)
test_auc = torch_auc(torch.cat(test_epoch_labels), torch.cat(test_epoch_outputs))
if writer:
writer.add_scalar(
"test_auc", scalar_value=test_auc, global_step=epoch)
# 注意每次attack也是一次训练!
# 攻击1:模攻击
# train_leak_auc = norm_attack(splitnn, train_loader, attack_criterion = nn.BCELoss(), device=device)
# print("Leau AUC is ", train_leak_auc)
# test_leak_auc = norm_attack(splitnn, test_loader, attack_criterion = nn.BCELoss(), device=device)
# print("Leau AUC is ", test_leak_auc)
# 攻击2:余弦攻击
# train_leak_auc = direction_attack(splitnn, train_loader, attack_criterion=nn.BCELoss(), device=device)
# print("Leau AUC is ", train_leak_auc)
# test_leak_auc = direction_attack(splitnn, test_loader, attack_criterion=nn.BCELoss(), device=device)
# print("Leau AUC is ", test_leak_auc)
if __name__ == "__main__":
writer = SummaryWriter("efficiency3")
main(writer)
writer.close()