-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutil.py
321 lines (245 loc) · 10.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import os
import random
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
from torch.autograd import Variable
from torch_geometric.data import Data
import matplotlib.pyplot as plt
def gen_abspath(directory: str, rel_path: str) -> str:
"""由相对路径,生成绝对路径"""
abs_dir = os.path.abspath(directory)
return os.path.join(abs_dir, rel_path)
class Accumulator:
"""在n个变量上累加"""
def __init__(self, n):
self.data = [0.0] * n
def add(self, *args):
self.data = [a + float(b) for a, b in zip(self.data, args)]
def reset(self):
self.data = [0.0] * len(self.data)
def __getitem__(self, idx):
return self.data[idx]
class MeanAggregator(nn.Module):
"""
Aggregates a node's embeddings using mean of neighbors' embeddings
"""
def __init__(self, features, cuda=False, gcn=False):
"""
Initializes the aggregator for a specific graph.
:param features: function mapping LongTensor of node ids to FloatTensor of feature values.
:param cuda: whether to use GPU
:param gcn: whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style
"""
super(MeanAggregator, self).__init__()
self.features = features
self.cuda = cuda
self.gcn = gcn
def forward(self, nodes, to_neighs, num_sample=10):
"""
:param nodes: list of nodes in a batch
:param to_neighs: list of sets, each set is the set of neighbors for node in batch
:param num_sample: number of neighbors to sample. No sampling if None.
"""
# Local pointers to functions (speed hack)
_set = set
if not num_sample is None:
# 对批量内每一个 node 的邻居,抽 num_sample 个样本
_sample = random.sample
samp_neighs = [_set(_sample(list(to_neigh),
num_sample,
)) if len(to_neigh) >= num_sample else to_neigh for to_neigh in to_neighs]
else:
samp_neighs = to_neighs
if self.gcn:
# 邻居节点 + 本节点
samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]
unique_nodes_list = list(set.union(*samp_neighs))
# 建一个字典,存节点到节点编码的映射
unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
# mask 是一个全零矩阵,shape 是 (批量大小, 无重复的节点数)
mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
# 把邻居列表摊平,并将每个邻居元素换成它对应的编码 i
column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]
# 一个数列,长度是 节点数 * 节点邻居数,相当于每个邻居在数列中被其节点的编号表示
row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
# 一个稀疏矩阵,行表示节点,列表示邻居,节点与邻居之间有边的时候,对应矩阵元素值为 1
mask[row_indices, column_indices] = 1
if self.cuda:
mask = mask.cuda()
# 沿着 1 轴的方向(按行)求和,并保留维度
num_neigh = mask.sum(1, keepdim=True)
# 利用广播机制,求行平均
mask = mask.div(num_neigh)
# self.features 是将节点 id 转换为 节点特征的函数
if self.cuda:
embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
else:
embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
# 矩阵乘法 (节点个数, 所有邻居节点个数) @ (所有邻居节点个数, 特征维数) => (节点个数, 特征维数)
to_feats = mask.mm(embed_matrix)
return to_feats
class Encoder(nn.Module):
"""
Encodes a node's using 'convolutional' GraphSage approach
"""
def __init__(self, features, feature_dim, embed_dim,
adj_lists, aggregator, num_sample=10,
gcn=False, cuda=False):
"""初始化
:param features: 特征矩阵
:param feature_dim: 特征数
:param embed_dim: 嵌入维度
:param adj_lists: 节点间关联关系,被存成值为集合的字典
:param aggregator: 聚合器,用于生成邻居节点的嵌入
:param num_sample: 邻居节点抽样个数
:param gcn: 是否仅使用关联信息
:param cuda: 是否使用 cuda
"""
super(Encoder, self).__init__()
self.features = features
self.feat_dim = feature_dim
self.adj_lists = adj_lists
self.aggregator = aggregator
self.num_sample = num_sample
self.gcn = gcn
self.embed_dim = embed_dim
self.cuda = cuda
self.aggregator.cuda = cuda
self.weight = nn.Parameter(
torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
init.xavier_uniform_(self.weight) # 对权重做初始化
def forward(self, nodes):
"""
Generates embeddings for a batch of nodes.
:param nodes: list of nodes
"""
# 邻居特征
neigh_feats = self.aggregator.forward(nodes, [self.adj_lists[int(node)] for node in nodes], self.num_sample)
# 表格特征
if not self.gcn:
if self.cuda:
self_feats = self.features(torch.LongTensor(nodes).cuda())
else:
self_feats = self.features(torch.LongTensor(nodes))
# 将邻居特征和表格特征 concat 起来
combined = torch.cat([self_feats, neigh_feats], dim=1)
else:
# 只有邻居特征
combined = neigh_feats
# 全连接层加一个 ReLU 激活函数
combined = F.relu(self.weight.mm(combined.t()))
return combined
class SupervisedGraphSage(nn.Module):
def __init__(self, num_classes, enc):
super(SupervisedGraphSage, self).__init__()
self.enc = enc
self.xent = nn.CrossEntropyLoss()
self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
init.xavier_uniform_(self.weight)
def forward(self, nodes):
# 获取节点的嵌入表示
embeds = self.enc(nodes)
# 全连接层
scores = self.weight.mm(embeds)
return scores.t()
def loss(self, nodes, labels):
scores = self.forward(nodes)
return self.xent(scores, labels.squeeze())
def net(features, num_feats, adj_lists, label_cnt, use_cuda):
agg1 = MeanAggregator(features, cuda=use_cuda)
enc1 = Encoder(features, num_feats, 128, adj_lists, agg1, gcn=True, cuda=use_cuda)
agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=use_cuda)
enc2 = Encoder(lambda nodes: enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
gcn=True, cuda=use_cuda)
enc1.num_samples = 5
enc2.num_samples = 5
return SupervisedGraphSage(label_cnt, enc2)
def num_gpus():
"""Get the number of available GPUs"""
return torch.cuda.device_count()
def load_cora(edge_path, feat_path):
"""加载 cora 数据集
:param edge_path: cora 数据集边文件
:param feat_path: cora 数据集节点特征文件
:return node_to_noi: 节点到节点下标的字典
:return noi_to_feat: 节点下标到节点特征的字典
:return noi_to_label: 节点下标到节点标签的字典
:return label_map: 节点标签值到节点标签的字典
"""
# noi: node_index
node_to_noi = dict()
noi_to_feat = dict()
noi_to_label = dict()
label_map = dict()
with open(feat_path) as fp:
for i, line in enumerate(fp):
info = line.strip().split()
node_to_noi[info[0]] = i
noi_to_feat[i] = [int(e) for e in info[1:-1]]
if info[-1] not in label_map:
label_map[info[-1]] = len(label_map)
noi_to_label[i] = label_map[info[-1]]
edge_list = list()
with open(edge_path) as fp:
for line in fp:
info = line.strip().split()
node_a = node_to_noi[info[0]]
node_b = node_to_noi[info[1]]
edge_list.append([node_a, node_b])
return edge_list, noi_to_feat, noi_to_label, label_map
def create_pyg_cora_data(edge_list, feat_dict, label_dict):
"""把 cora 数据集存成 PyG data"""
edge_index = torch.tensor(edge_list, dtype=torch.long)
feat_list = [f[1] for f in sorted(feat_dict.items(), key=lambda e: e[0])]
label_list = [l[1] for l in sorted(label_dict.items(), key=lambda e: e[0])]
x = torch.tensor(feat_list, dtype=torch.float)
y = torch.tensor(label_list, dtype=torch.long)
return Data(x=x, y=y, edge_index=edge_index.t().contiguous())
def random_split_cora_data(data, train_rate, val_rate):
"""随机分割训练集、验证集和测试集"""
assert train_rate + val_rate < 1
N = data.num_nodes
# 样本量
num_train = int(N * train_rate)
num_val = int(N * val_rate)
# 掩码初始化
train_mask = torch.zeros(N, dtype=torch.bool)
val_mask = torch.zeros(N, dtype=torch.bool)
test_mask = torch.zeros(N, dtype=torch.bool)
# 索引随机化
perm = torch.randperm(N)
# 按样本量分割数据集
train_mask[perm[:num_train]] = True
val_mask[perm[num_train:num_train+num_val]] = True
test_mask[perm[num_train+num_val:]] = True
# 将掩码添加到 data
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
return data
def train(model, optimizer, data, num_epoch):
model.train()
train_loss_list = list()
val_loss_list = list()
for epoch in range(num_epoch):
optimizer.zero_grad()
out = model(data)
train_loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
train_loss.backward()
optimizer.step()
# 记录一下 train loss
train_loss_list.append(train_loss.item())
# 记录一下 validate loss
val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask])
val_loss_list.append(val_loss.item())
return model, train_loss_list, val_loss_list
def plot_loss(train_loss, val_loss, title):
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='val loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title(title)
plt.legend()
plt.show()