-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredictor.py
384 lines (346 loc) · 18.2 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
import pandas as pd
import networkx as nx
import json
import os
import numpy as np
import random
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score
from sklearn.model_selection import train_test_split
class Predictor():
def __init__(self,ret,in_out,numwalks,walklength,threshold_value,verbose):
# Data Ingestion Variables
self.json_data = [] # the input data in list form
self.json_folder = [] # the input data files if provided as a data directory
# Graph Specific Variables
self.nodes = [] # the list node types requested by the user
self.edges = [] # the list of edge types requested by the user
self.TrainGraph = nx.Graph() # the Graph
self.TestGraph=nx.Graph()
self.keymap = [] # a list that holds the keys for each node
self.edgestatus = True # a boolean variable to ensure and edge can or cannot exist
self.traindb = pd.DataFrame() # a Pandas DataFrame with the graph data
self.testdb = pd.DataFrame() # a Pandas DataFrame with the graph data
self.trainedgesattr=[] # training edge attributes
self.testedgesattr=[] # test edge attributes
# New Edge Variables
self.affirmed_testedge_keymap = [] # keymap for test edges that are confirmed to exist in the training graph as well
self.newedgeattr = [] # a list of attributes of edges that exist in the test graph but do not exist in the training graph, in the form of node1--node2
self.affirmed_testedge = [] # list of test edges that do exist in the training graph
'''
# Splunk Variables - to be implemented later
self.credentials = {}
self.baseurl = ""
self.splunk_search = {}
'''
# Node2Vec Variables
self.walklen = walklength # walk length for node2vec random walks
self.numwalk = numwalks # number of walks
self.p = ret # return parameter
self.q = in_out # in-out parameter
# Link Prediction Variables
self.threshold = threshold_value # the link prediction threshold, default is 0.2
self.hits = [] # the edges that are below a certain threshold for link prediction
self.edge_predictions = [] # list of probabilites for each test edge embedding
# Graph Embedding Variables
self.pos_edge_embeddings = [] # list of edge embeddings that exist in the training graph
self.pos_edge_label = [] # labels corresponding to positive edge embeddings
self.neg_edge_embeddings = [] # list of edge embeddings that don't exist in the training graph
self.neg_edge_label = [] # labels of corresponding negative edges
self.test_edge_embeddings = [] # list of edge embeddings in the testing graph
self.test_edge_labels = [] # labels corresponding to the test edges
# Utility variables
self.verbose=verbose
# Ingest file for graph creation using JSON loads and Pandas DataFrames
# Input: filepath - absolute location of the file path
# Ouput: None
def ingest_file(self, filepath,type):
if self.verbose:
print(f"Beginning ingestion of data at file {filepath}")
for l in open(filepath).readlines():
self.json_data.append(json.loads(l))
if type=="train":
self.traindb = pd.DataFrame(self.json_data)
else:
self.testdb = pd.DataFrame(self.json_data)
# Ingest folder containing files for graph creation
# Input: folderpath - absolute location of the folder
# Output: None
def ingest_folder(self, folderpath,type):
if self.verbose:
print(f"Beginning ingestion of data at folder {folderpath}")
for (root, directory, files) in sorted(os.walk(folderpath)):
if files:
for f in files:
self.json_folder.append(u'%s' % os.path.join(str(root), str(f)))
if self.verbose:
print(f"Done walking folder {folderpath}")
for j in self.json_folder:
self.ingest_file(j,type)
if len(self.json_folder) == 0:
# If the folder is really just a single file
if self.verbose:
print(f'{folderpath} is a single file.')
self.ingest_file(folderpath,type)
# Add weights to the nodes [Unused right now]
# Input: input_Graph - NetworkX Graph to add node degrees to
# Output: None
def add_weights(self, input_Graph):
for n in input_Graph.nodes():
input_Graph.nodes[n]['weight'] = 1 / input_Graph.degree(n)
if self.verbose:
print(f'Done adding weights to node in graph')
# Combine two entities
# Input: first - first entity, second - second entity
# Output: first+second - addition of the two input entities
def combine(self, first, second):
return first + second
# Return the node attribute from an ID in keymap
# Input: input_id - integer that represents the requested node ID
# Output: the node that corresponds to that ID
def return_nodeattr_fromid(self, input_id):
return self.keymap[input_id]
# Create a string that represents the node attributes placed together
# Input: firstnode - attributes of the first node, secondnode - attributes of the second node
# Output: string that respresnts the node attributes combined with a "--" between them
def create_edge(self, firstnode, secondnode):
return str(firstnode) + "--" + str(secondnode)
# Return an array of node degrees
# Input: input_Graph - NetworkX Graph to find node degrees for
# Output: the node degrees for NetworkX type Graph, input_Graph
def return_node_degrees(self, input_Graph):
return np.array([input_Graph.degree(node) for node in list(input_Graph.nodes())])
# Instantiate Node2Vec with the training graph, specificed, walk length, number of walks, p and q values
# Input: None
# Output: None
def createnode2vec(self):
self.node2vec = Node2Vec(self.TrainGraph, walk_length=self.walklen, num_walks=self.numwalk, p=self.p,q=self.q) # https://arxiv.org/pdf/1607.00653.pdf
if self.verbose:
print(f'Node2Vec instantiated with p={self.p}, q={self.q}, walklength={self.walklen}, numwalks={self.numwalk}')
self.model = self.node2vec.fit(window=2, sample=0.1)
self.hadamard = HadamardEmbedder(keyed_vectors=self.model.wv) # embed edges as the product of the node embeddings
if self.verbose:
print("Done with node2vec instantiation")
# Instantiate the Logistic Regression classifier
# Input: None
# Output: None
def createlogreg(self):
self.edge_classifier = LogisticRegression(random_state=42, max_iter=100000)
if self.verbose:
print(f'Logistic regression classifier instantiated')
# Embed edges using Hadamard embedding
# Input: edges - list of edges formatted Node1--Node2, label - specific value to assign the edge (either 0 or 1)
# Output: embedded_edges - list of edges embedded in the feature space, embedded_labels - the labels that correspond to each embedded edge
def embed_edges(self, edges, label):
embedded_edges = []
embedded_labels = []
for n1, n2 in edges:
embedded_edges.append(self.hadamard[str(n1), str(n2)])
embedded_labels.append(label)
return embedded_edges, embedded_labels
# Embed negative edges in the Graph
# Input: None
# Output: None
def negative_embeds(self):
normalized_probs = self.return_node_degrees(self.TrainGraph) / np.linalg.norm(
self.return_node_degrees(self.TrainGraph), ord=1)
while len(self.neg_edge_embeddings) < len(self.pos_edge_embeddings) * 10:
self.random_edge = np.random.choice(list(self.TrainGraph.nodes()), 2, replace=False, p=normalized_probs)
random_node1 = random.choice(list(self.TrainGraph.nodes()))
random_node2 = random.choice(list(self.TrainGraph.nodes()))
random_edge = self.create_edge_from_ids(random_node1, random_node2)
random_edge_rev = self.create_edge_from_ids(random_node2, random_node1)
if random_edge not in self.TrainGraph.edges() and random_edge_rev not in self.TrainGraph.edges():
self.neg_edge_embeddings.append(self.hadamard[str(random_node1), str(random_node2)])
self.neg_edge_label.append(0.)
# Evalute the ROC and AP of the link prediction algorithm, not used yet
# Input: None
# Output: ROC and AP
def method_evaluation(self):
X_train, X_test, y_train, y_test = train_test_split(
self.combine(self.pos_edge_embeddings, self.neg_edge_embeddings),
self.combine(self.pos_edge_label, self.neg_edge_label), test_size=0.20, random_state=42)
self.fitpredict(X_train, y_train)
y_preds = self.edge_classifier.predict_proba(X_test)[:, 1]
self.roc = roc_auc_score(y_test, y_preds)
self.ap = average_precision_score(y_test, y_preds)
return self.roc, self.ap
# Fit the edge classifier using Logistic Regression
# Input: Xlabels - the edge embeddings to train on, Ylabels - the labels for each edge embedding
# Output: None
def fitpredict(self, Xlabels, Ylabels):
self.edge_classifier.fit(Xlabels, Ylabels)
# Make predictions using the trained edge classifier
# Input: embeds - embedded edges to test the edge classifier
# Output: probabilities of existence for each embeds provided
def fitpredictproba(self, embeds):
# print('Predicting')
return self.edge_classifier.predict_proba(embeds)[:, 1]
# Embed edges from the Training graph, create negative edge embeddings, and fit the edge classifier
# Input: None
# Output: None
def linkpredict(self):
if self.verbose:
print('Beginning Link Prediction')
self.pos_edge_embeddings, self.pos_edge_label = self.embed_edges(self.TrainGraph.edges(), 1)
self.negative_embeds()
self.fitpredict(self.combine(self.pos_edge_embeddings, self.neg_edge_embeddings),
self.combine(self.pos_edge_label, self.neg_edge_label))
if self.verbose:
print('Beginning Anomalous Edge Detection')
self.detect()
# Conduct link prediction on the test edges
# Input: None
# Output: None
def detect(self):
self.newedge() # check for new edges
if self.verbose:
print(f'Confirmed edges: {self.affirmed_testedge_keymap}')
self.test_edge_embeddings, _ = self.embed_edges(self.affirmed_testedge_keymap, 0)
self.edge_predictions = self.fitpredictproba(self.test_edge_embeddings)
pred_dictionary = {self.affirmed_testedge_keymap[i]: self.edge_predictions[i] for i in
range(len(self.affirmed_testedge_keymap))}
for k, v in pred_dictionary.items():
self.threshold_analysis(k, v)
# Return the edge predictions
# Input: None
# Output: edge_predictions
def return_edge_predictions(self):
return self.edge_predictions
# Select only edges that are below the link prediction threshold
# Input: input_edge - edge formatted by Node1--Node2, input_pred - decimal value between 0 and 1
# Output: None
def threshold_analysis(self, input_edge, input_pred):
if input_pred < self.threshold:
final_edge = self.create_edge(self.return_nodeattr_fromid(int(input_edge[0])),
self.return_nodeattr_fromid(int(input_edge[1])))
print(f'Edge: {final_edge} below threshold with link prediction: {input_pred}')
# Ensure the request edges make sense, are not missing, and are not empty
# Input: error_check - list of edges to check
# Output: None
def checkerrors_edge(self, error_check):
for e in error_check:
if str(e) == "" or str(e).split("-")[-1] == "":
print(f'ERROR: Mission edge in input')
quit()
elif len(str(e).split("-")) == 1:
print(f'ERROR: Missing node within edge in input')
quit()
# Ensure the nodes are columns in the Pandas DataFrame of data
# Input: testnode - requested node to test
# Output: None
def checkerrors_node(self, testnode):
# print(f'Checking the dataset and requested node to ensure compliance.')
if testnode in list(self.traindb) and testnode in list(self.testdb):
if self.verbose:
print(f'Found {testnode} in dataset')
return True
else:
print(f'ERROR: Did not find node {testnode} in dataset')
return False
# Run error checking mechanisms on input edges
# Input: input_edges - list of input edges in the form of Node1--Node2,Node2--Node3
# Output: None
def feature_check(self,input_edges):
edgelist=input_edges.split(",")
self.checkerrors_edge(edgelist)
for edge in edgelist:
n1,n2=self.split_edge(edge)
if (self.checkerrors_node(n1)) and (self.checkerrors_node(n2)):
self.save_edge(edge)
if self.verbose:
print(f'Feature check complete')
else:
quit()
# Save the edges for the graph
# Input: good_edge - edge to save
# Output: None
def save_edge(self, good_edge):
self.edges.append(good_edge)
# Split an edge based on "-"
# Input: input_edge - input edge
# Output: node1 - first node in edge, node2 - second node in edge
def split_edge(self, input_edge):
node1 = str(input_edge).split("--")[0]
node2 = str(input_edge).split("--")[1]
return node1, node2
# Create edge from two indices of nodes
# Input: n1 - first node id, n2 - second node id
# Output: (n1,n2) - edge created with two node ids
def create_edge_from_ids(self, n1, n2):
return (n1, n2)
# Find the attributes for the node
# Input: r - row within the Pandas DataFrame of data, requested_node - node to find attributes for
# Output: attributes of a specific column in the Pandas DataFrame
def return_nodevalues(self, r, requested_node):
return getattr(r, str(requested_node))
# Retrieve the index of the attributes of a node if already created. If not already created, add the attributes to the keymap.
# Input: node - the specific node, attr - attributes of the node
# Output: keymap index of the node attributes
def add_nodehandler(self, node, attr,input_graph):
if attr not in self.keymap:
input_graph.add_node(len(self.keymap), props={str(node): str(attr)})
self.keymap.append(str(attr))
return self.keymap.index(str(attr))
# Create Train Graph using TrainDB
# Input: None
# Output: None
def process_train(self):
for row in self.traindb.itertuples():
for e in self.edges:
n1, n2 = self.split_edge(e)
firstnode = self.return_nodevalues(row, str(n1))
secondnode = self.return_nodevalues(row, str(n2))
nodeid_firstnode = self.add_nodehandler(n1, str(firstnode),self.TrainGraph)
nodeid_secondnode = self.add_nodehandler(n2, str(secondnode),self.TrainGraph)
#print(f'Node: {firstnode} and id: {nodeid_firstnode}')
#print(f'Node: {secondnode} and id: {nodeid_secondnode}')
self.trainedgesattr.append(self.create_edge(firstnode, secondnode))
self.TrainGraph.add_edge(nodeid_firstnode, nodeid_secondnode)
if self.verbose:
print(f"Number of Training Nodes: {len(self.TrainGraph.nodes())}")
print(f"Number of Training Edges: {len(self.TrainGraph.edges())}")
# Create Test Graph using TestDB
# Input: None
# Output: None
def process_test(self):
for row in self.testdb.itertuples():
for e in self.edges:
n1, n2 = self.split_edge(e)
firstnode = self.return_nodevalues(row, str(n1))
secondnode = self.return_nodevalues(row, str(n2))
nodeid_firstnode = self.add_nodehandler(n1, str(firstnode),self.TestGraph)
nodeid_secondnode = self.add_nodehandler(n2, str(secondnode),self.TestGraph)
#print(f'Node: {firstnode} and id: {nodeid_firstnode}')
#print(f'Node: {secondnode} and id: {nodeid_secondnode}')
self.testedgesattr.append(self.create_edge(firstnode, secondnode))
self.TestGraph.add_edge(nodeid_firstnode, nodeid_secondnode)
if self.verbose:
print(f"Number of Testing Nodes: {len(self.TestGraph.nodes())}")
print(f"Number of Testing Edges: {len(self.TestGraph.edges())}")
# Find the new edges
# Input: None
# Output: None
def newedge(self):
for edg in self.testedgesattr:
if edg not in self.trainedgesattr and self.reverse_edge(edg) not in self.trainedgesattr:
if self.verbose:
print(f'Test Edge: {edg} not found in training graph')
self.newedgeattr.append(edg)
else:
self.affirmed_testedge.append(edg)
n1, n2 = self.split_edge(edg)
self.affirmed_testedge_keymap.append((self.get_keymap(str(n1)), self.get_keymap(str(n2))))
# Flip the supplied edge
# Input: forward_edge - the original edge
# Output: the forward_edge flipped so that node1--node2 becomes node2--node1
def reverse_edge(self, forward_edge):
return list(forward_edge).reverse()
# Return the ID of a node based on its attributes
# Input: node - attribute of the node requested
# Output: integer that represents the ID of the node requested
def get_keymap(self, node):
return self.keymap.index(node)