-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcomparaison_python.py
305 lines (212 loc) · 13.5 KB
/
comparaison_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 18 17:01:32 2019
@author: bchassagno
"""
import os
import pyAgrum as gum
import pyAgrum.lib._utils.oslike as oslike
import re
import time
import pyAgrum.lib.bn_vs_bn as comp
from h2pc import H2PC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import load_objects,save_objects
import pyAgrum.lib.ipython as gnb
from pyAgrum.lib.bn2scores import computeScores
import pickle
from independances import indepandance
from collections import OrderedDict
from matplotlib.lines import Line2D
def choose_graph_name(name_graphes):
dico_name_graphes_formatted={os.path.splitext(graph)[0]:graph for graph in name_graphes}
graph=input("choose one of the following graphs {} :\n".format(list(dico_name_graphes_formatted.keys())))
if graph in list(dico_name_graphes_formatted.keys()):
return dico_name_graphes_formatted[graph]
else:
print("Inserted name does not belong to those written")
choose_graph_name(name_graphes)
def compute_average_distance(bn, nsamples,size,algorithm,score_measured):
#matrix of shape (repetitions * scores) to get the mean of each type of score
#algorithm:(type,agrs,kwargs)
type_algorithm,args,kwargs=algorithm[0],algorithm[1][0],algorithm[1][1]
scoring_matrix=np.empty(shape=(len(score_measured),nsamples))
for repetition in range(nsamples):
#generate a database of the required size
gum.generateCSV(bn,os.path.join("databases","temp_base.csv"),size,False,with_labels=True)
learner=gum.BNLearner(os.path.join("databases","temp_base.csv"))
dico_algorithm={'greedy_climbing':learner.useGreedyHillClimbing,'tabu_search':learner.useLocalSearchWithTabuList, 'miic':learner.useMIIC, '3off2':learner.use3off2}
#with algos coming from pyagrum
if type_algorithm in dico_algorithm.keys():
detect_cycle,iteration=True,0
#we add that part to avoid cycle erros by repeating the process
while (detect_cycle and iteration<100):
start_time=time.time()
iteration+=1
try:
learner.useMIIC()
created_bn=learner.learnBN()
except gum.InvalidDirectedCycle:
gum.generateCSV(bn,os.path.join("databases","temp_base.csv"),size,False,with_labels=True)
learner=gum.BNLearner(os.path.join("databases","temp_base.csv"))
dico_algorithm[type_algorithm](*args,**kwargs)
else:
detect_cycle=False
end_time=time.time()-start_time
if iteration>=100:
raise AssertionError("failure of miic to compute the bayesian network")
#for the moment, only possible case is H2PC case, these lines won't be required after
else:
df=pd.read_csv(os.path.join("databases","temp_base.csv"))
start_time=time.time()
created_bn=H2PC(learner,df,*args,**kwargs).learnBN()
end_time=time.time()-start_time
#store results from the scores pyagrum tools in scoring_matrix
scores_list=comp.GraphicalBNComparator(bn,created_bn).scores()
scores_list.update(computeScores(created_bn,os.path.join("databases","temp_base.csv")))
scores_list.update(comp.GraphicalBNComparator(bn,created_bn).hamming())
scores_list.update({'time':end_time,'number_tests':indepandance.number_tests,'specificity':scores_list['count']['tn']/(scores_list['count']['tn'] +scores_list['count']['fp'])})
#gather all scores in a single list
#store results of each score, conserving the order of the given list
ordered_score = OrderedDict((score, scores_list[score]) for score in score_measured)
scoring_matrix[:,repetition]=list(ordered_score.values())
return scoring_matrix
def learn_scores(bn,sample_size,score_measured=['dist2opt'],algorithms={'tabu_search':([],{})},nsamples=30):
possible_scoring_distances=['recall','precision','fscore','dist2opt','bic','aic','mdl','time','number_tests','specificity','hamming','shd']
for score in score_measured:
if score not in possible_scoring_distances:
raise AssertionError("distance score still not implemented, list of of possible computations is {}".format(possible_scoring_distances))
#assert that if there's number_tests, there's only h2pc algo used
if ('number_tests' in score_measured) and "".join(list(algorithms.keys()))!='h2pc':
raise AssertionError ("we can only compute number of tests for h2pc")
possible_algorithms=['greedy_climbing','tabu_search', 'H2PC', 'miic', '3off2']
for algo in algorithms:
if algo not in possible_algorithms:
raise AssertionError("algorithm score still not implemented, list of of possible computations is {}".format(list(possible_algorithms.keys())))
#matrix scoring all scores measured for each database
matrix_scores=np.empty(shape=(len(sample_size),len(algorithms),len(score_measured),nsamples))
for row_index, size in enumerate (sample_size):
for column_index, algorithm in enumerate(algorithms.items()):
print("nous en sommes a l'algo ", algorithm, "pour al taille suivante de database ", size)
matrix_scores[row_index,column_index,Ellipsis]=compute_average_distance(bn, nsamples,size,algorithm,score_measured)
return matrix_scores
def plot_score_algorithms(bn_name,sample_size,score_measured=['dist2opt'],algorithms={'tabu_search':([],{})},nsamples=10,with_boxplot=False):
"""
Subplot score algorithms from a matrix score ( size*algo*scores).
"""
bn=gum.loadBN(os.path.join("true_graphes_structures",bn_name))
matrix_score=learn_scores(bn,sample_size,score_measured,algorithms,nsamples)
save_objects(os.path.join('scores','matrice_scores_{}'.format(os.path.splitext(bn_name)[0])),matrix_score)
#matrix_score=load_objects(os.path.join('scores','matrice_scores_alarm.bif'))
fig,ax = plt.subplots(nrows=len(score_measured), ncols=1, sharex=True,figsize =[6.4, 2*len(score_measured)],squeeze =False)
#define set of colors
cmap = plt.get_cmap('gnuplot')
colors = [cmap(i) for i in np.linspace(0.1, 1, len(algorithms))]
mean_colors=colors[::-1]
for index_score, score in enumerate(score_measured):
for index_algo,mixed in enumerate(zip(colors,algorithms)):
score_repetition=matrix_score[:,index_algo,index_score]
mean_resampling=np.mean(score_repetition,axis=1)
if with_boxplot:
meanpointprops = dict(marker='D', markeredgecolor='white',markerfacecolor=mean_colors[index_score])
bp=ax[index_score,0].boxplot(np.transpose(score_repetition),notch=True, sym=' ',patch_artist=True,showmeans=True,meanprops=meanpointprops)
for box in bp['boxes']:
# change fill color
box.set( facecolor = colors[index_score] )
else:
ax[index_score,0].plot(sample_size,mean_resampling, label=mixed[1],color=mixed[0],marker='+')
ax[index_score,0].set_ylabel (score)
ax[index_score,0].set_title("Score measure {} in relation with dataset size".format(score))
#at the end of computation, delete temporay file created
if os.path.exists(os.path.join("databases","temp_base.csv")):
os.remove(os.path.join("databases","temp_base.csv"))
plt.tick_params(axis='x',rotation=45)
plt.xlabel ("data size")
box = ax[len(score_measured)-1,0].get_position()
ax[len(score_measured)-1,0].set_position([box.x0, box.y0 + box.height * 0.4,
box.width, box.height * 0.6])
ax[len(score_measured)-1,0].legend(bbox_to_anchor=(0.0, -1,1., .102), loc=3,ncol=4, mode="expand", fancybox=True, shadow=True)
fig.suptitle('Some score measures for BN : {}'.format(os.path.splitext(bn_name)[0]),y=1.05,weight ="bold")
fig.tight_layout()
return fig
def compute_ratio(bn, nsamples,size,algorithms,score_measured):
matrix_ratio_list=[compute_average_distance(bn, nsamples,size,algo,score_measured) for algo in algorithms.items()]
#we add a really small value to avoid divisions by 0
matrix_ratio=matrix_ratio_list[1]/(matrix_ratio_list[0]+10**-10)
return matrix_ratio
def learn_ratio(bn,sample_size,score_measured=['dist2opt'],algorithms={'tabu_search':([20,50],{}),'greedy_climbing':([],{})},nsamples=30):
possible_ratio_distances=['recall','precision','fscore','dist2opt','bic','aic','mdl','time','number_tests','specificity','hamming','shd']
for score in score_measured:
if score not in possible_ratio_distances:
raise AssertionError("distance score still not implemented, list of of possible computations is {}".format(possible_ratio_distances))
if len (algorithms)!=2:
raise AssertionError("ratio is supposed to be between 2 distances only")
possible_algorithms=['greedy_climbing','tabu_search', 'H2PC', 'miic', '3off2']
for algo in algorithms:
if algo not in possible_algorithms:
raise AssertionError("algorithm score still not implemented, list of of possible computations is {}".format(list(possible_algorithms.keys())))
#matrix scoring all scores measured for each database
matrix_scores=np.empty(shape=(len(sample_size),len(score_measured),nsamples))
for index_size, size in enumerate (sample_size):
print("nous en sommes a la taille ",size)
matrix_scores[index_size,Ellipsis]=compute_ratio(bn, nsamples,size,algorithms,score_measured)
return matrix_scores
def plot_ratio_algorithms(bn_name,sample_size,score_measured=['dist2opt'],algorithms={'tabu_search':([],{})},nsamples=30):
"""
Subplot score algorithms from a matrix score ( size*algo*scores). """
bn=gum.loadBN(os.path.join("true_graphes_structures",bn_name))
matrix_ratio=learn_ratio(bn,sample_size,score_measured,algorithms,nsamples)
"""
save_objects('matrice_scores',matrix_score)
matrix_score=load_objects('matrice_scores')
"""
fig,ax = plt.subplots(nrows=len(score_measured), ncols=1, sharex=True,figsize =[6.4, 2*len(score_measured)],squeeze =False)
#define set of colors
cmap = plt.get_cmap('gnuplot')
colors = [cmap(i) for i in np.linspace(0.1, 1, len(score_measured))]
mean_colors=colors[::-1]
for index_score, score in enumerate(score_measured):
score_repetition=matrix_ratio[:,index_score,:]
meanpointprops = dict(marker='D', markeredgecolor='white',markerfacecolor=mean_colors[index_score])
bp=ax[index_score,0].boxplot(np.transpose(score_repetition), sym=' ',patch_artist=True,showmeans=True,meanprops=meanpointprops)
for box in bp['boxes']:
box.set( facecolor = colors[index_score] )
ax[index_score,0].set_ylabel (score)
ax[index_score,0].set_title("Score ratio measure {} in relation with dataset size".format(score))
ax[index_score,0].yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
alpha=0.5)
ax[len(score_measured)-1,0].set_xticklabels(sample_size,rotation=45, fontsize=8)
#at the end of computation, delete temporay file created
if os.path.exists(os.path.join("databases","temp_base.csv")):
os.remove(os.path.join("databases","temp_base.csv"))
box = ax[len(score_measured)-1,0].get_position()
ax[len(score_measured)-1,0].set_position([box.x0, box.y0 + box.height * 0.4,
box.width, box.height * 0.6])
legend_elements=[Line2D([0], [0], marker='D', markeredgecolor='white',markerfacecolor=mean_colors[index_score],color="w",label="mean ratio for score: {}".format(score_measured[index_score])) for index_score in range (len(score_measured))]
ax[len(score_measured)-1,0].legend(handles=legend_elements,bbox_to_anchor=(0.0, -1,1., .102), loc=3,ncol=4, mode="expand", fancybox=True, shadow=True)
plt.xlabel("data_size")
fig.suptitle('Ratio of scores of {} over {} for BN : {}'.format(list(algorithms.keys())[1],list(algorithms.keys())[0],os.path.splitext(bn_name)[0]),y=1.05,weight ="bold")
fig.tight_layout()
return fig
if __name__ == "__main__":
#several sample sizes to look after
sample_size=[1000,5000,10000]
#storing of true graph structures
name_graph_files=os.listdir("true_graphes_structures")
bn=choose_graph_name(name_graph_files)
#fig=plot_score_algorithms(bn,sample_size,['recall'],algorithms={'tabu_search':([],{}),'greedy_climbing':([],{})},nsamples=10)
#plt.savefig(os.path.join("figures","asia_scores"))
fig=plot_ratio_algorithms(bn,sample_size,score_measured=['time','recall'],algorithms={'tabu_search':([20,50],{}),'greedy_climbing':([],{})},nsamples=10)
plt.savefig(os.path.join("figures","asia_ratios"))
"""
learner=gum.BNLearner(os.path.join("databases","sample_asia.csv"))
learner.useGreedyHillClimbing()
bnHC=learner.learnBN()
geHC=gum.EssentialGraph(bnHC).mixedGraph()
bn1=gum.fastBN("A->B;A->C")
bn2=gum.fastBN("B->A;C->A")
print(gum.EssentialGraph(bn1).skeleton())
print(gum.EssentialGraph(bn2).skeleton())
"""