12
12
from determine_multiplicity import determine_multiplicity
13
13
from input_output import read_GAF
14
14
from input_output import read_TSV
15
+ from segment import find_this_link
15
16
16
17
import segment
17
18
@@ -57,9 +58,9 @@ def bridge_with_long_reads(segments, names, copiesnumber, gafFile, supported_lin
57
58
read_GAF (gafFile , 0.7 , 0.1 , lines )
58
59
print ("Finished going through the gaf file." )
59
60
elif '.tsv' in gafFile :
60
- print ("Reading the gpa file..." )
61
+ print ("Reading the tsv file..." )
61
62
read_TSV (gafFile , names , lines )
62
- print ("Finished going through the gpa file." )
63
+ print ("Finished going through the tsv file." )
63
64
else :
64
65
print ("ERROR: input format of mapped read not recognized. It should be .gfa or .gpa" )
65
66
sys .exit ()
@@ -71,7 +72,7 @@ def bridge_with_long_reads(segments, names, copiesnumber, gafFile, supported_lin
71
72
longContigs = [True for i in range (len (names ))] #then all contigs that are in the middle of a read will be marked as False
72
73
bridges = [[[],[]] for i in range (len (haploidContigs ))] #bridges is a list inventoring at index haploidCOntigsNames[seg.names[0]] all the links left and right of the contig, supported by the gaf
73
74
minimum_supported_links = sparse .lil_matrix ((len (names )* 2 , len (names )* 2 )) #minimum_supported links is the list of all links between different contigs found at least once in the gaf file
74
- inventoriate_bridges (lines , bridges , minimum_supported_links , haploidContigsNames , longContigs , names )
75
+ inventoriate_bridges (lines , bridges , minimum_supported_links , haploidContigsNames , longContigs , names , segments )
75
76
76
77
#now, from all the bridges, build consensus bridges
77
78
consensus_bridges = [['' ,'' ] for i in range (len (haploidContigs ))] #consensus bridge is essentially the same as bridges, except there is only one bridge left at each side for each contig
@@ -124,74 +125,92 @@ def bridge_with_long_reads(segments, names, copiesnumber, gafFile, supported_lin
124
125
125
126
#input : a list of alignments of a gaf file
126
127
#output : the completed bridges list, with for each haploid contig a list of what was found left and right of the contig
127
- def inventoriate_bridges (lines , bridges , minimum_supported_links , haploidContigsNames , longContigs , names ) :
128
+ def inventoriate_bridges (lines , bridges , minimum_supported_links , haploidContigsNames , longContigs , names , segments ) :
128
129
129
130
130
131
for l , line in enumerate (lines ) :
131
132
132
133
if (l + 1 ) % 1000 == 0 :
133
- print ("Inventoried " , l + 1 , " long reads over " , len (lines ))
134
+ print ("Inventoried " , l + 1 , " long reads over " , len (lines ), end = ' \r ' )
134
135
135
136
contigs = re .split ('[><]' , line )
136
137
orientations = "" .join (re .findall ("[<>]" , line ))
137
138
del contigs [0 ] #because the first element is always ''
138
-
139
+
140
+ #first go through the alignment to make sure it is possible on the gfa
141
+ possible = True
139
142
for c , contig in enumerate (contigs ) :
140
-
141
143
if c > 0 :
142
- minimum_supported_links [2 * names [contigs [c - 1 ]] + '<>' .index (orientations [c - 1 ]) , 2 * names [contigs [c ]] + '><' .index (orientations [c ])] = 1
143
- minimum_supported_links [2 * names [contigs [c ]] + '><' .index (orientations [c ]), 2 * names [contigs [c - 1 ]] + '<>' .index (orientations [c - 1 ])] = 1
144
+ or1 = '<>' .index (orientations [c - 1 ])
145
+ or2 = '><' .index (orientations [c ])
146
+ #check if the link actually exists (it should, if the aligner did its job correctly, but apparently sometimes SPAligner behaves strangely)
147
+ if - 1 == find_this_link (segments [names [contig ]], or2 , segments [names [contigs [c - 1 ]]].links [or1 ], segments [names [contigs [c - 1 ]]].otherEndOfLinks [or1 ]) :
148
+ print ("WARNING: discrepancy between what's found in the alignment files and the inputted GFA graph. Link " , contigs [c - 1 :c + 1 ], orientations [c - 1 :c + 1 ], " not found in the gfa" )
149
+ possible = False
150
+
151
+ #then, only inventoriate the bridge if it is possible with respect to the graph
152
+ if possible :
144
153
145
- if c > 0 and c < len (contigs ) - 1 :
146
- longContigs [names [contig ]] = False
147
-
148
- if contig in haploidContigsNames :
149
-
154
+ for c , contig in enumerate (contigs ) :
150
155
151
- if orientations [ c ] == ">" :
152
- r = 0
153
- #first look at what contigs are left of the contig of interest
154
- bridges [ haploidContigsNames [ contig ]][ 1 ] += [ "" ]
155
- for c2 in range ( c + 1 , len ( contigs )) :
156
-
157
- bridges [ haploidContigsNames [ contig ]][ 1 ][ - 1 ] += orientations [ c2 ] + contigs [ c2 ]
158
-
159
- # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
160
- # break
161
-
162
- #then look at what's left of the contig of interest (so mirror the orientations)
163
- bridges [ haploidContigsNames [ contig ]][ 0 ] += [ "" ]
164
- for c2 in range ( c - 1 , - 1 , - 1 ) :
165
-
166
- if orientations [ c2 ] == '>' :
167
- bridges [ haploidContigsNames [ contig ]][ 0 ][ - 1 ] += '<' + contigs [ c2 ]
168
- else :
169
- bridges [ haploidContigsNames [ contig ]][ 0 ][ - 1 ] += '>' + contigs [ c2 ]
156
+ if c > 0 :
157
+
158
+ or1 = '<>' . index ( orientations [ c - 1 ])
159
+ or2 = '><' . index ( orientations [ c ])
160
+
161
+ minimum_supported_links [ 2 * names [ contigs [ c - 1 ]] + or1 , 2 * names [ contigs [ c ]] + or2 ] = 1
162
+ minimum_supported_links [ 2 * names [ contigs [ c ]] + or2 , 2 * names [ contigs [ c - 1 ]] + or1 ] = 1
163
+
164
+ if c > 0 and c < len ( contigs ) - 1 :
165
+ longContigs [ names [ contig ]] = False
166
+
167
+ if contig in haploidContigsNames :
168
+
169
+
170
+ if orientations [ c ] == ">" :
171
+ r = 0
172
+ #first look at what contigs are left of the contig of interest
173
+ bridges [ haploidContigsNames [ contig ]][ 1 ] += [ "" ]
174
+ for c2 in range ( c + 1 , len ( contigs )) :
170
175
171
- # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
172
- # break
176
+ bridges [haploidContigsNames [contig ]][1 ][- 1 ] += orientations [c2 ] + contigs [c2 ]
173
177
174
- else :
175
-
176
- #first look at what contigs are left of the contig of interest
177
- bridges [haploidContigsNames [contig ]][0 ] += ["" ]
178
- for c2 in range (c + 1 , len (contigs )) :
179
- bridges [haploidContigsNames [contig ]][0 ][- 1 ] += orientations [c2 ] + contigs [c2 ]
180
- # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
181
- # break
182
-
183
- #then look at what's left of the contig of interest (so mirror the orientations)
184
- bridges [haploidContigsNames [contig ]][1 ] += ["" ]
185
- for c2 in range (c - 1 , - 1 , - 1 ) :
178
+ # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
179
+ # break
180
+
181
+ #then look at what's left of the contig of interest (so mirror the orientations)
182
+ bridges [haploidContigsNames [contig ]][0 ] += ["" ]
183
+ for c2 in range (c - 1 , - 1 , - 1 ) :
184
+
185
+ if orientations [c2 ] == '>' :
186
+ bridges [haploidContigsNames [contig ]][0 ][- 1 ] += '<' + contigs [c2 ]
187
+ else :
188
+ bridges [haploidContigsNames [contig ]][0 ][- 1 ] += '>' + contigs [c2 ]
189
+
190
+ # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
191
+ # break
192
+
193
+ else :
186
194
187
- if orientations [c2 ] == '>' :
188
- bridges [haploidContigsNames [contig ]][1 ][- 1 ] += '<' + contigs [c2 ]
189
- else :
190
- bridges [haploidContigsNames [contig ]][1 ][- 1 ] += '>' + contigs [c2 ]
195
+ #first look at what contigs are left of the contig of interest
196
+ bridges [haploidContigsNames [contig ]][0 ] += ["" ]
197
+ for c2 in range (c + 1 , len (contigs )) :
198
+ bridges [haploidContigsNames [contig ]][0 ][- 1 ] += orientations [c2 ] + contigs [c2 ]
199
+ # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
200
+ # break
191
201
192
- # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
193
- # break
194
-
202
+ #then look at what's left of the contig of interest (so mirror the orientations)
203
+ bridges [haploidContigsNames [contig ]][1 ] += ["" ]
204
+ for c2 in range (c - 1 , - 1 , - 1 ) :
205
+
206
+ if orientations [c2 ] == '>' :
207
+ bridges [haploidContigsNames [contig ]][1 ][- 1 ] += '<' + contigs [c2 ]
208
+ else :
209
+ bridges [haploidContigsNames [contig ]][1 ][- 1 ] += '>' + contigs [c2 ]
210
+
211
+ # if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
212
+ # break
213
+
195
214
#input : list of bridges for each haploid contig
196
215
#output : completed consensus_bridges, where there is max one bridge at each end of contig
197
216
def build_consensus_bridges (consensus_bridges , bridges , names , haploidContigs , haploidContigsNames ):
@@ -206,7 +225,7 @@ def build_consensus_bridges(consensus_bridges, bridges, names, haploidContigs, h
206
225
for c in range (len (bridges )) :
207
226
208
227
if (c )% 100 == 0 :
209
- print ("consensused " , c , " bridges out of " , len (consensus_bridges ))
228
+ print ("consensused " , c , " bridges out of " , len (consensus_bridges ), end = ' \r ' )
210
229
211
230
localContigs = [ [ re .split ('[><]' , bridges [c ][j ][k ])[1 :] for k in range (len (bridges [c ][j ])) ] for j in range (2 )]
212
231
localOrientations = [ [ "" .join (re .findall ("[<>]" , bridges [c ][j ][k ])) for k in range (len (bridges [c ][j ])) ] for j in range (2 )]
@@ -410,7 +429,7 @@ def unzip_graph_with_bridges(segments, non_overlapping_bridges, copiesnumber, ha
410
429
for se in range (l ) :
411
430
412
431
if (se )% 1000 == 0 :
413
- print ("Processed " , se , " contigs out of " , l , ", while untangling with long reads" )
432
+ print ("Processed " , se , " contigs out of " , l , ", while untangling with long reads" , end = ' \r ' )
414
433
s = segments [se ]
415
434
416
435
if s .names [0 ] in haploidContigsNames :
@@ -524,7 +543,7 @@ def unzip_graph_with_bridges(segments, non_overlapping_bridges, copiesnumber, ha
524
543
525
544
for alreadyDuplicatedContig in segments [oldContigsIndices [c - 1 ]].links [end0 ] :
526
545
if alreadyDuplicatedContig .names [0 ] == contigs [c ]:
527
- segment .add_link (segments [newContigsIndices [- 1 ]] , end0 , alreadyDuplicatedContig , end1 )
546
+ segment .add_link (segments [newContigsIndices [- 1 ]] , end0 , alreadyDuplicatedContig , end1 , CIGAR )
528
547
529
548
530
549
else :
0 commit comments