Skip to content

Commit 5aedf63

Browse files
author
Roland Faure
committed
Added securities in case inputted alignments are incompatible with the graph
1 parent f3271c5 commit 5aedf63

File tree

3 files changed

+82
-61
lines changed

3 files changed

+82
-61
lines changed

graphunzip.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -320,9 +320,11 @@ def main():
320320

321321
print("WARNING: all interaction matrices are empty, GraphUnzip does not do anything")
322322

323+
print("\nDone untangling, now merging all contigs that can be merged")
323324
merge_adjacent_contigs(segments)
324325

325-
# now exporting the output
326+
# now exporting the output
327+
print("Now exporting the result")
326328
io.export_to_GFA(segments, gfaFile, exportFile=outFile, merge_adjacent_contigs=merge)
327329

328330
if fastaFile != "None":

input_output.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pickle #for writing files and reading them
1313
import re #to find all numbers in a mixed number/letters string (such as 31M1D4M), to split on several characters (<> in longReads_interactionMatrix)
1414
import shutil #to remove directories
15+
import sys
1516

1617
from segment import Segment
1718
from segment import compute_copiesNumber
@@ -150,9 +151,8 @@ def read_TSV(tsv_file, names, lines):
150151
alignment += contig[:-1]
151152

152153
if contig[:-1] not in names :
153-
print("Problem: ", line)
154-
while True :
155-
rien = 0
154+
print("ERROR: while reading the .tsv, I am coming across a contig that was not in the .gfa, namely ", contig[:-1], ". I recommend you check that you are using the same GFA that you aligned the long reads on.")
155+
sys.exit()
156156
lines += [alignment]
157157

158158

solve_with_long_reads.py

+76-57
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from determine_multiplicity import determine_multiplicity
1313
from input_output import read_GAF
1414
from input_output import read_TSV
15+
from segment import find_this_link
1516

1617
import segment
1718

@@ -57,9 +58,9 @@ def bridge_with_long_reads(segments, names, copiesnumber, gafFile, supported_lin
5758
read_GAF(gafFile, 0.7, 0.1, lines)
5859
print("Finished going through the gaf file.")
5960
elif '.tsv' in gafFile :
60-
print("Reading the gpa file...")
61+
print("Reading the tsv file...")
6162
read_TSV(gafFile, names, lines)
62-
print("Finished going through the gpa file.")
63+
print("Finished going through the tsv file.")
6364
else :
6465
print("ERROR: input format of mapped read not recognized. It should be .gfa or .gpa")
6566
sys.exit()
@@ -71,7 +72,7 @@ def bridge_with_long_reads(segments, names, copiesnumber, gafFile, supported_lin
7172
longContigs = [True for i in range(len(names))] #then all contigs that are in the middle of a read will be marked as False
7273
bridges = [[[],[]] for i in range(len(haploidContigs))] #bridges is a list inventoring at index haploidCOntigsNames[seg.names[0]] all the links left and right of the contig, supported by the gaf
7374
minimum_supported_links = sparse.lil_matrix((len(names)*2, len(names)*2)) #minimum_supported links is the list of all links between different contigs found at least once in the gaf file
74-
inventoriate_bridges(lines, bridges, minimum_supported_links, haploidContigsNames, longContigs, names)
75+
inventoriate_bridges(lines, bridges, minimum_supported_links, haploidContigsNames, longContigs, names, segments)
7576

7677
#now, from all the bridges, build consensus bridges
7778
consensus_bridges = [['',''] for i in range(len(haploidContigs))] #consensus bridge is essentially the same as bridges, except there is only one bridge left at each side for each contig
@@ -124,74 +125,92 @@ def bridge_with_long_reads(segments, names, copiesnumber, gafFile, supported_lin
124125

125126
#input : a list of alignments of a gaf file
126127
#output : the completed bridges list, with for each haploid contig a list of what was found left and right of the contig
127-
def inventoriate_bridges(lines, bridges, minimum_supported_links, haploidContigsNames, longContigs, names) :
128+
def inventoriate_bridges(lines, bridges, minimum_supported_links, haploidContigsNames, longContigs, names, segments) :
128129

129130

130131
for l, line in enumerate(lines) :
131132

132133
if (l+1) % 1000 == 0 :
133-
print("Inventoried ", l+1, " long reads over ", len(lines))
134+
print("Inventoried ", l+1, " long reads over ", len(lines), end = '\r')
134135

135136
contigs = re.split('[><]' , line)
136137
orientations = "".join(re.findall("[<>]", line))
137138
del contigs[0] #because the first element is always ''
138-
139+
140+
#first go through the alignment to make sure it is possible on the gfa
141+
possible = True
139142
for c, contig in enumerate(contigs) :
140-
141143
if c>0 :
142-
minimum_supported_links[2*names[contigs[c-1]] + '<>'.index(orientations[c-1]) , 2*names[contigs[c]] + '><'.index(orientations[c])] = 1
143-
minimum_supported_links[2*names[contigs[c]] + '><'.index(orientations[c]), 2*names[contigs[c-1]] + '<>'.index(orientations[c-1])] = 1
144+
or1 = '<>'.index(orientations[c-1])
145+
or2 = '><'.index(orientations[c])
146+
#check if the link actually exists (it should, if the aligner did its job correctly, but apparently sometimes SPAligner behaves strangely)
147+
if -1 == find_this_link(segments[names[contig]], or2, segments[names[contigs[c-1]]].links[or1], segments[names[contigs[c-1]]].otherEndOfLinks[or1]) :
148+
print ("WARNING: discrepancy between what's found in the alignment files and the inputted GFA graph. Link ", contigs[c-1:c+1], orientations[c-1:c+1], " not found in the gfa")
149+
possible = False
150+
151+
#then, only inventoriate the bridge if it is possible with respect to the graph
152+
if possible :
144153

145-
if c > 0 and c < len(contigs) - 1 :
146-
longContigs[names[contig]] = False
147-
148-
if contig in haploidContigsNames :
149-
154+
for c, contig in enumerate(contigs) :
150155

151-
if orientations[c] == ">" :
152-
r = 0
153-
#first look at what contigs are left of the contig of interest
154-
bridges[haploidContigsNames[contig]][1] += [""]
155-
for c2 in range(c+1, len(contigs)) :
156-
157-
bridges[haploidContigsNames[contig]][1][-1] += orientations[c2] + contigs[c2]
158-
159-
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
160-
# break
161-
162-
#then look at what's left of the contig of interest (so mirror the orientations)
163-
bridges[haploidContigsNames[contig]][0] += [""]
164-
for c2 in range(c-1 , -1, -1) :
165-
166-
if orientations[c2] == '>' :
167-
bridges[haploidContigsNames[contig]][0][-1] += '<' + contigs[c2]
168-
else :
169-
bridges[haploidContigsNames[contig]][0][-1] += '>' + contigs[c2]
156+
if c>0 :
157+
158+
or1 = '<>'.index(orientations[c-1])
159+
or2 = '><'.index(orientations[c])
160+
161+
minimum_supported_links[2*names[contigs[c-1]] + or1, 2*names[contigs[c]] + or2] = 1
162+
minimum_supported_links[2*names[contigs[c]] + or2, 2*names[contigs[c-1]] + or1] = 1
163+
164+
if c > 0 and c < len(contigs) - 1 :
165+
longContigs[names[contig]] = False
166+
167+
if contig in haploidContigsNames :
168+
169+
170+
if orientations[c] == ">" :
171+
r = 0
172+
#first look at what contigs are left of the contig of interest
173+
bridges[haploidContigsNames[contig]][1] += [""]
174+
for c2 in range(c+1, len(contigs)) :
170175

171-
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
172-
# break
176+
bridges[haploidContigsNames[contig]][1][-1] += orientations[c2] + contigs[c2]
173177

174-
else :
175-
176-
#first look at what contigs are left of the contig of interest
177-
bridges[haploidContigsNames[contig]][0] += [""]
178-
for c2 in range(c+1, len(contigs)) :
179-
bridges[haploidContigsNames[contig]][0][-1] += orientations[c2] + contigs[c2]
180-
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
181-
# break
182-
183-
#then look at what's left of the contig of interest (so mirror the orientations)
184-
bridges[haploidContigsNames[contig]][1] += [""]
185-
for c2 in range(c-1 , -1, -1 ) :
178+
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
179+
# break
180+
181+
#then look at what's left of the contig of interest (so mirror the orientations)
182+
bridges[haploidContigsNames[contig]][0] += [""]
183+
for c2 in range(c-1 , -1, -1) :
184+
185+
if orientations[c2] == '>' :
186+
bridges[haploidContigsNames[contig]][0][-1] += '<' + contigs[c2]
187+
else :
188+
bridges[haploidContigsNames[contig]][0][-1] += '>' + contigs[c2]
189+
190+
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
191+
# break
192+
193+
else :
186194

187-
if orientations[c2] == '>' :
188-
bridges[haploidContigsNames[contig]][1][-1] += '<' + contigs[c2]
189-
else :
190-
bridges[haploidContigsNames[contig]][1][-1] += '>' + contigs[c2]
195+
#first look at what contigs are left of the contig of interest
196+
bridges[haploidContigsNames[contig]][0] += [""]
197+
for c2 in range(c+1, len(contigs)) :
198+
bridges[haploidContigsNames[contig]][0][-1] += orientations[c2] + contigs[c2]
199+
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
200+
# break
191201

192-
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
193-
# break
194-
202+
#then look at what's left of the contig of interest (so mirror the orientations)
203+
bridges[haploidContigsNames[contig]][1] += [""]
204+
for c2 in range(c-1 , -1, -1 ) :
205+
206+
if orientations[c2] == '>' :
207+
bridges[haploidContigsNames[contig]][1][-1] += '<' + contigs[c2]
208+
else :
209+
bridges[haploidContigsNames[contig]][1][-1] += '>' + contigs[c2]
210+
211+
# if contigs[c2] in haploidContigsNames : #you can stop the bridge, you've reached the other side
212+
# break
213+
195214
#input : list of bridges for each haploid contig
196215
#output : completed consensus_bridges, where there is max one bridge at each end of contig
197216
def build_consensus_bridges(consensus_bridges, bridges, names, haploidContigs, haploidContigsNames):
@@ -206,7 +225,7 @@ def build_consensus_bridges(consensus_bridges, bridges, names, haploidContigs, h
206225
for c in range(len(bridges)) :
207226

208227
if (c)%100 == 0 :
209-
print("consensused ", c, " bridges out of ", len(consensus_bridges))
228+
print("consensused ", c, " bridges out of ", len(consensus_bridges), end = '\r')
210229

211230
localContigs = [ [ re.split('[><]' , bridges[c][j][k])[1:] for k in range(len(bridges[c][j])) ] for j in range(2)]
212231
localOrientations = [ [ "".join(re.findall("[<>]", bridges[c][j][k])) for k in range(len(bridges[c][j])) ] for j in range(2)]
@@ -410,7 +429,7 @@ def unzip_graph_with_bridges(segments, non_overlapping_bridges, copiesnumber, ha
410429
for se in range(l) :
411430

412431
if (se)%1000 == 0 :
413-
print("Processed ", se, " contigs out of ", l, ", while untangling with long reads")
432+
print("Processed ", se, " contigs out of ", l, ", while untangling with long reads", end = '\r')
414433
s = segments[se]
415434

416435
if s.names[0] in haploidContigsNames :
@@ -524,7 +543,7 @@ def unzip_graph_with_bridges(segments, non_overlapping_bridges, copiesnumber, ha
524543

525544
for alreadyDuplicatedContig in segments[oldContigsIndices[c-1]].links[end0] :
526545
if alreadyDuplicatedContig.names[0] == contigs[c]:
527-
segment.add_link(segments[newContigsIndices[-1]] , end0, alreadyDuplicatedContig, end1)
546+
segment.add_link(segments[newContigsIndices[-1]] , end0, alreadyDuplicatedContig, end1, CIGAR)
528547

529548

530549
else :

0 commit comments

Comments
 (0)