-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbigbwt
executable file
·245 lines (219 loc) · 10.9 KB
/
bigbwt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python3
import sys, time, argparse, subprocess, os.path
Description = """
Tool to build the BWT for higly repetive files using the approach
described in
"Prefix-Free Parsing for Building Big BWTs"
by Christina Boucher, Travis Gagie, Alan Kuhnle and Giovanni Manzini
Proc. WABI '18 (http://drops.dagstuhl.de/opus/volltexte/2018/9304/)
The input file cannot contain the characters 0, 1 or 2 which are
used internally by the algorithm. The character 0 is used as the EOF
in the output BWT. The dictionary and the parse should not be larger than 2GB.
Input files larger than 2GB are ok, but computing the BWT in the traditional way
takes 9n bytes of RAM. If this is a problem, just don't use the option -c and
check the correctness of the BWT by some other means (for example inverting it)
"""
dirname = os.path.dirname(os.path.abspath(__file__))
parse_exe = os.path.join(dirname, "newscan.x")
parseNT_exe = os.path.join(dirname, "newscanNT.x")
parsebwt_exe = os.path.join(dirname, "bwtparse")
parsebwt_exe64 = os.path.join(dirname, "bwtparse64")
pfbwt_exe = os.path.join(dirname, "pfbwt.x")
pfbwtNT_exe = os.path.join(dirname, "pfbwtNT.x")
pfbwt_exe64 = os.path.join(dirname, "pfbwt64.x")
pfbwtNT_exe64 = os.path.join(dirname, "pfbwtNT64.x")
bwt_exe = os.path.join(dirname, "simplebwt")
bwt_exe64 = os.path.join(dirname, "simplebwt64")
shasum_exe = "sha256sum"
def main():
parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('input', help='input file name', type=str)
parser.add_argument('-w', '--wsize', help='sliding window size (def. 10)', default=10, type=int)
parser.add_argument('-p', '--mod', help='hash modulus (def. 100)', default=100, type=int)
parser.add_argument('-t', help='number of helper threads (def. None)', default=0, type=int)
parser.add_argument('-s', help='compute the start run-length sampled Suffix Array',action='store_true')
parser.add_argument('-e', help='compute the end run-length sampled Suffix Array',action='store_true')
parser.add_argument('-S', help='compute the full Suffix Array',action='store_true')
parser.add_argument('-k', help='keep temporary files',action='store_true')
parser.add_argument('-v', help='verbose',action='store_true')
parser.add_argument('-c', help='check BWT using SACA-K',action='store_true')
parser.add_argument('-f', help='read fasta',action='store_true')
parser.add_argument('--sum', help='compute output files sha256sum',action='store_true')
parser.add_argument('--parsing', help='stop after the parsing phase (debug only)',action='store_true')
parser.add_argument('--compress', help='compress output of the parsing phase (debug only)',action='store_true')
parser.add_argument('--probing','-P', help='use linear probing to avoid hash collisions during parsing',action='store_true', dest='probing')
args = parser.parse_args()
if args.f and args.t > 0 and (".fq" in args.input or ".fastq" in args.input or ".fnq" in args.input):
print("bigbwt does not current support FASTQ format! Exiting...")
return
if(args.S and (args.s or args.e)):
print("You can either compute the full SA or a sample of it, not both. Exiting...")
return
logfile_name = args.input + ".log"
# get main bigbwt directory
args.bigbwt_dir = os.path.split(sys.argv[0])[0]
print("Sending logging messages to file:", logfile_name)
with open(logfile_name,"a") as logfile:
# ---------- parsing of the input file
start0 = start = time.time()
if args.t>0:
command = "{exe} {file} -w {wsize} -p {modulus} -t {th}".format(
exe = os.path.join(args.bigbwt_dir,parse_exe),
wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input)
else:
command = "{exe} {file} -w {wsize} -p {modulus}".format(
exe = os.path.join(args.bigbwt_dir,parseNT_exe),
wsize=args.wsize, modulus = args.mod, file=args.input)
if (args.s or args.e or args.S): command += " -s"
if args.v: command += " -v"
if args.parsing or args.compress: command += " -c"
if args.probing: command += " -P"
if args.f: command += " -f"
print("==== Parsing. Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
if args.parsing:
# delete temporary parsing files
command = "rm -f {file}.parse_old {file}.last {file}.occ".format(file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("==== Stopping after the parsing phase as requested")
return
elif args.compress:
# save parsing files
start = time.time()
command = "tar -cJf {file}.parse.txz {file}.parse {file}.dicz".format(file=args.input)
print("==== Compressing. Command:", command)
if(execute_command(command,logfile,logfile_name,env={"XZ_OPT":"-9"})!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
delete_temp_files(args,logfile,logfile_name)
print("==== Done: Parsing output xz-compressed as requested")
return
# ----------- computation of the BWT of the parsing
start = time.time()
parse_size = os.path.getsize(args.input+".parse")/4
if(parse_size >= (2**32-1) ):
print("Sorry, the parse contains %d words" % parse_size )
print("which is more than my current limit 2^32-2")
print("Please re-run the program with a larger modulus (currently %d)" % args.mod)
sys.exit(1)
elif(parse_size >= (2**31-1) ):
command = "{exe} {file}".format(
exe = os.path.join(args.bigbwt_dir,parsebwt_exe64), file=args.input)
else:
command = "{exe} {file}".format(
exe = os.path.join(args.bigbwt_dir,parsebwt_exe), file=args.input)
if (args.s or args.e or args.S): command += " -s"
if (args.t>0): command += " -t " + str(args.t)
print("==== Computing BWT of parsing. Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start));
# ----------- compute final BWT using dictionary and BWT of parse
start = time.time()
if(os.path.getsize(args.input+".dict") >= (2**31-4) ):
# 64 bit version with and without threads
if args.t>0 and args.s==False and args.e==False:
command = "{exe} -w {wsize} {file} -t {th}".format(
exe = os.path.join(args.bigbwt_dir,pfbwt_exe64),
wsize=args.wsize, file=args.input, th=args.t)
else:
command = "{exe} -w {wsize} {file}".format(
exe = os.path.join(args.bigbwt_dir,pfbwtNT_exe64),
wsize=args.wsize, file=args.input)
else: # 32 bit version
if args.t>0 and args.s==False and args.e==False:
command = "{exe} -w {wsize} {file} -t {th}".format(
exe = os.path.join(args.bigbwt_dir,pfbwt_exe),
wsize=args.wsize, file=args.input, th=args.t)
else:
command = "{exe} -w {wsize} {file}".format(
exe = os.path.join(args.bigbwt_dir,pfbwtNT_exe),
wsize=args.wsize, file=args.input)
if args.s: command += " -s"
if args.e: command += " -e"
if args.S: command += " -S"
print("==== Computing final BWT. Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
print("Total construction time: {0:.4f}".format(time.time()-start0))
# ---- compute sha256sum
if args.sum:
digest = file_digest(args.input +".bwt",logfile)
print("BWT {exe}: {digest}".format(exe=shasum_exe, digest=digest))
if args.S:
digest = file_digest(args.input +".sa",logfile)
print("SA {exe}: {digest}".format(exe=shasum_exe, digest=digest))
if args.s:
digest = file_digest(args.input +".ssa",logfile)
print("SSA {exe}: {digest}".format(exe=shasum_exe, digest=digest))
if args.e:
digest = file_digest(args.input +".esa",logfile)
print("ESA {exe}: {digest}".format(exe=shasum_exe, digest=digest))
# ---- delete intermediate files
delete_temp_files(args,logfile,logfile_name)
# --- start checking ---
if args.c:
start = time.time()
if(os.path.getsize(args.input)>= 2**31):
command = "{exe} {file}".format(
exe = os.path.join(args.bigbwt_dir,bwt_exe64), file=args.input)
else:
command = "{exe} {file}".format(
exe = os.path.join(args.bigbwt_dir,bwt_exe), file=args.input)
print("==== Computing BWT using sacak. Command:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start));
command = "cmp {file}.bwt {file}.Bwt".format(file=args.input);
print("==== Comparing BWTs. Command:", command);
if(execute_command(command,logfile,logfile_name)):
print("BWTs match");
else:
print("BWTs differ");
# --- end checking ---
print("==== Done")
# delete intermediate files
def delete_temp_files(args,logfile,logfile_name):
if args.k==False:
print("==== Deleting temporary files.") # no need to show the command
command = "rm -f {file}.parse {file}.parse_old {file}.last {file}.bwlast {file}.dict {file}.ilist {file}.occ".format(file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
for i in range(args.t):
command = "rm -f {file}.{i}.parse_old {file}.{i}.last".format(file=args.input, i=i)
if(execute_command(command,logfile,logfile_name)!=True):
return
if args.s or args.S:
command = "rm -f {file}.sai {file}.bwsai".format(file=args.input);
if(execute_command(command,logfile,logfile_name)!=True):
return
for i in range(args.t):
command = "rm -f {file}.{i}.sai".format(file=args.input, i=i)
if(execute_command(command,logfile,logfile_name)!=True):
return
# compute hash digest for a file
def file_digest(name,logfile):
try:
hash_command = "{exe} {infile}".format(exe=shasum_exe, infile=name)
hashsum = subprocess.check_output(hash_command.split(),stderr=logfile)
hashsum = hashsum.decode("utf-8").split()[0]
except:
hashsum = "Error!"
return hashsum
# execute command: return True is everything OK, False otherwise
def execute_command(command,logfile,logfile_name,env=None):
try:
#subprocess.run(command.split(),stdout=logfile,stderr=logfile,check=True,env=env)
subprocess.check_call(command.split(),stdout=logfile,stderr=logfile,env=env)
except subprocess.CalledProcessError:
print("Error executing command line:")
print("\t"+ command)
print("Check log file: " + logfile_name)
return False
return True
if __name__ == '__main__':
main()