bigbwt

#!/usr/bin/env python3

import sys, time, argparse, subprocess, os.path

Description = """
Tool to build the BWT for higly repetive files using the approach
described in 

   "Prefix-Free Parsing for Building Big BWTs"
   by Christina Boucher, Travis Gagie, Alan Kuhnle and Giovanni Manzini
   Proc. WABI '18 (http://drops.dagstuhl.de/opus/volltexte/2018/9304/)
   
The input file cannot contain the characters 0, 1 or 2 which are
used internally by the algorithm. The character 0 is used as the EOF
in the output BWT. The dictionary and the parse should not be larger than 2GB. 

Input files larger than 2GB are ok, but computing the BWT in the traditional way
takes 9n bytes of RAM. If this is a problem, just don't use the option -c and
check the correctness of the BWT by some other means (for example inverting it)
"""

dirname = os.path.dirname(os.path.abspath(__file__))
parse_exe       =  os.path.join(dirname, "newscan.x")
parseNT_exe     =  os.path.join(dirname, "newscanNT.x")
parsebwt_exe    =  os.path.join(dirname, "bwtparse")
parsebwt_exe64  =  os.path.join(dirname, "bwtparse64")
pfbwt_exe       =  os.path.join(dirname, "pfbwt.x")
pfbwtNT_exe     =  os.path.join(dirname, "pfbwtNT.x")
pfbwt_exe64     =  os.path.join(dirname, "pfbwt64.x")
pfbwtNT_exe64   =  os.path.join(dirname, "pfbwtNT64.x")
bwt_exe         =  os.path.join(dirname, "simplebwt")
bwt_exe64       =  os.path.join(dirname, "simplebwt64")
shasum_exe      =  "sha256sum"


def main():
  parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
  parser.add_argument('input', help='input file name', type=str)
  parser.add_argument('-w', '--wsize', help='sliding window size (def. 10)', default=10, type=int)
  parser.add_argument('-p', '--mod', help='hash modulus (def. 100)', default=100, type=int)
  parser.add_argument('-t', help='number of helper threads (def. None)', default=0, type=int)
  parser.add_argument('-s',  help='compute the start run-length sampled Suffix Array',action='store_true')
  parser.add_argument('-e',  help='compute the end run-length sampled Suffix Array',action='store_true')
  parser.add_argument('-S',  help='compute the full Suffix Array',action='store_true')
  parser.add_argument('-k',  help='keep temporary files',action='store_true')
  parser.add_argument('-v',  help='verbose',action='store_true')
  parser.add_argument('-c',  help='check BWT using SACA-K',action='store_true')
  parser.add_argument('-f', help='read fasta',action='store_true')
  parser.add_argument('--sum', help='compute output files sha256sum',action='store_true')
  parser.add_argument('--parsing',  help='stop after the parsing phase (debug only)',action='store_true')
  parser.add_argument('--compress',  help='compress output of the parsing phase (debug only)',action='store_true')
  parser.add_argument('--probing','-P', help='use linear probing to avoid hash collisions during parsing',action='store_true', dest='probing')
  args = parser.parse_args()

  if args.f and args.t > 0 and (".fq" in args.input or ".fastq" in args.input or ".fnq" in args.input): 
    print("bigbwt does not current support FASTQ format! Exiting...")
    return

  if(args.S and (args.s or args.e)):
    print("You can either compute the full SA or a sample of it, not both. Exiting...")
    return
    
  logfile_name = args.input + ".log"
  # get main bigbwt directory 
  args.bigbwt_dir = os.path.split(sys.argv[0])[0]  
  print("Sending logging messages to file:", logfile_name)
  with open(logfile_name,"a") as logfile:  
    
    # ---------- parsing of the input file 
    start0 = start = time.time()
    if args.t>0:
      command = "{exe} {file} -w {wsize} -p {modulus} -t {th}".format(
              exe = os.path.join(args.bigbwt_dir,parse_exe),
              wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input)
    else:
      command = "{exe} {file} -w {wsize} -p {modulus}".format(
              exe = os.path.join(args.bigbwt_dir,parseNT_exe),
              wsize=args.wsize, modulus = args.mod, file=args.input)
    if (args.s or args.e or args.S): command += " -s"
    if args.v: command += " -v"
    if args.parsing or args.compress: command += " -c"
    if args.probing: command += " -P"
    if args.f: command += " -f"
    print("==== Parsing. Command:", command)
    if(execute_command(command,logfile,logfile_name)!=True):
      return
    print("Elapsed time: {0:.4f}".format(time.time()-start))
    if args.parsing:
      # delete temporary parsing files 
      command = "rm -f {file}.parse_old {file}.last {file}.occ".format(file=args.input)
      if(execute_command(command,logfile,logfile_name)!=True):
        return
      print("==== Stopping after the parsing phase as requested")      
      return
    elif args.compress:
      # save parsing files 
      start = time.time()
      command = "tar -cJf {file}.parse.txz {file}.parse {file}.dicz".format(file=args.input)
      print("==== Compressing. Command:", command)
      if(execute_command(command,logfile,logfile_name,env={"XZ_OPT":"-9"})!=True): 
        return
      print("Elapsed time: {0:.4f}".format(time.time()-start))
      delete_temp_files(args,logfile,logfile_name)
      print("==== Done: Parsing output xz-compressed as requested")
      return

    # ----------- computation of the BWT of the parsing
    start = time.time()
    parse_size = os.path.getsize(args.input+".parse")/4
    if(parse_size >=  (2**32-1) ):
      print("Sorry, the parse contains %d words" %  parse_size )
      print("which is more than my current limit 2^32-2")
      print("Please re-run the program with a larger modulus (currently %d)" % args.mod)
      sys.exit(1)  
    elif(parse_size >=  (2**31-1) ):
      command = "{exe} {file}".format(
              exe = os.path.join(args.bigbwt_dir,parsebwt_exe64), file=args.input)
    else:
      command = "{exe} {file}".format(
              exe = os.path.join(args.bigbwt_dir,parsebwt_exe), file=args.input)
    if (args.s or args.e or args.S): command += " -s"
    if (args.t>0): command += " -t " + str(args.t)
    print("==== Computing BWT of parsing. Command:", command)
    if(execute_command(command,logfile,logfile_name)!=True):
      return
    print("Elapsed time: {0:.4f}".format(time.time()-start));      
    
    # ----------- compute final BWT using dictionary and BWT of parse    
    start = time.time()     
    if(os.path.getsize(args.input+".dict") >=  (2**31-4) ):
      # 64 bit version with and without threads
      if args.t>0 and args.s==False and args.e==False:
        command = "{exe} -w {wsize} {file} -t {th}".format(
              exe = os.path.join(args.bigbwt_dir,pfbwt_exe64),
              wsize=args.wsize, file=args.input, th=args.t)
      else:
        command = "{exe} -w {wsize} {file}".format( 
                exe = os.path.join(args.bigbwt_dir,pfbwtNT_exe64),
                wsize=args.wsize, file=args.input)        
    else:  # 32 bit version
      if args.t>0 and args.s==False and args.e==False:
        command = "{exe} -w {wsize} {file} -t {th}".format(
                exe = os.path.join(args.bigbwt_dir,pfbwt_exe),
                wsize=args.wsize, file=args.input, th=args.t)
      else:
        command = "{exe} -w {wsize} {file}".format(
                exe = os.path.join(args.bigbwt_dir,pfbwtNT_exe),
                wsize=args.wsize, file=args.input)
    if args.s: command += " -s"
    if args.e: command += " -e"
    if args.S: command += " -S"
    
    print("==== Computing final BWT. Command:", command)
    if(execute_command(command,logfile,logfile_name)!=True):
      return
    print("Elapsed time: {0:.4f}".format(time.time()-start))      
    print("Total construction time: {0:.4f}".format(time.time()-start0))      

    # ---- compute sha256sum
    if args.sum:
      digest = file_digest(args.input +".bwt",logfile)
      print("BWT {exe}: {digest}".format(exe=shasum_exe, digest=digest))
      if args.S:
        digest = file_digest(args.input +".sa",logfile)
        print("SA  {exe}: {digest}".format(exe=shasum_exe, digest=digest))
      if args.s:
        digest = file_digest(args.input +".ssa",logfile)
        print("SSA {exe}: {digest}".format(exe=shasum_exe, digest=digest))
      if args.e:
        digest = file_digest(args.input +".esa",logfile)
        print("ESA {exe}: {digest}".format(exe=shasum_exe, digest=digest))
 
    # ---- delete intermediate files
    delete_temp_files(args,logfile,logfile_name)

    # --- start checking ---
    if args.c:
      start = time.time()
      if(os.path.getsize(args.input)>= 2**31):
        command = "{exe} {file}".format(
                  exe = os.path.join(args.bigbwt_dir,bwt_exe64), file=args.input)
      else:
        command = "{exe} {file}".format(
                  exe = os.path.join(args.bigbwt_dir,bwt_exe), file=args.input)
      print("==== Computing BWT using sacak. Command:", command)       
      if(execute_command(command,logfile,logfile_name)!=True):
        return   
      print("Elapsed time: {0:.4f}".format(time.time()-start));      
      command = "cmp {file}.bwt {file}.Bwt".format(file=args.input); 
      print("==== Comparing BWTs. Command:", command);
      if(execute_command(command,logfile,logfile_name)):
        print("BWTs match");
      else:
        print("BWTs differ");
    # --- end checking ---

  print("==== Done")

# delete intermediate files
def delete_temp_files(args,logfile,logfile_name):
    if args.k==False:  
      print("==== Deleting temporary files.") # no need to show the command
      command = "rm -f {file}.parse {file}.parse_old {file}.last {file}.bwlast {file}.dict {file}.ilist {file}.occ".format(file=args.input)
      if(execute_command(command,logfile,logfile_name)!=True):
        return
      for i in range(args.t):
        command = "rm -f {file}.{i}.parse_old {file}.{i}.last".format(file=args.input, i=i)
        if(execute_command(command,logfile,logfile_name)!=True):
          return
      if args.s or args.S:
        command = "rm -f {file}.sai {file}.bwsai".format(file=args.input);
        if(execute_command(command,logfile,logfile_name)!=True):
          return
        for i in range(args.t):
          command = "rm -f {file}.{i}.sai".format(file=args.input, i=i)
          if(execute_command(command,logfile,logfile_name)!=True):
            return


# compute hash digest for a file 
def file_digest(name,logfile):
    try:
      hash_command = "{exe} {infile}".format(exe=shasum_exe, infile=name)
      hashsum = subprocess.check_output(hash_command.split(),stderr=logfile)
      hashsum = hashsum.decode("utf-8").split()[0]
    except:
      hashsum = "Error!" 
    return hashsum  

# execute command: return True is everything OK, False otherwise
def execute_command(command,logfile,logfile_name,env=None):
  try:
    #subprocess.run(command.split(),stdout=logfile,stderr=logfile,check=True,env=env)
    subprocess.check_call(command.split(),stdout=logfile,stderr=logfile,env=env)
  except subprocess.CalledProcessError:
    print("Error executing command line:")
    print("\t"+ command)
    print("Check log file: " + logfile_name)
    return False
  return True


if __name__ == '__main__':
    main()