bump v1.3.9

benoukraflab · Mar 24, 2021 · 45b1961 · 45b1961
1 parent 5fc7e4a
commit 45b1961
Show file tree

Hide file tree

Showing 15 changed files with 1,978 additions and 1,781 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -3,6 +3,25 @@ NanoVar Changelog
 
 Release Summary:
 
+
+Version 1.3.9 - Mar 24, 2021
+    * Fixed nv_detect_algo insertion and deletion large size bug
+    * Added pysam >=0.15.4 into bioconda metal.yml as prerequisite
+    * Added pybedtools >=0.8.2 prerequisite to fixed RuntimeWarning buffering=1 error (Refer to https://github.com/daler/pybedtools/issues/322)
+    * Prevent repeated read-indexes by adjusting seed (Thanks to Geoffrey Woodland)
+    * Improve read cluster exception message (Thanks to Geoffrey Woodland)
+    * Unique ID of breakpoints identified by BLAST shortened to four characters to prevent mixing with minimap2 breakpoints
+    * Adjusted breakend filtering during mm clustering
+    * Improved breakpoint clustering algorithm to increase accuracy
+    * Added newline to last line of genome.sizes file
+    * Added genome check for BAM (Thanks to oneillkza, https://github.com/cytham/nanovar/issues/19#issuecomment-791599629)
+    * Modified argparse "usage" format
+    * Suppressed BAM index missing warning
+    * Supressed Tensorflow INFO and WARNING logs
+    * Migrated to tensorflow-cpu/tensorflow-mkl to prevent cuda_driver.cc error
+    * Fixed FixedLocator warning
+
+
 Version 1.3.8 - May 24, 2020
     * Fixed file type detection (Thanks to jiadong324, https://github.com/cytham/nanovar/issues/9#issuecomment-626579853)
     * Fixed negative coordinates in VCF

diff --git a/nanovar/nanovar b/nanovar/nanovar
@@ -137,6 +137,7 @@ def main():
     filename = os.path.basename(file_path)
     read_suffix = ['.fa', '.fq', '.fasta', '.fastq', '.fa.gzip', '.fq.gzip', '.fa.gz', '.fq.gz', '.fasta.gz', '.fastq.gz']
     bam_suffix = '.bam'
+    contig_list = []
     if any(filename.lower().endswith(s) for s in read_suffix):
         input_name = os.path.basename(file_path).rsplit('.f', 1)[0]
         input_type = 'raw'
@@ -153,12 +154,18 @@ def main():
         else:
             logging.debug("Input FASTQ/FASTA file passed")
     elif filename.lower().endswith(bam_suffix):
+        save = pysam.set_verbosity(0)  # Suppress BAM index missing warning
         sam = pysam.AlignmentFile(file_path, "rb")
+        pysam.set_verbosity(save)  # Revert verbosity level
         try:
             assert sam.is_bam, "Error: Input BAM file is not a BAM file."
             input_name = os.path.basename(file_path).rsplit('.bam', 1)[0]
             input_type = 'bam'
             fastx_check = []
+            # Get BAM contigs from header
+            header = sam.header.to_dict()
+            for h in header['SQ']:
+                contig_list.append(h['SN'])
         except AssertionError:
             logging.critical("Error: Input BAM file is not a BAM file.")
             raise Exception("Error: Input BAM file is not a BAM file.")
@@ -200,6 +207,13 @@ def main():
         contig_len_dict[seq_record.id] = len(seq_record)
         total_gsize += len(seq_record)
 
+    # Check BAM contigs in reference genome
+    if input_type == 'bam':
+        for c in contig_list:
+            if c not in contig_len_dict:
+                logging.critical("Error: Contig %s in BAM is absent in reference genome" % c)
+                raise Exception("Error: Contig %s in BAM is absent in reference genome" % c)
+
     # Check contig id for invalid symbols
     contig_omit = checkcontignames(contig_len_dict)
 
@@ -327,13 +341,14 @@ def main():
         make_index(force, ref_path, wk_dir, ref_name, mdb, wmk, hsb)
 
     # Run hsblastn on INS and INV reads
-    hsba = align_hsb(ref_path, wk_dir, ref_name, threads_bt, hsb)
+    hsba = align_hsb(ref_path, wk_dir, ref_name, threads_bt, hsb, debug)
     sub_run = VariantDetect(wk_dir, hsba[1], splitpct, minalign, filter_path, minlen, buff, model_path,
                             total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, hsba[0],
                             mincov, homo_t, het_t, debug, contig_omit)
 
     # Parsing INS and INV SVs and clustering
     sub_run.rlendict = run.rlendict
+    sub_run.seed = run.seed  # Try to prevent repeated read-indexes
     sub_run.parse_detect_hsb()
     logging.info('Parsing BAM and detecting INV and INS SVs')
     run.cluster_nn(add_out=sub_run.total_out)

diff --git a/nanovar/nv_align.py b/nanovar/nv_align.py
@@ -172,7 +172,7 @@ def align_mm(ref, read, wk_dir, read_name, ref_name, threads, mm, data_type, st)
 
 
 # HS-BLASTN alignment
-def align_hsb(ref, wk_dir, ref_name, threads, hsb):
+def align_hsb(ref, wk_dir, ref_name, threads, hsb, debug):
     obinary_path = os.path.join(wk_dir, str(ref_name) + '.counts.obinary')
     out_path = os.path.join(wk_dir, 'temp-%s-blast.tab' % ref_name)
     read_path = os.path.join(wk_dir, 'temp2.fa')
@@ -187,7 +187,8 @@ def align_hsb(ref, wk_dir, ref_name, threads, hsb):
     if exitcode != 0:
         logging.critical("Error: hs-blastn alignment failed")
         raise Exception("Error: hs-blastn alignment failed, see log")
-    os.remove(read_path)
+    if not debug:  # Remove temp2.fa
+        os.remove(read_path)
     return ["hs-blastn align -db " + ref + " -window_masker_db obinary_path -query " + read_path + " -out " + out_path +
             " -outfmt 6 -num_threads " + str(threads) + " -max_target_seqs 3 -gapopen 0 -gapextend 4 -penalty -3 -reward 2",
             out_path]