add consistent interface for adding abs and base paths

danielecook · Nov 7, 2019 · 024f184 · 024f184
1 parent 55754ea
commit 024f184
Show file tree

Hide file tree

Showing 7 changed files with 98 additions and 97 deletions.
diff --git a/README.md b/README.md
@@ -71,6 +71,10 @@ __Benchmark__
 0m58.738s
 ```
 
+### fq-count
+
+Count the number of reads in a FASTQ
+
 ### fq-meta
 
 __Scenario:__ You are given an old dusty hard drive packed with sequence data. Your collaborator says "We have some great sequencing data here, if only someone could analyze it." You peek at the filesystem and discover that FASTQs have been renamed, removing crucial information about how they were generated. Your collaborator, however, recalls certain details about which data was sequenced on which sequencer and he has a list of sequencing barcodes and associated samples that you can match on." If only there was a way to determine the barcodes, sequencer, or other essential metadata for each FASTQ...
@@ -219,3 +223,10 @@ typical output.
 * `GT` - Outputs genotypes as [[0, 0], [0, 1], [1, 1], ...
 * `SGT` - Outputs genotypes as `0/0`, `0/1`, `1/1`, ...
 * `TGT` - Outputs genotypes as `T/T`, `A/T`, `A/A`, ...
+
+
+### Cross-compilation
+
+```
+nim c --cpu:i386 --os:linux --threads:on --compileOnly sc.nim
+```
diff --git a/sc.nim b/sc.nim
@@ -38,33 +38,38 @@ proc get_vcf(vcf: string): string =
         return "-"
     return vcf
 
+
 var p = newParser("sc"):
     flag("--debug", help="Debug")
     help(fmt"Sequence data utilities (Version {VERSION})")
     command("fq-meta", group="FASTQ"):
         help("Output metadata for FASTQ")
         arg("fastq", nargs = -1, help="List of FASTQ files")
         option("-n", "--lines", help="Number of sequences to sample (n_lines) for qual and index/barcode determination", default = "100")
-        flag("--header", help="Output the header")
-        flag("-s", "--symlinks", help="Follow symlinks")
+        flag("-t", "--header", help="Output the header")
+        flag("-b", "--basename", help="Add basename column")
+        flag("-a", "--absolute", help="Add column for absolute path") 
         run:
-            if opts.fastq.len == 0:
-                quit_error("No FASTQ specified", 3)
+            if opts.header:
+                output_header(fq_meta_header, opts.basename, opts.absolute)
             if opts.fastq.len > 0:
                 for fastq in opts.fastq:
-                    fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks, opts.header)
+                    fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.basename, opts.absolute)
     command("fq-count", group="FASTQ"):
         help("Counts lines in a FASTQ")
-        flag("--header", help="Output just header")
+        flag("-t", "--header", help="Output the header")
+        flag("-b", "--basename", help="Add basename column")
+        flag("-a", "--absolute", help="Add column for absolute path") 
         arg("fastq", nargs = -1, help = "Input FASTQ")
         run:
             if opts.header:
-                echo fq_count_header
+                output_header(fq_meta_header, opts.basename, opts.absolute)
             if opts.fastq.len == 0:
                 quit_error("No FASTQ specified", 3)
             if opts.fastq.len > 0:
                 for fastq in opts.fastq:
-                    fq_count.fq_count(fastq)
+                    fq_count.fq_count(fastq, opts.basename, opts.absolute)
+
     command("fq-dedup", group="FASTQ"):
         help("Removes exact duplicates from FASTQ Files")
         arg("fastq", nargs = 1, help = "Input FASTQ")
@@ -89,18 +94,20 @@ var p = newParser("sc"):
 
     command("insert-size", group="BAM"):
         help("Calculate insert-size metrics")
-        flag("--header", help="Output the header")
         option("-d", "--dist", default="", help = "Output raw distribution(s)")
         arg("bam", nargs = -1, help = "Input BAM")
+        flag("-t", "--header", help="Output the header")
+        flag("-b", "--basename", help="Add basename column")
+        flag("-a", "--absolute", help="Add column for absolute path")        
         flag("-v", "--verbose", help="Provide output")
         run:
             if opts.header:
-                echo insert_size.header
+                output_header(insert_size_header, opts.basename, opts.absolute)
             if opts.bam.len == 0:
                 quit_error("No BAM specified", 3)
             if opts.bam.len > 0:
                 for bam in opts.bam:
-                    insert_size.cmd_insert_size(bam, opts.dist, opts.verbose)
+                    insert_size.cmd_insert_size(bam, opts.dist, opts.verbose, opts.basename, opts.absolute)
 
     command("vcf2tsv", group="VCF"):
         help("Converts a VCF to TSV or CSV")
@@ -114,24 +121,25 @@ var p = newParser("sc"):
             elif opts.vcf.len > 0:
                 vcf2tsv(opts.vcf, opts.long, opts.regions)
 
-    command("window", group="VCF"):
-        help("Generate windows from a VCF for parallel execution")
-        arg("vcf", nargs = 1, help = "Input VCF")
-        arg("width", nargs = 1)
-        run:
-            vcf_window(opts.vcf)
+    # command("window", group="VCF"):
+    #     help("Generate windows from a VCF for parallel execution")
+    #     arg("vcf", nargs = 1, help = "Input VCF")
+    #     arg("width", nargs = 1)
+    #     run:
+    #         vcf_window(opts.vcf)
 
-    command("fasta", group="VCF"):
-        help("Convert a VCF to a FASTA file")
-        arg("vcf", nargs = 1, help="VCF to convert to JSON")
-        arg("region", nargs = -1, help="List of regions or bed files")
-        option("-s", "--samples", help="Set Samples", default="ALL")
-        option("-r", "--reference", help="Output full reference sequence")
-        flag("-f", "--force", help="Force output even if genotypes are not phased")
-        flag("-c", "--concat", help="Combine chromosomes into a single sequence")
-        flag("-m", "--merge", help="Merge samples into a single file and send to stdout")
-        run:
-            to_fasta(get_vcf(opts.vcf), opts.region, opts.samples, opts.force)
+    # command("fasta", group="VCF"):
+    #     help("Convert a VCF to a FASTA file")
+    #     arg("vcf", nargs = 1, help="VCF to convert to JSON")
+    #     arg("region", nargs = -1, help="List of regions or bed files")
+    #     option("-s", "--samples", help="Set Samples", default="ALL")
+    #     option("-r", "--reference", help="Output full reference sequence")
+    #     flag("-f", "--force", help="Force output even if genotypes are not phased")
+    #     flag("-c", "--concat", help="Combine chromosomes into a single sequence")
+    #     flag("-m", "--merge", help="Merge samples into a single file and send to stdout")
+    #     run:
+    #         to_fasta(get_vcf(opts.vcf), opts.region, opts.samples, opts.force)
+
     # command("index-swap", group="BAM"):
     #     arg("BAM", nargs= -1, help="List of BAMs or CRAMs to examine")
     #     option("-s", "--sites", help="List of sites to check (required)")

diff --git a/sc.nimble b/sc.nimble
@@ -7,7 +7,7 @@ license       = "MIT"
 
 # Dependencies
 
-requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1"
+requires "argparse >= 0.9.0", "hts >= 0.2.8", "colorize", "zip >= 0.2.1"
 requires "https://github.com/danielecook/BitVector#b8cc21271c90cca96ed31f5d5383711dc96a8d3f"
 
 bin = @["sc"]

diff --git a/src/fq_count.nim b/src/fq_count.nim
@@ -16,7 +16,7 @@ const fq_count_header* = ["reads",
 
 
 
-proc fq_count*(fastq: string) =
+proc fq_count*(fastq: string, basename: bool, absolute: bool) =
     #[
         Deduplicates reads by ID in FASTQs
 
@@ -32,9 +32,6 @@ proc fq_count*(fastq: string) =
         n_reads: int
         #n_dups: int
 
-    var basename = lastPathPart(fastq)
-    var absolute_path = absolutePath(fastq)
-
     let stream: Stream =
         if fastq[^3 .. ^1] == ".gz":
             newGZFileStream(fastq)
@@ -56,34 +53,6 @@ proc fq_count*(fastq: string) =
                 $(gc_cnt.float / (total_len - n_cnt).float),
                 $gc_cnt,
                 $n_cnt,
-                $total_len,
-                basename,
-                absolute_path]
+                $total_len]
 
-    echo output.join("\t")
-
-    # https://github.com/nim-lang/Nim/issues/9026#issuecomment-423632254
-    # var mf = memfiles.open(fn)
-    # var cs: cstring
-    # var linec, wordc, bytec: int
-    # var inWord: bool
-    # var s: string
-    # for slice in memSlices(mf):
-    #   inc(linec)
-    #   cs = cast[cstring](slice.data)
-    #   let length = slice.size
-    #   inc(bytec, length)
-    #   var j = -1
-    #   for i in 0..length-1:
-    #     j = i
-    #     if cs[i] in WhiteSpace:
-    #       if inWord == true:
-    #         inc(wordc)
-    #         inWord = false
-    #     else:
-    #       inWord = true
-    #   if j >= 0:
-    #     inc(wordc)
-    # result.linec = linec
-    # result.wordc = wordc
-    # result.bytec = bytec + linec
+    output_w_fnames(output.join("\t"), fastq, basename, absolute)
diff --git a/src/fq_meta.nim b/src/fq_meta.nim
@@ -9,7 +9,7 @@ import zip/gzipfiles
 import utils/helpers
 
 const qual = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"""
-const fq_meta_header = ["machine",
+const fq_meta_header* = ["machine",
                          "sequencer",
                          "prob_sequencer",
                          "flowcell",
@@ -24,9 +24,7 @@ const fq_meta_header = ["machine",
                          "qual_multiple",
                          "min_qual",
                          "max_qual",
-                         "n_lines",
-                         "basename",
-                         "absolute_path"].join("\t")
+                         "n_lines"].join("\t")
 
 import tables
 
@@ -197,10 +195,10 @@ proc get_sequencer_name(sequencers: seq[string]): string =
     elif sequencers.len > 0:
         return sequencers[^1]
 
-proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header: bool) =
+
+proc fq_meta*(fastq: string, sample_n = 20, basename: bool, absolute: bool) =
 
     var
-        fastq: string
         sequence_id: string
         machine: string
         run: string
@@ -219,26 +217,13 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header:
         barcodes = newSeq[string](sample_n)
         i = 0
 
-    if fq_header == true:
-        echo fq_meta_header
-
-    if follow_symlinks and symlinkExists(fastq_in):
-        fastq = expandSymlink(fastq_in)
-    else:
-        fastq = fastq_in
-
-    var basename = lastPathPart(fastq)
-    var absolute_path = absolutePath(fastq)
-
-
     let stream: Stream =
-        if fastq[^3 .. ^1] == ".gz":
+        if fastq.toLowerAscii()[^3 .. ^1] == ".gz":
             newGZFileStream(fastq)
         else:
             newFileStream(fastq, fmRead)
     if stream == nil:
         quit_error("Unable to open file: " & fastq, 2)
-
 
     while not stream.atEnd() and i < sample_n * 4:
         line = stream.readLine()
@@ -275,7 +260,7 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header:
     var fastq_scores_name = fastq_scores.mapIt(it.name).join(";")
     let fastq_scores_phred = fastq_scores.mapIt(it.phred).deduplicate().join(";")
 
-    echo [machine,
+    let header_out = [machine,
           sequencer,
           sequencer_prob,
           flowcell,
@@ -290,6 +275,5 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header:
           $(fastq_scores.mapIt(it.name).len > 1),
           (if qual_min >= 0: $qual_min else: ""),
           (if qual_max >= 0: $qual_max else: ""),
-          $(i/4).int,
-          basename,
-          absolute_path].join("\t")
+          $(i/4).int].join("\t")
+    output_w_fnames(header_out, fastq, basename, absolute)
diff --git a/src/insert_size.nim b/src/insert_size.nim
@@ -10,11 +10,12 @@ import colorize
 import re
 import algorithm
 import threadpool
+import utils/helpers
 #import ggplotnim
 
 const INS_ARR = 10000
 
-const header* = ["median",
+const insert_size_header* = ["median",
                  "mean",
                  "std_dev",
                  "min",
@@ -23,8 +24,7 @@ const header* = ["median",
                  "n_reads",
                  "n_accept",
                  "n_use",
-                 "sample",
-                 "filename"].join("\t")
+                 "sample"].join("\t")
 
 const distribution_header = ["insert_size", "count", "sample", "filename"].join("\t")
 
@@ -93,7 +93,7 @@ proc freq_inserts(bamfile: string, contig: string, verbose: bool): chrom_freqs =
                        n_reads: n_reads,
                        n_accept: n_accept)
 
-proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) =
+proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool, basename: bool, absolute: bool) =
     #[
         Calculates insert size
     ]#
@@ -181,7 +181,7 @@ proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) =
     let variance = (m.float - (n.float*(mean_insert_size^2))) / (n - 1).float
     let std_dev = pow(variance, 0.5)
 
-    var result = [$median_insert_size,
+    var header_out = [$median_insert_size,
                   $math.round(mean_insert_size, 3),
                   $round(std_dev, 3),
                   $min_insert_size,
@@ -190,7 +190,6 @@ proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) =
                   $n_reads,
                   $n_accept,
                   $(sum(inserts_trimmed)),
-                  bam_sample(b),
-                  fname]
-    echo result.join("\t")
+                  bam_sample(b)]
+    output_w_fnames(header_out.join("\t"), bamfile, basename, absolute)
 
diff --git a/src/utils/helpers.nim b/src/utils/helpers.nim
@@ -3,6 +3,7 @@ import hts
 import strutils
 import strformat
 import colorize
+import sequtils
 
 proc quit_error*(msg: string, error_code = 1) =
     stderr.write_line fmt"Error {error_code}".bgWhite.fgRed & fmt": {msg}".fgRed
@@ -25,7 +26,6 @@ proc check_file_list*(files: seq[string]): bool =
             missing_file = true
     return missing_file
 
-
 iterator variants*(vcf:VCF, regions: seq[string]): Variant =
     ## iterator over region or just the variants.
     if regions.len == 0:
@@ -39,4 +39,34 @@ iterator variants*(vcf:VCF, regions: seq[string]): Variant =
                 for v in vcf.query(&"{toks[0]}:{parseInt(toks[1]) + 1}-{toks[2]}"):
                     yield v
         else:
-            for v in vcf.query(region): yield v
+            for v in vcf.query(region): yield v
+
+#====================#
+# Headers and Output #
+#====================#
+
+proc output_header*(header: string, basename: bool, absolute: bool) =
+    var
+        basename_str = ""
+        absolute_str = ""
+    if basename == true:
+        basename_str = "basename"
+    if absolute == true:
+        absolute_str = "absolute"
+    echo [header, basename_str, absolute_str].filterIt(it.len > 0).join("\t")
+
+proc get_absolute*(path: string): string =
+    if symlinkExists(path) == true:
+        return absolutePath(expandSymlink(path))
+    return absolutePath(path)
+
+proc output_w_fnames*(header: string, path: string, basename: bool, absolute: bool) =
+    # Attaches the basename or absolute if selected in a consistant manner
+    var
+        basename_str = ""
+        absolute_str = ""
+    if basename == true:
+        basename_str = lastPathPart(path)
+    if absolute == true:
+        absolute_str = get_absolute(path)
+    echo [header, basename_str, absolute_str].filterIt(it.len > 0).join("\t")