Merge pull request #15 from danielecook/development

Development
danielecook · Oct 22, 2019 · 642e4ee · 642e4ee
2 parents e306561 + 0e7845c
commit 642e4ee
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -5,12 +5,12 @@
 A collection of useful sequencing utilities.
 
 __FASTQ__
-* [#fq-dedup](fq-dedup)
-* [#fq-meta](fq-meta)
+* [fq-dedup](#fq-dedup)
+* [fq-meta](#fq-meta)
 
 __VCF__
-* [#json](json)
-* [#fasta](fasta)
+* [json](#json)
+* [fasta](#fasta)
 
 
 
@@ -37,10 +37,10 @@ checking a set of FASTQs to remove duplicates, you can use the following bash to
 not found.
 
 ```bash
-sc fq-dedup myfastq.fq.gz > myfastq.dedup.fq 2> result.err
-if [[ `head -n 1 result.err` == "No Duplicates Found" ]]; then
+(sc fq-dedup myfastq.fq.gz 2> dup.err) | gzip > out.fq.gz
+if [[ `head -n 1 dup.err` == "No Duplicates Found" ]]; then
     # Handle case where duplicates are not found (probably by moving file)
-    mv myfastq.fq.gz myfastq.dedup.fq
+    mv myout.fq.gz out.dedup.fq.gz
 fi
 ```
 

diff --git a/sc.nim b/sc.nim
@@ -15,6 +15,7 @@ import terminal
 import asyncfile
 import zip/gzipfiles
 import src/fq_meta
+import src/fq_count
 import src/fq_dedup
 #import src/index_swap
 import src/utils/helpers
@@ -208,8 +209,6 @@ proc to_json(vcf: string, region_list: seq[string], sample_set: string, info: st
                     tgt_set.add(gt)
                 j_format.add(tgt.name, out_fmt(tgt_set, tgt, zip, samples))
 
-        var jnode = newJObject()
-        var variant = newJObject()
         var json_out = %* { "CHROM": $rec.CHROM,
                     "POS": rec.POS,
                     "ID": $rec.ID,
@@ -250,7 +249,6 @@ proc to_fasta(vcf: string, region_list: seq[string], sample_set: string, force:
     var gts = new_seq[int32](10)
 
     var allele_set: seq[string]
-    var seq_add: cstring
     var sample: string
     # sample → chromosome_n → genotype
 
@@ -267,11 +265,9 @@ proc to_fasta(vcf: string, region_list: seq[string], sample_set: string, force:
             chrom_set.add(newFileStream(fmt"{sample}_{n_chrom}.fa", fmWrite))
         sequences[n_sample] = chrom_set
 
-    var n_variant = 0
     var n_chrom = 0
     var n_sample = 0
     var allele_out: string
-    var gt_set: seq[string]
     for rec in variants(v, region_list):
         allele_set = sequtils.concat(@[rec.REF], rec.ALT)
         # Record ploidy
@@ -317,12 +313,24 @@ var p = newParser("sc"):
         run:
             if opts.header:
                 # Allow user to output just the header if desired
-                echo fq_meta.header
+                echo fq_meta_header
             elif opts.fastq.len == 0:
                 quit_error("No FASTQ specified", 3)
             if opts.fastq.len > 0:
                 for fastq in opts.fastq:
                     fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks)
+    command("fq-count", group="FASTQ"):
+        help("Counts lines in a FASTQ")
+        flag("--header", help="Output just header")
+        arg("fastq", nargs = -1, help = "Input FASTQ")
+        run:
+            if opts.header:
+                echo fq_count_header
+            if opts.fastq.len == 0:
+                quit_error("No FASTQ specified", 3)
+            if opts.fastq.len > 0:
+                for fastq in opts.fastq:
+                    fq_count.fq_count(fastq)
     command("fq-dedup", group="FASTQ"):
         help("Removes exact duplicates from FASTQ Files")
         arg("fastq", nargs = 1, help = "Input FASTQ")

diff --git a/src/fq_count.nim b/src/fq_count.nim
@@ -0,0 +1,87 @@
+import memfiles
+import streams
+import zip/gzipfiles
+import utils/helpers
+import strformat
+import strutils
+import os
+
+const header* = ["reads",
+                "gc_content",
+                "gc_bases",
+                "bases",
+                "fname"].join("\t")
+
+
+
+proc fq_count*(fastq: string) =
+    #[
+        Deduplicates reads by ID in FASTQs
+
+        Based on Brent Pedersens implementation:
+        https://gist.github.com/brentp/640806
+    ]#
+
+    var 
+        i = 0
+        gc_cnt: int64
+        n_cnt: int64
+        total_len: int64
+        n_reads: int
+        #n_dups: int
+
+    var basename = lastPathPart(fastq)
+    var absolute_path = absolutePath(fastq)
+
+    let stream: Stream =
+        if fastq[^3 .. ^1] == ".gz":
+            newGZFileStream(fastq)
+        else:
+            newFileStream(fastq, fmRead)
+    if stream == nil:
+        quit_error("Unable to open file: " & fastq, 2)
+
+    for line in lines(stream):
+        i.inc()
+        if (i mod 4) == 1:
+            n_reads.inc()
+        if (i mod 4) == 2:
+            gc_cnt.inc(line.count("G") + line.count("C"))
+            n_cnt.inc(line.count("N"))
+            total_len.inc(line.len)
+
+    var output = [$n_reads,
+                $(gc_cnt.float / (total_len - n_cnt).float),
+                $gc_cnt,
+                $n_cnt,
+                $total_len,
+                basename,
+                absolute_path]
+
+    echo output.join("\t")
+
+    # https://github.com/nim-lang/Nim/issues/9026#issuecomment-423632254
+    # var mf = memfiles.open(fn)
+    # var cs: cstring
+    # var linec, wordc, bytec: int
+    # var inWord: bool
+    # var s: string
+    # for slice in memSlices(mf):
+    #   inc(linec)
+    #   cs = cast[cstring](slice.data)
+    #   let length = slice.size
+    #   inc(bytec, length)
+    #   var j = -1
+    #   for i in 0..length-1:
+    #     j = i
+    #     if cs[i] in WhiteSpace:
+    #       if inWord == true:
+    #         inc(wordc)
+    #         inWord = false
+    #     else:
+    #       inWord = true
+    #   if j >= 0:
+    #     inc(wordc)
+    # result.linec = linec
+    # result.wordc = wordc
+    # result.bytec = bytec + linec
diff --git a/src/fq_dedup.nim b/src/fq_dedup.nim
@@ -52,7 +52,7 @@ proc fq_dedup*(fastq: string) =
 
     if check.len == 0:
         stderr.writeLine("No Duplicates Found")
-        quit(0)
+        stderr.writeLine("Copying fq to stdout")
 
     stream.setPosition(0)
     var write_ln = true

diff --git a/src/fq_meta.nim b/src/fq_meta.nim
@@ -9,24 +9,24 @@ import zip/gzipfiles
 import utils/helpers
 
 const qual = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"""
-const header* = ["machine",
-                 "sequencer",
-                 "prob_sequencer",
-                 "flowcell",
-                 "flowcell_description",
-                 "run",
-                 "lane",
-                 "sequence_id",
-                 "index1",
-                 "index2",
-                 "qual_format",
-                 "qual_phred",
-                 "qual_multiple",
-                 "min_qual",
-                 "max_qual",
-                 "n_lines",
-                 "basename",
-                 "absolute_path"].join("\t")
+const fq_meta_header* = ["machine",
+                         "sequencer",
+                         "prob_sequencer",
+                         "flowcell",
+                         "flowcell_description",
+                         "run",
+                         "lane",
+                         "sequence_id",
+                         "index1",
+                         "index2",
+                         "qual_format",
+                         "qual_phred",
+                         "qual_multiple",
+                         "min_qual",
+                         "max_qual",
+                         "n_lines",
+                         "basename",
+                         "absolute_path"].join("\t")
 
 import tables