From 4b624265e1127cc42c143602815786da58084c6d Mon Sep 17 00:00:00 2001 From: Daniel Cook Date: Wed, 25 Sep 2019 14:25:59 +0100 Subject: [PATCH 1/4] cleanup unused vars --- README.md | 6 +++--- sc.nim | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 9367a2e..fcfd098 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,10 @@ checking a set of FASTQs to remove duplicates, you can use the following bash to not found. ```bash -sc fq-dedup myfastq.fq.gz > myfastq.dedup.fq 2> result.err -if [[ `head -n 1 result.err` == "No Duplicates Found" ]]; then +(sc fq-dedup myfastq.fq.gz 2> dup.err) | gzip > out.fq.gz +if [[ `head -n 1 dup.err` == "No Duplicates Found" ]]; then # Handle case where duplicates are not found (probably by moving file) - mv myfastq.fq.gz myfastq.dedup.fq + mv myout.fq.gz out.dedup.fq.gz fi ``` diff --git a/sc.nim b/sc.nim index 1437722..76e74ba 100644 --- a/sc.nim +++ b/sc.nim @@ -16,6 +16,7 @@ import asyncfile import zip/gzipfiles import src/fq_meta import src/fq_dedup +import src/fq_count #import src/index_swap import src/utils/helpers from constants import ANN_header @@ -208,8 +209,6 @@ proc to_json(vcf: string, region_list: seq[string], sample_set: string, info: st tgt_set.add(gt) j_format.add(tgt.name, out_fmt(tgt_set, tgt, zip, samples)) - var jnode = newJObject() - var variant = newJObject() var json_out = %* { "CHROM": $rec.CHROM, "POS": rec.POS, "ID": $rec.ID, @@ -250,7 +249,6 @@ proc to_fasta(vcf: string, region_list: seq[string], sample_set: string, force: var gts = new_seq[int32](10) var allele_set: seq[string] - var seq_add: cstring var sample: string # sample → chromosome_n → genotype @@ -267,11 +265,9 @@ proc to_fasta(vcf: string, region_list: seq[string], sample_set: string, force: chrom_set.add(newFileStream(fmt"{sample}_{n_chrom}.fa", fmWrite)) sequences[n_sample] = chrom_set - var n_variant = 0 var n_chrom = 0 var n_sample = 0 var allele_out: string - var gt_set: seq[string] for rec in variants(v, region_list): allele_set = sequtils.concat(@[rec.REF], rec.ALT) # Record ploidy @@ -323,6 +319,18 @@ var p = newParser("sc"): if opts.fastq.len > 0: for fastq in opts.fastq: fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks) + command("fq-count", group="FASTQ"): + help("Counts lines in a FASTQ") + flag("--header", help="Output just header") + arg("fastq", nargs = -1, help = "Input FASTQ") + run: + if opts.header: + echo fq_count.header + if opts.fastq.len == 0: + quit_error("No FASTQ specified", 3) + if opts.fastq.len > 0: + for fastq in opts.fastq: + fq_count.fq_count(fastq) command("fq-dedup", group="FASTQ"): help("Removes exact duplicates from FASTQ Files") arg("fastq", nargs = 1, help = "Input FASTQ") From 9a78bf0a4b6daee9a4e888d62d22a0f965d9339d Mon Sep 17 00:00:00 2001 From: Daniel Cook Date: Wed, 25 Sep 2019 14:26:49 +0100 Subject: [PATCH 2/4] add fq_count workin progress --- src/fq_count.nim | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 src/fq_count.nim diff --git a/src/fq_count.nim b/src/fq_count.nim new file mode 100644 index 0000000..dad45e8 --- /dev/null +++ b/src/fq_count.nim @@ -0,0 +1,87 @@ +import memfiles +import streams +import zip/gzipfiles +import utils/helpers +import strformat +import strutils +import os + +const header* = ["reads", + "gc_content", + "gc_bases", + "bases", + "fname"].join("\t") + + + +proc fq_count*(fastq: string) = + #[ + Deduplicates reads by ID in FASTQs + + Based on Brent Pedersens implementation: + https://gist.github.com/brentp/640806 + ]# + + var + i = 0 + gc_cnt: int64 + n_cnt: int64 + total_len: int64 + n_reads: int + #n_dups: int + + var basename = lastPathPart(fastq) + var absolute_path = absolutePath(fastq) + + let stream: Stream = + if fastq[^3 .. ^1] == ".gz": + newGZFileStream(fastq) + else: + newFileStream(fastq, fmRead) + if stream == nil: + quit_error("Unable to open file: " & fastq, 2) + + for line in lines(stream): + i.inc() + if (i mod 4) == 1: + n_reads.inc() + if (i mod 4) == 2: + gc_cnt.inc(line.count("G") + line.count("C")) + n_cnt.inc(line.count("N")) + total_len.inc(line.len) + + var output = [$n_reads, + $(gc_cnt.float / (total_len - n_cnt).float), + $gc_cnt, + $n_cnt, + $total_len, + basename, + absolute_path] + + echo output.join("\t") + + # https://github.com/nim-lang/Nim/issues/9026#issuecomment-423632254 + # var mf = memfiles.open(fn) + # var cs: cstring + # var linec, wordc, bytec: int + # var inWord: bool + # var s: string + # for slice in memSlices(mf): + # inc(linec) + # cs = cast[cstring](slice.data) + # let length = slice.size + # inc(bytec, length) + # var j = -1 + # for i in 0..length-1: + # j = i + # if cs[i] in WhiteSpace: + # if inWord == true: + # inc(wordc) + # inWord = false + # else: + # inWord = true + # if j >= 0: + # inc(wordc) + # result.linec = linec + # result.wordc = wordc + # result.bytec = bytec + linec From 285be29ca07e8863708c765a152f47e2dfd7e0f2 Mon Sep 17 00:00:00 2001 From: Daniel Cook Date: Wed, 25 Sep 2019 15:56:03 +0100 Subject: [PATCH 3/4] Add FQ Count --- sc.nim | 6 +++--- src/fq_meta.nim | 36 ++++++++++++++++++------------------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/sc.nim b/sc.nim index 76e74ba..569b14e 100644 --- a/sc.nim +++ b/sc.nim @@ -15,8 +15,8 @@ import terminal import asyncfile import zip/gzipfiles import src/fq_meta -import src/fq_dedup import src/fq_count +import src/fq_dedup #import src/index_swap import src/utils/helpers from constants import ANN_header @@ -313,7 +313,7 @@ var p = newParser("sc"): run: if opts.header: # Allow user to output just the header if desired - echo fq_meta.header + echo fq_meta_header elif opts.fastq.len == 0: quit_error("No FASTQ specified", 3) if opts.fastq.len > 0: @@ -325,7 +325,7 @@ var p = newParser("sc"): arg("fastq", nargs = -1, help = "Input FASTQ") run: if opts.header: - echo fq_count.header + echo fq_count_header if opts.fastq.len == 0: quit_error("No FASTQ specified", 3) if opts.fastq.len > 0: diff --git a/src/fq_meta.nim b/src/fq_meta.nim index 0becde8..9c912f5 100644 --- a/src/fq_meta.nim +++ b/src/fq_meta.nim @@ -9,24 +9,24 @@ import zip/gzipfiles import utils/helpers const qual = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~""" -const header* = ["machine", - "sequencer", - "prob_sequencer", - "flowcell", - "flowcell_description", - "run", - "lane", - "sequence_id", - "index1", - "index2", - "qual_format", - "qual_phred", - "qual_multiple", - "min_qual", - "max_qual", - "n_lines", - "basename", - "absolute_path"].join("\t") +const fq_meta_header* = ["machine", + "sequencer", + "prob_sequencer", + "flowcell", + "flowcell_description", + "run", + "lane", + "sequence_id", + "index1", + "index2", + "qual_format", + "qual_phred", + "qual_multiple", + "min_qual", + "max_qual", + "n_lines", + "basename", + "absolute_path"].join("\t") import tables From 0e7845c38fd56abde3da8f1abcb6ad357c5e4d69 Mon Sep 17 00:00:00 2001 From: Daniel Cook Date: Tue, 22 Oct 2019 12:01:23 +0100 Subject: [PATCH 4/4] output fq dedup regardless of dups being found. --- README.md | 8 ++++---- src/fq_dedup.nim | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fcfd098..6903453 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,12 @@ A collection of useful sequencing utilities. __FASTQ__ -* [#fq-dedup](fq-dedup) -* [#fq-meta](fq-meta) +* [fq-dedup](#fq-dedup) +* [fq-meta](#fq-meta) __VCF__ -* [#json](json) -* [#fasta](fasta) +* [json](#json) +* [fasta](#fasta) diff --git a/src/fq_dedup.nim b/src/fq_dedup.nim index e31978b..f78cfe4 100644 --- a/src/fq_dedup.nim +++ b/src/fq_dedup.nim @@ -52,7 +52,7 @@ proc fq_dedup*(fastq: string) = if check.len == 0: stderr.writeLine("No Duplicates Found") - quit(0) + stderr.writeLine("Copying fq to stdout") stream.setPosition(0) var write_ln = true