Skip to content

Commit

Permalink
Merge pull request #15 from danielecook/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
danielecook authored Oct 22, 2019
2 parents e306561 + 0e7845c commit 642e4ee
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 32 deletions.
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
A collection of useful sequencing utilities.

__FASTQ__
* [#fq-dedup](fq-dedup)
* [#fq-meta](fq-meta)
* [fq-dedup](#fq-dedup)
* [fq-meta](#fq-meta)

__VCF__
* [#json](json)
* [#fasta](fasta)
* [json](#json)
* [fasta](#fasta)



Expand All @@ -37,10 +37,10 @@ checking a set of FASTQs to remove duplicates, you can use the following bash to
not found.

```bash
sc fq-dedup myfastq.fq.gz > myfastq.dedup.fq 2> result.err
if [[ `head -n 1 result.err` == "No Duplicates Found" ]]; then
(sc fq-dedup myfastq.fq.gz 2> dup.err) | gzip > out.fq.gz
if [[ `head -n 1 dup.err` == "No Duplicates Found" ]]; then
# Handle case where duplicates are not found (probably by moving file)
mv myfastq.fq.gz myfastq.dedup.fq
mv myout.fq.gz out.dedup.fq.gz
fi
```

Expand Down
20 changes: 14 additions & 6 deletions sc.nim
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import terminal
import asyncfile
import zip/gzipfiles
import src/fq_meta
import src/fq_count
import src/fq_dedup
#import src/index_swap
import src/utils/helpers
Expand Down Expand Up @@ -208,8 +209,6 @@ proc to_json(vcf: string, region_list: seq[string], sample_set: string, info: st
tgt_set.add(gt)
j_format.add(tgt.name, out_fmt(tgt_set, tgt, zip, samples))

var jnode = newJObject()
var variant = newJObject()
var json_out = %* { "CHROM": $rec.CHROM,
"POS": rec.POS,
"ID": $rec.ID,
Expand Down Expand Up @@ -250,7 +249,6 @@ proc to_fasta(vcf: string, region_list: seq[string], sample_set: string, force:
var gts = new_seq[int32](10)

var allele_set: seq[string]
var seq_add: cstring
var sample: string
# sample → chromosome_n → genotype

Expand All @@ -267,11 +265,9 @@ proc to_fasta(vcf: string, region_list: seq[string], sample_set: string, force:
chrom_set.add(newFileStream(fmt"{sample}_{n_chrom}.fa", fmWrite))
sequences[n_sample] = chrom_set

var n_variant = 0
var n_chrom = 0
var n_sample = 0
var allele_out: string
var gt_set: seq[string]
for rec in variants(v, region_list):
allele_set = sequtils.concat(@[rec.REF], rec.ALT)
# Record ploidy
Expand Down Expand Up @@ -317,12 +313,24 @@ var p = newParser("sc"):
run:
if opts.header:
# Allow user to output just the header if desired
echo fq_meta.header
echo fq_meta_header
elif opts.fastq.len == 0:
quit_error("No FASTQ specified", 3)
if opts.fastq.len > 0:
for fastq in opts.fastq:
fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks)
command("fq-count", group="FASTQ"):
help("Counts lines in a FASTQ")
flag("--header", help="Output just header")
arg("fastq", nargs = -1, help = "Input FASTQ")
run:
if opts.header:
echo fq_count_header
if opts.fastq.len == 0:
quit_error("No FASTQ specified", 3)
if opts.fastq.len > 0:
for fastq in opts.fastq:
fq_count.fq_count(fastq)
command("fq-dedup", group="FASTQ"):
help("Removes exact duplicates from FASTQ Files")
arg("fastq", nargs = 1, help = "Input FASTQ")
Expand Down
87 changes: 87 additions & 0 deletions src/fq_count.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import memfiles
import streams
import zip/gzipfiles
import utils/helpers
import strformat
import strutils
import os

const header* = ["reads",
"gc_content",
"gc_bases",
"bases",
"fname"].join("\t")



proc fq_count*(fastq: string) =
#[
Deduplicates reads by ID in FASTQs
Based on Brent Pedersens implementation:
https://gist.github.com/brentp/640806
]#

var
i = 0
gc_cnt: int64
n_cnt: int64
total_len: int64
n_reads: int
#n_dups: int

var basename = lastPathPart(fastq)
var absolute_path = absolutePath(fastq)

let stream: Stream =
if fastq[^3 .. ^1] == ".gz":
newGZFileStream(fastq)
else:
newFileStream(fastq, fmRead)
if stream == nil:
quit_error("Unable to open file: " & fastq, 2)

for line in lines(stream):
i.inc()
if (i mod 4) == 1:
n_reads.inc()
if (i mod 4) == 2:
gc_cnt.inc(line.count("G") + line.count("C"))
n_cnt.inc(line.count("N"))
total_len.inc(line.len)

var output = [$n_reads,
$(gc_cnt.float / (total_len - n_cnt).float),
$gc_cnt,
$n_cnt,
$total_len,
basename,
absolute_path]

echo output.join("\t")

# https://github.com/nim-lang/Nim/issues/9026#issuecomment-423632254
# var mf = memfiles.open(fn)
# var cs: cstring
# var linec, wordc, bytec: int
# var inWord: bool
# var s: string
# for slice in memSlices(mf):
# inc(linec)
# cs = cast[cstring](slice.data)
# let length = slice.size
# inc(bytec, length)
# var j = -1
# for i in 0..length-1:
# j = i
# if cs[i] in WhiteSpace:
# if inWord == true:
# inc(wordc)
# inWord = false
# else:
# inWord = true
# if j >= 0:
# inc(wordc)
# result.linec = linec
# result.wordc = wordc
# result.bytec = bytec + linec
2 changes: 1 addition & 1 deletion src/fq_dedup.nim
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ proc fq_dedup*(fastq: string) =

if check.len == 0:
stderr.writeLine("No Duplicates Found")
quit(0)
stderr.writeLine("Copying fq to stdout")

stream.setPosition(0)
var write_ln = true
Expand Down
36 changes: 18 additions & 18 deletions src/fq_meta.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@ import zip/gzipfiles
import utils/helpers

const qual = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"""
const header* = ["machine",
"sequencer",
"prob_sequencer",
"flowcell",
"flowcell_description",
"run",
"lane",
"sequence_id",
"index1",
"index2",
"qual_format",
"qual_phred",
"qual_multiple",
"min_qual",
"max_qual",
"n_lines",
"basename",
"absolute_path"].join("\t")
const fq_meta_header* = ["machine",
"sequencer",
"prob_sequencer",
"flowcell",
"flowcell_description",
"run",
"lane",
"sequence_id",
"index1",
"index2",
"qual_format",
"qual_phred",
"qual_multiple",
"min_qual",
"max_qual",
"n_lines",
"basename",
"absolute_path"].join("\t")

import tables

Expand Down

0 comments on commit 642e4ee

Please # to comment.