Skip to content

Commit

Permalink
Merge pull request #12 from danielecook/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
danielecook authored Sep 20, 2019
2 parents 0221fc0 + 437f2d5 commit 50f280a
Show file tree
Hide file tree
Showing 14 changed files with 236 additions and 4 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ I intend to port some commands over from [VCF-kit](https://github.com/AndersenLa

## Tools

### fq-dedup

The `fq-dedup` command de-duplicates a FASTQ by read ID (e.g. `@@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018`). Ideally, this should never happen, and I honestly do not know how it happens. The command has to read through the entire FASTQ twice.

The command uses a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) to identify duplicates. If no duplicates are found during the first pass, the command will return 0 and print "No Duplicates Found" to `stderr`. If you are
checking a set of FASTQs to remove duplicates, you can use the following bash to handle cases where duplicates are
not found.

```bash
sc fq-dedup myfastq.fq.gz > myfastq.dedup.fq 2> result.err
if [[ `head -n 1 result.err` == "No Duplicates Found" ]]; then
# Handle case where duplicates are not found (probably by moving file)
mv myfastq.fq.gz myfastq.dedup.fq
fi
```

### fq-meta

__Scenario:__ You are given an old dusty hard drive packed with sequence data. Your collaborator says "We have some great sequencing data here, if only someone could analyze it." You peek at the filesystem and discover that FASTQs have been renamed, removing crucial information about how they were generated. Your collaborator, however, recalls certain details about which data was sequenced on which sequencer and he has a list of sequencing barcodes and associated samples that you can match on." If only there was a way to determine the barcodes, sequencer, or other essential metadata for each FASTQ...
Expand Down
6 changes: 6 additions & 0 deletions sc.nim
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import terminal
import asyncfile
import zip/gzipfiles
import src/fq_meta
import src/fq_dedup
#import src/index_swap
import src/utils/helpers
from constants import ANN_header
Expand Down Expand Up @@ -310,6 +311,11 @@ var p = newParser("sc"):
if opts.fastq.len > 0:
for fastq in opts.fastq:
fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks)
command("fq-dedup", group="FASTQ"):
help("Removes exact duplicates from FASTQ Files")
arg("fastq", nargs = 1, help = "Input FASTQ")
run:
fq_dedup.fq_dedup(opts.fastq)
command("json", group="VCF"):
help("Convert a VCF to JSON")
arg("vcf", nargs = 1, help="VCF to convert to JSON")
Expand Down
2 changes: 1 addition & 1 deletion sc.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ license = "MIT"

# Dependencies

requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1"
requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1", "bitvector >= 0.4.10"

bin = @["sc"]
skipDirs = @["test"]
Expand Down
29 changes: 26 additions & 3 deletions scripts/functional-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@ test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest

. ssshtest

PARENT_DIR="`git rev-parse --show-toplevel`"
PARENT_DIR=`git rev-parse --show-toplevel`
export PATH="${PATH}:${PARENT_DIR}"

#======#
# json #
#======#

set -o nounset
run test_json sc json "${PARENT_DIR}/tests/data/test.vcf.gz" X:17276844-17276844
assert_equal \"X\" "$(cat "$STDOUT_FILE" | jq '.CHROM')"
Expand All @@ -31,11 +35,30 @@ assert_equal 0.5 "$(cat $STDOUT_FILE | jq '.INFO.HOB')"
run format_dp sc json --format="DP" "${PARENT_DIR}/tests/data/test.vcf.gz" I:41947-41947 | jq '.FORMAT.DP|add'
assert_equal 2094 "$(cat $STDOUT_FILE | jq '.FORMAT.DP|add')"

#==========#
# fq-dedup #
#==========#

run fq_dedup sc fq-dedup tests/fastq/dup.fq
assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"
assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"

# fq-meta
run fq_dedup_gz sc fq-dedup tests/fastq/dup.fq.gz
assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"
assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"

#=========#
# fq-meta #
#=========#
curl https://raw.githubusercontent.com/10XGenomics/supernova/master/tenkit/lib/python/tenkit/illumina_instrument.py | \
grep -v 'import martian' | \
sed 's/martian.exit/exit/g' > illumina_instrument.py
sed 's/martian.exit/exit/g' | \
sed 's/f.readline()/f.readline().decode("utf-8")/g' | \
sed 's/exit/print/g' > "${PARENT_DIR}/illumina_instrument.py"

# Fix python script
echo "Fixing ${PARENT_DIR}/illumina_instrument.py"
2to3 --write --nobackups "${PARENT_DIR}/illumina_instrument.py"

function test_fq() {
gzip -c $1 > test.fq.gz
Expand Down
89 changes: 89 additions & 0 deletions src/fq_dedup.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import tables
import sequtils
import strutils
import utils/gz
import os
import re
import sets
import streams
import zip/gzipfiles
import utils/helpers
import bitvector
import bloom

type
H = uint64

proc fq_dedup*(fastq: string) =
#[
Deduplicates reads by ID in FASTQs
Based on Brent Pedersens implementation:
https://gist.github.com/brentp/640806
]#

var
i = 0
n_reads: int
n_dups: int
check = initCountTable[string]()
putative_false_positives = initCountTable[string]()

var bloom = newBloomFilter[uint32](1e8.int, 0.0001, 0, true)

let stream: Stream =
if fastq[^3 .. ^1] == ".gz":
newGZFileStream(fastq)
else:
newFileStream(fastq, fmRead)
if stream == nil:
quit_error("Unable to open file: " & fastq, 2)

# Iterate once to place dups in.
for record in stream.lines:
if i mod 4 == 0:
if bloom.lookup($record):
check.inc(record, 1)
bloom.insert($record)
i.inc()
if i mod 100000 == 0:
stderr.writeLine $i

n_reads = i div 4.int

if check.len == 0:
stderr.writeLine("No Duplicates Found")
quit(0)

stream.setPosition(0)
var write_ln = true
i = 0
for record in stream.lines:
i.inc()
if (i-1) mod 4 == 0:
if record in check == false:
echo record
write_ln = true
continue
else:
putative_false_positives.inc(record, 1)
if putative_false_positives[record] > 1:
write_ln = false
n_dups.inc(1)
continue
echo record
write_ln = true
elif write_ln:
echo record

stderr.writeLine("total_reads: ", n_reads)
stderr.writeLine("duplicates ", n_dups)
var fp = 0
for v in putative_false_positives.values():
if v == 1:
fp.inc(1)
stderr.writeLine("false-positive: ", fp)
stderr.writeLine("false-positive-rate: ", fp.float / n_dups.float)


stream.close()
1 change: 1 addition & 0 deletions src/private/murmer3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.c">redirected</a>.</body></html>
1 change: 1 addition & 0 deletions src/private/murmer3.h
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.h">redirected</a>.</body></html>
36 changes: 36 additions & 0 deletions src/utils/bloom.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import stint, nimcrypto

type UInt2048 = StUint[2048]

iterator chunksForBloom(h: MDigest[256]): array[2, uint8] =
yield [h.data[0], h.data[1]]
yield [h.data[2], h.data[3]]
yield [h.data[4], h.data[5]]

proc chunkToBloomBits(chunk: array[2, uint8]): UInt2048 =
let h = chunk[0].int
let l = chunk[1].int
one(UInt2048) shl ((l + (h shl 8)) and 2047)

iterator bloomBits(h: MDigest[256]): UInt2048 =
for chunk in chunksForBloom(h):
yield chunkToBloomBits(chunk)

type BloomFilter* = object
value*: UInt2048

proc incl(f: var BloomFilter, h: MDigest[256]) = # Should this be public?
for bits in bloomBits(h):
f.value = f.value or bits

# TODO: The following 2 procs should be one genric, but it doesn't compile. Nim bug?
proc incl*(f: var BloomFilter, v: string) = f.incl(keccak256.digest(v))
proc incl*(f: var BloomFilter, v: openarray[byte]) = f.incl(keccak256.digest(v))

proc contains(f: BloomFilter, h: MDigest[256]): bool = # Should this be public?
for bits in bloomBits(h):
if (f.value and bits).isZero: return false
return true

template contains*[T](f: BloomFilter, v: openarray[T]): bool =
f.contains(keccak256.digest(v))
32 changes: 32 additions & 0 deletions tests/fastq/dup.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
@t1
AGGA
+
AAAA
@t2
AGGA
+
AAAA
@t1
AGGA
+
AAAA
@t3
AGGA
+
AAAA
@t3
AGGA
+
AAJA
@t4
AGGA
+
AAJA
@t4
AGGG
+
AAAA
@t1
AGGA
+
AAAA
Binary file added tests/fastq/dup.fq.gz
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/fastq/illumina_2000_2500.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018 1:Y:0:GCTCGGTA
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
/////////////////////////////////////////////////////////////////////////////////////////////////////
4 changes: 4 additions & 0 deletions tests/fastq/illumina_3000_4000.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@K00100:33:H300JBBXX:6:1101:1538:1527 1:N:0:GCCAAT
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
/////////////////////////////////////////////////////////////////////////////////////////////////////
4 changes: 4 additions & 0 deletions tests/fastq/illumina_hiseq_x.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@ST-E00314:132:HLCJTCCXX:6:2206:31213:47966 1:N:0
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
16 changes: 16 additions & 0 deletions tests/fastq/nodup.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@t1
AGGA
+
AAAA
@t2
AGGA
+
AAAA
@t3
AGGA
+
AAAA
@t4
AGGA
+
AAJA

0 comments on commit 50f280a

Please # to comment.