Merge pull request #12 from danielecook/development

Development
danielecook · Sep 20, 2019 · 50f280a · 50f280a
2 parents 0221fc0 + 437f2d5
commit 50f280a
Show file tree

Hide file tree

Showing 14 changed files with 236 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,22 @@ I intend to port some commands over from [VCF-kit](https://github.com/AndersenLa
 
 ## Tools
 
+### fq-dedup
+
+The `fq-dedup` command de-duplicates a FASTQ by read ID (e.g. `@@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018`). Ideally, this should never happen, and I honestly do not know how it happens. The command has to read through the entire FASTQ twice. 
+
+The command uses a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) to identify duplicates. If no duplicates are found during the first pass, the command will return 0 and print "No Duplicates Found" to `stderr`. If you are
+checking a set of FASTQs to remove duplicates, you can use the following bash to handle cases where duplicates are
+not found.
+
+```bash
+sc fq-dedup myfastq.fq.gz > myfastq.dedup.fq 2> result.err
+if [[ `head -n 1 result.err` == "No Duplicates Found" ]]; then
+    # Handle case where duplicates are not found (probably by moving file)
+    mv myfastq.fq.gz myfastq.dedup.fq
+fi
+```
+
 ### fq-meta
 
 __Scenario:__ You are given an old dusty hard drive packed with sequence data. Your collaborator says "We have some great sequencing data here, if only someone could analyze it." You peek at the filesystem and discover that FASTQs have been renamed, removing crucial information about how they were generated. Your collaborator, however, recalls certain details about which data was sequenced on which sequencer and he has a list of sequencing barcodes and associated samples that you can match on." If only there was a way to determine the barcodes, sequencer, or other essential metadata for each FASTQ...

diff --git a/sc.nim b/sc.nim
@@ -15,6 +15,7 @@ import terminal
 import asyncfile
 import zip/gzipfiles
 import src/fq_meta
+import src/fq_dedup
 #import src/index_swap
 import src/utils/helpers
 from constants import ANN_header
@@ -310,6 +311,11 @@ var p = newParser("sc"):
             if opts.fastq.len > 0:
                 for fastq in opts.fastq:
                     fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks)
+    command("fq-dedup", group="FASTQ"):
+        help("Removes exact duplicates from FASTQ Files")
+        arg("fastq", nargs = 1, help = "Input FASTQ")
+        run:
+            fq_dedup.fq_dedup(opts.fastq)
     command("json", group="VCF"):
         help("Convert a VCF to JSON")
         arg("vcf", nargs = 1, help="VCF to convert to JSON")

diff --git a/sc.nimble b/sc.nimble
@@ -7,7 +7,7 @@ license       = "MIT"
 
 # Dependencies
 
-requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1"
+requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1",  "bitvector >= 0.4.10"
 
 bin = @["sc"]
 skipDirs = @["test"]

diff --git a/scripts/functional-tests.sh b/scripts/functional-tests.sh
@@ -3,9 +3,13 @@ test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest
 
 . ssshtest
 
-PARENT_DIR="`git rev-parse --show-toplevel`"
+PARENT_DIR=`git rev-parse --show-toplevel`
 export PATH="${PATH}:${PARENT_DIR}"
 
+#======#
+# json #
+#======#
+
 set -o nounset
 run test_json sc json "${PARENT_DIR}/tests/data/test.vcf.gz" X:17276844-17276844
 assert_equal \"X\" "$(cat "$STDOUT_FILE" | jq '.CHROM')"
@@ -31,11 +35,30 @@ assert_equal 0.5 "$(cat $STDOUT_FILE | jq '.INFO.HOB')"
 run format_dp sc json --format="DP" "${PARENT_DIR}/tests/data/test.vcf.gz" I:41947-41947 | jq '.FORMAT.DP|add'
 assert_equal 2094 "$(cat $STDOUT_FILE | jq '.FORMAT.DP|add')"
 
+#==========#
+# fq-dedup #
+#==========#
+
+run fq_dedup sc fq-dedup tests/fastq/dup.fq
+assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"
+assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"
 
-# fq-meta
+run fq_dedup_gz sc fq-dedup tests/fastq/dup.fq.gz
+assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"
+assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)"
+
+#=========#
+# fq-meta #
+#=========#
 curl https://raw.githubusercontent.com/10XGenomics/supernova/master/tenkit/lib/python/tenkit/illumina_instrument.py | \
         grep -v 'import martian' | \
-        sed 's/martian.exit/exit/g' > illumina_instrument.py
+        sed 's/martian.exit/exit/g' | \
+        sed 's/f.readline()/f.readline().decode("utf-8")/g' | \
+        sed 's/exit/print/g' > "${PARENT_DIR}/illumina_instrument.py"
+
+# Fix python script
+echo "Fixing ${PARENT_DIR}/illumina_instrument.py"
+2to3 --write --nobackups "${PARENT_DIR}/illumina_instrument.py"
 
 function test_fq() {
     gzip -c $1 > test.fq.gz

diff --git a/src/fq_dedup.nim b/src/fq_dedup.nim
@@ -0,0 +1,89 @@
+import tables
+import sequtils
+import strutils
+import utils/gz
+import os
+import re
+import sets
+import streams
+import zip/gzipfiles
+import utils/helpers
+import bitvector
+import bloom
+
+type
+    H = uint64
+
+proc fq_dedup*(fastq: string) =
+    #[
+        Deduplicates reads by ID in FASTQs
+
+        Based on Brent Pedersens implementation:
+        https://gist.github.com/brentp/640806
+    ]#
+
+    var 
+        i = 0
+        n_reads: int
+        n_dups: int
+        check = initCountTable[string]()
+        putative_false_positives = initCountTable[string]()
+
+    var bloom = newBloomFilter[uint32](1e8.int, 0.0001, 0, true)
+
+    let stream: Stream =
+        if fastq[^3 .. ^1] == ".gz":
+            newGZFileStream(fastq)
+        else:
+            newFileStream(fastq, fmRead)
+    if stream == nil:
+        quit_error("Unable to open file: " & fastq, 2)
+
+    # Iterate once to place dups in.
+    for record in stream.lines:
+        if i mod 4 == 0:
+            if bloom.lookup($record):
+                check.inc(record, 1)
+            bloom.insert($record)
+        i.inc()
+        if i mod 100000 == 0:
+            stderr.writeLine $i
+
+    n_reads = i div 4.int
+
+    if check.len == 0:
+        stderr.writeLine("No Duplicates Found")
+        quit(0)
+
+    stream.setPosition(0)
+    var write_ln = true
+    i = 0
+    for record in stream.lines:
+        i.inc()
+        if (i-1) mod 4 == 0:
+            if record in check == false:
+                echo record
+                write_ln = true
+                continue
+            else:
+                putative_false_positives.inc(record, 1)
+                if putative_false_positives[record] > 1:
+                    write_ln = false
+                    n_dups.inc(1)
+                    continue
+                echo record
+                write_ln = true
+        elif write_ln:
+            echo record
+
+    stderr.writeLine("total_reads: ", n_reads)
+    stderr.writeLine("duplicates ", n_dups)
+    var fp = 0
+    for v in putative_false_positives.values():
+        if v == 1:
+            fp.inc(1)
+    stderr.writeLine("false-positive: ", fp)
+    stderr.writeLine("false-positive-rate: ", fp.float / n_dups.float)
+
+
+    stream.close()
diff --git a/src/private/murmer3.c b/src/private/murmer3.c
@@ -0,0 +1 @@
+<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.c">redirected</a>.</body></html>
diff --git a/src/private/murmer3.h b/src/private/murmer3.h
@@ -0,0 +1 @@
+<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.h">redirected</a>.</body></html>
diff --git a/src/utils/bloom.nim b/src/utils/bloom.nim
@@ -0,0 +1,36 @@
+import stint, nimcrypto
+
+type UInt2048 = StUint[2048]
+
+iterator chunksForBloom(h: MDigest[256]): array[2, uint8] =
+  yield [h.data[0], h.data[1]]
+  yield [h.data[2], h.data[3]]
+  yield [h.data[4], h.data[5]]
+
+proc chunkToBloomBits(chunk: array[2, uint8]): UInt2048 =
+  let h = chunk[0].int
+  let l = chunk[1].int
+  one(UInt2048) shl ((l + (h shl 8)) and 2047)
+
+iterator bloomBits(h: MDigest[256]): UInt2048 =
+  for chunk in chunksForBloom(h):
+    yield chunkToBloomBits(chunk)
+
+type BloomFilter* = object
+  value*: UInt2048
+
+proc incl(f: var BloomFilter, h: MDigest[256]) = # Should this be public?
+  for bits in bloomBits(h):
+    f.value = f.value or bits
+
+# TODO: The following 2 procs should be one genric, but it doesn't compile. Nim bug?
+proc incl*(f: var BloomFilter, v: string) = f.incl(keccak256.digest(v))
+proc incl*(f: var BloomFilter, v: openarray[byte]) = f.incl(keccak256.digest(v))
+
+proc contains(f: BloomFilter, h: MDigest[256]): bool = # Should this be public?
+  for bits in bloomBits(h):
+    if (f.value and bits).isZero: return false
+  return true
+
+template contains*[T](f: BloomFilter, v: openarray[T]): bool =
+  f.contains(keccak256.digest(v))
diff --git a/tests/fastq/dup.fq b/tests/fastq/dup.fq
@@ -0,0 +1,32 @@
+@t1
+AGGA
++
+AAAA
+@t2
+AGGA
++
+AAAA
+@t1
+AGGA
++
+AAAA
+@t3
+AGGA
++
+AAAA
+@t3
+AGGA
++
+AAJA
+@t4
+AGGA
++
+AAJA
+@t4
+AGGG
++
+AAAA
+@t1
+AGGA
++
+AAAA
diff --git a/tests/fastq/dup.fq.gz b/tests/fastq/dup.fq.gz
diff --git a/tests/fastq/illumina_2000_2500.fq b/tests/fastq/illumina_2000_2500.fq
@@ -0,0 +1,4 @@
+@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018 1:Y:0:GCTCGGTA
+CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
++
+/////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/fastq/illumina_3000_4000.fq b/tests/fastq/illumina_3000_4000.fq
@@ -0,0 +1,4 @@
+@K00100:33:H300JBBXX:6:1101:1538:1527 1:N:0:GCCAAT
+CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
++
+/////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/fastq/illumina_hiseq_x.fq b/tests/fastq/illumina_hiseq_x.fq
@@ -0,0 +1,4 @@
+@ST-E00314:132:HLCJTCCXX:6:2206:31213:47966 1:N:0
+GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
++
+!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
diff --git a/tests/fastq/nodup.fq b/tests/fastq/nodup.fq
@@ -0,0 +1,16 @@
+@t1
+AGGA
++
+AAAA
+@t2
+AGGA
++
+AAAA
+@t3
+AGGA
++
+AAAA
+@t4
+AGGA
++
+AAJA
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.c">redirected</a>.</body></html>
-Original file line number
+Diff line change
@@ -0,0 +1,32 @@
+    @t1
+    AGGA
+    +
+    AAAA
+    @t2
+    AGGA
+    +
+    AAAA
+    @t1
+    AGGA
+    +
+    AAAA
+    @t3
+    AGGA
+    +
+    AAAA
+    @t3
+    AGGA
+    +
+    AAJA
+    @t4
+    AGGA
+    +
+    AAJA
+    @t4
+    AGGG
+    +
+    AAAA
+    @t1
+    AGGA
+    +
+    AAAA