diff --git a/README.md b/README.md index df918fb..932ddf4 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,22 @@ I intend to port some commands over from [VCF-kit](https://github.com/AndersenLa ## Tools +### fq-dedup + +The `fq-dedup` command de-duplicates a FASTQ by read ID (e.g. `@@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018`). Ideally, this should never happen, and I honestly do not know how it happens. The command has to read through the entire FASTQ twice. + +The command uses a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) to identify duplicates. If no duplicates are found during the first pass, the command will return 0 and print "No Duplicates Found" to `stderr`. If you are +checking a set of FASTQs to remove duplicates, you can use the following bash to handle cases where duplicates are +not found. + +```bash +sc fq-dedup myfastq.fq.gz > myfastq.dedup.fq 2> result.err +if [[ `head -n 1 result.err` == "No Duplicates Found" ]]; then + # Handle case where duplicates are not found (probably by moving file) + mv myfastq.fq.gz myfastq.dedup.fq +fi +``` + ### fq-meta __Scenario:__ You are given an old dusty hard drive packed with sequence data. Your collaborator says "We have some great sequencing data here, if only someone could analyze it." You peek at the filesystem and discover that FASTQs have been renamed, removing crucial information about how they were generated. Your collaborator, however, recalls certain details about which data was sequenced on which sequencer and he has a list of sequencing barcodes and associated samples that you can match on." If only there was a way to determine the barcodes, sequencer, or other essential metadata for each FASTQ... diff --git a/sc.nim b/sc.nim index 5f4b34a..10a9e6b 100644 --- a/sc.nim +++ b/sc.nim @@ -15,6 +15,7 @@ import terminal import asyncfile import zip/gzipfiles import src/fq_meta +import src/fq_dedup #import src/index_swap import src/utils/helpers from constants import ANN_header @@ -310,6 +311,11 @@ var p = newParser("sc"): if opts.fastq.len > 0: for fastq in opts.fastq: fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks) + command("fq-dedup", group="FASTQ"): + help("Removes exact duplicates from FASTQ Files") + arg("fastq", nargs = 1, help = "Input FASTQ") + run: + fq_dedup.fq_dedup(opts.fastq) command("json", group="VCF"): help("Convert a VCF to JSON") arg("vcf", nargs = 1, help="VCF to convert to JSON") diff --git a/sc.nimble b/sc.nimble index fc2b652..c4b729f 100644 --- a/sc.nimble +++ b/sc.nimble @@ -7,7 +7,7 @@ license = "MIT" # Dependencies -requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1" +requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1", "bitvector >= 0.4.10" bin = @["sc"] skipDirs = @["test"] diff --git a/scripts/functional-tests.sh b/scripts/functional-tests.sh index 961f8c0..5a9aa4f 100644 --- a/scripts/functional-tests.sh +++ b/scripts/functional-tests.sh @@ -3,9 +3,13 @@ test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest . ssshtest -PARENT_DIR="`git rev-parse --show-toplevel`" +PARENT_DIR=`git rev-parse --show-toplevel` export PATH="${PATH}:${PARENT_DIR}" +#======# +# json # +#======# + set -o nounset run test_json sc json "${PARENT_DIR}/tests/data/test.vcf.gz" X:17276844-17276844 assert_equal \"X\" "$(cat "$STDOUT_FILE" | jq '.CHROM')" @@ -31,11 +35,30 @@ assert_equal 0.5 "$(cat $STDOUT_FILE | jq '.INFO.HOB')" run format_dp sc json --format="DP" "${PARENT_DIR}/tests/data/test.vcf.gz" I:41947-41947 | jq '.FORMAT.DP|add' assert_equal 2094 "$(cat $STDOUT_FILE | jq '.FORMAT.DP|add')" +#==========# +# fq-dedup # +#==========# + +run fq_dedup sc fq-dedup tests/fastq/dup.fq +assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)" +assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)" -# fq-meta +run fq_dedup_gz sc fq-dedup tests/fastq/dup.fq.gz +assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)" +assert_equal 4 "$(cat $STDOUT_FILE | grep '@' | wc -l)" + +#=========# +# fq-meta # +#=========# curl https://raw.githubusercontent.com/10XGenomics/supernova/master/tenkit/lib/python/tenkit/illumina_instrument.py | \ grep -v 'import martian' | \ - sed 's/martian.exit/exit/g' > illumina_instrument.py + sed 's/martian.exit/exit/g' | \ + sed 's/f.readline()/f.readline().decode("utf-8")/g' | \ + sed 's/exit/print/g' > "${PARENT_DIR}/illumina_instrument.py" + +# Fix python script +echo "Fixing ${PARENT_DIR}/illumina_instrument.py" +2to3 --write --nobackups "${PARENT_DIR}/illumina_instrument.py" function test_fq() { gzip -c $1 > test.fq.gz diff --git a/src/fq_dedup.nim b/src/fq_dedup.nim new file mode 100644 index 0000000..9fb546c --- /dev/null +++ b/src/fq_dedup.nim @@ -0,0 +1,89 @@ +import tables +import sequtils +import strutils +import utils/gz +import os +import re +import sets +import streams +import zip/gzipfiles +import utils/helpers +import bitvector +import bloom + +type + H = uint64 + +proc fq_dedup*(fastq: string) = + #[ + Deduplicates reads by ID in FASTQs + + Based on Brent Pedersens implementation: + https://gist.github.com/brentp/640806 + ]# + + var + i = 0 + n_reads: int + n_dups: int + check = initCountTable[string]() + putative_false_positives = initCountTable[string]() + + var bloom = newBloomFilter[uint32](1e8.int, 0.0001, 0, true) + + let stream: Stream = + if fastq[^3 .. ^1] == ".gz": + newGZFileStream(fastq) + else: + newFileStream(fastq, fmRead) + if stream == nil: + quit_error("Unable to open file: " & fastq, 2) + + # Iterate once to place dups in. + for record in stream.lines: + if i mod 4 == 0: + if bloom.lookup($record): + check.inc(record, 1) + bloom.insert($record) + i.inc() + if i mod 100000 == 0: + stderr.writeLine $i + + n_reads = i div 4.int + + if check.len == 0: + stderr.writeLine("No Duplicates Found") + quit(0) + + stream.setPosition(0) + var write_ln = true + i = 0 + for record in stream.lines: + i.inc() + if (i-1) mod 4 == 0: + if record in check == false: + echo record + write_ln = true + continue + else: + putative_false_positives.inc(record, 1) + if putative_false_positives[record] > 1: + write_ln = false + n_dups.inc(1) + continue + echo record + write_ln = true + elif write_ln: + echo record + + stderr.writeLine("total_reads: ", n_reads) + stderr.writeLine("duplicates ", n_dups) + var fp = 0 + for v in putative_false_positives.values(): + if v == 1: + fp.inc(1) + stderr.writeLine("false-positive: ", fp) + stderr.writeLine("false-positive-rate: ", fp.float / n_dups.float) + + + stream.close() \ No newline at end of file diff --git a/src/private/murmer3.c b/src/private/murmer3.c new file mode 100644 index 0000000..508940c --- /dev/null +++ b/src/private/murmer3.c @@ -0,0 +1 @@ +You are being redirected. \ No newline at end of file diff --git a/src/private/murmer3.h b/src/private/murmer3.h new file mode 100644 index 0000000..73ed0d3 --- /dev/null +++ b/src/private/murmer3.h @@ -0,0 +1 @@ +You are being redirected. \ No newline at end of file diff --git a/src/utils/bloom.nim b/src/utils/bloom.nim new file mode 100644 index 0000000..367a7ee --- /dev/null +++ b/src/utils/bloom.nim @@ -0,0 +1,36 @@ +import stint, nimcrypto + +type UInt2048 = StUint[2048] + +iterator chunksForBloom(h: MDigest[256]): array[2, uint8] = + yield [h.data[0], h.data[1]] + yield [h.data[2], h.data[3]] + yield [h.data[4], h.data[5]] + +proc chunkToBloomBits(chunk: array[2, uint8]): UInt2048 = + let h = chunk[0].int + let l = chunk[1].int + one(UInt2048) shl ((l + (h shl 8)) and 2047) + +iterator bloomBits(h: MDigest[256]): UInt2048 = + for chunk in chunksForBloom(h): + yield chunkToBloomBits(chunk) + +type BloomFilter* = object + value*: UInt2048 + +proc incl(f: var BloomFilter, h: MDigest[256]) = # Should this be public? + for bits in bloomBits(h): + f.value = f.value or bits + +# TODO: The following 2 procs should be one genric, but it doesn't compile. Nim bug? +proc incl*(f: var BloomFilter, v: string) = f.incl(keccak256.digest(v)) +proc incl*(f: var BloomFilter, v: openarray[byte]) = f.incl(keccak256.digest(v)) + +proc contains(f: BloomFilter, h: MDigest[256]): bool = # Should this be public? + for bits in bloomBits(h): + if (f.value and bits).isZero: return false + return true + +template contains*[T](f: BloomFilter, v: openarray[T]): bool = + f.contains(keccak256.digest(v)) \ No newline at end of file diff --git a/tests/fastq/dup.fq b/tests/fastq/dup.fq new file mode 100644 index 0000000..e3eb946 --- /dev/null +++ b/tests/fastq/dup.fq @@ -0,0 +1,32 @@ +@t1 +AGGA ++ +AAAA +@t2 +AGGA ++ +AAAA +@t1 +AGGA ++ +AAAA +@t3 +AGGA ++ +AAAA +@t3 +AGGA ++ +AAJA +@t4 +AGGA ++ +AAJA +@t4 +AGGG ++ +AAAA +@t1 +AGGA ++ +AAAA diff --git a/tests/fastq/dup.fq.gz b/tests/fastq/dup.fq.gz new file mode 100644 index 0000000..9a30cd4 Binary files /dev/null and b/tests/fastq/dup.fq.gz differ diff --git a/tests/fastq/illumina_2000_2500.fq b/tests/fastq/illumina_2000_2500.fq new file mode 100644 index 0000000..bf469e6 --- /dev/null +++ b/tests/fastq/illumina_2000_2500.fq @@ -0,0 +1,4 @@ +@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018 1:Y:0:GCTCGGTA +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ++ +///////////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/tests/fastq/illumina_3000_4000.fq b/tests/fastq/illumina_3000_4000.fq new file mode 100644 index 0000000..945e429 --- /dev/null +++ b/tests/fastq/illumina_3000_4000.fq @@ -0,0 +1,4 @@ +@K00100:33:H300JBBXX:6:1101:1538:1527 1:N:0:GCCAAT +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ++ +///////////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/tests/fastq/illumina_hiseq_x.fq b/tests/fastq/illumina_hiseq_x.fq new file mode 100644 index 0000000..2cfc669 --- /dev/null +++ b/tests/fastq/illumina_hiseq_x.fq @@ -0,0 +1,4 @@ +@ST-E00314:132:HLCJTCCXX:6:2206:31213:47966 1:N:0 +GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT ++ +!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 diff --git a/tests/fastq/nodup.fq b/tests/fastq/nodup.fq new file mode 100644 index 0000000..b3ab646 --- /dev/null +++ b/tests/fastq/nodup.fq @@ -0,0 +1,16 @@ +@t1 +AGGA ++ +AAAA +@t2 +AGGA ++ +AAAA +@t3 +AGGA ++ +AAAA +@t4 +AGGA ++ +AAJA \ No newline at end of file