-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from danielecook/development
Development
- Loading branch information
Showing
14 changed files
with
236 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import tables | ||
import sequtils | ||
import strutils | ||
import utils/gz | ||
import os | ||
import re | ||
import sets | ||
import streams | ||
import zip/gzipfiles | ||
import utils/helpers | ||
import bitvector | ||
import bloom | ||
|
||
type | ||
H = uint64 | ||
|
||
proc fq_dedup*(fastq: string) = | ||
#[ | ||
Deduplicates reads by ID in FASTQs | ||
Based on Brent Pedersens implementation: | ||
https://gist.github.com/brentp/640806 | ||
]# | ||
|
||
var | ||
i = 0 | ||
n_reads: int | ||
n_dups: int | ||
check = initCountTable[string]() | ||
putative_false_positives = initCountTable[string]() | ||
|
||
var bloom = newBloomFilter[uint32](1e8.int, 0.0001, 0, true) | ||
|
||
let stream: Stream = | ||
if fastq[^3 .. ^1] == ".gz": | ||
newGZFileStream(fastq) | ||
else: | ||
newFileStream(fastq, fmRead) | ||
if stream == nil: | ||
quit_error("Unable to open file: " & fastq, 2) | ||
|
||
# Iterate once to place dups in. | ||
for record in stream.lines: | ||
if i mod 4 == 0: | ||
if bloom.lookup($record): | ||
check.inc(record, 1) | ||
bloom.insert($record) | ||
i.inc() | ||
if i mod 100000 == 0: | ||
stderr.writeLine $i | ||
|
||
n_reads = i div 4.int | ||
|
||
if check.len == 0: | ||
stderr.writeLine("No Duplicates Found") | ||
quit(0) | ||
|
||
stream.setPosition(0) | ||
var write_ln = true | ||
i = 0 | ||
for record in stream.lines: | ||
i.inc() | ||
if (i-1) mod 4 == 0: | ||
if record in check == false: | ||
echo record | ||
write_ln = true | ||
continue | ||
else: | ||
putative_false_positives.inc(record, 1) | ||
if putative_false_positives[record] > 1: | ||
write_ln = false | ||
n_dups.inc(1) | ||
continue | ||
echo record | ||
write_ln = true | ||
elif write_ln: | ||
echo record | ||
|
||
stderr.writeLine("total_reads: ", n_reads) | ||
stderr.writeLine("duplicates ", n_dups) | ||
var fp = 0 | ||
for v in putative_false_positives.values(): | ||
if v == 1: | ||
fp.inc(1) | ||
stderr.writeLine("false-positive: ", fp) | ||
stderr.writeLine("false-positive-rate: ", fp.float / n_dups.float) | ||
|
||
|
||
stream.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.c">redirected</a>.</body></html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<html><body>You are being <a href="https://raw.githubusercontent.com/MarcAzar/BitVector/master/src/private/murmur3.h">redirected</a>.</body></html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import stint, nimcrypto | ||
|
||
type UInt2048 = StUint[2048] | ||
|
||
iterator chunksForBloom(h: MDigest[256]): array[2, uint8] = | ||
yield [h.data[0], h.data[1]] | ||
yield [h.data[2], h.data[3]] | ||
yield [h.data[4], h.data[5]] | ||
|
||
proc chunkToBloomBits(chunk: array[2, uint8]): UInt2048 = | ||
let h = chunk[0].int | ||
let l = chunk[1].int | ||
one(UInt2048) shl ((l + (h shl 8)) and 2047) | ||
|
||
iterator bloomBits(h: MDigest[256]): UInt2048 = | ||
for chunk in chunksForBloom(h): | ||
yield chunkToBloomBits(chunk) | ||
|
||
type BloomFilter* = object | ||
value*: UInt2048 | ||
|
||
proc incl(f: var BloomFilter, h: MDigest[256]) = # Should this be public? | ||
for bits in bloomBits(h): | ||
f.value = f.value or bits | ||
|
||
# TODO: The following 2 procs should be one genric, but it doesn't compile. Nim bug? | ||
proc incl*(f: var BloomFilter, v: string) = f.incl(keccak256.digest(v)) | ||
proc incl*(f: var BloomFilter, v: openarray[byte]) = f.incl(keccak256.digest(v)) | ||
|
||
proc contains(f: BloomFilter, h: MDigest[256]): bool = # Should this be public? | ||
for bits in bloomBits(h): | ||
if (f.value and bits).isZero: return false | ||
return true | ||
|
||
template contains*[T](f: BloomFilter, v: openarray[T]): bool = | ||
f.contains(keccak256.digest(v)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
@t1 | ||
AGGA | ||
+ | ||
AAAA | ||
@t2 | ||
AGGA | ||
+ | ||
AAAA | ||
@t1 | ||
AGGA | ||
+ | ||
AAAA | ||
@t3 | ||
AGGA | ||
+ | ||
AAAA | ||
@t3 | ||
AGGA | ||
+ | ||
AAJA | ||
@t4 | ||
AGGA | ||
+ | ||
AAJA | ||
@t4 | ||
AGGG | ||
+ | ||
AAAA | ||
@t1 | ||
AGGA | ||
+ | ||
AAAA |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@D00446:1:140101_HWI-D00446_0001_C8HN4ANXX:8:2210:1238:2018 1:Y:0:GCTCGGTA | ||
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC | ||
+ | ||
///////////////////////////////////////////////////////////////////////////////////////////////////// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@K00100:33:H300JBBXX:6:1101:1538:1527 1:N:0:GCCAAT | ||
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC | ||
+ | ||
///////////////////////////////////////////////////////////////////////////////////////////////////// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
@ST-E00314:132:HLCJTCCXX:6:2206:31213:47966 1:N:0 | ||
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT | ||
+ | ||
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
@t1 | ||
AGGA | ||
+ | ||
AAAA | ||
@t2 | ||
AGGA | ||
+ | ||
AAAA | ||
@t3 | ||
AGGA | ||
+ | ||
AAAA | ||
@t4 | ||
AGGA | ||
+ | ||
AAJA |