Skip to content

Commit

Permalink
finish insert-size
Browse files Browse the repository at this point in the history
  • Loading branch information
danielecook committed Nov 7, 2019
1 parent d81c22c commit c38a813
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 133 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ __FASTQ__
* [fq-dedup](#fq-dedup)
* [fq-meta](#fq-meta)

__BAM__

* [insert-size](#insert-size)

__VCF__
* [json](#json)
* [fasta](#fasta)
Expand Down Expand Up @@ -201,9 +205,9 @@ sc insert-size --header input.bam

__Output__

| median | mean | max_all | n_reads | n_accept | n_use | filename | |
|---------:|--------:|----------:|----------:|-----------:|--------:|-----------:|:-------------------|
| 195 | 212.621 | 2 | 249229747 | 2062509 | 1021195 | 1016055 | input.bam |
| median | mean | min | max_all | n_reads | n_accept | n_use | sample | filename |
|---------:|--------:|------:|----------:|----------:|-----------:|--------:|:---------------|:---------------|
| 195 | 212.621 | 2 | 249229747 | 2062509 | 1021195 | 1016055 | sample_A | sample_A.bam |


## Compilation
Expand Down
3 changes: 2 additions & 1 deletion sc.nim
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,15 @@ var p = newParser("sc"):
option("-d", "--dist", default="", help = "Output raw distribution(s)")
option("-j", "--threads", default="1", help = "Number of threads")
arg("bam", nargs = -1, help = "Input BAM")
flag("-v", "--verbose", help="Provide output")
run:
if opts.header:
echo insert_size.header
if opts.bam.len == 0:
quit_error("No BAM specified", 3)
if opts.bam.len > 0:
for bam in opts.bam:
insert_size.cmd_insert_size(bam, opts.dist, opts.threads.parseInt().int8)
insert_size.cmd_insert_size(bam, opts.dist, opts.threads.parseInt(), opts.verbose)

command("vcf2tsv", group="VCF"):
help("Converts a VCF to TSV or CSV")
Expand Down
155 changes: 26 additions & 129 deletions src/insert_size.nim
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import strformat
import math
import stats
import os
import colorize
import re
import threadpool
#import ggplotnim
Expand All @@ -20,7 +21,7 @@ const header* = ["median",
"sample",
"filename"].join("\t")

const distribution_header = ["insert_size", "count", "bam"].join("\t")
const distribution_header = ["insert_size", "count", "sample", "filename"].join("\t")

proc accept_record(r: Record): bool =
if (r.flag.pair == false or
Expand Down Expand Up @@ -66,15 +67,15 @@ type
proc `$`(s: chrom_freqs): string =
fmt"(inserts: ${s.inserts[0..5]})"

proc freq_inserts(bamfile: string, contig: string, contig_length: uint32): chrom_freqs =
proc freq_inserts(bamfile: string, contig: string, verbose: bool): chrom_freqs =
var b: Bam
var n_reads = 0
var n_accept = 0
let ins_arr = 10000
var inserts = new_seq[int64](ins_arr)
var overflow: seq[int64]
open(b, bamfile, index=true)
for record in b.query(contig, start=0, stop=contig_length.int):
for record in b.query(contig):
n_reads += 1
if record.accept_record():
n_accept += 1
Expand All @@ -83,58 +84,35 @@ proc freq_inserts(bamfile: string, contig: string, contig_length: uint32): chrom
inserts[insert_val-1] += 1
else:
overflow.add(insert_val)
echo fmt"FETCHING {contig}"
var result = chrom_freqs(inserts: inserts,
close(b)
if verbose:
stderr.write_line fmt"{contig} complete".fgBlue
return chrom_freqs(inserts: inserts,
overflow: overflow,
n_reads: n_reads,
n_accept: n_accept)
echo result
return result


proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) =
# [ ] TODO: Reimplement this with a count table?
# create procs for MAD (median abs. dev)
# and use to calculate width...?
# proc for getting median also?
# Proc for freq proc.
# https://nimble.directory/pkg/rbtree
# https://nimble.directory/pkg/binaryheap

# https://leetcode.com/problems/find-median-from-data-stream/solution/
var b: Bam
let fname = bam_file.lastPathPart().changeFileExt("")
let ins_arr = 10000
var inserts = new_seq[int64](ins_arr)
var inserts_trimmed = new_seq[int64](ins_arr)
var overflow: seq[int64]
var n_reads = 0
var n_accept = 0
proc cmd_insert_size*(bamfile: string, distfile: string, threads: int, verbose: bool) =
#[
Calculates insert size
]#
let fname = bam_file.lastPathPart()
const ins_arr = 10000
var
b: Bam
inserts = new_seq[int64](ins_arr)
inserts_trimmed = new_seq[int64](ins_arr)
overflow: seq[int64]
n_reads = 0
n_accept = 0

# Option 1: 0m2.951s parallelization
# Can potentially parallelize here across contigs
# open(b, bamfile, threads=threads, index=true)
# for record in b:
# n_reads += 1
# if record.accept_record():
# n_accept += 1
# var insert_val = abs(record.isize)
# if insert_val <= inserts.len:
# inserts[insert_val-1] += 1
# else:
# overflow.add(insert_val)

# echo inserts[0..50]

# Option 2: parallelize across chromosomes
# Time: 1.980
open(b, bamfile, index=true)
var freq_results = newSeq[FlowVar[chrom_freqs]](b.hdr.targets().len)
for idx, contig in b.hdr.targets():
freq_results[idx] = spawn freq_inserts(bamfile, contig.name, contig.length)
freq_results[idx] = spawn freq_inserts(bamfile, contig.name, verbose)
sync()


# Compile the results
for idx, contig in b.hdr.targets():
var result = ^freq_results[idx]
for i, v in result.inserts:
Expand All @@ -143,11 +121,9 @@ proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) =
overflow.add(v)
n_reads += result.n_reads
n_accept += result.n_accept
echo inserts[0..50]
close(b)


# Now calculate the median insert size
# Add an option to include 'overflow length'
# Calculate the median insert size
var total_length = sum(inserts) + overflow.len

# Identify midpoint index
Expand Down Expand Up @@ -183,7 +159,7 @@ proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) =
var f = open(distfile, fmWrite)
f.writeLine(distribution_header)
for idx, val in inserts_trimmed.filterIt(it > 0):
f.writeLine([$idx, $val, fname].join("\t"))
f.writeLine([$idx, $val, bam_sample(b), fname].join("\t"))
f.close()

var result = [$median_insert_size,
Expand All @@ -196,83 +172,4 @@ proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) =
bam_sample(b),
fname]
echo result.join("\t")
# var mad = new_seq[int64](cumulative_sum.len)
# for idx, val in inserts:
# echo inserts[median_insert_size], "<- mid freq"
# mad[idx] = abs(val - inserts[median_insert_size])
# var mad_val = median_freq(mad)*deviations

# # set inserts to 0 if outside mad
# for idx, val in inserts:
# if val <= (mad_val + median_insert_size):
# inserts[idx] = 0
# echo inserts
# echo (sum(inserts).float/inserts.len.float)
# # Now get MAD

# var percentile = new_seq[float](ins_arr)
# var max_insert_size = 0
# for record in b:
# if record.accept_record():
# var idx = abs(record.isize)
# if idx <= inserts.len:
# inserts[idx-1] += 1
# else:
# if idx > max_insert_size:
# max_insert_size = idx

# var total_length = 0.int64
# var trimmed_count = 0.int64
# var trimmed_length = 0.int64
# for idx, val in inserts:
# total_length += val.int64*(idx+1)

# # Trim 1 pct from end.
# var running_total = 0i64
# var max_idx = 0
# for idx, val in inserts:
# var length_add = val.int64*(idx+1)
# running_total += length_add
# var pct = running_total.float / total_length.float
# if pct <= 0.995:
# percentile[idx] = pct
# trimmed_count += val.int64
# trimmed_length += length_add
# insert_lengths[idx] = length_add
# max_idx = idx
# else:
# percentile[idx] = 1.0

# var median_insert_size: int64
# for idx, val in percentile:
# if val <= 0.50:
# median_insert_size = idx + 1
# else:
# break
# echo median_insert_size, "<- median"
# # Stats
# let mode_insert_size = inserts.find(max(inserts))+1
# let min_insert_size = inserts.find(inserts.filterIt( it > 0 )[0]) + 1
# let mean_insert_size = (trimmed_length.float / trimmed_count.float)

# # Standard dev
# let n = sum(inserts[1..max_idx])
# var m = 0.int64
# for idx, val in inserts[1..max_idx]:
# m += val * (idx + 1)^2
# let variance = (m.float - (n.float*(mean_insert_size^2))) / (n - 1).float
# let std_dev = pow(variance, 0.5)

# var result = [
# $min_insert_size,
# $mean_insert_size,
# $median_insert_size,
# $mode_insert_size,
# $std_dev,
# $max_insert_size,
# bamfile
# ]

# echo $result


0 comments on commit c38a813

Please # to comment.