From c38a8138ab94033e136d0cc6e2286becfb0b9529 Mon Sep 17 00:00:00 2001 From: Daniel Cook Date: Thu, 7 Nov 2019 20:38:06 +0000 Subject: [PATCH] finish insert-size --- README.md | 10 ++- sc.nim | 3 +- src/insert_size.nim | 155 ++++++++------------------------------------ 3 files changed, 35 insertions(+), 133 deletions(-) diff --git a/README.md b/README.md index 63a51be..1ff8b5f 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,10 @@ __FASTQ__ * [fq-dedup](#fq-dedup) * [fq-meta](#fq-meta) +__BAM__ + +* [insert-size](#insert-size) + __VCF__ * [json](#json) * [fasta](#fasta) @@ -201,9 +205,9 @@ sc insert-size --header input.bam __Output__ -| median | mean | max_all | n_reads | n_accept | n_use | filename | | -|---------:|--------:|----------:|----------:|-----------:|--------:|-----------:|:-------------------| -| 195 | 212.621 | 2 | 249229747 | 2062509 | 1021195 | 1016055 | input.bam | +| median | mean | min | max_all | n_reads | n_accept | n_use | sample | filename | +|---------:|--------:|------:|----------:|----------:|-----------:|--------:|:---------------|:---------------| +| 195 | 212.621 | 2 | 249229747 | 2062509 | 1021195 | 1016055 | sample_A | sample_A.bam | ## Compilation diff --git a/sc.nim b/sc.nim index fc41291..3e57225 100644 --- a/sc.nim +++ b/sc.nim @@ -95,6 +95,7 @@ var p = newParser("sc"): option("-d", "--dist", default="", help = "Output raw distribution(s)") option("-j", "--threads", default="1", help = "Number of threads") arg("bam", nargs = -1, help = "Input BAM") + flag("-v", "--verbose", help="Provide output") run: if opts.header: echo insert_size.header @@ -102,7 +103,7 @@ var p = newParser("sc"): quit_error("No BAM specified", 3) if opts.bam.len > 0: for bam in opts.bam: - insert_size.cmd_insert_size(bam, opts.dist, opts.threads.parseInt().int8) + insert_size.cmd_insert_size(bam, opts.dist, opts.threads.parseInt(), opts.verbose) command("vcf2tsv", group="VCF"): help("Converts a VCF to TSV or CSV") diff --git a/src/insert_size.nim b/src/insert_size.nim index 88bd1d9..19edc3e 100644 --- a/src/insert_size.nim +++ b/src/insert_size.nim @@ -6,6 +6,7 @@ import strformat import math import stats import os +import colorize import re import threadpool #import ggplotnim @@ -20,7 +21,7 @@ const header* = ["median", "sample", "filename"].join("\t") -const distribution_header = ["insert_size", "count", "bam"].join("\t") +const distribution_header = ["insert_size", "count", "sample", "filename"].join("\t") proc accept_record(r: Record): bool = if (r.flag.pair == false or @@ -66,7 +67,7 @@ type proc `$`(s: chrom_freqs): string = fmt"(inserts: ${s.inserts[0..5]})" -proc freq_inserts(bamfile: string, contig: string, contig_length: uint32): chrom_freqs = +proc freq_inserts(bamfile: string, contig: string, verbose: bool): chrom_freqs = var b: Bam var n_reads = 0 var n_accept = 0 @@ -74,7 +75,7 @@ proc freq_inserts(bamfile: string, contig: string, contig_length: uint32): chrom var inserts = new_seq[int64](ins_arr) var overflow: seq[int64] open(b, bamfile, index=true) - for record in b.query(contig, start=0, stop=contig_length.int): + for record in b.query(contig): n_reads += 1 if record.accept_record(): n_accept += 1 @@ -83,58 +84,35 @@ proc freq_inserts(bamfile: string, contig: string, contig_length: uint32): chrom inserts[insert_val-1] += 1 else: overflow.add(insert_val) - echo fmt"FETCHING {contig}" - var result = chrom_freqs(inserts: inserts, + close(b) + if verbose: + stderr.write_line fmt"{contig} complete".fgBlue + return chrom_freqs(inserts: inserts, overflow: overflow, n_reads: n_reads, n_accept: n_accept) - echo result - return result - - -proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) = - # [ ] TODO: Reimplement this with a count table? - # create procs for MAD (median abs. dev) - # and use to calculate width...? - # proc for getting median also? - # Proc for freq proc. - # https://nimble.directory/pkg/rbtree - # https://nimble.directory/pkg/binaryheap - # https://leetcode.com/problems/find-median-from-data-stream/solution/ - var b: Bam - let fname = bam_file.lastPathPart().changeFileExt("") - let ins_arr = 10000 - var inserts = new_seq[int64](ins_arr) - var inserts_trimmed = new_seq[int64](ins_arr) - var overflow: seq[int64] - var n_reads = 0 - var n_accept = 0 +proc cmd_insert_size*(bamfile: string, distfile: string, threads: int, verbose: bool) = + #[ + Calculates insert size + ]# + let fname = bam_file.lastPathPart() + const ins_arr = 10000 + var + b: Bam + inserts = new_seq[int64](ins_arr) + inserts_trimmed = new_seq[int64](ins_arr) + overflow: seq[int64] + n_reads = 0 + n_accept = 0 - # Option 1: 0m2.951s parallelization - # Can potentially parallelize here across contigs - # open(b, bamfile, threads=threads, index=true) - # for record in b: - # n_reads += 1 - # if record.accept_record(): - # n_accept += 1 - # var insert_val = abs(record.isize) - # if insert_val <= inserts.len: - # inserts[insert_val-1] += 1 - # else: - # overflow.add(insert_val) - - # echo inserts[0..50] - - # Option 2: parallelize across chromosomes - # Time: 1.980 open(b, bamfile, index=true) var freq_results = newSeq[FlowVar[chrom_freqs]](b.hdr.targets().len) for idx, contig in b.hdr.targets(): - freq_results[idx] = spawn freq_inserts(bamfile, contig.name, contig.length) + freq_results[idx] = spawn freq_inserts(bamfile, contig.name, verbose) sync() - + # Compile the results for idx, contig in b.hdr.targets(): var result = ^freq_results[idx] for i, v in result.inserts: @@ -143,11 +121,9 @@ proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) = overflow.add(v) n_reads += result.n_reads n_accept += result.n_accept - echo inserts[0..50] + close(b) - - # Now calculate the median insert size - # Add an option to include 'overflow length' + # Calculate the median insert size var total_length = sum(inserts) + overflow.len # Identify midpoint index @@ -183,7 +159,7 @@ proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) = var f = open(distfile, fmWrite) f.writeLine(distribution_header) for idx, val in inserts_trimmed.filterIt(it > 0): - f.writeLine([$idx, $val, fname].join("\t")) + f.writeLine([$idx, $val, bam_sample(b), fname].join("\t")) f.close() var result = [$median_insert_size, @@ -196,83 +172,4 @@ proc cmd_insert_size*(bamfile: string, distfile: string, threads: int8) = bam_sample(b), fname] echo result.join("\t") - # var mad = new_seq[int64](cumulative_sum.len) - # for idx, val in inserts: - # echo inserts[median_insert_size], "<- mid freq" - # mad[idx] = abs(val - inserts[median_insert_size]) - # var mad_val = median_freq(mad)*deviations - - # # set inserts to 0 if outside mad - # for idx, val in inserts: - # if val <= (mad_val + median_insert_size): - # inserts[idx] = 0 - # echo inserts - # echo (sum(inserts).float/inserts.len.float) - # # Now get MAD - - # var percentile = new_seq[float](ins_arr) - # var max_insert_size = 0 - # for record in b: - # if record.accept_record(): - # var idx = abs(record.isize) - # if idx <= inserts.len: - # inserts[idx-1] += 1 - # else: - # if idx > max_insert_size: - # max_insert_size = idx - - # var total_length = 0.int64 - # var trimmed_count = 0.int64 - # var trimmed_length = 0.int64 - # for idx, val in inserts: - # total_length += val.int64*(idx+1) - - # # Trim 1 pct from end. - # var running_total = 0i64 - # var max_idx = 0 - # for idx, val in inserts: - # var length_add = val.int64*(idx+1) - # running_total += length_add - # var pct = running_total.float / total_length.float - # if pct <= 0.995: - # percentile[idx] = pct - # trimmed_count += val.int64 - # trimmed_length += length_add - # insert_lengths[idx] = length_add - # max_idx = idx - # else: - # percentile[idx] = 1.0 - - # var median_insert_size: int64 - # for idx, val in percentile: - # if val <= 0.50: - # median_insert_size = idx + 1 - # else: - # break - # echo median_insert_size, "<- median" - # # Stats - # let mode_insert_size = inserts.find(max(inserts))+1 - # let min_insert_size = inserts.find(inserts.filterIt( it > 0 )[0]) + 1 - # let mean_insert_size = (trimmed_length.float / trimmed_count.float) - - # # Standard dev - # let n = sum(inserts[1..max_idx]) - # var m = 0.int64 - # for idx, val in inserts[1..max_idx]: - # m += val * (idx + 1)^2 - # let variance = (m.float - (n.float*(mean_insert_size^2))) / (n - 1).float - # let std_dev = pow(variance, 0.5) - - # var result = [ - # $min_insert_size, - # $mean_insert_size, - # $median_insert_size, - # $mode_insert_size, - # $std_dev, - # $max_insert_size, - # bamfile - # ] - - # echo $result -