From 024f18439e5c4d714c88d0c6502a2f992482023b Mon Sep 17 00:00:00 2001 From: Daniel Cook Date: Thu, 7 Nov 2019 23:22:51 +0000 Subject: [PATCH] add consistent interface for adding abs and base paths --- README.md | 11 ++++++++ sc.nim | 64 ++++++++++++++++++++++++------------------- sc.nimble | 2 +- src/fq_count.nim | 37 ++----------------------- src/fq_meta.nim | 32 ++++++---------------- src/insert_size.nim | 15 +++++----- src/utils/helpers.nim | 34 +++++++++++++++++++++-- 7 files changed, 98 insertions(+), 97 deletions(-) diff --git a/README.md b/README.md index 37cb2d0..53e7d02 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,10 @@ __Benchmark__ 0m58.738s ``` +### fq-count + +Count the number of reads in a FASTQ + ### fq-meta __Scenario:__ You are given an old dusty hard drive packed with sequence data. Your collaborator says "We have some great sequencing data here, if only someone could analyze it." You peek at the filesystem and discover that FASTQs have been renamed, removing crucial information about how they were generated. Your collaborator, however, recalls certain details about which data was sequenced on which sequencer and he has a list of sequencing barcodes and associated samples that you can match on." If only there was a way to determine the barcodes, sequencer, or other essential metadata for each FASTQ... @@ -219,3 +223,10 @@ typical output. * `GT` - Outputs genotypes as [[0, 0], [0, 1], [1, 1], ... * `SGT` - Outputs genotypes as `0/0`, `0/1`, `1/1`, ... * `TGT` - Outputs genotypes as `T/T`, `A/T`, `A/A`, ... + + +### Cross-compilation + +``` +nim c --cpu:i386 --os:linux --threads:on --compileOnly sc.nim +``` \ No newline at end of file diff --git a/sc.nim b/sc.nim index 1825388..ef7cfb6 100644 --- a/sc.nim +++ b/sc.nim @@ -38,6 +38,7 @@ proc get_vcf(vcf: string): string = return "-" return vcf + var p = newParser("sc"): flag("--debug", help="Debug") help(fmt"Sequence data utilities (Version {VERSION})") @@ -45,26 +46,30 @@ var p = newParser("sc"): help("Output metadata for FASTQ") arg("fastq", nargs = -1, help="List of FASTQ files") option("-n", "--lines", help="Number of sequences to sample (n_lines) for qual and index/barcode determination", default = "100") - flag("--header", help="Output the header") - flag("-s", "--symlinks", help="Follow symlinks") + flag("-t", "--header", help="Output the header") + flag("-b", "--basename", help="Add basename column") + flag("-a", "--absolute", help="Add column for absolute path") run: - if opts.fastq.len == 0: - quit_error("No FASTQ specified", 3) + if opts.header: + output_header(fq_meta_header, opts.basename, opts.absolute) if opts.fastq.len > 0: for fastq in opts.fastq: - fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks, opts.header) + fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.basename, opts.absolute) command("fq-count", group="FASTQ"): help("Counts lines in a FASTQ") - flag("--header", help="Output just header") + flag("-t", "--header", help="Output the header") + flag("-b", "--basename", help="Add basename column") + flag("-a", "--absolute", help="Add column for absolute path") arg("fastq", nargs = -1, help = "Input FASTQ") run: if opts.header: - echo fq_count_header + output_header(fq_meta_header, opts.basename, opts.absolute) if opts.fastq.len == 0: quit_error("No FASTQ specified", 3) if opts.fastq.len > 0: for fastq in opts.fastq: - fq_count.fq_count(fastq) + fq_count.fq_count(fastq, opts.basename, opts.absolute) + command("fq-dedup", group="FASTQ"): help("Removes exact duplicates from FASTQ Files") arg("fastq", nargs = 1, help = "Input FASTQ") @@ -89,18 +94,20 @@ var p = newParser("sc"): command("insert-size", group="BAM"): help("Calculate insert-size metrics") - flag("--header", help="Output the header") option("-d", "--dist", default="", help = "Output raw distribution(s)") arg("bam", nargs = -1, help = "Input BAM") + flag("-t", "--header", help="Output the header") + flag("-b", "--basename", help="Add basename column") + flag("-a", "--absolute", help="Add column for absolute path") flag("-v", "--verbose", help="Provide output") run: if opts.header: - echo insert_size.header + output_header(insert_size_header, opts.basename, opts.absolute) if opts.bam.len == 0: quit_error("No BAM specified", 3) if opts.bam.len > 0: for bam in opts.bam: - insert_size.cmd_insert_size(bam, opts.dist, opts.verbose) + insert_size.cmd_insert_size(bam, opts.dist, opts.verbose, opts.basename, opts.absolute) command("vcf2tsv", group="VCF"): help("Converts a VCF to TSV or CSV") @@ -114,24 +121,25 @@ var p = newParser("sc"): elif opts.vcf.len > 0: vcf2tsv(opts.vcf, opts.long, opts.regions) - command("window", group="VCF"): - help("Generate windows from a VCF for parallel execution") - arg("vcf", nargs = 1, help = "Input VCF") - arg("width", nargs = 1) - run: - vcf_window(opts.vcf) + # command("window", group="VCF"): + # help("Generate windows from a VCF for parallel execution") + # arg("vcf", nargs = 1, help = "Input VCF") + # arg("width", nargs = 1) + # run: + # vcf_window(opts.vcf) - command("fasta", group="VCF"): - help("Convert a VCF to a FASTA file") - arg("vcf", nargs = 1, help="VCF to convert to JSON") - arg("region", nargs = -1, help="List of regions or bed files") - option("-s", "--samples", help="Set Samples", default="ALL") - option("-r", "--reference", help="Output full reference sequence") - flag("-f", "--force", help="Force output even if genotypes are not phased") - flag("-c", "--concat", help="Combine chromosomes into a single sequence") - flag("-m", "--merge", help="Merge samples into a single file and send to stdout") - run: - to_fasta(get_vcf(opts.vcf), opts.region, opts.samples, opts.force) + # command("fasta", group="VCF"): + # help("Convert a VCF to a FASTA file") + # arg("vcf", nargs = 1, help="VCF to convert to JSON") + # arg("region", nargs = -1, help="List of regions or bed files") + # option("-s", "--samples", help="Set Samples", default="ALL") + # option("-r", "--reference", help="Output full reference sequence") + # flag("-f", "--force", help="Force output even if genotypes are not phased") + # flag("-c", "--concat", help="Combine chromosomes into a single sequence") + # flag("-m", "--merge", help="Merge samples into a single file and send to stdout") + # run: + # to_fasta(get_vcf(opts.vcf), opts.region, opts.samples, opts.force) + # command("index-swap", group="BAM"): # arg("BAM", nargs= -1, help="List of BAMs or CRAMs to examine") # option("-s", "--sites", help="List of sites to check (required)") diff --git a/sc.nimble b/sc.nimble index 0a1aa99..aca46f6 100644 --- a/sc.nimble +++ b/sc.nimble @@ -7,7 +7,7 @@ license = "MIT" # Dependencies -requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1" +requires "argparse >= 0.9.0", "hts >= 0.2.8", "colorize", "zip >= 0.2.1" requires "https://github.com/danielecook/BitVector#b8cc21271c90cca96ed31f5d5383711dc96a8d3f" bin = @["sc"] diff --git a/src/fq_count.nim b/src/fq_count.nim index e888cb5..fb7aa77 100644 --- a/src/fq_count.nim +++ b/src/fq_count.nim @@ -16,7 +16,7 @@ const fq_count_header* = ["reads", -proc fq_count*(fastq: string) = +proc fq_count*(fastq: string, basename: bool, absolute: bool) = #[ Deduplicates reads by ID in FASTQs @@ -32,9 +32,6 @@ proc fq_count*(fastq: string) = n_reads: int #n_dups: int - var basename = lastPathPart(fastq) - var absolute_path = absolutePath(fastq) - let stream: Stream = if fastq[^3 .. ^1] == ".gz": newGZFileStream(fastq) @@ -56,34 +53,6 @@ proc fq_count*(fastq: string) = $(gc_cnt.float / (total_len - n_cnt).float), $gc_cnt, $n_cnt, - $total_len, - basename, - absolute_path] + $total_len] - echo output.join("\t") - - # https://github.com/nim-lang/Nim/issues/9026#issuecomment-423632254 - # var mf = memfiles.open(fn) - # var cs: cstring - # var linec, wordc, bytec: int - # var inWord: bool - # var s: string - # for slice in memSlices(mf): - # inc(linec) - # cs = cast[cstring](slice.data) - # let length = slice.size - # inc(bytec, length) - # var j = -1 - # for i in 0..length-1: - # j = i - # if cs[i] in WhiteSpace: - # if inWord == true: - # inc(wordc) - # inWord = false - # else: - # inWord = true - # if j >= 0: - # inc(wordc) - # result.linec = linec - # result.wordc = wordc - # result.bytec = bytec + linec + output_w_fnames(output.join("\t"), fastq, basename, absolute) \ No newline at end of file diff --git a/src/fq_meta.nim b/src/fq_meta.nim index be63f0f..cd59b3f 100644 --- a/src/fq_meta.nim +++ b/src/fq_meta.nim @@ -9,7 +9,7 @@ import zip/gzipfiles import utils/helpers const qual = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~""" -const fq_meta_header = ["machine", +const fq_meta_header* = ["machine", "sequencer", "prob_sequencer", "flowcell", @@ -24,9 +24,7 @@ const fq_meta_header = ["machine", "qual_multiple", "min_qual", "max_qual", - "n_lines", - "basename", - "absolute_path"].join("\t") + "n_lines"].join("\t") import tables @@ -197,10 +195,10 @@ proc get_sequencer_name(sequencers: seq[string]): string = elif sequencers.len > 0: return sequencers[^1] -proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header: bool) = + +proc fq_meta*(fastq: string, sample_n = 20, basename: bool, absolute: bool) = var - fastq: string sequence_id: string machine: string run: string @@ -219,26 +217,13 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header: barcodes = newSeq[string](sample_n) i = 0 - if fq_header == true: - echo fq_meta_header - - if follow_symlinks and symlinkExists(fastq_in): - fastq = expandSymlink(fastq_in) - else: - fastq = fastq_in - - var basename = lastPathPart(fastq) - var absolute_path = absolutePath(fastq) - - let stream: Stream = - if fastq[^3 .. ^1] == ".gz": + if fastq.toLowerAscii()[^3 .. ^1] == ".gz": newGZFileStream(fastq) else: newFileStream(fastq, fmRead) if stream == nil: quit_error("Unable to open file: " & fastq, 2) - while not stream.atEnd() and i < sample_n * 4: line = stream.readLine() @@ -275,7 +260,7 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header: var fastq_scores_name = fastq_scores.mapIt(it.name).join(";") let fastq_scores_phred = fastq_scores.mapIt(it.phred).deduplicate().join(";") - echo [machine, + let header_out = [machine, sequencer, sequencer_prob, flowcell, @@ -290,6 +275,5 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header: $(fastq_scores.mapIt(it.name).len > 1), (if qual_min >= 0: $qual_min else: ""), (if qual_max >= 0: $qual_max else: ""), - $(i/4).int, - basename, - absolute_path].join("\t") + $(i/4).int].join("\t") + output_w_fnames(header_out, fastq, basename, absolute) diff --git a/src/insert_size.nim b/src/insert_size.nim index 26d090d..75c56f8 100644 --- a/src/insert_size.nim +++ b/src/insert_size.nim @@ -10,11 +10,12 @@ import colorize import re import algorithm import threadpool +import utils/helpers #import ggplotnim const INS_ARR = 10000 -const header* = ["median", +const insert_size_header* = ["median", "mean", "std_dev", "min", @@ -23,8 +24,7 @@ const header* = ["median", "n_reads", "n_accept", "n_use", - "sample", - "filename"].join("\t") + "sample"].join("\t") const distribution_header = ["insert_size", "count", "sample", "filename"].join("\t") @@ -93,7 +93,7 @@ proc freq_inserts(bamfile: string, contig: string, verbose: bool): chrom_freqs = n_reads: n_reads, n_accept: n_accept) -proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) = +proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool, basename: bool, absolute: bool) = #[ Calculates insert size ]# @@ -181,7 +181,7 @@ proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) = let variance = (m.float - (n.float*(mean_insert_size^2))) / (n - 1).float let std_dev = pow(variance, 0.5) - var result = [$median_insert_size, + var header_out = [$median_insert_size, $math.round(mean_insert_size, 3), $round(std_dev, 3), $min_insert_size, @@ -190,7 +190,6 @@ proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) = $n_reads, $n_accept, $(sum(inserts_trimmed)), - bam_sample(b), - fname] - echo result.join("\t") + bam_sample(b)] + output_w_fnames(header_out.join("\t"), bamfile, basename, absolute) diff --git a/src/utils/helpers.nim b/src/utils/helpers.nim index 342e5d1..81e9384 100644 --- a/src/utils/helpers.nim +++ b/src/utils/helpers.nim @@ -3,6 +3,7 @@ import hts import strutils import strformat import colorize +import sequtils proc quit_error*(msg: string, error_code = 1) = stderr.write_line fmt"Error {error_code}".bgWhite.fgRed & fmt": {msg}".fgRed @@ -25,7 +26,6 @@ proc check_file_list*(files: seq[string]): bool = missing_file = true return missing_file - iterator variants*(vcf:VCF, regions: seq[string]): Variant = ## iterator over region or just the variants. if regions.len == 0: @@ -39,4 +39,34 @@ iterator variants*(vcf:VCF, regions: seq[string]): Variant = for v in vcf.query(&"{toks[0]}:{parseInt(toks[1]) + 1}-{toks[2]}"): yield v else: - for v in vcf.query(region): yield v \ No newline at end of file + for v in vcf.query(region): yield v + +#====================# +# Headers and Output # +#====================# + +proc output_header*(header: string, basename: bool, absolute: bool) = + var + basename_str = "" + absolute_str = "" + if basename == true: + basename_str = "basename" + if absolute == true: + absolute_str = "absolute" + echo [header, basename_str, absolute_str].filterIt(it.len > 0).join("\t") + +proc get_absolute*(path: string): string = + if symlinkExists(path) == true: + return absolutePath(expandSymlink(path)) + return absolutePath(path) + +proc output_w_fnames*(header: string, path: string, basename: bool, absolute: bool) = + # Attaches the basename or absolute if selected in a consistant manner + var + basename_str = "" + absolute_str = "" + if basename == true: + basename_str = lastPathPart(path) + if absolute == true: + absolute_str = get_absolute(path) + echo [header, basename_str, absolute_str].filterIt(it.len > 0).join("\t") \ No newline at end of file