Skip to content

Commit

Permalink
add consistent interface for adding abs and base paths
Browse files Browse the repository at this point in the history
  • Loading branch information
danielecook committed Nov 7, 2019
1 parent 55754ea commit 024f184
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 97 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ __Benchmark__
0m58.738s
```

### fq-count

Count the number of reads in a FASTQ

### fq-meta

__Scenario:__ You are given an old dusty hard drive packed with sequence data. Your collaborator says "We have some great sequencing data here, if only someone could analyze it." You peek at the filesystem and discover that FASTQs have been renamed, removing crucial information about how they were generated. Your collaborator, however, recalls certain details about which data was sequenced on which sequencer and he has a list of sequencing barcodes and associated samples that you can match on." If only there was a way to determine the barcodes, sequencer, or other essential metadata for each FASTQ...
Expand Down Expand Up @@ -219,3 +223,10 @@ typical output.
* `GT` - Outputs genotypes as [[0, 0], [0, 1], [1, 1], ...
* `SGT` - Outputs genotypes as `0/0`, `0/1`, `1/1`, ...
* `TGT` - Outputs genotypes as `T/T`, `A/T`, `A/A`, ...


### Cross-compilation

```
nim c --cpu:i386 --os:linux --threads:on --compileOnly sc.nim
```
64 changes: 36 additions & 28 deletions sc.nim
Original file line number Diff line number Diff line change
Expand Up @@ -38,33 +38,38 @@ proc get_vcf(vcf: string): string =
return "-"
return vcf


var p = newParser("sc"):
flag("--debug", help="Debug")
help(fmt"Sequence data utilities (Version {VERSION})")
command("fq-meta", group="FASTQ"):
help("Output metadata for FASTQ")
arg("fastq", nargs = -1, help="List of FASTQ files")
option("-n", "--lines", help="Number of sequences to sample (n_lines) for qual and index/barcode determination", default = "100")
flag("--header", help="Output the header")
flag("-s", "--symlinks", help="Follow symlinks")
flag("-t", "--header", help="Output the header")
flag("-b", "--basename", help="Add basename column")
flag("-a", "--absolute", help="Add column for absolute path")
run:
if opts.fastq.len == 0:
quit_error("No FASTQ specified", 3)
if opts.header:
output_header(fq_meta_header, opts.basename, opts.absolute)
if opts.fastq.len > 0:
for fastq in opts.fastq:
fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.symlinks, opts.header)
fq_meta.fq_meta(fastq, parseInt(opts.lines), opts.basename, opts.absolute)
command("fq-count", group="FASTQ"):
help("Counts lines in a FASTQ")
flag("--header", help="Output just header")
flag("-t", "--header", help="Output the header")
flag("-b", "--basename", help="Add basename column")
flag("-a", "--absolute", help="Add column for absolute path")
arg("fastq", nargs = -1, help = "Input FASTQ")
run:
if opts.header:
echo fq_count_header
output_header(fq_meta_header, opts.basename, opts.absolute)
if opts.fastq.len == 0:
quit_error("No FASTQ specified", 3)
if opts.fastq.len > 0:
for fastq in opts.fastq:
fq_count.fq_count(fastq)
fq_count.fq_count(fastq, opts.basename, opts.absolute)

command("fq-dedup", group="FASTQ"):
help("Removes exact duplicates from FASTQ Files")
arg("fastq", nargs = 1, help = "Input FASTQ")
Expand All @@ -89,18 +94,20 @@ var p = newParser("sc"):

command("insert-size", group="BAM"):
help("Calculate insert-size metrics")
flag("--header", help="Output the header")
option("-d", "--dist", default="", help = "Output raw distribution(s)")
arg("bam", nargs = -1, help = "Input BAM")
flag("-t", "--header", help="Output the header")
flag("-b", "--basename", help="Add basename column")
flag("-a", "--absolute", help="Add column for absolute path")
flag("-v", "--verbose", help="Provide output")
run:
if opts.header:
echo insert_size.header
output_header(insert_size_header, opts.basename, opts.absolute)
if opts.bam.len == 0:
quit_error("No BAM specified", 3)
if opts.bam.len > 0:
for bam in opts.bam:
insert_size.cmd_insert_size(bam, opts.dist, opts.verbose)
insert_size.cmd_insert_size(bam, opts.dist, opts.verbose, opts.basename, opts.absolute)

command("vcf2tsv", group="VCF"):
help("Converts a VCF to TSV or CSV")
Expand All @@ -114,24 +121,25 @@ var p = newParser("sc"):
elif opts.vcf.len > 0:
vcf2tsv(opts.vcf, opts.long, opts.regions)

command("window", group="VCF"):
help("Generate windows from a VCF for parallel execution")
arg("vcf", nargs = 1, help = "Input VCF")
arg("width", nargs = 1)
run:
vcf_window(opts.vcf)
# command("window", group="VCF"):
# help("Generate windows from a VCF for parallel execution")
# arg("vcf", nargs = 1, help = "Input VCF")
# arg("width", nargs = 1)
# run:
# vcf_window(opts.vcf)

command("fasta", group="VCF"):
help("Convert a VCF to a FASTA file")
arg("vcf", nargs = 1, help="VCF to convert to JSON")
arg("region", nargs = -1, help="List of regions or bed files")
option("-s", "--samples", help="Set Samples", default="ALL")
option("-r", "--reference", help="Output full reference sequence")
flag("-f", "--force", help="Force output even if genotypes are not phased")
flag("-c", "--concat", help="Combine chromosomes into a single sequence")
flag("-m", "--merge", help="Merge samples into a single file and send to stdout")
run:
to_fasta(get_vcf(opts.vcf), opts.region, opts.samples, opts.force)
# command("fasta", group="VCF"):
# help("Convert a VCF to a FASTA file")
# arg("vcf", nargs = 1, help="VCF to convert to JSON")
# arg("region", nargs = -1, help="List of regions or bed files")
# option("-s", "--samples", help="Set Samples", default="ALL")
# option("-r", "--reference", help="Output full reference sequence")
# flag("-f", "--force", help="Force output even if genotypes are not phased")
# flag("-c", "--concat", help="Combine chromosomes into a single sequence")
# flag("-m", "--merge", help="Merge samples into a single file and send to stdout")
# run:
# to_fasta(get_vcf(opts.vcf), opts.region, opts.samples, opts.force)

# command("index-swap", group="BAM"):
# arg("BAM", nargs= -1, help="List of BAMs or CRAMs to examine")
# option("-s", "--sites", help="List of sites to check (required)")
Expand Down
2 changes: 1 addition & 1 deletion sc.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ license = "MIT"

# Dependencies

requires "argparse >= 0.7.1", "hts >= 0.2.8", "colorize", "zip >= 0.2.1"
requires "argparse >= 0.9.0", "hts >= 0.2.8", "colorize", "zip >= 0.2.1"
requires "https://github.com/danielecook/BitVector#b8cc21271c90cca96ed31f5d5383711dc96a8d3f"

bin = @["sc"]
Expand Down
37 changes: 3 additions & 34 deletions src/fq_count.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ const fq_count_header* = ["reads",



proc fq_count*(fastq: string) =
proc fq_count*(fastq: string, basename: bool, absolute: bool) =
#[
Deduplicates reads by ID in FASTQs
Expand All @@ -32,9 +32,6 @@ proc fq_count*(fastq: string) =
n_reads: int
#n_dups: int

var basename = lastPathPart(fastq)
var absolute_path = absolutePath(fastq)

let stream: Stream =
if fastq[^3 .. ^1] == ".gz":
newGZFileStream(fastq)
Expand All @@ -56,34 +53,6 @@ proc fq_count*(fastq: string) =
$(gc_cnt.float / (total_len - n_cnt).float),
$gc_cnt,
$n_cnt,
$total_len,
basename,
absolute_path]
$total_len]

echo output.join("\t")

# https://github.com/nim-lang/Nim/issues/9026#issuecomment-423632254
# var mf = memfiles.open(fn)
# var cs: cstring
# var linec, wordc, bytec: int
# var inWord: bool
# var s: string
# for slice in memSlices(mf):
# inc(linec)
# cs = cast[cstring](slice.data)
# let length = slice.size
# inc(bytec, length)
# var j = -1
# for i in 0..length-1:
# j = i
# if cs[i] in WhiteSpace:
# if inWord == true:
# inc(wordc)
# inWord = false
# else:
# inWord = true
# if j >= 0:
# inc(wordc)
# result.linec = linec
# result.wordc = wordc
# result.bytec = bytec + linec
output_w_fnames(output.join("\t"), fastq, basename, absolute)
32 changes: 8 additions & 24 deletions src/fq_meta.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import zip/gzipfiles
import utils/helpers

const qual = """!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"""
const fq_meta_header = ["machine",
const fq_meta_header* = ["machine",
"sequencer",
"prob_sequencer",
"flowcell",
Expand All @@ -24,9 +24,7 @@ const fq_meta_header = ["machine",
"qual_multiple",
"min_qual",
"max_qual",
"n_lines",
"basename",
"absolute_path"].join("\t")
"n_lines"].join("\t")

import tables

Expand Down Expand Up @@ -197,10 +195,10 @@ proc get_sequencer_name(sequencers: seq[string]): string =
elif sequencers.len > 0:
return sequencers[^1]

proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header: bool) =

proc fq_meta*(fastq: string, sample_n = 20, basename: bool, absolute: bool) =

var
fastq: string
sequence_id: string
machine: string
run: string
Expand All @@ -219,26 +217,13 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header:
barcodes = newSeq[string](sample_n)
i = 0

if fq_header == true:
echo fq_meta_header

if follow_symlinks and symlinkExists(fastq_in):
fastq = expandSymlink(fastq_in)
else:
fastq = fastq_in

var basename = lastPathPart(fastq)
var absolute_path = absolutePath(fastq)


let stream: Stream =
if fastq[^3 .. ^1] == ".gz":
if fastq.toLowerAscii()[^3 .. ^1] == ".gz":
newGZFileStream(fastq)
else:
newFileStream(fastq, fmRead)
if stream == nil:
quit_error("Unable to open file: " & fastq, 2)


while not stream.atEnd() and i < sample_n * 4:
line = stream.readLine()
Expand Down Expand Up @@ -275,7 +260,7 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header:
var fastq_scores_name = fastq_scores.mapIt(it.name).join(";")
let fastq_scores_phred = fastq_scores.mapIt(it.phred).deduplicate().join(";")

echo [machine,
let header_out = [machine,
sequencer,
sequencer_prob,
flowcell,
Expand All @@ -290,6 +275,5 @@ proc fq_meta*(fastq_in: string, sample_n = 20, follow_symlinks: bool, fq_header:
$(fastq_scores.mapIt(it.name).len > 1),
(if qual_min >= 0: $qual_min else: ""),
(if qual_max >= 0: $qual_max else: ""),
$(i/4).int,
basename,
absolute_path].join("\t")
$(i/4).int].join("\t")
output_w_fnames(header_out, fastq, basename, absolute)
15 changes: 7 additions & 8 deletions src/insert_size.nim
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ import colorize
import re
import algorithm
import threadpool
import utils/helpers
#import ggplotnim

const INS_ARR = 10000

const header* = ["median",
const insert_size_header* = ["median",
"mean",
"std_dev",
"min",
Expand All @@ -23,8 +24,7 @@ const header* = ["median",
"n_reads",
"n_accept",
"n_use",
"sample",
"filename"].join("\t")
"sample"].join("\t")

const distribution_header = ["insert_size", "count", "sample", "filename"].join("\t")

Expand Down Expand Up @@ -93,7 +93,7 @@ proc freq_inserts(bamfile: string, contig: string, verbose: bool): chrom_freqs =
n_reads: n_reads,
n_accept: n_accept)

proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) =
proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool, basename: bool, absolute: bool) =
#[
Calculates insert size
]#
Expand Down Expand Up @@ -181,7 +181,7 @@ proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) =
let variance = (m.float - (n.float*(mean_insert_size^2))) / (n - 1).float
let std_dev = pow(variance, 0.5)

var result = [$median_insert_size,
var header_out = [$median_insert_size,
$math.round(mean_insert_size, 3),
$round(std_dev, 3),
$min_insert_size,
Expand All @@ -190,7 +190,6 @@ proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool) =
$n_reads,
$n_accept,
$(sum(inserts_trimmed)),
bam_sample(b),
fname]
echo result.join("\t")
bam_sample(b)]
output_w_fnames(header_out.join("\t"), bamfile, basename, absolute)

34 changes: 32 additions & 2 deletions src/utils/helpers.nim
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import hts
import strutils
import strformat
import colorize
import sequtils

proc quit_error*(msg: string, error_code = 1) =
stderr.write_line fmt"Error {error_code}".bgWhite.fgRed & fmt": {msg}".fgRed
Expand All @@ -25,7 +26,6 @@ proc check_file_list*(files: seq[string]): bool =
missing_file = true
return missing_file


iterator variants*(vcf:VCF, regions: seq[string]): Variant =
## iterator over region or just the variants.
if regions.len == 0:
Expand All @@ -39,4 +39,34 @@ iterator variants*(vcf:VCF, regions: seq[string]): Variant =
for v in vcf.query(&"{toks[0]}:{parseInt(toks[1]) + 1}-{toks[2]}"):
yield v
else:
for v in vcf.query(region): yield v
for v in vcf.query(region): yield v

#====================#
# Headers and Output #
#====================#

proc output_header*(header: string, basename: bool, absolute: bool) =
var
basename_str = ""
absolute_str = ""
if basename == true:
basename_str = "basename"
if absolute == true:
absolute_str = "absolute"
echo [header, basename_str, absolute_str].filterIt(it.len > 0).join("\t")

proc get_absolute*(path: string): string =
if symlinkExists(path) == true:
return absolutePath(expandSymlink(path))
return absolutePath(path)

proc output_w_fnames*(header: string, path: string, basename: bool, absolute: bool) =
# Attaches the basename or absolute if selected in a consistant manner
var
basename_str = ""
absolute_str = ""
if basename == true:
basename_str = lastPathPart(path)
if absolute == true:
absolute_str = get_absolute(path)
echo [header, basename_str, absolute_str].filterIt(it.len > 0).join("\t")

0 comments on commit 024f184

Please # to comment.