diff --git a/DESCRIPTION b/DESCRIPTION index 48b7985..7e2972f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: SeqArray Type: Package Title: Big Data Management of Whole-genome Sequence Variant Calls -Version: 1.21.2 -Date: 2018-06-11 +Version: 1.21.3 +Date: 2018-07-29 Depends: R (>= 3.3.0), gdsfmt (>= 1.10.1) Imports: methods, parallel, IRanges, GenomicRanges, GenomeInfoDb, Biostrings, S4Vectors diff --git a/NEWS b/NEWS index ac1a480..4c6cfda 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -CHANGES IN VERSION 1.21.1-1.21.2 +CHANGES IN VERSION 1.21.1-1.21.3 ------------------------- UTILITIES @@ -17,6 +17,8 @@ NEW FEATURES o a new function `seqCheck()` for checking the data integrity of a SeqArray file + o `seqGDS2SNP()` exports dosage GDS files + BUG FIXES o `seqVCF2GDS()` and `seqVCF_Header()` are able to import site-only VCF diff --git a/R/Conversion.R b/R/Conversion.R index 344df9e..b4c02aa 100644 --- a/R/Conversion.R +++ b/R/Conversion.R @@ -320,13 +320,14 @@ seqGDS2VCF <- function(gdsfile, vcf.fn, info.var=NULL, fmt.var=NULL, # Convert a SeqArray GDS file to a SNP GDS file # -seqGDS2SNP <- function(gdsfile, out.gdsfn, - compress.geno="ZIP_RA", compress.annotation="LZMA_RA", +seqGDS2SNP <- function(gdsfile, out.gdsfn, dosage=FALSE, + compress.geno="LZMA_RA", compress.annotation="LZMA_RA", optimize=TRUE, verbose=TRUE) { # check stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass")) stopifnot(is.character(out.gdsfn), length(out.gdsfn)==1L) + stopifnot(is.logical(dosage) | is.character(dosage), length(dosage)==1L) stopifnot(is.character(compress.geno), length(compress.geno)==1L) stopifnot(is.character(compress.annotation), length(compress.annotation)==1L) stopifnot(is.logical(optimize), length(optimize)==1L) @@ -335,7 +336,10 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn, if (verbose) { cat(date(), "\n", sep="") - cat("SeqArray GDS to SNP GDS Format:\n") + if (isTRUE(dosage) | is.character(dosage)) + cat("SeqArray GDS to SNP GDS Dosage Format:\n") + else + cat("SeqArray GDS to SNP GDS Format:\n") } # if it is a file name @@ -345,12 +349,27 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn, on.exit({ seqClose(gdsfile) }) } + # dosage variable name + if (isTRUE(dosage) | is.character(dosage)) + { + if (isTRUE(dosage)) + dosage <- "annotation/format/DS" + else if (substr(dosage, 1L, 18L) != "annotation/format/") + dosage <- paste0("annotation/format/", dosage) + } + if (verbose) { dm <- .seldim(gdsfile) cat(" # of samples: ", .pretty(dm[2L]), "\n", sep="") cat(" # of variants: ", .pretty(dm[3L]), "\n", sep="") - cat(" genotype compression: ", compress.geno, "\n", sep="") + if (is.character(dosage)) + { + cat(" dosage compression: ", compress.geno, ", from [", + dosage, "]\n", sep="") + } else { + cat(" genotype compression: ", compress.geno, "\n", sep="") + } cat(" annotation compression: ", compress.annotation, "\n", sep="") } @@ -358,19 +377,16 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn, gfile <- createfn.gds(out.gdsfn) # close the file at the end on.exit({ - closefn.gds(gfile) - if (verbose) - cat("Done.\n", date(), "\n", sep="") - if (optimize) - { - if (verbose) cat("Optimize the access efficiency ...\n") - cleanup.gds(out.gdsfn, verbose=verbose) - if (verbose) cat(date(), "\n", sep="") - } + if (!is.null(gfile)) closefn.gds(gfile) }, add=TRUE) # add a flag - put.attr.gdsn(gfile$root, "FileFormat", "SNP_ARRAY") + if (!is.character(dosage)) + { + put.attr.gdsn(gfile$root, "FileFormat", "SNP_ARRAY") + } else { + put.attr.gdsn(gfile$root, "FileFormat", "IMPUTED_DOSAGE") + } # add "sample.id" sampid <- seqGetData(gdsfile, "sample.id") @@ -402,13 +418,34 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn, compress=compress.annotation, closezip=TRUE) # add "genotype" - gGeno <- add.gdsn(gfile, "genotype", storage="bit2", - valdim=c(length(sampid), 0L), compress=compress.geno) - put.attr.gdsn(gGeno, "sample.order") + if (!is.character(dosage)) + { + gGeno <- add.gdsn(gfile, "genotype", storage="bit2", + valdim=c(length(sampid), 0L), compress=compress.geno) + put.attr.gdsn(gGeno, "sample.order") + seqApply(gdsfile, "$dosage", as.is=gGeno, .useraw=TRUE, .progress=verbose, + FUN = .cfunction("FC_GDS2SNP")) + } else { + .cfunction("FC_SetNumSamp")(length(sampid)) + gGeno <- add.gdsn(gfile, "genotype", storage="float32", + valdim=c(length(sampid), 0L), compress=compress.geno) + put.attr.gdsn(gGeno, "sample.order") + seqApply(gdsfile, dosage, as.is=gGeno, .progress=verbose, + FUN = .cfunction("FC_GDS2Dosage")) + } - seqApply(gdsfile, "$dosage", as.is=gGeno, .useraw=TRUE, .progress=verbose, - FUN = .cfunction("FC_GDS2SNP")) readmode.gdsn(gGeno) + closefn.gds(gfile) + gfile <- NULL + if (verbose) + cat("Done.\n", date(), "\n", sep="") + + if (optimize) + { + if (verbose) cat("Optimize the access efficiency ...\n") + cleanup.gds(out.gdsfn, verbose=verbose) + if (verbose) cat(date(), "\n", sep="") + } # output invisible(normalizePath(out.gdsfn)) diff --git a/man/seqGDS2SNP.Rd b/man/seqGDS2SNP.Rd index 9930754..70727e6 100644 --- a/man/seqGDS2SNP.Rd +++ b/man/seqGDS2SNP.Rd @@ -5,13 +5,16 @@ Converts a SeqArray GDS file to a SNP GDS file. } \usage{ -seqGDS2SNP(gdsfile, out.gdsfn, compress.geno="ZIP_RA", +seqGDS2SNP(gdsfile, out.gdsfn, dosage=FALSE, compress.geno="LZMA_RA", compress.annotation="LZMA_RA", optimize=TRUE, verbose=TRUE) } \arguments{ \item{gdsfile}{character (GDS file name), or a \code{\link{SeqVarGDSClass}} object} \item{out.gdsfn}{the file name, output a file of VCF format} + \item{dosage}{a logical value, or characters for the variable name of + dosage in the SeqArray file; if \code{FALSE} exports genotypes, + otherwise exports dosages} \item{compress.geno}{the compression method for "genotype"; optional values are defined in the function \code{add.gdsn}} \item{compress.annotation}{the compression method for the GDS variables, diff --git a/src/ConvToGDS.cpp b/src/ConvToGDS.cpp index 819efc6..0fc4829 100755 --- a/src/ConvToGDS.cpp +++ b/src/ConvToGDS.cpp @@ -2,7 +2,7 @@ // // ConvToGDS.cpp: format conversion // -// Copyright (C) 2015-2017 Xiuwen Zheng +// Copyright (C) 2015-2018 Xiuwen Zheng // // This file is part of SeqArray. // @@ -288,4 +288,36 @@ COREARRAY_DLL_EXPORT SEXP FC_GDS2SNP(SEXP geno) return geno; } + +// ====================================================================== +// SeqArray GDS --> Dosage GDS +// ====================================================================== + +static int FC_Num_Sample = 0; + +COREARRAY_DLL_EXPORT SEXP FC_SetNumSamp(SEXP num) +{ + FC_Num_Sample = Rf_asInteger(num); + return R_NilValue; +} + +COREARRAY_DLL_EXPORT SEXP FC_GDS2Dosage(SEXP dosage) +{ + int n = LENGTH(dosage); + if (n < FC_Num_Sample) + { + dosage = NEW_NUMERIC(FC_Num_Sample); + double *dst = REAL(dosage); + for (int i=0; i < FC_Num_Sample; i++) + dst[i] = R_NaN; + } else if (n > FC_Num_Sample) + { + double *src = REAL(dosage); + dosage = NEW_NUMERIC(FC_Num_Sample); + double *dst = REAL(dosage); + memcpy(dst, src, sizeof(double)*FC_Num_Sample); + } + return dosage; +} + } // extern "C"