Skip to content

Commit

Permalink
seqGDS2SNP() exports dosage GDS files
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengxwen committed Jul 30, 2018
1 parent a1844e5 commit 1af80d7
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 24 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: SeqArray
Type: Package
Title: Big Data Management of Whole-genome Sequence Variant Calls
Version: 1.21.2
Date: 2018-06-11
Version: 1.21.3
Date: 2018-07-29
Depends: R (>= 3.3.0), gdsfmt (>= 1.10.1)
Imports: methods, parallel, IRanges, GenomicRanges, GenomeInfoDb, Biostrings,
S4Vectors
Expand Down
4 changes: 3 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CHANGES IN VERSION 1.21.1-1.21.2
CHANGES IN VERSION 1.21.1-1.21.3
-------------------------

UTILITIES
Expand All @@ -17,6 +17,8 @@ NEW FEATURES
o a new function `seqCheck()` for checking the data integrity of a SeqArray
file

o `seqGDS2SNP()` exports dosage GDS files

BUG FIXES

o `seqVCF2GDS()` and `seqVCF_Header()` are able to import site-only VCF
Expand Down
75 changes: 56 additions & 19 deletions R/Conversion.R
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,14 @@ seqGDS2VCF <- function(gdsfile, vcf.fn, info.var=NULL, fmt.var=NULL,
# Convert a SeqArray GDS file to a SNP GDS file
#

seqGDS2SNP <- function(gdsfile, out.gdsfn,
compress.geno="ZIP_RA", compress.annotation="LZMA_RA",
seqGDS2SNP <- function(gdsfile, out.gdsfn, dosage=FALSE,
compress.geno="LZMA_RA", compress.annotation="LZMA_RA",
optimize=TRUE, verbose=TRUE)
{
# check
stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
stopifnot(is.character(out.gdsfn), length(out.gdsfn)==1L)
stopifnot(is.logical(dosage) | is.character(dosage), length(dosage)==1L)
stopifnot(is.character(compress.geno), length(compress.geno)==1L)
stopifnot(is.character(compress.annotation), length(compress.annotation)==1L)
stopifnot(is.logical(optimize), length(optimize)==1L)
Expand All @@ -335,7 +336,10 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn,
if (verbose)
{
cat(date(), "\n", sep="")
cat("SeqArray GDS to SNP GDS Format:\n")
if (isTRUE(dosage) | is.character(dosage))
cat("SeqArray GDS to SNP GDS Dosage Format:\n")
else
cat("SeqArray GDS to SNP GDS Format:\n")
}

# if it is a file name
Expand All @@ -345,32 +349,44 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn,
on.exit({ seqClose(gdsfile) })
}

# dosage variable name
if (isTRUE(dosage) | is.character(dosage))
{
if (isTRUE(dosage))
dosage <- "annotation/format/DS"
else if (substr(dosage, 1L, 18L) != "annotation/format/")
dosage <- paste0("annotation/format/", dosage)
}

if (verbose)
{
dm <- .seldim(gdsfile)
cat(" # of samples: ", .pretty(dm[2L]), "\n", sep="")
cat(" # of variants: ", .pretty(dm[3L]), "\n", sep="")
cat(" genotype compression: ", compress.geno, "\n", sep="")
if (is.character(dosage))
{
cat(" dosage compression: ", compress.geno, ", from [",
dosage, "]\n", sep="")
} else {
cat(" genotype compression: ", compress.geno, "\n", sep="")
}
cat(" annotation compression: ", compress.annotation, "\n", sep="")
}

# create GDS file
gfile <- createfn.gds(out.gdsfn)
# close the file at the end
on.exit({
closefn.gds(gfile)
if (verbose)
cat("Done.\n", date(), "\n", sep="")
if (optimize)
{
if (verbose) cat("Optimize the access efficiency ...\n")
cleanup.gds(out.gdsfn, verbose=verbose)
if (verbose) cat(date(), "\n", sep="")
}
if (!is.null(gfile)) closefn.gds(gfile)
}, add=TRUE)

# add a flag
put.attr.gdsn(gfile$root, "FileFormat", "SNP_ARRAY")
if (!is.character(dosage))
{
put.attr.gdsn(gfile$root, "FileFormat", "SNP_ARRAY")
} else {
put.attr.gdsn(gfile$root, "FileFormat", "IMPUTED_DOSAGE")
}

# add "sample.id"
sampid <- seqGetData(gdsfile, "sample.id")
Expand Down Expand Up @@ -402,13 +418,34 @@ seqGDS2SNP <- function(gdsfile, out.gdsfn,
compress=compress.annotation, closezip=TRUE)

# add "genotype"
gGeno <- add.gdsn(gfile, "genotype", storage="bit2",
valdim=c(length(sampid), 0L), compress=compress.geno)
put.attr.gdsn(gGeno, "sample.order")
if (!is.character(dosage))
{
gGeno <- add.gdsn(gfile, "genotype", storage="bit2",
valdim=c(length(sampid), 0L), compress=compress.geno)
put.attr.gdsn(gGeno, "sample.order")
seqApply(gdsfile, "$dosage", as.is=gGeno, .useraw=TRUE, .progress=verbose,
FUN = .cfunction("FC_GDS2SNP"))
} else {
.cfunction("FC_SetNumSamp")(length(sampid))
gGeno <- add.gdsn(gfile, "genotype", storage="float32",
valdim=c(length(sampid), 0L), compress=compress.geno)
put.attr.gdsn(gGeno, "sample.order")
seqApply(gdsfile, dosage, as.is=gGeno, .progress=verbose,
FUN = .cfunction("FC_GDS2Dosage"))
}

seqApply(gdsfile, "$dosage", as.is=gGeno, .useraw=TRUE, .progress=verbose,
FUN = .cfunction("FC_GDS2SNP"))
readmode.gdsn(gGeno)
closefn.gds(gfile)
gfile <- NULL
if (verbose)
cat("Done.\n", date(), "\n", sep="")

if (optimize)
{
if (verbose) cat("Optimize the access efficiency ...\n")
cleanup.gds(out.gdsfn, verbose=verbose)
if (verbose) cat(date(), "\n", sep="")
}

# output
invisible(normalizePath(out.gdsfn))
Expand Down
5 changes: 4 additions & 1 deletion man/seqGDS2SNP.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
Converts a SeqArray GDS file to a SNP GDS file.
}
\usage{
seqGDS2SNP(gdsfile, out.gdsfn, compress.geno="ZIP_RA",
seqGDS2SNP(gdsfile, out.gdsfn, dosage=FALSE, compress.geno="LZMA_RA",
compress.annotation="LZMA_RA", optimize=TRUE, verbose=TRUE)
}
\arguments{
\item{gdsfile}{character (GDS file name), or
a \code{\link{SeqVarGDSClass}} object}
\item{out.gdsfn}{the file name, output a file of VCF format}
\item{dosage}{a logical value, or characters for the variable name of
dosage in the SeqArray file; if \code{FALSE} exports genotypes,
otherwise exports dosages}
\item{compress.geno}{the compression method for "genotype"; optional
values are defined in the function \code{add.gdsn}}
\item{compress.annotation}{the compression method for the GDS variables,
Expand Down
34 changes: 33 additions & 1 deletion src/ConvToGDS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// ConvToGDS.cpp: format conversion
//
// Copyright (C) 2015-2017 Xiuwen Zheng
// Copyright (C) 2015-2018 Xiuwen Zheng
//
// This file is part of SeqArray.
//
Expand Down Expand Up @@ -288,4 +288,36 @@ COREARRAY_DLL_EXPORT SEXP FC_GDS2SNP(SEXP geno)
return geno;
}


// ======================================================================
// SeqArray GDS --> Dosage GDS
// ======================================================================

static int FC_Num_Sample = 0;

COREARRAY_DLL_EXPORT SEXP FC_SetNumSamp(SEXP num)
{
FC_Num_Sample = Rf_asInteger(num);
return R_NilValue;
}

COREARRAY_DLL_EXPORT SEXP FC_GDS2Dosage(SEXP dosage)
{
int n = LENGTH(dosage);
if (n < FC_Num_Sample)
{
dosage = NEW_NUMERIC(FC_Num_Sample);
double *dst = REAL(dosage);
for (int i=0; i < FC_Num_Sample; i++)
dst[i] = R_NaN;
} else if (n > FC_Num_Sample)
{
double *src = REAL(dosage);
dosage = NEW_NUMERIC(FC_Num_Sample);
double *dst = REAL(dosage);
memcpy(dst, src, sizeof(double)*FC_Num_Sample);
}
return dosage;
}

} // extern "C"

0 comments on commit 1af80d7

Please # to comment.