Skip to content

Commit

Permalink
Merge pull request #3 from lifeomic/javadoc
Browse files Browse the repository at this point in the history
Javadoc
  • Loading branch information
dmmiller612 authored Nov 8, 2017
2 parents 29dca63 + 9e0730f commit e0555f9
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 9 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,22 @@ Spark VCF data source implementation in native spark without Hadoop-bam.
# Introduction

Spark VCF allows you to natively load VCFs into an Apache Spark Dataframe/Dataset. To get started with Spark-VCF, you can
clone or download this repository, then run `mvn package` and use the jar. In the very near future, spark-vcf will
be added to Maven Central.
clone or download this repository, then run `mvn package` and use the jar. We are also now in Maven central.

Since spark-vcf is written specifically for Spark, it comes with large performance gains over frameworks like ADAM.

# Getting Started

To install spark vcf, add the following to your pom:

```
<dependency>
<groupId>com.lifeomic</groupId>
<artifactId>spark-vcf</artifactId>
<version>0.1.0</version>
</dependency>
```

Getting started with Spark VCF is as simple as:

```scala
Expand Down Expand Up @@ -64,7 +73,6 @@ val sparkConf = new SparkConf()
```

# TODO
* put release in Maven Central
* Provide performance benchmarks compared to ADAM
* Get Travis CI set up

Expand Down
89 changes: 86 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>spark-vcf</groupId>
<groupId>com.lifeomic</groupId>
<artifactId>spark-vcf</artifactId>
<version>0.1.0-SNAPSHOT</version>
<version>0.2.0-SNAPSHOT</version>
<packaging>jar</packaging>

<name>Spark VCF</name>
Expand All @@ -28,6 +28,15 @@
<tag>master</tag>
</scm>

<developers>
<developer>
<name>Derek Miller</name>
<email>derek.miller@lifeomic.com</email>
<organization>Lifeomic</organization>
<organizationUrl>https://lifeomic.com</organizationUrl>
</developer>
</developers>

<properties>
<java.version>1.8</java.version>
<scala.version>2.11.8</scala.version>
Expand All @@ -37,6 +46,17 @@
<jackson.version>2.6.5</jackson.version>
</properties>

<distributionManagement>
<snapshotRepository>
<id>ossrh</id>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>
<repository>
<id>ossrh</id>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>

<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
Expand Down Expand Up @@ -99,11 +119,12 @@
<version>7.9.0</version>
</dependency>


</dependencies>

<build>

<finalName>spark-vcf</finalName>

<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
Expand All @@ -115,6 +136,34 @@
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.0.0-M1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
Expand Down Expand Up @@ -158,12 +207,46 @@
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>attach-scaladocs</id>
<phase>package</phase>
<goals>
<goal>doc-jar</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>

<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.7</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>

</plugins>

</build>
Expand Down
13 changes: 13 additions & 0 deletions src/main/scala/com/lifeomic/variants/DefaultSource.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,23 @@ import org.apache.spark.sql.types.StructType

class DefaultSource extends RelationProvider with SchemaRelationProvider {

/**
* Creates relation
* @param sqlContext spark sql context
* @param parameters parameters for job
* @return Base relation
*/
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
createRelation(sqlContext, parameters, null)
}

/**
* Creates relation with user schema
* @param sqlContext spark sql context
* @param parameters parameters for job
* @param schema user defined schema
* @return Base relation
*/
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = {
createPrivateRelation(sqlContext, parameters)
}
Expand Down
14 changes: 14 additions & 0 deletions src/main/scala/com/lifeomic/variants/VCFFunctions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ import com.lifeomic.variants.VCFConstants._

object VCFFunctions {

/**
* Returns a meta row of the key, value and number
* @param t formatType
* @return
*/
def metaHandler(t: String) : (String) => (String, (String, String)) = (item: String) => {
val z = item.replace("<", "").replace(t, "")
val filtered = z.split(",").filter(item => item.startsWith(ID) || item.startsWith(TYPE) || item.startsWith(NUMBER))
Expand All @@ -26,6 +31,15 @@ object VCFFunctions {
(key, (value, number))
}

/**
* Extends fields for format and info columns
* @param mapFlag should use map or not
* @param map parameter map
* @param schFields column fields
* @param start start index
* @param end end index
* @return
*/
def fieldsExtended(mapFlag: Boolean,
map: Map[String, String],
schFields: Array[(String, String, String)],
Expand Down
17 changes: 14 additions & 3 deletions src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@ import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import com.lifeomic.variants.VCFConstants._


/**
* Spark vcf resource relation
* @param sqlContext Spark sql context
* @param path path of vcf file(s)
* @param useFormatTypes Type checking for formats, plus casting types
* @param useFormatAsMap Use the format column as a map
* @param useAnnotationTypes Type casting for info fields
* @param useAnnotationAsMap use annotations as a map
*/
class VCFResourceRelation(
override val sqlContext: SQLContext,
path: String,
Expand Down Expand Up @@ -75,8 +83,7 @@ class VCFResourceRelation(
private val annotations = vcf.filter(col(TEXT_VALUE).startsWith("##INFO")).map(_.getString(1)).rdd.map(VCFFunctions.metaHandler("##INFO="))
private var annotationCount = 1


/**
/*
* order is
* 0. chromosome
* 1. position
Expand All @@ -93,6 +100,10 @@ class VCFResourceRelation(
*/
override val schema: StructType = inferSchema()

/**
* Runs the vcf queries and converts them to an rdd of rows
* @return rdd of a spark sql row
*/
override def buildScan(): RDD[Row] = {
val schFields = schema.fields.map(item => (item.name, item.dataType.typeName, item.dataType.sql.toLowerCase))
val annotateCount = annotationCount
Expand Down

0 comments on commit e0555f9

Please # to comment.