From f108dd3faa97996f2e0b456e3cc2e99921f036e1 Mon Sep 17 00:00:00 2001 From: dmmiller612 Date: Tue, 7 Nov 2017 07:37:48 -0500 Subject: [PATCH 1/3] Maven stuff --- pom.xml | 65 +++++++++++++++++++ .../com/lifeomic/variants/DefaultSource.scala | 17 +++++ .../com/lifeomic/variants/VCFFunctions.scala | 14 ++++ .../variants/VCFResourceRelation.scala | 14 +++- 4 files changed, 109 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 78c022c..0bed629 100644 --- a/pom.xml +++ b/pom.xml @@ -37,6 +37,17 @@ 2.6.5 + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + org.scala-lang @@ -164,6 +175,60 @@ + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.7 + true + + ossrh + https://oss.sonatype.org/ + true + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + attach-javadocs + + jar + + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.5 + + + sign-artifacts + verify + + sign + + + + + diff --git a/src/main/scala/com/lifeomic/variants/DefaultSource.scala b/src/main/scala/com/lifeomic/variants/DefaultSource.scala index 999a176..423ada1 100644 --- a/src/main/scala/com/lifeomic/variants/DefaultSource.scala +++ b/src/main/scala/com/lifeomic/variants/DefaultSource.scala @@ -28,12 +28,29 @@ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType +/** + * Default source for spark-vcf. + * To use this default source, you will need to register it in spark with spark.read.format("com.lifeomic.variants") + */ class DefaultSource extends RelationProvider with SchemaRelationProvider { + /** + * Creates relation + * @param sqlContext spark sql context + * @param parameters parameters for job + * @return + */ override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { createRelation(sqlContext, parameters, null) } + /** + * Creates relation with user schema + * @param sqlContext spark sql context + * @param parameters parameters for job + * @param schema user defined schema + * @return + */ override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { createPrivateRelation(sqlContext, parameters) } diff --git a/src/main/scala/com/lifeomic/variants/VCFFunctions.scala b/src/main/scala/com/lifeomic/variants/VCFFunctions.scala index 7a061de..eca5aad 100644 --- a/src/main/scala/com/lifeomic/variants/VCFFunctions.scala +++ b/src/main/scala/com/lifeomic/variants/VCFFunctions.scala @@ -4,6 +4,11 @@ import com.lifeomic.variants.VCFConstants._ object VCFFunctions { + /** + * Returns a meta row of the key, value and number + * @param t formatType + * @return + */ def metaHandler(t: String) : (String) => (String, (String, String)) = (item: String) => { val z = item.replace("<", "").replace(t, "") val filtered = z.split(",").filter(item => item.startsWith(ID) || item.startsWith(TYPE) || item.startsWith(NUMBER)) @@ -26,6 +31,15 @@ object VCFFunctions { (key, (value, number)) } + /** + * Extends fields for format and info columns + * @param mapFlag should use map or not + * @param map parameter map + * @param schFields column fields + * @param start start index + * @param end end index + * @return + */ def fieldsExtended(mapFlag: Boolean, map: Map[String, String], schFields: Array[(String, String, String)], diff --git a/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala b/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala index b644631..6ebe27c 100644 --- a/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala +++ b/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala @@ -32,7 +32,15 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.functions._ import com.lifeomic.variants.VCFConstants._ - +/** + * Spark vcf resource relation + * @param sqlContext Spark sql context + * @param path path of vcf file(s) + * @param useFormatTypes Type checking for formats, plus casting types + * @param useFormatAsMap Use the format column as a map + * @param useAnnotationTypes Type casting for info fields + * @param useAnnotationAsMap use annotations as a map + */ class VCFResourceRelation( override val sqlContext: SQLContext, path: String, @@ -93,6 +101,10 @@ class VCFResourceRelation( */ override val schema: StructType = inferSchema() + /** + * Runs the vcf queries and converts them to an rdd of rows + * @return rdd of a spark sql row + */ override def buildScan(): RDD[Row] = { val schFields = schema.fields.map(item => (item.name, item.dataType.typeName, item.dataType.sql.toLowerCase)) val annotateCount = annotationCount From a07c71ddf15a13e1f9e3679a2fe0275cb3ce200c Mon Sep 17 00:00:00 2001 From: dmmiller612 Date: Tue, 7 Nov 2017 21:59:51 -0500 Subject: [PATCH 2/3] fixed pom issues, deployed to maven central --- README.md | 14 +++- pom.xml | 78 ++++++++++++------- .../com/lifeomic/variants/DefaultSource.scala | 8 +- .../variants/VCFResourceRelation.scala | 2 +- 4 files changed, 62 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 1a767af..c038191 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,22 @@ Spark VCF data source implementation in native spark without Hadoop-bam. # Introduction Spark VCF allows you to natively load VCFs into an Apache Spark Dataframe/Dataset. To get started with Spark-VCF, you can -clone or download this repository, then run `mvn package` and use the jar. In the very near future, spark-vcf will -be added to Maven Central. +clone or download this repository, then run `mvn package` and use the jar. We are also now in Maven central. Since spark-vcf is written specifically for Spark, it comes with large performance gains over frameworks like ADAM. # Getting Started +To install spark vcf, add the following to your pom: + +``` + + com.lifeomic + spark-vcf + 0.1.0 + +``` + Getting started with Spark VCF is as simple as: ```scala @@ -64,7 +73,6 @@ val sparkConf = new SparkConf() ``` # TODO -* put release in Maven Central * Provide performance benchmarks compared to ADAM * Get Travis CI set up diff --git a/pom.xml b/pom.xml index 0bed629..345aa45 100644 --- a/pom.xml +++ b/pom.xml @@ -4,9 +4,9 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - spark-vcf + com.lifeomic spark-vcf - 0.1.0-SNAPSHOT + 0.1.0 jar Spark VCF @@ -28,6 +28,15 @@ master + + + Derek Miller + derek.miller@lifeomic.com + Lifeomic + https://lifeomic.com + + + 1.8 2.11.8 @@ -110,11 +119,12 @@ 7.9.0 - + spark-vcf + org.apache.maven.plugins @@ -126,6 +136,34 @@ + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.0.0-M1 + + + attach-javadocs + + jar + + + + + org.codehaus.mojo build-helper-maven-plugin @@ -169,6 +207,13 @@ testCompile + + attach-scaladocs + package + + doc-jar + + ${scala.version} @@ -187,33 +232,6 @@ - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.9.1 - - - attach-javadocs - - jar - - - - - org.apache.maven.plugins maven-gpg-plugin diff --git a/src/main/scala/com/lifeomic/variants/DefaultSource.scala b/src/main/scala/com/lifeomic/variants/DefaultSource.scala index 423ada1..fa324cb 100644 --- a/src/main/scala/com/lifeomic/variants/DefaultSource.scala +++ b/src/main/scala/com/lifeomic/variants/DefaultSource.scala @@ -28,17 +28,13 @@ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType -/** - * Default source for spark-vcf. - * To use this default source, you will need to register it in spark with spark.read.format("com.lifeomic.variants") - */ class DefaultSource extends RelationProvider with SchemaRelationProvider { /** * Creates relation * @param sqlContext spark sql context * @param parameters parameters for job - * @return + * @return Base relation */ override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { createRelation(sqlContext, parameters, null) @@ -49,7 +45,7 @@ class DefaultSource extends RelationProvider with SchemaRelationProvider { * @param sqlContext spark sql context * @param parameters parameters for job * @param schema user defined schema - * @return + * @return Base relation */ override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { createPrivateRelation(sqlContext, parameters) diff --git a/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala b/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala index 6ebe27c..ee10535 100644 --- a/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala +++ b/src/main/scala/com/lifeomic/variants/VCFResourceRelation.scala @@ -84,7 +84,7 @@ class VCFResourceRelation( private var annotationCount = 1 - /** + /* * order is * 0. chromosome * 1. position From 8bcf8bdc70b87595a62752153006fb501cb58c08 Mon Sep 17 00:00:00 2001 From: dmmiller612 Date: Tue, 7 Nov 2017 22:00:22 -0500 Subject: [PATCH 3/3] bumped pom --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 345aa45..437ddfc 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.lifeomic spark-vcf - 0.1.0 + 0.2.0-SNAPSHOT jar Spark VCF