diff --git a/.github/workflows/quneys-dsl-githubactions.yml b/.github/workflows/quenya-dsl-githubactions.yml similarity index 95% rename from .github/workflows/quneys-dsl-githubactions.yml rename to .github/workflows/quenya-dsl-githubactions.yml index 822860b..7e7447e 100644 --- a/.github/workflows/quneys-dsl-githubactions.yml +++ b/.github/workflows/quenya-dsl-githubactions.yml @@ -28,8 +28,8 @@ jobs: cache: sbt - name: Build and test scala version run: | - PGPASSWORD="postgres" psql -c 'create database almaren;' -U postgres -h localhost - PGPASSWORD="postgres" psql -c "ALTER USER postgres PASSWORD 'foo' ;" -U postgres -h localhost + PGPASSWORD="postgres" psql -c 'create database almaren;' -U postgres -h localhost + PGPASSWORD="postgres" psql -c "ALTER USER postgres PASSWORD 'postgres' ;" -U postgres -h localhost PGPASSWORD="postgres" psql -c 'create role runner;' -U postgres -h localhost PGPASSWORD="postgres" psql -c 'ALTER ROLE "runner" WITH LOGIN SUPERUSER INHERIT CREATEDB CREATEROLE REPLICATION;' -U postgres -h localhost sbt ++2.12.10 test diff --git a/README.md b/README.md index 8515aab..cdcf3a8 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,24 @@ -# Quenya DSL +# Quenya-DSL [![Build Status](https://github.com/music-of-the-ainur/quenya-dsl/actions/workflows/quneys-dsl-githubactions.yml/badge.svg)](https://github.com/music-of-the-ainur/quenya-dsl/actions/workflows/quneys-dsl-githubactions.yml) -Adding Quenya DSL dependency to your sbt build: +Adding Quenya-DSL dependency to your sbt build: ``` -libraryDependencies += "com.github.music-of-the-ainur" %% "quenya-dsl" % "1.2.2-3.3" +libraryDependencies += "com.github.music-of-the-ainur" %% "quenya-dsl" % "1.2.2-$SPARK_VERSION" ``` To run in spark-shell: ``` -spark-shell --packages "com.github.music-of-the-ainur:quenya-dsl_2.13:1.2.2-3.3" +spark-shell --packages "com.github.music-of-the-ainur:quenya-dsl_2.12:1.2.0-$SPARK_VERSION" ``` -### Connector Usage -#### Maven / Ivy Package Usage -The connector is also available from the -[Maven Central](https://mvnrepository.com/artifact/com.github.music-of-the-ainur) -repository. It can be used using the `--packages` option or the -`spark.jars.packages` configuration property. Use the following value +Quenya-Dsl is available in [Maven Central](https://mvnrepository.com/artifact/com.github.music-of-the-ainur) +repository. -| version | Connector Artifact | +| versions | Connector Artifact | |----------------------------|-----------------------------------------------------------| | Spark 3.3.x and scala 2.13 | `com.github.music-of-the-ainur:quenya-dsl_2.13:1.2.2-3.3` | | Spark 3.3.x and scala 2.12 | `com.github.music-of-the-ainur:quenya-dsl_2.12:1.2.2-3.3` | @@ -32,7 +28,7 @@ repository. It can be used using the `--packages` option or the | Spark 2.4.x and scala 2.11 | `com.github.music-of-the-ainur:quenya-dsl_2.11:1.2.2-2.4` | ## Introduction -Quenya DSL(Domain Specific Language) is a language that simplifies the task to parser complex semi-structured data. +Quenya-DSL(Domain Specific Language) is a language that simplifies the task to parser complex semi-structured data. ```scala @@ -155,7 +151,7 @@ Output: ## DSL Generator -You can generate a DSL based on a DataFrame: +You can generate the DSL from an existing DataFrame: ```scala import com.github.music.of.the.ainur.quenya.QuenyaDSL @@ -165,6 +161,18 @@ val quenyaDsl = QuenyaDSL quenyaDsl.printDsl(df) ``` +### getDsl +You can generate and asssign a DSL to variable based on a DataFrame: + +```scala +import com.github.music.of.the.ainur.quenya.QuenyaDSL + +val df:DataFrame = ... +val quenyaDsl = QuenyaDSL +val dsl = quenyaDsl.getDsl(df) +``` + + json: ``` { @@ -201,6 +209,50 @@ weapon@weapon You can _alias_ using the fully qualified name using ```printDsl(df,true)```, you should turn on in case of name conflict. +## How to Handle Special Characters + + + +Use the literal backtick **``** to handle special characters like space,semicolon,hyphen and colon. +Example: + + + +json: +``` +{ + "name":{ + "name One":"Mithrandir", + "Last-Name":"Olórin", + "nick:Names":[ + "Gandalf the Grey", + "Gandalf the White" + ] + }, + "race":"Maiar", + "age":"immortal", + "weapon;name":[ + "Glamdring", + "Narya", + "Wizard Staff" + ] +} +``` + + + +DSL: +``` +age$age:StringType +`name.Last-Name`$`Last-Name`:StringType +`name.name One`$`name-One`:StringType +`name.nick:Names`@`nick:Names` + `nick:Names`$`nick:Names`:StringType +race$race:StringType +`weapon;name`@`weapon;name` + `weapon;name`$`weapon_name`:StringType +``` + ## Backus–Naur form ``` @@ -216,14 +268,6 @@ You can _alias_ using the fully qualified name using ```printDsl(df,true)```, yo | DoubleType | FloatType | ByteType | IntegerType | LongType | ShortType ``` -## Requirements - -| Software | Version | -|--------------|-----------| -| Java | 8 | -| Scala | 2.11/2.12 | -| Apache Spark | 2.4 | - ## Author Daniel Mantovani [daniel.mantovani@modak.com](mailto:daniel.mantovani@modak.com) diff --git a/build.sbt b/build.sbt index 5617c48..276290d 100644 --- a/build.sbt +++ b/build.sbt @@ -32,6 +32,12 @@ ThisBuild / developers := List( name = "Daniel Mantovani", email = "daniel.mantovani@modakanalytics.com", url = url("https://github.com/music-of-the-ainur") + ), + Developer( + id = "ChinthapallyAkanksha", + name = "Akanksha Chinthapally", + email = "akanksha.chinthapally@modak.com", + url = url("https://github.com/music-of-the-ainur") ) ) diff --git a/src/main/scala/com/github/music/of/the/ainur/quenya/compiler/CombinatorParser.scala b/src/main/scala/com/github/music/of/the/ainur/quenya/compiler/CombinatorParser.scala index b89a2ee..efe7581 100644 --- a/src/main/scala/com/github/music/of/the/ainur/quenya/compiler/CombinatorParser.scala +++ b/src/main/scala/com/github/music/of/the/ainur/quenya/compiler/CombinatorParser.scala @@ -6,7 +6,6 @@ import scala.util.parsing.combinator._ /* Spark DSL Backus-Naur form. - ::= \{"[\r\n]*".r \} ::= "[\s\t]*".r ::= "a-zA-Z0-9_.".r [ element ] @@ -15,9 +14,8 @@ import scala.util.parsing.combinator._ <@> ::= @ <$> ::= $ : ::= "0-9a-zA-Z_".r - ::= BinaryType | BooleanType | StringType | TimestampType | DecimalType + ::= BinaryType | BooleanType | StringType | TimestampType | DecimalType | DoubleType | FloatType | ByteType | IntegerType | LongType | ShortType - */ private[quenya] trait CombinatorParser { val parser = ParserQuenyaDsl @@ -30,9 +28,16 @@ private[quenya] trait CombinatorParser { } } -object ParserQuenyaDsl extends JavaTokenParsers { +trait ParserUtil { + def removeLiteral(content: String, literal: String): String = { + if (content.head.toString == literal && content.last.toString == literal) + content.substring(1, content.length - 1) + else content + } +} +object ParserQuenyaDsl extends JavaTokenParsers with ParserUtil { override val skipWhitespace = false - + def dsl: Parser[List[Statement]] = repsep(expression,"""[\n\r]*""".r) ^^ (List() ++ _ ) def expression: Parser[Statement] = precedence ~ col ~ operator ^^ { case prec ~ cl ~ op => @@ -45,16 +50,21 @@ object ParserQuenyaDsl extends JavaTokenParsers { case al:String => Statement(prec,cl,AT,al) } } - def precedence: Parser[Int] = """[\t\s]*""".r ^^ (prec => prec.replaceAll(" ","\t").count(_ == '\t')) - def col: Parser[StateSelect] = """[0-9A-Za-z._]+""".r ~ opt(element) ^^ { - case a ~ Some(b) => StateSelect(a,b) - case a ~ None => StateSelect(a,None) + def precedence: Parser[Int] = """^[\t\s]*""".r ^^ (prec => prec.replaceAll(" ","\t").count(_ == '\t')) + def col: Parser[StateSelect] = """[\w.]+|`[\w. \-:;$]+`""".r ~ opt(element) ^^ { + case a ~ Some(b) => createStateSelect(a,b) + case a ~ None => createStateSelect(a,None) } + private def createStateSelect(name: String, element: Option[String]): StateSelect = + StateSelect(removeLiteral(name,"`"),element) + def element: Parser[Option[String]] = "[" ~> opt("""\d+""".r) <~ "]" def operator: Parser[Any] = at | dollar def at: Parser[String] = "@" ~> alias def dollar : Parser[Any] = "$" ~> alias ~ opt(":") ~ datatype - def alias : Parser[String] = "[0-9a-zA-Z_]+".r + def alias : Parser[String] = """\w+|`[\w \-:;$]+`""".r ^^ { + alias => removeLiteral(alias,"`") + } def datatype : Parser[Option[DataType]] = ("BinaryType" ^^ (dt => Some(BinaryType)) | "FloatType" ^^ (dt => Some(FloatType)) | "ByteType" ^^ (dt => Some(ByteType)) @@ -67,4 +77,4 @@ object ParserQuenyaDsl extends JavaTokenParsers { | "DoubleType" ^^ (dt => Some(DoubleType)) | "ShortType" ^^ (dt => Some(ShortType)) | "" ^^ (dt => None)) -} +} \ No newline at end of file diff --git a/src/test/resources/data.csv b/src/test/resources/data.csv index 886052d..e39b9dd 100644 --- a/src/test/resources/data.csv +++ b/src/test/resources/data.csv @@ -1,10 +1,10 @@ age,LastName,nameOne,nickNames,race,weapon -3500,Olórin,Mithrandir,Gandalf the Grey,Maiar,Glamdring -3500,Olórin,Mithrandir,Gandalf the Grey,Maiar,Narya -3500,Olórin,Mithrandir,Gandalf the Grey,Maiar,Wizard Staff +3500,Olórin,Mithrandir,Gandalf the Grey,Maiar,Glamdring +3500,Olórin,Mithrandir,Gandalf the Grey,Maiar,Narya +3500,Olórin,Mithrandir,Gandalf the Grey,Maiar,Wizard Staff 4500,"",Ilmarë,"",Ainur,Powers of the Ainur 3500,"",Morgoth,Bauglir,Ainur,Powers of the Ainur 3500,"",Morgoth,Bauglir,Ainur,Grond 3500,"",Morgoth,Bauglir,Ainur,Mace 3500,"",Morgoth,Bauglir,Ainur,Sword -3500,"",Manwë,"King of Arda,",Ainur,Powers of the Ainur +3500,"",Manwë,"King of Arda,",Ainur,Powers of the Ainur \ No newline at end of file diff --git a/src/test/resources/data.json b/src/test/resources/data.json new file mode 100644 index 0000000..63f1191 --- /dev/null +++ b/src/test/resources/data.json @@ -0,0 +1,50 @@ +{ + "Coffee": { + "sub region": [ + { + "id": 1, + "full name": "John Doe" + }, + { + "id": 2, + "name": "Don Joeh" + } + ], + "country": { + "id": 2, + "company": "ACME" + } + }, + "brewing": { + "sub-region": [ + { + "id": 1, + "name": "John Doe" + }, + { + "id": 2, + "name": "Don Joeh" + } + ], + "world:country": { + "id": 2, + "company": "ACME" + } + }, + "brewing2": { + "sub;region": [ + { + "id": 1, + "name": "John Doe" + }, + { + "id": 2, + "name": "Don Joeh" + } + ], + "world;country": { + "id": 2, + "company": "ACME" + } + } +} \ No newline at end of file diff --git a/src/test/resources/data/complexData.parquet/._SUCCESS.crc b/src/test/resources/data/complexData.parquet/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/src/test/resources/data/complexData.parquet/._SUCCESS.crc differ diff --git a/src/test/resources/data/complexData.parquet/.part-00000-f0e17416-ae5a-4295-9514-c1c7010257da-c000.snappy.parquet.crc b/src/test/resources/data/complexData.parquet/.part-00000-f0e17416-ae5a-4295-9514-c1c7010257da-c000.snappy.parquet.crc new file mode 100644 index 0000000..10efee3 Binary files /dev/null and b/src/test/resources/data/complexData.parquet/.part-00000-f0e17416-ae5a-4295-9514-c1c7010257da-c000.snappy.parquet.crc differ diff --git a/src/test/resources/data/complexData.parquet/_SUCCESS b/src/test/resources/data/complexData.parquet/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/data/complexData.parquet/part-00000-f0e17416-ae5a-4295-9514-c1c7010257da-c000.snappy.parquet b/src/test/resources/data/complexData.parquet/part-00000-f0e17416-ae5a-4295-9514-c1c7010257da-c000.snappy.parquet new file mode 100644 index 0000000..4a93a07 Binary files /dev/null and b/src/test/resources/data/complexData.parquet/part-00000-f0e17416-ae5a-4295-9514-c1c7010257da-c000.snappy.parquet differ diff --git a/src/test/scala/com/github/music/of/the/ainur/quenya/Test.scala b/src/test/scala/com/github/music/of/the/ainur/quenya/Test.scala index 225d6f9..c90ba41 100644 --- a/src/test/scala/com/github/music/of/the/ainur/quenya/Test.scala +++ b/src/test/scala/com/github/music/of/the/ainur/quenya/Test.scala @@ -1,22 +1,25 @@ package com.github.music.of.the.ainur.quenya -import org.scalatest._ +import org.apache.spark.sql.functions.col +import org.scalatest.BeforeAndAfter +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, SparkSession} import org.scalatest.funsuite.AnyFunSuite -import org.apache.spark.sql.SparkSession class Test extends AnyFunSuite with BeforeAndAfter { val spark = SparkSession.builder() .master("local[*]") .config("spark.sql.shuffle.partitions", "1") .getOrCreate() - + spark.sparkContext.setLogLevel("ERROR") + // Test case for DSL + val data = Seq( -"""{"name": {"nameOne": "Mithrandir","LastName": "Olórin","nickNames": ["Gandalf the Grey","Gandalf the White"]}, "race": "Maiar","age": 3500,"weapon": ["Glamdring", "Narya", "Wizard Staff"]}""", -"""{"name": {"nameOne": "Ilmarë","LastName": null, "nickNames": null}, "race": "Ainur","age": 4500,"weapon": ["Powers of the Ainur"]}""", -"""{"name": {"nameOne": "Morgoth","LastName": null, "nickNames": ["Bauglir","Belegurth","Belegûr","The Great Enemy","The Black Foe"]}, "race": "Ainur","age": 3500,"weapon": ["Powers of the Ainur","Grond","Mace","Sword"]}""", -"""{"name": {"nameOne": "Manwë","LastName": null, "nickNames": ["King of Arda,","Lord of the Breath of Arda","King of the Valar"]}, "race": "Ainur","age": 3500,"weapon": ["Powers of the Ainur"]}""") + """{"name": {"nameOne": "Mithrandir","LastName": "Olórin","nickNames": ["Gandalf the Grey","Gandalf the White"]}, "race": "Maiar","age": 3500,"weapon": ["Glamdring", "Narya", "Wizard Staff"]}""", + """{"name": {"nameOne": "Ilmarë","LastName": null, "nickNames": null}, "race": "Ainur","age": 4500,"weapon": ["Powers of the Ainur"]}""", + """{"name": {"nameOne": "Morgoth","LastName": null, "nickNames": ["Bauglir","Belegurth","Belegûr","The Great Enemy","The Black Foe"]}, "race": "Ainur","age": 3500,"weapon": ["Powers of the Ainur","Grond","Mace","Sword"]}""", + """{"name": {"nameOne": "Manwë","LastName": null, "nickNames": ["King of Arda,","Lord of the Breath of Arda","King of the Valar"]}, "race": "Ainur","age": 3500,"weapon": ["Powers of the Ainur"]}""") import spark.implicits._ @@ -24,40 +27,24 @@ class Test extends AnyFunSuite with BeforeAndAfter { val quenyaDsl = QuenyaDSL + val dsl = quenyaDsl.compile( + """ + |age$age:LongType + |name.LastName$LastName:StringType + |name.nameOne$nameOne:StringType + |name.nickNames[0]$nickNames:StringType + |race$race:StringType + |weapon@weapon + | weapon$weapon:StringType""".stripMargin) - val dsl = quenyaDsl.compile(""" - |age$age:LongType - |name.LastName$LastName:StringType - |name.nameOne$nameOne:StringType - |name.nickNames[0]$nickNames:StringType - |race$race:StringType - |weapon@weapon - | weapon$weapon:StringType""".stripMargin) - - val dslDf = quenyaDsl.execute(dsl,df) - val csvDf = spark.read.option("header","true").csv("src/test/resources/data.csv") - - val dslCount = dslDf.count() - val csvCount = csvDf.count() - - test("number of records should match") { - assert(dslCount == csvCount) - } + val dslDf = quenyaDsl.execute(dsl, df) + val csvDf = spark.read.option("header", "true").csv("src/test/resources/data.csv") - val diff = dslDf.as("dsl").join(csvDf.as("csv"), - $"dsl.age" <=> $"csv.age" && - $"dsl.LastName" <=> $"csv.LastName" && - $"dsl.nameOne" <=> $"csv.nameOne" && - $"dsl.nickNames" <=> $"csv.nickNames" && - $"dsl.race" <=> $"csv.race" && - $"dsl.weapon" <=> $"csv.weapon","leftanti").count() + test(dslDf, csvDf, "Test for Dsl") - test("data should be exactly the same") { - assert(diff == 0) - } - - - val dslMatch = """age$age:LongType + // Test case for getDsl method + val dslMatch = + """age$age:LongType name.LastName$LastName:StringType name.nameOne$nameOne:StringType name.nickNames@nickNames @@ -66,13 +53,83 @@ race$race:StringType weapon@weapon weapon$weapon:StringType""" - test("DSL generate") { + test("Test for DSL Generate") { assert(dslMatch == quenyaDsl.getDsl(df)) } + //Test case for Dsl containing special characters + val complexJsonDf = spark.read.option("multiLine", "true").json("src/test/resources/data.json") + + // For columns containing special characters like ;,:,- and spaces etc , need to include them in `` + + val complexDsl = quenyaDsl.compile( + """Coffee.country.company$`coffee-company`:StringType + |Coffee.country.id$`coffee-country-id`:LongType + |`Coffee.sub region`@`sub:region` + | `sub region.full name`$`sub_region_full-name`:StringType + | `sub region.id`$`sub_region-id`:LongType + | `sub region.name`$`sub$region-name`:StringType + |`brewing.sub-region`@`sub-region` + | `sub-region.id`$`brewing_sub_region:id`:LongType + | `sub-region.name`$`brewing:sub-region:name`:StringType + |`brewing.world:country.company`$company1:StringType + |`brewing.world:country.id`$id1:LongType + |`brewing2.sub;region`@`sub;region` + | `sub;region.id2`$id2:LongType + | `sub;region.name`$name2:StringType + |`brewing2.world;country.company2`$company:StringType + |`brewing2.world;country.id`$`id$f`:LongType""".stripMargin) + + val complexJsonData = quenyaDsl.execute(complexDsl, complexJsonDf) + complexJsonData.coalesce(1).write.mode("overwrite").parquet("src/test/resources/data/complexData.parquet") + + val complexDataParquet = spark.read.parquet("src/test/resources/data/complexData.parquet") + test(complexJsonData, complexDataParquet, "Test for Complex Dsl with Special Characters") + + def test(df1: DataFrame, df2: DataFrame, name: String): Unit = { + testCount(df1, df2, name) + testCompare(df1, df2, name) + } + + def testCount(df1: DataFrame, df2: DataFrame, name: String): Unit = { + val count1 = df1.count() + val count2 = df2.count() + val count3 = spark.emptyDataFrame.count() + test(s"Count Test:$name should match of count: ${df1.count()}") { + assert(count1 == count2) + } + test(s"Count Test:$name should not match") { + assert(count1 != count3) + } + } + + // Doesn't support nested type and we don't need it :) + def testCompare(df1: DataFrame, df2: DataFrame, name: String): Unit = { + val diff = compare(df1, df2) + test(s"Compare Test:$name should be zero") { + assert(diff == 0) + } + test(s"Compare Test:$name, should not be able to join") { + assertThrows[AnalysisException] { + compare(df2, spark.emptyDataFrame) + } + } + } + + private def compare(df1: DataFrame, df2: DataFrame): Long = { + val diff = df1.as("df1").join(df2.as("df2"), joinExpression(df1), "leftanti") + diff.show(false) + diff.count + } + + private def joinExpression(df1: DataFrame): Column = + df1.schema.fields + .map(field => col(s"df1.${field.name}") <=> col(s"df2.${field.name}")) + .reduce((col1, col2) => col1.and(col2)) + after { spark.stop() } -} +} \ No newline at end of file