-
Notifications
You must be signed in to change notification settings - Fork 123
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds the csv-vs-parquet config file to examples and docs Adds page for workload-suite documentation Adds pages for examples
- Loading branch information
Showing
12 changed files
with
301 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
--- | ||
layout: page | ||
title: CSV vs. Parquet | ||
--- | ||
|
||
This is an experimental setup for benchmarking the performance of some | ||
simple SQL queries over the same dataset store in CSV and Parquet. | ||
|
||
In this case, only one spark submit is needed. Within that spark-submit, several workload-suites get run serially. | ||
|
||
The first workload suite first generates data using `data-generation-kmeans`. Then a second workload in that suite | ||
picks up that dataset and writes it out as Parquet. | ||
|
||
In the second dataset, four different workloads are setup in one workload block. Each of them runs a sql query. | ||
Under the hood, spark-bench will take the two parameter lists and cross-join them to create, in this instance, four | ||
workload configurations. | ||
|
||
```hocon | ||
spark-bench = { | ||
spark-submit-config = [{ | ||
spark-home = "XXXXXXX" // PATH TO YOUR SPARK INSTALLATION | ||
spark-args = { | ||
master = "XXXXXXX" // FILL IN YOUR MASTER HERE | ||
executor-memory = "XXXXXXX" // FILL IN YOUR EXECUTOR MEMORY | ||
} | ||
conf = { | ||
// Any configuration you need for your setup goes here, like: | ||
// "spark.dynamicAllocation.enabled" = "false" | ||
} | ||
suites-parallel = false | ||
workload-suites = [ | ||
{ | ||
descr = "Generate a dataset, then take that same dataset and write it out to Parquet format" | ||
benchmark-output = "hdfs:///tmp/csv-vs-parquet/results-data-gen.csv" | ||
// We need to generate the dataset first through the data generator, then we take that dataset and convert it to Parquet. | ||
parallel = false | ||
workloads = [ | ||
{ | ||
name = "data-generation-kmeans" | ||
rows = 10000000 | ||
cols = 24 | ||
output = "hdfs:///tmp/csv-vs-parquet/kmeans-data.csv" | ||
}, | ||
{ | ||
name = "sql" | ||
query = "select * from input" | ||
input = "hdfs:///tmp/csv-vs-parquet/kmeans-data.csv" | ||
output = "hdfs:///tmp/csv-vs-parquet/kmeans-data.parquet" | ||
} | ||
] | ||
}, | ||
{ | ||
descr = "Run two different SQL queries over the dataset in two different formats" | ||
benchmark-output = "hdfs:///tmp/csv-vs-parquet/results-sql.csv" | ||
parallel = false | ||
repeat = 10 | ||
workloads = [ | ||
{ | ||
name = "sql" | ||
input = ["hdfs:///tmp/csv-vs-parquet/kmeans-data.csv", "hdfs:///tmp/csv-vs-parquet/kmeans-data.parquet"] | ||
query = ["select * from input", "select `0`, `22` from input where `0` < -0.9"] | ||
cache = false | ||
} | ||
] | ||
} | ||
] | ||
}] | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
--- | ||
layout: page | ||
title: Minimal Configuration File | ||
--- | ||
|
||
This is an example of a really minimal but complete configuration file. This one just runs one tiny instance of the SparkPi workload. | ||
|
||
## Setting spark-home and master | ||
Spark-Bench must know the location of your Spark installation and must have master specified. | ||
You can specify these variables in your config file or in your environment | ||
|
||
## Spark variables set in config file | ||
```hocon | ||
spark-bench = { | ||
spark-submit-config = [{ | ||
spark-home = "/path/to/your/spark/installation" | ||
spark-args = { | ||
master = "XXXXXX" // replace with local, yarn, whatever | ||
} | ||
workload-suites = [ | ||
{ | ||
descr = "One run of SparkPi and that's it!" | ||
benchmark-output = "console" | ||
workloads = [ | ||
{ | ||
name = "sparkpi" | ||
slices = 10 | ||
} | ||
] | ||
} | ||
] | ||
}] | ||
} | ||
```` | ||
## Spark variables set in environment | ||
```bash | ||
export SPARK_HOME=/path/to/your/spark/installation | ||
export SPARK_MASTER_HOST=XXXX # replace with local, yarn, whatever | ||
``` | ||
|
||
```hocon | ||
// spark-home and master will be picked up from the environment | ||
spark-bench = { | ||
spark-submit-config = [{ | ||
workload-suites = [ | ||
{ | ||
descr = "One run of SparkPi and that's it!" | ||
benchmark-output = "console" | ||
workloads = [ | ||
{ | ||
name = "sparkpi" | ||
slices = 10 | ||
} | ||
] | ||
} | ||
] | ||
}] | ||
} | ||
```` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
--- | ||
layout: page | ||
title: Workload-Suite Configuration | ||
--- | ||
|
||
Workload suites are exactly what they sound like. They are logical groups of workloads. | ||
Workload suites can be composed with each other for benchmarking tasks or to simulate different cluster use cases. | ||
|
||
<!-- START doctoc generated TOC please keep comment here to allow auto update --> | ||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE --> | ||
|
||
|
||
- [Parameters](#parameters) | ||
- [benchmark-output](#benchmark-output) | ||
- [descr](#descr) | ||
- [parallel](#parallel) | ||
- [repeat](#repeat) | ||
|
||
<!-- END doctoc generated TOC please keep comment here to allow auto update --> | ||
|
||
## Parameters | ||
|
||
| Name | Required | Description | | ||
| ---------- | ----- | --- | | ||
| benchmark-output | yes | path to the file where benchmark results should be stored, or use `"console"` to print to the terminal | | ||
| descr | yes | Human-readable string description of what the suite intends to do | | ||
| parallel | no | Whether the workloads in the suite run serially or in parallel. Defaults to `false`. | | ||
| repeat | no | How many times the workloads in the suite should be repeated. | | ||
|
||
## benchmark-output | ||
|
||
Control where results are outputted by using `benchmark-output`. While each workload can output the results of its particular algorithm | ||
using the configurable parameter `workload` within a workload block, `benchmark-output` collects the _benchmark results_ in one place. | ||
|
||
For example, in the following configuration, the contents of `output` will be the dataset generated by running the query over in the input. | ||
The contents of `benchmark-output` will be one single line containing the _timing results_ of the sql run. | ||
```hocon | ||
workload-suites = [ | ||
{ | ||
descr = "One run of a SQL query" | ||
benchmark-output = "hdfs:///tmp/sql-benchmark-results.csv" | ||
workloads = [ | ||
{ | ||
name = "sql" | ||
input = "/tmp/generated-kmeans-data.parquet" | ||
output = "/tmp/sql-query-results.parquet" | ||
query = "select `0` from input where `0` < -0.9" | ||
} | ||
] | ||
} | ||
] | ||
``` | ||
|
||
## descr | ||
|
||
`descr` is simply a human-readable string that gets added to benchmark results. | ||
|
||
## parallel | ||
|
||
The parameter `parallel` is a boolean that controls whether the workloads within the suite run serially or are launched in parallel. | ||
The default is `false`, meaning that workloads will run serially by default. | ||
|
||
## repeat | ||
|
||
`repeat` controls how many times the _suite_ repeats. | ||
For example, say a suite contains the workloads A, B, a different instance of B, and C. | ||
Let's also say it's running serially, and repeat is 2. | ||
This setup will run like this: | ||
```text | ||
A | ||
B | ||
B | ||
C | ||
--- | ||
A | ||
B | ||
B | ||
C | ||
--- | ||
Done | ||
``` | ||
And it will NOT run like this: | ||
```text | ||
// Will NOT run like this! | ||
A | ||
A | ||
-- | ||
B | ||
B | ||
-- | ||
B | ||
B | ||
-- | ||
C | ||
C | ||
--- | ||
Done | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
--- | ||
layout: page | ||
title: Examples | ||
permalink: /examples/ | ||
--- | ||
|
||
Spark-Bench is best understood by example. The following pages show and explain the configuration files | ||
from the examples included in the distribution. | ||
|
||
<ul> | ||
{% for page in site.examples %} | ||
<li> | ||
<h3> | ||
<a class="page-link" href="{{ page.url | relative_url }}">{{ page.title | escape }}</a> | ||
</h3> | ||
</li> | ||
{% endfor %} | ||
</ul> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
spark-bench = { | ||
spark-submit-config = [{ | ||
spark-home = "XXXXXXX" // PATH TO YOUR SPARK INSTALLATION | ||
spark-args = { | ||
master = "XXXXXXX" // FILL IN YOUR MASTER HERE | ||
executor-memory = "XXXXXXX" // FILL IN YOUR EXECUTOR MEMORY | ||
} | ||
conf = { | ||
// Any configuration you need for your setup goes here, like: | ||
// "spark.dynamicAllocation.enabled" = "false" | ||
} | ||
suites-parallel = false | ||
workload-suites = [ | ||
{ | ||
descr = "Generate a dataset, then take that same dataset and write it out to Parquet format" | ||
benchmark-output = "hdfs:///tmp/csv-vs-parquet/results-data-gen.csv" | ||
// We need to generate the dataset first through the data generator, then we take that dataset and convert it to Parquet. | ||
parallel = false | ||
workloads = [ | ||
{ | ||
name = "data-generation-kmeans" | ||
rows = 10000000 | ||
cols = 24 | ||
output = "hdfs:///tmp/csv-vs-parquet/kmeans-data.csv" | ||
}, | ||
{ | ||
name = "sql" | ||
query = "select * from input" | ||
input = "hdfs:///tmp/csv-vs-parquet/kmeans-data.csv" | ||
output = "hdfs:///tmp/csv-vs-parquet/kmeans-data.parquet" | ||
} | ||
] | ||
}, | ||
{ | ||
descr = "Run two different SQL queries over the dataset in two different formats" | ||
benchmark-output = "hdfs:///tmp/csv-vs-parquet/results-sql.csv" | ||
parallel = false | ||
repeat = 10 | ||
workloads = [ | ||
{ | ||
name = "sql" | ||
input = ["hdfs:///tmp/csv-vs-parquet/kmeans-data.csv", "hdfs:///tmp/csv-vs-parquet/kmeans-data.parquet"] | ||
query = ["select * from input", "select `0`, `22` from input where `0` < -0.9"] | ||
cache = false | ||
} | ||
] | ||
} | ||
] | ||
}] | ||
} |
This file was deleted.
Oops, something went wrong.
File renamed without changes.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.