-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathDatabricks in 5 minutes.sql
105 lines (70 loc) · 3.35 KB
/
Databricks in 5 minutes.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
-- Databricks notebook source
-- MAGIC %md
-- MAGIC # Databricks in 5 minutes an introduction
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## Create a quickstart cluster
-- MAGIC
-- MAGIC 1. In the sidebar, right-click the **Clusters** button and open the link in a new window.
-- MAGIC 1. On the Clusters page, click **Create Cluster**.
-- MAGIC 1. Name the cluster **Quickstart**.
-- MAGIC 1. In the Databricks Runtime Version drop-down, select **7.3 LTS (Scala 2.12, Spark 3.0.1)**.
-- MAGIC 1. Click **Create Cluster**.
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## Attach the notebook to the cluster and run all commands in the notebook
-- MAGIC
-- MAGIC 1. Return to this notebook.
-- MAGIC 1. In the notebook menu bar, select **<img src="http://docs.databricks.com/_static/images/notebooks/detached.png"/></a> > Quickstart**.
-- MAGIC 1. When the cluster changes from <img src="http://docs.databricks.com/_static/images/clusters/cluster-starting.png"/></a> to <img src="http://docs.databricks.com/_static/images/clusters/cluster-running.png"/></a>, click **<img src="http://docs.databricks.com/_static/images/notebooks/run-all.png"/></a> Run All**.
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## The next command creates a table from a Databricks dataset
-- COMMAND ----------
DROP TABLE IF EXISTS diamonds;
CREATE TABLE diamonds
USING csv
OPTIONS (path "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv", header "true")
-- COMMAND ----------
SELECT * from diamonds
-- COMMAND ----------
-- MAGIC %python
-- MAGIC diamonds = spark.read.csv("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv", header="true", inferSchema="true")
-- MAGIC diamonds.write.format("delta").save("/delta/diamonds")
-- COMMAND ----------
DROP TABLE IF EXISTS diamonds;
CREATE TABLE diamonds USING DELTA LOCATION '/delta/diamonds/'
-- COMMAND ----------
SELECT * from diamonds
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## The next command manipulates the data and displays the results
-- MAGIC
-- MAGIC Specifically, the command:
-- MAGIC 1. Selects color and price columns, averages the price, and groups and orders by color.
-- MAGIC 1. Displays a table of the results.
-- COMMAND ----------
SELECT color, avg(price) AS price FROM diamonds GROUP BY color ORDER BY color
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## Convert the table to a chart
-- MAGIC
-- MAGIC Under the table, click the bar chart <img src="http://docs.databricks.com/_static/images/notebooks/chart-button.png"/></a> icon.
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## Repeat the same operations using Python DataFrame API.
-- MAGIC This is a SQL notebook; by default command statements are passed to a SQL interpreter. To pass command statements to a Python interpreter, include the `%python` magic command.
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## The next command creates a DataFrame from a Databricks dataset
-- COMMAND ----------
-- MAGIC %python
-- MAGIC diamonds = spark.read.csv("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv", header="true", inferSchema="true")
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ## The next command manipulates the data and displays the results
-- COMMAND ----------
-- MAGIC %python
-- MAGIC from pyspark.sql.functions import avg
-- MAGIC
-- MAGIC display(diamonds.select("color","price").groupBy("color").agg(avg("price")).sort("color"))