-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoptimizer.py
34 lines (27 loc) · 900 Bytes
/
optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from pyspark.sql import SparkSession
import sys, time
disabled = sys.argv[1]
spark = SparkSession.builder.appName('query1-sql').getOrCreate()
if disabled == "Y":
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
elif disabled == 'N':
pass
else:
raise Exception ("This setting is not available.")
df = spark.read.format("parquet")
df1 = df.load("hdfs://master:9000/ratings.parquet")
df2 = df.load("hdfs://master:9000/movie_genres.parquet")
df1.registerTempTable("ratings")
df2.registerTempTable("movie_genres")
sqlString = \
"SELECT * " + \
"FROM " + \
" (SELECT * FROM movie_genres LIMIT 100) as g, " + \
" ratings as r " + \
"WHERE " + \
" r.id_movie = g.id_movie"
t1 = time.time()
spark.sql(sqlString).show()
t2 = time.time()
spark.sql(sqlString).explain()
print("Time with choosing join type %s is %.4f sec."%("enabled" if disabled == 'N' else "disabled", t2-t1))