-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEC2_analyser.py
191 lines (172 loc) · 6.53 KB
/
EC2_analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime
import numpy as np
import subprocess
def check_folder(dir):
if not os.path.isdir(dir):
os.mkdir(dir)
def get_data():
# Check if data directory is present, otherwise create it
check_folder("Data")
# Change these dates to retrieve data from other timeframes
# Make sure the format is "YYYY-MM-DDTHH:MM:SS"
start_time = "2017-09-17T00:00:00"
end_time = "2017-10-17T00:00:00"
# Add more machine to this list if you like
instances = ["m4.large", "m4.xlarge", "m4.2xlarge", "m4.4xlarge", "m4.10xlarge", "m4.16xlarge", "c4.large",
"c4.xlarge", "c4.2xlarge", "c4.4xlarge", "c4.8xlarge"]
print "Started downloading historical EC2 data, this process could take a while"
for instance in instances:
print "Now downloading data for: " + str(instance)
cmd = "aws ec2 describe-spot-price-history --instance-types " + str(instance) + " --start-time " + str(start_time) + \
" --end-time " + str(end_time) + " --output text > Data/" + str(instance) + ".txt"
get = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE)
if get == 0:
print "Downloading of " + str(instance) + " data successful"
def get_day_name(row):
if row['DayofWeek'] == 0:
return "Monday"
elif row['DayofWeek'] == 1:
return "Tuesday"
elif row['DayofWeek'] == 2:
return "Wednesday"
elif row['DayofWeek'] == 3:
return "Thursday"
elif row['DayofWeek'] == 4:
return "Friday"
elif row['DayofWeek'] == 5:
return "Saturday"
elif row['DayofWeek'] == 6:
return "Sunday"
def load_data_txt(path):
# Give column names
colums_names = ["Info", "VPC", "InstanceType", "OS", "SpotPrice", "Time"]
# Read in the txt file
print "Read in the file at " + str(path)
df = pd.read_csv(path, sep="\t", decimal='.', names = colums_names)
print "Convert dates"
# Convert time column to datetime
df.Time = pd.to_datetime(df['Time'])
# Set index to this timestamp
df.set_index('Time')
# Remove unnecessary columns
df = df[df.columns[1:]]
df = df[df.OS == "Linux/UNIX"]
df = df.drop('OS', 1)
print "Add weekday information to dataframe for day-by-day analysis"
# Add weekday information to dataframe
df['DayofWeek'] = df.apply(lambda row: row['Time'].weekday(), axis=1)
df['DayNameofWeek'] = df.apply(lambda row: get_day_name(row), axis=1)
return df
def visualize_data_per_week(df, machine, number):
# Define the different VPCs
vpcs = df.VPC.unique()
# Make a new figure
plt.figure()
# Plot all VPCs in this figure
for vpc in vpcs:
subset = df[df.VPC == vpc]
x = subset['Time']
y = subset['SpotPrice']
plt.plot(x, y, c=np.random.rand(3,))
# Set labels
plt.xlabel('Time')
plt.ylabel('Spot price')
# plt.show()
text = machine + "_week_" + str(number)
plt.savefig('Images/' + str('Plot_of_') + "%s.png" % (text))
def visualize_data_per_month(df, machine):
# Define the different VPCs
vpcs = df.VPC.unique()
# Make a new figure
plt.figure()
# Plot all VPCs in this figure
for vpc in vpcs:
subset = df[df.VPC == vpc]
x = subset['Time']
y = subset['SpotPrice']
plt.plot(x, y, c=np.random.rand(3,))
# Set labels
plt.xlabel('Time')
plt.ylabel('Spot price')
# plt.show()
text = machine + "_month_" + str(df.iloc[0]["Time"].strftime("%B"))
plt.savefig('Images/' + str('Plot_of_') + "%s.png" % (text))
def get_daily_averages(data):
# Group by day of the week and calculate means, median, min and max
mean = data.groupby(['DayNameofWeek']).mean().sort_values(['DayofWeek'])
# Delete first column for nice output
mean = mean.iloc[:,0:1]
median = data.groupby(['DayNameofWeek']).median().sort_values(['DayofWeek'])
median = median.iloc[:,0:1]
min_val = data.groupby(['DayNameofWeek']).min(axis=1).sort_values(['DayofWeek'])
min_val = min_val.iloc[:,2:3]
max_val = data.groupby(['DayNameofWeek']).max(axis=1).sort_values(['DayofWeek'])
max_val = max_val.iloc[:,2:3]
return mean, median, min_val, max_val
def visualize_stats(df, type, machine):
plt.figure()
df.plot(kind='bar')
plt.xticks(rotation=20)
plt.title(str(type) + " price of " + str(machine))
# plt.show()
text = "stats_" + machine + "_" + type
plt.savefig('Images/' + str('Plot of ') + "%s.png" % (text))
if __name__ == '__main__':
print "Program started"
######## DATA GATHERING
answer = raw_input("Is data already present? Typ Y/N ")
if answer == "y" or answer == "Y":
print "Perfect, let's use that data!"
elif answer == "n" or answer == "N":
get_data()
else:
print "Wrong answer, terminating"
######## DATA LOADING
print "The following data is available in Data folder:"
ls = os.listdir("Data")
for i in range(len(ls)):
print ls[i]
path = raw_input("Please enter the name of the .txt file you want to analyse: ")
print "Loading data file " + str(path)
data = load_data_txt('Data/' + path)
machine = data.iloc[0]["InstanceType"]
print "Data loaded successfully for machine " + str(machine)
######## DATA DIVISION IN WEEKS
q = int(round(len(data.index) * 0.25))
first_quarter = data[0:q]
second_quarter = data[q:2*q]
third_quarter = data[2*q:3*q]
fourth_quarter = data[3*q:]
weeks = [first_quarter, second_quarter, third_quarter, fourth_quarter]
####### DATA VISUALIZATION
check_folder("Images")
print "Visualize data per week"
i = 0
for week in weeks:
visualize_data_per_week(week, machine, i)
i = i + 1
print "Visualize data for the whole month"
visualize_data_per_month(data, machine)
print "Done visualizing"
###### DATA STATISTICS
mean, median, min_val, max_val = get_daily_averages(data)
print "Data statistical analyses"
print "For machine " + machine + " the descriptive statistics are:"
print "----------------------------------------------"
print "-- MEAN -- "
print mean
print "-- MEDIAN -- "
print median
print "-- MINIMUM -- "
print min_val
print "-- MAXIMUM -- "
print max_val
print "----------------------------------------------"
visualize_stats(mean, "Mean", machine)
visualize_stats(median, "Median", machine)
visualize_stats(min_val, "Minimum", machine)
visualize_stats(max_val, "Maximum", machine)
print "Program finished successfully"