-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataManager.py
113 lines (88 loc) · 3.81 KB
/
dataManager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pickle
import pandas as pd
import Quandl as q
from pathlib import Path
import datetime as dt
import os
import time
import sys
import utility as util
curr_year = dt.date.today().year
last_year = curr_year - 1
all_months = list(util.months.keys())
hist_data_filename = "data.hist.pickle"
hist_years = range(1998, curr_year)
data_filename = "data.pickle"
threshold = dt.timedelta(days=1)
curr_years = range(curr_year, curr_year + 2) # range(stop) param is not inclusive
def load_data():
contract_df = pd.DataFrame()
# make sure we have contract data from the past (1995 to curr-1)
hist_df = load_existing_data_file(hist_years, hist_data_filename)
# on Jan 1st, we will always need to roll over last years data into the historical dataset
if all(column.endswith(str(last_year)) == False for column in hist_df.columns):
print('Loaded historical data does not include \'{0}\' (is it January?). Performing rollover...'.format(curr_year))
last_year_df = get_leanhog_contract_data([last_year])
util.print_df(last_year_df)
hist_df = pd.concat([hist_df, last_year_df], axis=1)
util.print_df(hist_df.columns)
with open(hist_data_filename, 'wb') as fi:
pickle.dump(hist_df, fi)
# if the file doesn't exist or is out of date, download from Quandl
curr_df = pd.DataFrame()
delta = get_existing_file_age(data_filename)
if (delta is None) or (delta > threshold) :
curr_df = get_and_save_data_from_quandl(curr_years, data_filename)
else:
curr_df = load_existing_data_file(curr_years, data_filename)
contract_df = pd.concat([hist_df, curr_df], axis=1)
return contract_df
def load_existing_data_file(years, filename):
contract_df = pd.DataFrame()
histContractDataFile = Path(filename)
if histContractDataFile.is_file():
print('Loading data from file \'{0}\''.format(filename))
contract_df = pd.read_pickle(filename)
else:
print('Datafile \'{0}\' does not exist. Creating...'.format(filename))
contract_df = get_and_save_data_from_quandl(years, filename)
return contract_df
def get_and_save_data_from_quandl(years, filename):
contract_df = pd.DataFrame()
contract_df = get_leanhog_contract_data(years)
with open(filename, 'wb') as fi:
pickle.dump(contract_df, fi)
return contract_df
def get_leanhog_contract_data(years):
print("Loading contracts from Quandl web service....")
print(' loading years: {0}'.format(years))
recordList = []
columnList = []
columnRenameDict = {}
for year in years :
for month in all_months :
s = "CME/LN{0}{1}".format(month, year)
recordList.append(s)
t = "CME.LN{0}{1} - Settle".format(month, year)
tx = "LN{0}{1}".format(month, year)
# May contracts didn't start until 2008
if not ((month == 'K') and (year < 2008)):
columnList.append(t)
columnRenameDict.update({t:tx})
df = q.get(recordList, authtoken="dXzByDoZicZy-WFvPyTf")
# take out any of the columns that Quandl didn't return that were expected
# possibly a contract year that hasn't started
columnList = [x for x in columnList if x in df.columns]
df = df[columnList]
df.rename(columns=columnRenameDict, inplace=True)
return df
def get_existing_file_age(filename):
delta = None
dataFile = Path(filename)
if dataFile.is_file():
filetime = os.path.getmtime(filename) # filename is the path to the local file you are refreshing
formattedTime = time.strftime('%m/%d/%Y', time.localtime(filetime))
print("{0} was last updated on: {1}".format(filename, formattedTime))
now = time.time()
delta = dt.timedelta(seconds=now - filetime)
return delta