-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcensus_api.py
199 lines (157 loc) · 8.41 KB
/
census_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
'''
Last Updated June 30, 2015
by Laura Kurup
https://github.com/laurakurup
Description: Create a pandas dataframe and csv file from the U.S. Census
Decennial Census API, which offers access to population data by sex, age,
race, etc. and housing data by occupancy, vacancy status, and tenure.
Documentation: https://github.com/laurakurup/census-api/raw/master/README.md
Please check out README.md for a complete
explanation of the variables and options below
Happy querying!
'''
''' Configure your request '''
# census API key --> request at http://www.census.gov/developers/
census_api_key = 'YOUR-KEY-HERE'
# csv file of the variables you want (relative path from python's working directory)
# Get the template: https://github.com/laurakurup/census-api/raw/master/census_variables_sample.csv
variables_csv = 'census_variables.csv'
# 'state', 'county', 'metro' or 'metro-micro'
location_type = 'state'
# maximum variables per request
api_variable_limit = 50
# append the year to your column names (True / False)
# e.g. 'housing_renter' becomes 'housing_renter_2010'
add_year = True
# subdirectory for data exports (relative to python's working directory)
# if not using, leave as empty string
subdirectory = 'data/census/'
# add a timestamp to your csv export filename (True / False)
# this keeps me from overwriting my data :)
time_stamp = True
''' Imports '''
import pandas as pd
import urllib2
import datetime
import time
''' Process your variables '''
# read in the data
df = pd.read_csv(variables_csv)
# creates a list of dictionaries from relevant columns
variables_list = df[['year', 'variable', 'column_name']].to_dict(orient='records')
# create a list of the years to query (must be seperate API calls)
years_to_query = [item for item in set([item['year'] for item in variables_list])]
''' Set up the dataframe based on location type '''
# create a dataframe of the lcation names and FIPS codes
if location_type == 'metro' or location_type == 'metro-micro':
fips_codes = 'https://github.com/laurakurup/data/raw/master/us_metro_areas/us_metro_areas_fips_codes.csv'
df = pd.read_csv(fips_codes, dtype={'state_fips': object, 'place_fips': object})
elif location_type == 'state':
fips_codes = 'https://raw.githubusercontent.com/laurakurup/data/master/us_states/us_states_fips_codes.csv'
df = pd.read_csv(fips_codes, dtype={'state_fips': object})
else:
fips_codes = 'https://github.com/laurakurup/data/raw/master/us_counties/us_counties_fips_codes.csv'
df = pd.read_csv(fips_codes, dtype={'state_fips': object, 'county_fips': object})
# if 'metro' only, drop the micropolitan areas, leaving the 685 metropolitan areas
if location_type == 'metro':
df = df[df['metro_micro'] != 'micropolitan']
df = df.reset_index(drop=True)
''' Optional: Test your query '''
# save some time! uncomment line 124 to cut your request to 10 locations
# if everything is running smoothly, rerun it on the full list :)
# df = df.head(10)
''' Get the data '''
# function that will request and save the data for each source URL
def get_census_data(source_url):
# start timing the request
start_time = time.time()
# print location to be requested
if location_type == 'state':
print "requesting data for " + source_url.split('=')[3] + ' . . .'
else:
print "requesting data for " + source_url.split('=')[4].split('&')[0] + ' ' + source_url.split('=')[3].split('&')[0] + ' . . .'
try:
# read json for the source URL
response = pd.read_json(source_url)
# transpose for easy saving
response = response.transpose()
# if location type is state, save all but the last value (the state FIPS)
if location_type == 'state':
data = [item for item in response[1]][:-1]
# otherwise, save all but the last two values (the state and place FIPS)
else:
data = [item for item in response[1]][:-2]
# print a success message with the length of time
print("success! %s seconds" % (round(time.time() - start_time, 5)))
print '---------------------------------------------------'
# If the API returns a HTTPError, print the error message and the source URL
except urllib2.HTTPError, error:
contents = error.read()
print 'ERROR MESSAGE FROM API: ' + contents
print source_url
print '---------------------------------------------------'
# save 'error' for the variables requested
data = ['error'] * len(source_url.split(','))
# For other errors, print the source URL
# this will happen if the location does not exist for the year requested
except:
print 'ERROR – NO RESPONSE FOR URL:'
print source_url
print '---------------------------------------------------'
# save 'error' for the variables requested
data = ['error'] * len(source_url.split(','))
# return a list of the response values
return data
# to build the source URLs, iterate through each year
for year in years_to_query:
# create a list of the variables for the year
new_variables_list = [item for item in variables_list if item['year'] == year]
# iterate through list(s) of variables that are < the api limit
for i in xrange(1, len(new_variables_list), api_variable_limit):
print 'Starting request for Census ' + str(year) + ' variables ' + str(i) + ' through ' + str(i+api_variable_limit) + ':'
print '---------------------------------------------------'
# create a list of the next batch of varibles to request
list_within_limit = new_variables_list[i:i+api_variable_limit]
# create a string of variables to build the source URL (comma seperated, no spaces)
variables_str = ''
for item in list_within_limit:
variables_str = variables_str + item['variable'] + ','
# trim the last character (extra comma)
variables_str = variables_str[:-1]
# build the source URL based on location type
if location_type == 'metro' or location_type == 'metro-micro':
df['source_url'] = 'http://api.census.gov/data/' + str(year) + '/sf1?key=' + census_api_key + '&get=' + variables_str + '&for=place:' + df['place_fips'] + '&in=state:' + df['state_fips']
elif location_type == 'state':
df['source_url'] = 'http://api.census.gov/data/' + str(year) + '/sf1?key=' + census_api_key + '&get=' + variables_str + '&for=state:' + df['state_fips']
else:
df['source_url'] = 'http://api.census.gov/data/' + str(year) + '/sf1?key=' + census_api_key + '&get=' + variables_str + '&for=county:' + df['county_fips'] + '&in=state:' + df['state_fips']
# run the get_census_data() function on each source URL, create a new column of results
df['new_data'] = [get_census_data(item) for item in df['source_url']]
print 'Request complete for Census ' + str(year) + ' variables ' + str(i) + ' through ' + str(i+api_variable_limit)
print '---------------------------------------------------'
# iterate through the list of variables that were requested to create columns
for item in list_within_limit:
# create column names for each variable with the year (if add_year == True)
if add_year == True:
new_column_name = item['column_name'] + '_' + str(item['year'])
# create column names for each variable without the year (if add_year == False)
else:
new_column_name = item['column_name']
# create the new column with the data returned from the API
n = list_within_limit.index(item)
df[new_column_name] = [item[n] for item in df['new_data']]
# print success message when year is complete
# print success message when everything is complete
print 'FINISHED! Request complete for all ' + str(len(variables_list)) + ' variables'
''' Clean up and export to csv'''
# drop unneeded columns (used during API calls)
df = df.drop('source_url', 1)
df = df.drop('new_data', 1)
# construct csv file name with date and time added to file name (if time_stamp == True)
# or, without date and time (if time_stamp == False)
if time_stamp == True:
file_name = subdirectory + 'census-data-by-' + location_type + '-' + datetime.datetime.strftime(datetime.datetime.now(), '%Y.%m.%d-%I.%M%p') + '.csv'
else:
file_name = subdirectory + 'census-data-by' + location_type + '.csv'
# save data to csv
df.to_csv(file_name, index=False)