forked from NYPL/oral-history-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_metadata.py
120 lines (105 loc) · 4.33 KB
/
get_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
# This script will download all oral history interview and collection metadata directly from the oral history website.
# It will create:
# - neighborhoods.json and neighborhoods.csv
# - interviews.json and interviews.csv
# - individual .json files for each interview which contain more metadata and annotations
# Usage:
# python get_metadata.py -out "/Volumes/Seagate Backup Plus Drive/oral_history/"
import argparse
import csv
import json
import os
import sys
import urllib2
# Input
parser = argparse.ArgumentParser()
parser.add_argument('-out', dest="OUTPUT_DIR", default="tmp/", help="Path to output directory")
parser.add_argument('-url', dest="BASE_URL", default="http://oralhistory.nypl.org", help="Base URL to pull data from")
parser.add_argument('-overwrite', dest="OVERWRITE", default=False, help="Overwrite existing files")
args = parser.parse_args()
DIR = args.OUTPUT_DIR
INTERVIEW_DIR = DIR + "interviews/"
URL = args.BASE_URL
OVERWRITE = args.OVERWRITE
# Config fields for csv
NEIGHBORHOOD_FIELDS = [
{"field": "slug", "header": "id"},
{"field": "title", "header": "title"},
{"field": "subtitle", "header": "subtitle"},
{"field": "long_description", "header": "description"},
{"field": "image.url", "header": "image_url"},
{"field": "image.thumb.url", "header": "thumb_url"}
]
INTERVIEW_FIELDS = [
{"field": "slug", "header": "id"},
{"field": "storyteller_name", "header": "storyteller_name"},
{"field": "interviewer_name", "header": "interviewer_name"},
{"field": "summary", "header": "summary"},
{"field": "url", "header": "audio_url"},
{"field": "neighborhood.slug", "header": "neighborhood_id"},
{"field": "notes", "header": "notes"},
{"field": "place_of_birth", "header": "place_of_birth"},
{"field": "date_of_birth", "header": "date_of_birth"},
{"field": "location", "header": "location"},
{"field": "occupations", "header": "occupations"},
{"field": "image.url", "header": "image_url"},
{"field": "image.thumb.url", "header": "thumb_url"}
]
# Make sure sub-directories exist
if not os.path.exists(INTERVIEW_DIR):
os.makedirs(INTERVIEW_DIR)
def getData(filename, url, overwrite=False):
# Download the file if not exists or overwrite
data = None
if not os.path.isfile(filename) or overwrite:
print "Downloading %s" % url
data = json.loads(urllib2.urlopen(url).read())
with open(filename, 'w') as f:
json.dump(data, f)
# Read from file if already downloaded
if data is None:
# print "Opening %s" % filename
with open(filename, 'rb') as f:
data = json.load(f)
return data
def toCSV(fields, filename, data, overwrite=False):
if not os.path.isfile(filename) or overwrite:
print "Writing to %s" % filename
headers = [field["header"] for field in fields]
with open(filename, 'wb') as f:
fw = csv.writer(f)
fw.writerow(headers)
for item in data:
row = []
for field in fields:
fieldParts = field["field"].split(".")
value = ""
obj = None
for i, fp in enumerate(fieldParts):
if obj is None:
obj = item[fp]
else:
obj = obj[fp]
if type(obj) is not dict:
value = obj
row.append(value.encode('utf-8').strip())
fw.writerow(row)
# First, get the neighborhoods
neighborhoods = getData(DIR + "neighborhoods.json", URL + "/neighborhoods.json", OVERWRITE)
print "Loaded %s neighborhoods" % len(neighborhoods)
# Next, get the interviews
interviews = getData(DIR + "interviews.json", URL + "/interviews.json", OVERWRITE)
interviewCount = len(interviews)
print "Loaded %s interviews" % interviewCount
# Download each interview
interviewsData = []
for interview in interviews:
url = interview["url"]
filename = INTERVIEW_DIR + interview["slug"] + ".json"
interviewData = getData(filename, url, OVERWRITE)
interviewsData.append(interviewData)
# convert neighborhoods to csv
toCSV(NEIGHBORHOOD_FIELDS, DIR + "neighborhoods.csv", neighborhoods, OVERWRITE)
toCSV(INTERVIEW_FIELDS, DIR + "interviews.csv", interviewsData, OVERWRITE)
print "Done."