-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
88 lines (77 loc) · 2.67 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var years = [];
for (var year = 2009; year < 2010; year++)
years.push(year);
async.mapSeries(years, scrapeMoviesForYear, function(err, results) {
var movies = [];
results.forEach(function(movies_for_year) {
movies = movies.concat(movies_for_year);
});
var complete_movie_list = []; // JSON.parse(fs.readFileSync('movies.json')); // read in the JSON file if you want to append to it
complete_movie_list = complete_movie_list.concat(movies);
fs.writeFileSync('movies.json', JSON.stringify(complete_movie_list), {encoding: 'utf8'});
});
function scrapeMoviesForYear(year, callback) {
// setTimeout() so wikipedia doesn't hate us for slamming their servers
setTimeout(function() {
request('https://en.wikipedia.org/wiki/List_of_American_films_of_' + year,
function(err, res, body) {
if (err)
throw err;
if (res.statusCode != 200)
throw new Error('wikipedia returned an error response: ' + res.statusCode);
var $ = cheerio.load(body);
// console.log($);
var tables = $('table.wikitable');
if (!tables.length) {
console.log(body);
throw new Error('Did not find a table w/ class "wikitable" in Wikipedia\'s response');
}
//
var movies = [];
tables.each(function(ix, table) {
var rows = $(table).find('tr');
rows.each(function(ix, el) {
// the first row just has headings
if (ix == 0)
return;
var cells = $(el).find('td');
// console.log(cells);
var title_cell = $(cells[0]);
if (title_cell.attr('rowspan'))
title_cell = $(cells[1]);
if (title_cell.attr('rowspan'))
title_cell = $(cells[2]);
if (title_cell.attr('rowspan'))
throw new Error('Unexpected: a 3 cells in a row with rowspans');
// often there are empty rows with just rowspans
// perhaps leftover from when there was an anticipated release in that month
if (!title_cell.text().trim())
return;
title_cell.find('.sortkey').remove();
//
var movie_data = {
title: title_cell.text(),
href:title_cell.children().children().attr('href')
};
movies.push(movie_data);
//
var m = movie_data;
console.log(m.title + ':', m.href);
});
});
//
callback(null, movies);
});
}, 1000);
}
function toCommaDelimitedList(cell) {
var text = cell.text().trim();
if (text)
return text.split('\n').join(', ');
else
return null;
}