forked from zotero/translators
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathThe Age.js
109 lines (97 loc) · 3.21 KB
/
The Age.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
{
"translatorID":"efb3c424-daa9-40c9-8ee2-983d2802b27a",
"translatorType":4,
"label":"The Age",
"creator":"Michael Berkowitz",
"target":"^http://(www|search).theage.com.au/",
"minVersion":"1.0.0b4.r5",
"maxVersion":"",
"priority":100,
"inRepository":true,
"lastUpdated":"2007-08-14 22:15:00"
}
function detectWeb(doc, url) {
if (url.indexOf("siteSearch.ac") != -1) {
return "multiple";
} else if (url.indexOf("html") != -1) {
return "newspaperArticle";
}
}
function scrape(url) {
Zotero.Utilities.HTTP.doGet(url, function(text) {
var newItem = new Zotero.Item("newspaperArticle");
newItem.ISSN = "0312-6307";
newItem.url =url;
newItem.publicationTitle = "The Age";
Zotero.debug(url);
//title
var t = /<HEADLINE>(.*)<\/HEADLINE>/;
newItem.title = Zotero.Utilities.unescapeHTML(Zotero.Utilities.capitalizeTitle(text.match(t)[1]).split(" - ")[0]);
//meta tags? (except abstract, for some reason)
var m = /name=\"(.*)\"\s+content=\"(.*)\"\s+\/>/g;
var metaTags = text.match(m);
var metaInfo = new Object();
var metaNames = new Array();
var m2 = /name=\"(.*)\"\s+content=\"(.*)\"\s+\/>/;
for (var i = 0 ; i < metaTags.length ; i++) {
var stuff = metaTags[i].match(m2);
metaInfo[stuff[1]] = stuff[2];
metaNames.push(stuff[1]);
}
for (var i = 0 ; i <metaNames.length ; i++) {
if (metaNames[i] == "sitecategories") {
newItem.section = metaInfo[metaNames[i]].split(",")[0];
} else if (metaNames[i] == "publishdate") {
newItem.date = metaInfo[metaNames[i]].split(/\s+/)[0];
} else if (metaNames[i] == "byline") {
var byline = metaInfo[metaNames[i]].split(",")[0];
if (byline.indexOf(" and ") != -1) {
byline = byline.split(" and ");
for (var j = 0 ; j < byline.length ; j++) {
newItem.creators.push(Zotero.Utilities.cleanAuthor(byline[j], "author"));
}
} else {
newItem.creators.push(Zotero.Utilities.cleanAuthor(byline, "author"));
}
} else if (metaNames[i] == "keywords") {
var keywords = metaInfo[metaNames[i]].split(",");
for (var k = 0 ; k < keywords.length ; k++) {
if (keywords[k].length > 1) {
newItem.tags.push(Zotero.Utilities.unescapeHTML(keywords[k][0].toUpperCase() + keywords[k].substr(1).toLowerCase()));
}
}
}
}
//abstract
var a = /\"Description\"\s+content=\"([^\"]*)\"/;
newItem.abstractNote = Zotero.Utilities.unescapeHTML(text.match(a)[1].substring(0, text.match(a)[1].length - 3));
newItem.complete();
Zotero.done();
}, function() {});
}
function doWeb(doc, url) {
var URLS = new Array();
if (url.indexOf("siteSearch.ac") != -1) {
var xpath = '//div[@class="searchresults"]/dl/dt/a';
var titles = new Object();
var stuff = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
var newest = stuff.iterateNext();
while (newest) {
titles[newest.href] = newest.textContent;
newest = stuff.iterateNext();
}
var items = Zotero.selectItems(titles);
for (var i in items) {
URLS.push(i.split("u=")[1].replace(/%3A/g,":").replace(/%2F/g,"/").split("&")[0]);
}
} else {
URLS.push(url);
}
Zotero.debug(URLS);
Zotero.Utilities.HTTP.doPost(URLS, "", function(text) {
for (var i = 0 ; i < URLS.length ; i++) {
scrape(URLS[i]);
}
});
Zotero.wait();
}