-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathindex.js
107 lines (87 loc) · 2.52 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
'use strict';
const request = require('request');
const cheerio = require('cheerio');
function extractField($, field, selectors, nested = false) {
let fieldOutput = '';
for (const partialSelector of selectors[field]) {
const {
selector,
text = false,
html = false,
attribute = null,
group = false,
groupSelectors
} = partialSelector;
if (text) {
fieldOutput += $(selector)
.first()
.text()
.replace(/\r?\n/g, '')
.trim();
}
if (attribute) {
fieldOutput += $(selector)
.first()
.attr(attribute);
}
if (html) {
fieldOutput += $(selector).html();
}
if (group) {
if (nested) {
throw new Error('Nested group selectors are not allowed.');
}
if (!groupSelectors) {
throw new Error('Group selection missing group selectors.');
}
const groupFields = [];
const all = $(selector);
for (let i = 0, l = all.length; i < l; i++) {
const item = cheerio.load(all[`${i}`]);
const fields = {};
const selectorProperties = Object.keys(groupSelectors);
for (const property of selectorProperties) {
fields[property] = extractField(item, property, groupSelectors, true);
}
groupFields.push(fields);
}
return groupFields;
}
}
return fieldOutput;
}
module.exports = (originUrl, config = {}) =>
new Promise(async (resolve, reject) => {
const { selectors = {}, httpOptions = {} } = config;
const requestOptions = {
method: 'GET',
uri: originUrl,
...httpOptions
};
const reqStream = request(requestOptions, async (error, response, body) => {
if (error) {
return reject(error);
}
const $ = cheerio.load(body);
const extractedFields = {};
const selectorProperties = Object.keys(selectors);
try {
for (const property of selectorProperties) {
extractedFields[property] = extractField($, property, selectors);
}
} catch (err) {
return reject(err);
}
resolve(extractedFields);
});
reqStream.on('response', response => {
if (response.statusCode !== 200) {
reqStream.emit('error', new Error(`Http status code ${response.statusCode}`));
} else if (!/text\/html/.test(response.headers['content-type'])) {
reqStream.emit(
'error',
new Error(`Unsupported content type ${response.headers['content-type']}`)
);
}
});
});