-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_behaviors.js
136 lines (124 loc) · 6.25 KB
/
example_behaviors.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*********************** Example Behaviors ***********************/
// example: find all the <a href>s on the page and add them to the crawl queue
const DiscoverOutlinksBehavior = {
schema: 'BehaviorSchema@0.1.0',
name: 'DiscoverOutlinksBehavior',
hooks: {
window: {
// '*': async (event, BehaviorBus, window) => {
// console.log(`[window] -> [DiscoverOutlinksBehavior] ${JSON.stringify(event)}`);
// },
PAGE_CAPTURE: async (event, BehaviorBus, window) => {
console.log(`[window] -> [DiscoverOutlinksBehavior] 🔍 Discovering outlinks...`)
for (const elem of window.document.querySelectorAll('a')) {
BehaviorBus.dispatchEvent(new BehaviorEvent('DISCOVERED_OUTLINK', {url: elem.href, elem}))
}
},
DISCOVERED_OUTLINK: async (event, BehaviorBus, window) => {
console.log(`[window] -> [DiscoverOutlinksBehavior] ➕ Found a new outlink to add to crawl! ${event.url}`)
// browsertrix driver itself would also listen for this event and use it to add add URLs to the crawl queue
}
},
puppeteer: {
// '*': async (event, BehaviorBus, page) => {
// console.log(`[puppeteer] -> [DiscoverOutlinksBehavior] ${JSON.stringify(event)}`);
// },
// can also optionally implement handlers that run in other contexts (if driver implements that context)
PAGE_SETUP: async (event, BehaviorBus, page) => {
console.log(`[puppeteer] -> [DiscoverOutlinksBehavior] 🔧 Setting up page for outlink discovery...`)
await page.setRequestInterception(true);
page.on('request', request => {
request.continue();
if (request.url().endsWith('.html')) {
BehaviorBus.dispatchEvent(new BehaviorEvent('DISCOVERED_OUTLINK', {url: request.url()}));
// consumes/broadcasts events to all contexts using same shared BehaviorBus
// so the DISCOVERED_OUTLINK handler above would fire even though it's bound in a different context
}
})
},
},
},
}
// example: behavior to extract a page's article text content
const ExtractArticleTextBehavior = {
schema: 'BehaviorSchema@0.1.0',
name: 'ExtractArticleTextBehavior',
hooks: {
window: {
// '*': async (event, BehaviorBus, window) => {
// console.log(`[window] -> [ExtractArticleTextBehavior] ${JSON.stringify(event)}`);
// },
PAGE_CAPTURE: async (event, BehaviorBus, window) => {
console.log(`[window] -> [ExtractArticleTextBehavior] 📄 Extracting article text...`)
const article_text = window.document.body.innerText
BehaviorBus.dispatchEvent(new BehaviorEvent('FS_WRITE_FILE', {path: 'body_text.txt', content: article_text}))
BehaviorBus.dispatchEvent(new BehaviorEvent('DISCOVERED_TEXT', {selector: 'body', text: article_text}))
// browsertrix could listen for this to build a full-text-search index in the WARC if it wants
},
},
},
}
// example: behavior to expand comments on reddit, facebook, and github
const ExpandCommentsBehavior = {
schema: 'BehaviorSchema@0.1.0',
name: 'ExpandCommentsBehavior',
// private helper methods that behavior can use internally
_expand: (elem) => { elem.open = true },
_shouldRun: (page_url) => {
for (const domain of ['//facebook.com', '//reddit.com', '//github.com']) {
if (page_url.includes(domain)) return true;
}
return false;
},
_selectors: (page_url) => {
if (ExpandCommentsBehavior._shouldRun(page_url)) {
return ['article details', 'div.js-discussion details:not(.details-overlay)', '.markdown-body details']
}
return [];
},
hooks: {
window: {
// '*': async (event, BehaviorBus, window) => {
// console.log(`[window] -> [ExpandCommentsBehavior] ${JSON.stringify(event)}`);
// },
PAGE_LOAD: async (event, BehaviorBus, window) => {
console.log(`[window] -> [ExpandCommentsBehavior] 💬 Expanding comments...`)
// expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
for (const selector of ExpandCommentsBehavior._selectors(window.location.href)) {
for (const elem of window.document.querySelectorAll(selector)) {
ExpandCommentsBehavior._expand(elem);
}
}
}
},
puppeteer: {
// '*': async (event, BehaviorBus, page) => {
// console.log(`[puppeteer] -> [ExpandCommentsBehavior] ${JSON.stringify(event)}`);
// },
PAGE_LOAD: async (event, BehaviorBus, page) => {
console.log(`[puppeteer] -> [ExpandCommentsBehavior] 💬 Expanding comments...`)
// if driver offers a puppeteer context the behavior can use its extra powers to pierce nested iframes/shadow doms/etc
for (const selector of ExpandCommentsBehavior._selectors(page.url())) {
await page.$$eval(`pierce/${selector}`, ExpandCommentsBehavior._expand);
}
}
},
},
}
const BEHAVIORS = [DiscoverOutlinksBehavior, ExtractArticleTextBehavior, ExpandCommentsBehavior]
var all_exports = { DiscoverOutlinksBehavior, ExtractArticleTextBehavior, ExpandCommentsBehavior, BEHAVIORS }
if (globalThis.navigator) {
// loaded from browser, running in window
console.log(`[window] importing src/example_behaviors.js ...`);
for (const key of Object.keys(all_exports)) {
console.log(`[window] loaded window.${key}`);
globalThis[key] = all_exports[key];
}
} else {
// loaded from node, running in puppeteer
console.log(`[puppeteer] importing src/example_behaviors.js ...`);
for (const key of Object.keys(all_exports)) {
console.log(`[puppeteer] loaded global.${key}`);
}
}
export { BEHAVIORS, DiscoverOutlinksBehavior, ExtractArticleTextBehavior, ExpandCommentsBehavior }