-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_browsertrix_driver.js
74 lines (63 loc) · 3.18 KB
/
example_browsertrix_driver.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Usage
/*
Usage: run the code below in the context of the page, or by injecting another a tag pointing to this file:
<script src="src/example_browsertrix_driver.js"></script>
It uses window.Browsertrix, a hypothetical API that would be provided by the Browsertrix extension.
*/
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
const BrowsertrixInBrowserCrawlDriver = {
name: 'BrowsertrixInBrowserCrawlDriver',
schema: 'BehaviorDriverSchema@0.1.0',
state: {
warc_file: null,
},
hooks: {
browser: {
FS_WRITE_FILE: async (event, BehaviorBus, page) => {
const opfsRoot = await window.navigator.storage.getDirectory();
const fileHandle = await opfsRoot.getFileHandle("fast", { create: true });
const accessHandle = await fileHandle.createSyncAccessHandle();
accessHandle.write(content); accessHandle.flush(); accessHandle.close();
await window.Browsertrix.addExtraFileToWarc(path, accessHandle);
},
DISCOVERED_OUTLINK: async (event, BehaviorBus, page) => {
await window.Browsertrix.addLinkToCrawlQueue(event.url);
},
DISCOVERED_TEXT: async (event, BehaviorBus, page) => {
await window.Browsertrix.addTextToSearchIndex(event.text);
},
PAGE_CAPTURE_COMPLETE: async (event, BehaviorBus, page) => {
BrowsertrixCrawlDriver.state.warc_file = await window.Browsertrix.saveWarc();
},
},
serviceworker: {
PAGE_CAPTURE_COMPLETE: async (event, BehaviorBus, page) => {
await window.Browsertrix.uploadWarc(BrowsertrixCrawlDriver.state.warc_file);
},
}
},
}
const crawlInBrowsertrixInBrowser = async (url, behaviors) => {
// navigate to the url we want to archive in the browser
window.location.href = url;
// inject the behavior_bus.js implementation + example_behaviors.js into the page
const behavior_bus_tag = document.createElement("script");
behavior_bus_tag.src = "src/behavior_bus.js";
document.head.appendChild(behavior_bus_tag);
const behaviors_tag = document.createElement("script");
behaviors_tag.src = "src/example_behaviors.js";
document.head.appendChild(behaviors_tag);
// initialize the WindowBehaviorBus bus
const BehaviorBus = window.initWindowBehaviorBus([BrowsertrixInBrowserCrawlDriver, ...behaviors]);
BehaviorBus.emit({type: 'PAGE_SETUP', url}, {path: [BrowsertrixInBrowserCrawlDriver.name]})
// run the page lifecycle events
window.addEventListener('load', async () => {
BehaviorBus.emit({type: 'PAGE_LOAD', url: window.location.href}, {path: [BrowsertrixInBrowserCrawlDriver.name]})
await sleep(5000);
BehaviorBus.emit({type: 'PAGE_CAPTURE', url: window.location.href}, {path: [BrowsertrixInBrowserCrawlDriver.name]})
await sleep(5000);
BehaviorBus.emit({type: 'PAGE_CAPTURE_COMPLETE', url: window.location.href}, {path: [BrowsertrixInBrowserCrawlDriver.name]})
});
}
// run the example
await crawlInBrowsertrixInBrowser('https://example.com', window.BEHAVIORS);