Skip to content

Commit

Permalink
(Tentative) Move SSL capture to ScoopProxy
Browse files Browse the repository at this point in the history
Implements #138

---

- Removes `crip` dependency, dedicated certificates capture step and associated options.
- Intercepts certificate chain at `ScoopProxy` level using `socket.getPeerCertificate()` to assemble a PEM on the fly. Runs once per origin.
- Removes duplicate processing of `noarchive` checks

---

**Still working through:** The certificates interception currently happens at `ScoopProxy.onResponse()` level. It should be in `ScoopProxy.onConnected()`, but in some cases it appears to be _"too early"_. TBD, but this version works.
  • Loading branch information
matteocargnelutti committed Apr 11, 2023
1 parent 671cf85 commit 5389649
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 178 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ Options:
--network-idle-timeout <number> Max time Scoop will wait for the in-browser networking tasks to complete, in ms. (default: 20000)
--behaviors-timeout <number> Max time Scoop will wait for the browser behaviors to complete, in ms. (default: 20000)
--capture-video-as-attachment-timeout <number> Max time Scoop will wait for the video capture process to complete, in ms. (default: 30000)
--capture-certificates-as-attachment-timeout <number> Max time Scoop will wait for the certificates capture process to complete, in ms. (default: 10000)
--capture-window-x <number> Width of the browser window Scoop will open to capture, in pixels. (default: 1600)
--capture-window-y <number> Height of the browser window Scoop will open to capture, in pixels. (default: 900)
--max-capture-size <number> Size limit for the capture's exchanges list, in bytes. (default: 209715200)
Expand All @@ -201,7 +200,6 @@ Options:
--proxy-verbose <bool> Should Scoop's HTTP proxy output logs to the console? (choices: "true", "false", default: "false")
--public-ip-resolver-endpoint <string> API endpoint to be used to resolve the client's IP address. Used in the context of the provenance summary. (default: "https://icanhazip.com")
--yt-dlp-path <string> Path to the yt-dlp executable. Used for capturing videos. (default: "[library]/executables/yt-dlp")
--crip-path <string> Path to the crip executable. Used for capturing SSL/TLS certificates. (default: "[library]/executables/crip")
--log-level <string> Controls Scoop CLI's verbosity. (choices: "silent", "trace", "debug", "info", "warn", "error", default: "info")
-h, --help Show options list.
```
Expand Down Expand Up @@ -362,7 +360,7 @@ Namely:
- Same goes for certificates, captured as attachments via [crip](https://github.com/Hakky54/certificate-ripper).
- Favicons may be captured out-of-band using [curl](https://curl.se/), if not intercepted during capture.

Exchanges captured in that context still go through Scoop's HTTP proxy, with the exception of _crip_.
Exchanges captured in that context still go through Scoop's HTTP proxy.

```mermaid
flowchart LR
Expand Down
167 changes: 35 additions & 132 deletions Scoop.js
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,17 @@ export class Scoop {
* cpuArchitecture: ?string,
* blockedRequests: Array.<{match: string, rule: string}>,
* noArchiveUrls: string[],
* certificates: Array.<{host: string, pem: string}>,
* certificates: Object.<string, string>,
* ytDlpHash: string,
* cripHash: string,
* options: ScoopOptions,
* }}
*
*/
provenanceInfo = {
blockedRequests: [],
noArchiveUrls: [],
certificates: []
certificates: {} /// Key: host, Value: PEM string
}

/**
Expand Down Expand Up @@ -385,30 +386,6 @@ export class Scoop {
})
}

// Push step: noarchive directive detection
// TODO: Move this logic back to ScoopProxy.intercept() when new proxy implementation is ready.
steps.push({
name: 'Detecting "noarchive" directive',
alwaysRun: true,
webPageOnly: true,
main: async () => {
for (const exchange of this.intercepter.exchanges) {
this.intercepter.checkExchangeForNoArchive(exchange)
}
}
})

// Push step: certs capture
if (options.captureCertificatesAsAttachment) {
steps.push({
name: 'Capturing certificates info',
alwaysRun: options.attachmentsBypassLimits,
main: async () => {
await this.#captureCertificatesAsAttachment()
}
})
}

// Push step: Provenance summary
if (options.provenanceSummary) {
steps.push({
Expand Down Expand Up @@ -1038,98 +1015,6 @@ export class Scoop {
this.addGeneratedExchange(url, httpHeaders, body, isEntryPoint, description)
}

/**
* Runs `crip` against the different origins the capture process encountered.
* Captures certificates as `file:///[origin].pem`).
* Populates `this.provenanceInfo.certificates`.
*
* @returns {Promise<void>}
* @private
*/
async #captureCertificatesAsAttachment () {
const { captureCertificatesAsAttachmentTimeout, cripPath } = this.options

//
// Start timeout timer
//
let timeIsOut = false
const timer = setTimeout(() => { timeIsOut = true }, captureCertificatesAsAttachmentTimeout)

//
// Check that `crip` is available
//
try {
await exec(cripPath)
} catch (err) {
this.log.trace(err)
throw new Error('"crip" executable is not available or cannot be executed.')
}

//
// Pull certs
//
const processedHosts = new Map()

for (const exchange of this.intercepter.exchanges) {
const url = new URL(exchange.url)

if (timeIsOut) {
throw new Error('Capture certificates at attachment timeout reached')
}

if (url.protocol !== 'https:' || processedHosts.get(url.host) === true) {
continue
}

if (this.blocklist.find(searchBlocklistFor(`https://${url.host}`))) {
this.log.warn(`${url.host} matched against blocklist - skipped trying to pull its certificate.`)
continue
}

try {
const cripOptions = [
'print',
'-u', `https://${url.host}`,
'-f', 'pem'
]

let timeout = captureCertificatesAsAttachmentTimeout

if (processedHosts.length > 0) { // Timeout per request decreases as we go through the list.
timeout = captureCertificatesAsAttachmentTimeout / processedHosts.length
}

const spawnOptions = {
timeout: timeout > 1000 ? timeout : 1000,
maxBuffer: 1024 * 1024 * 128
}

const pem = await exec(cripPath, cripOptions, spawnOptions)

processedHosts.set(url.host, true)

if (!pem) {
throw new Error(`crip did not return a PEM for ${url.host}.`)
}

// Add to generated exchanges
const fileUrl = `file:///${url.host}.pem`
const httpHeaders = new Headers({ 'content-type': 'application/x-pem-file' })
const body = Buffer.from(pem)
const isEntryPoint = false
await this.addGeneratedExchange(fileUrl, httpHeaders, body, isEntryPoint)

// Add to `this.provenanceInfo.certificates`
this.provenanceInfo.certificates.push({ host: url.host, pem })
} catch (err) {
this.log.trace(err)
this.log.warn(`Certificates could not be extracted for ${url.host}`)
}
}

clearTimeout(timer)
}

/**
* Populates `this.provenanceInfo`, which is then used to generate a `file:///provenance-summary.html` exchange and entry point.
* That property is also be used by `scoopToWACZ()` to populate the `extras` field of `datapackage.json`.
Expand All @@ -1148,7 +1033,6 @@ export class Scoop {
const osInfo = await getOSInfo()
const userAgent = await page.evaluate(() => window.navigator.userAgent) // Source user agent from the browser in case it was altered during capture
let ytDlpHash = ''
let cripHash = ''

// Grab public IP address
try {
Expand Down Expand Up @@ -1183,18 +1067,6 @@ export class Scoop {
this.log.trace(err)
}

// Compute crip hash
try {
cripHash = createHash('sha256')
.update(await readFile(this.options.cripPath))
.digest('hex')

cripHash = `sha256:${cripHash}`
} catch (err) {
this.log.warn('Could not compute SHA256 hash of crip executable')
this.log.trace(err)
}

// Gather provenance info
this.provenanceInfo = {
...this.provenanceInfo,
Expand All @@ -1207,7 +1079,6 @@ export class Scoop {
osVersion: osInfo.version,
cpuArchitecture: os.machine(),
ytDlpHash,
cripHash,
options: structuredClone(this.options)
}

Expand Down Expand Up @@ -1235,6 +1106,38 @@ export class Scoop {
}
}

/**
* Adds an SSL certificate to the capture as:
* - An entry in `provenanceInfo.certificates`
* - A generated exchange (file:///{host}.pem)
* @param {string} host
* @param {string} pem
* @returns {Promise<void>}
*/
async addCertificate (host, pem) {
host = `${host}`
pem = `${pem}`

if (host.length < 3 || !host.includes('.')) {
throw new Error('"host" must be a valid network host.')
}

if (!pem.startsWith('-----BEGIN CERTIFICATE-----') ||
!pem.endsWith('-----END CERTIFICATE-----\n')) {
throw new Error('"pem" must be a valid certificate.')
}

// Save as generated exchange
const fileUrl = `file:///${host}.pem`
const httpHeaders = new Headers({ 'content-type': 'application/x-pem-file' })
const body = Buffer.from(pem)
const isEntryPoint = false
await this.addGeneratedExchange(fileUrl, httpHeaders, body, isEntryPoint)

// Save in provenance info (if successful)
this.provenanceInfo.certificates[host] = pem
}

/**
* Generates a ScoopGeneratedExchange for generated content and adds it to `exchanges`.
* Unless `force` argument is passed, generated exchanges count towards time / size limits.
Expand Down
15 changes: 12 additions & 3 deletions Scoop.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import express from 'express'
import { FIXTURES_PATH } from './constants.js'
import { isPNG, getDimensions } from './utils/png.js'
import { isPDF, getPageCount } from './utils/pdf.js'
import { defaults } from './options.js'
import { defaults, testDefaults } from './options.js'
import { Scoop } from './Scoop.js'

await test('Scoop - capture of a web page.', async (t) => {
await test('Scoop - capture of a (local) web page.', async (t) => {
const app = express()
const PORT = 3000
const URL = `http://localhost:${PORT}`
Expand Down Expand Up @@ -103,7 +103,16 @@ await test('Scoop - capture of a web page.', async (t) => {
server.close()
})

await test('Scoop - capture of a non-web resource.', async (t) => {
// Accounts for tests that can't be run locally
await test('Scoop - capture of a (remote) web page.', async (t) => {
await t.test('Scoop captures SSL certificates', async (_t) => {
const capture = await Scoop.capture('https://example.com', testDefaults)
assert(capture.provenanceInfo.certificates['example.com'])
assert(capture.extractGeneratedExchanges()['example.com.pem'])
})
})

await test('Scoop - capture of a (local) non-web resource.', async (t) => {
const app = express()
const PORT = 3000
const URL = `http://localhost:${PORT}`
Expand Down
6 changes: 3 additions & 3 deletions assets/templates/provenance-summary.njk
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,14 @@
</section>
{% endif %}

{% if certificates.length %}
{% if certificates|length %}
<section>
<h2>SSL/TLS Certificates</h2>

<p>The following certificates were pulled by <em>crip</em> from the different origins encountered during capture.</p>

{% for cert in certificates %}
<li><a href="{{cert.host}}.pem">{{ cert.host }}</a></li>
{% for host, pem in certificates %}
<li><a href="{{host}}.pem">{{ host }}</a></li>
{% endfor %}

</section>
Expand Down
14 changes: 0 additions & 14 deletions bin/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,6 @@ program.addOption(
.default(defaults.captureVideoAsAttachmentTimeout)
)

program.addOption(
new Option(
'--capture-certificates-as-attachment-timeout <number>',
'Max time Scoop will wait for the certificates capture process to complete, in ms.')
.default(defaults.captureCertificatesAsAttachmentTimeout)
)

//
// Dimensions
//
Expand Down Expand Up @@ -283,13 +276,6 @@ program.addOption(
.default(defaults.ytDlpPath)
)

program.addOption(
new Option(
'--crip-path <string>',
'Path to the crip executable. Used for capturing SSL/TLS certificates.')
.default(defaults.cripPath)
)

program.addOption(
new Option('--log-level <string>', 'Controls Scoop CLI\'s verbosity.')
.choices(['silent', 'trace', 'debug', 'info', 'warn', 'error'])
Expand Down
Loading

0 comments on commit 5389649

Please # to comment.