diff --git a/README.md b/README.md index 6aea136b1..9c6e5287c 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ can use one of the many wrappers available on npm, like npm install ws ``` -### Opt-in for performance and spec compliance +### Opt-in for performance There are 2 optional modules that can be installed along side with the ws module. These modules are binary addons which improve certain operations. @@ -67,7 +67,7 @@ necessarily need to have a C++ compiler installed on your machine. operations such as masking and unmasking the data payload of the WebSocket frames. - `npm install --save-optional utf-8-validate`: Allows to efficiently check if a - message contains valid UTF-8 as required by the spec. + message contains valid UTF-8. ## API docs diff --git a/lib/validation.js b/lib/validation.js index 32db5a570..d8693fdb9 100644 --- a/lib/validation.js +++ b/lib/validation.js @@ -1,16 +1,5 @@ 'use strict'; -try { - const isValidUTF8 = require('utf-8-validate'); - - exports.isValidUTF8 = - typeof isValidUTF8 === 'object' - ? isValidUTF8.Validation.isValidUTF8 // utf-8-validate@<3.0.0 - : isValidUTF8; -} catch (e) /* istanbul ignore next */ { - exports.isValidUTF8 = () => true; -} - /** * Checks if a status code is allowed in a close frame. * @@ -18,7 +7,7 @@ try { * @return {Boolean} `true` if the status code is valid, else `false` * @public */ -exports.isValidStatusCode = (code) => { +function isValidStatusCode(code) { return ( (code >= 1000 && code <= 1014 && @@ -27,4 +16,89 @@ exports.isValidStatusCode = (code) => { code !== 1006) || (code >= 3000 && code <= 4999) ); -}; +} + +/** + * Checks if a given buffer contains only correct UTF-8. + * Ported from https://www.cl.cam.ac.uk/%7Emgk25/ucs/utf8_check.c by + * Markus Kuhn. + * + * @param {Buffer} buf The buffer to check + * @return {Boolean} `true` if `buf` contains only correct UTF-8, else `false` + * @public + */ +function _isValidUTF8(buf) { + const len = buf.length; + let i = 0; + + while (i < len) { + if (buf[i] < 0x80) { + // 0xxxxxxx + i++; + } else if ((buf[i] & 0xe0) === 0xc0) { + // 110xxxxx 10xxxxxx + if ( + i + 1 === len || + (buf[i + 1] & 0xc0) !== 0x80 || + (buf[i] & 0xfe) === 0xc0 // Overlong + ) { + return false; + } else { + i += 2; + } + } else if ((buf[i] & 0xf0) === 0xe0) { + // 1110xxxx 10xxxxxx 10xxxxxx + if ( + i + 2 >= len || + (buf[i + 1] & 0xc0) !== 0x80 || + (buf[i + 2] & 0xc0) !== 0x80 || + (buf[i] === 0xe0 && (buf[i + 1] & 0xe0) === 0x80) || // Overlong + (buf[i] === 0xed && (buf[i + 1] & 0xe0) === 0xa0) // Surrogate (U+D800 - U+DFFF) + ) { + return false; + } else { + i += 3; + } + } else if ((buf[i] & 0xf8) === 0xf0) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if ( + i + 3 >= len || + (buf[i + 1] & 0xc0) !== 0x80 || + (buf[i + 2] & 0xc0) !== 0x80 || + (buf[i + 3] & 0xc0) !== 0x80 || + (buf[i] === 0xf0 && (buf[i + 1] & 0xf0) === 0x80) || // Overlong + (buf[i] === 0xf4 && buf[i + 1] > 0x8f) || + buf[i] > 0xf4 // > U+10FFFF + ) { + return false; + } else { + i += 4; + } + } else { + return false; + } + } + + return true; +} + +try { + let isValidUTF8 = require('utf-8-validate'); + + /* istanbul ignore if */ + if (typeof isValidUTF8 === 'object') { + isValidUTF8 = isValidUTF8.Validation.isValidUTF8; // utf-8-validate@<3.0.0 + } + + module.exports = { + isValidStatusCode, + isValidUTF8(buf) { + return buf.length < 150 ? _isValidUTF8(buf) : isValidUTF8(buf); + } + }; +} catch (e) /* istanbul ignore next */ { + module.exports = { + isValidStatusCode, + isValidUTF8: _isValidUTF8 + }; +} diff --git a/test/validation.test.js b/test/validation.test.js new file mode 100644 index 000000000..5718b12f0 --- /dev/null +++ b/test/validation.test.js @@ -0,0 +1,52 @@ +'use strict'; + +const assert = require('assert'); + +const { isValidUTF8 } = require('../lib/validation'); + +describe('extension', () => { + describe('isValidUTF8', () => { + it('returns false if it finds invalid bytes', () => { + assert.strictEqual(isValidUTF8(Buffer.from([0xf8])), false); + }); + + it('returns false for overlong encodings', () => { + assert.strictEqual(isValidUTF8(Buffer.from([0xc0, 0xa0])), false); + assert.strictEqual(isValidUTF8(Buffer.from([0xe0, 0x80, 0xa0])), false); + assert.strictEqual( + isValidUTF8(Buffer.from([0xf0, 0x80, 0x80, 0xa0])), + false + ); + }); + + it('returns false for code points in the range U+D800 - U+DFFF', () => { + for (let i = 0xa0; i < 0xc0; i++) { + for (let j = 0x80; j < 0xc0; j++) { + assert.strictEqual(isValidUTF8(Buffer.from([0xed, i, j])), false); + } + } + }); + + it('returns false for code points greater than U+10FFFF', () => { + assert.strictEqual( + isValidUTF8(Buffer.from([0xf4, 0x90, 0x80, 0x80])), + false + ); + assert.strictEqual( + isValidUTF8(Buffer.from([0xf5, 0x80, 0x80, 0x80])), + false + ); + }); + + it('returns true for a well-formed UTF-8 byte sequence', () => { + // prettier-ignore + const buf = Buffer.from([ + 0xe2, 0x82, 0xAC, // € + 0xf0, 0x90, 0x8c, 0x88, // 𐍈 + 0x24 // $ + ]); + + assert.strictEqual(isValidUTF8(buf), true); + }); + }); +});