Skip to content

Commit

Permalink
[fix] Make UTF-8 validation work even if utf-8-validate is not installed
Browse files Browse the repository at this point in the history
Fixes #1868
  • Loading branch information
lpinca committed Apr 17, 2021
1 parent 114de9e commit 23ba6b2
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 15 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ can use one of the many wrappers available on npm, like
npm install ws
```

### Opt-in for performance and spec compliance
### Opt-in for performance

There are 2 optional modules that can be installed along side with the ws
module. These modules are binary addons which improve certain operations.
Expand All @@ -67,7 +67,7 @@ necessarily need to have a C++ compiler installed on your machine.
operations such as masking and unmasking the data payload of the WebSocket
frames.
- `npm install --save-optional utf-8-validate`: Allows to efficiently check if a
message contains valid UTF-8 as required by the spec.
message contains valid UTF-8.

## API docs

Expand Down
100 changes: 87 additions & 13 deletions lib/validation.js
Original file line number Diff line number Diff line change
@@ -1,24 +1,13 @@
'use strict';

try {
const isValidUTF8 = require('utf-8-validate');

exports.isValidUTF8 =
typeof isValidUTF8 === 'object'
? isValidUTF8.Validation.isValidUTF8 // utf-8-validate@<3.0.0
: isValidUTF8;
} catch (e) /* istanbul ignore next */ {
exports.isValidUTF8 = () => true;
}

/**
* Checks if a status code is allowed in a close frame.
*
* @param {Number} code The status code
* @return {Boolean} `true` if the status code is valid, else `false`
* @public
*/
exports.isValidStatusCode = (code) => {
function isValidStatusCode(code) {
return (
(code >= 1000 &&
code <= 1014 &&
Expand All @@ -27,4 +16,89 @@ exports.isValidStatusCode = (code) => {
code !== 1006) ||
(code >= 3000 && code <= 4999)
);
};
}

/**
* Checks if a given buffer contains only correct UTF-8.
* Ported from https://www.cl.cam.ac.uk/%7Emgk25/ucs/utf8_check.c by
* Markus Kuhn.
*
* @param {Buffer} buf The buffer to check
* @return {Boolean} `true` if `buf` contains only correct UTF-8, else `false`
* @public
*/
function _isValidUTF8(buf) {
const len = buf.length;
let i = 0;

while (i < len) {
if (buf[i] < 0x80) {
// 0xxxxxxx
i++;
} else if ((buf[i] & 0xe0) === 0xc0) {
// 110xxxxx 10xxxxxx
if (
i + 1 === len ||
(buf[i + 1] & 0xc0) !== 0x80 ||
(buf[i] & 0xfe) === 0xc0 // Overlong
) {
return false;
} else {
i += 2;
}
} else if ((buf[i] & 0xf0) === 0xe0) {
// 1110xxxx 10xxxxxx 10xxxxxx
if (
i + 2 >= len ||
(buf[i + 1] & 0xc0) !== 0x80 ||
(buf[i + 2] & 0xc0) !== 0x80 ||
(buf[i] === 0xe0 && (buf[i + 1] & 0xe0) === 0x80) || // Overlong
(buf[i] === 0xed && (buf[i + 1] & 0xe0) === 0xa0) // Surrogate (U+D800 - U+DFFF)
) {
return false;
} else {
i += 3;
}
} else if ((buf[i] & 0xf8) === 0xf0) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (
i + 3 >= len ||
(buf[i + 1] & 0xc0) !== 0x80 ||
(buf[i + 2] & 0xc0) !== 0x80 ||
(buf[i + 3] & 0xc0) !== 0x80 ||
(buf[i] === 0xf0 && (buf[i + 1] & 0xf0) === 0x80) || // Overlong
(buf[i] === 0xf4 && buf[i + 1] > 0x8f) ||
buf[i] > 0xf4 // > U+10FFFF
) {
return false;
} else {
i += 4;
}
} else {
return false;
}
}

return true;
}

try {
let isValidUTF8 = require('utf-8-validate');

/* istanbul ignore if */
if (typeof isValidUTF8 === 'object') {
isValidUTF8 = isValidUTF8.Validation.isValidUTF8; // utf-8-validate@<3.0.0
}

module.exports = {
isValidStatusCode,
isValidUTF8(buf) {
return buf.length < 150 ? _isValidUTF8(buf) : isValidUTF8(buf);
}
};
} catch (e) /* istanbul ignore next */ {
module.exports = {
isValidStatusCode,
isValidUTF8: _isValidUTF8
};
}
52 changes: 52 additions & 0 deletions test/validation.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
'use strict';

const assert = require('assert');

const { isValidUTF8 } = require('../lib/validation');

describe('extension', () => {
describe('isValidUTF8', () => {
it('returns false if it finds invalid bytes', () => {
assert.strictEqual(isValidUTF8(Buffer.from([0xf8])), false);
});

it('returns false for overlong encodings', () => {
assert.strictEqual(isValidUTF8(Buffer.from([0xc0, 0xa0])), false);
assert.strictEqual(isValidUTF8(Buffer.from([0xe0, 0x80, 0xa0])), false);
assert.strictEqual(
isValidUTF8(Buffer.from([0xf0, 0x80, 0x80, 0xa0])),
false
);
});

it('returns false for code points in the range U+D800 - U+DFFF', () => {
for (let i = 0xa0; i < 0xc0; i++) {
for (let j = 0x80; j < 0xc0; j++) {
assert.strictEqual(isValidUTF8(Buffer.from([0xed, i, j])), false);
}
}
});

it('returns false for code points greater than U+10FFFF', () => {
assert.strictEqual(
isValidUTF8(Buffer.from([0xf4, 0x90, 0x80, 0x80])),
false
);
assert.strictEqual(
isValidUTF8(Buffer.from([0xf5, 0x80, 0x80, 0x80])),
false
);
});

it('returns true for a well-formed UTF-8 byte sequence', () => {
// prettier-ignore
const buf = Buffer.from([
0xe2, 0x82, 0xAC, // €
0xf0, 0x90, 0x8c, 0x88, // 𐍈
0x24 // $
]);

assert.strictEqual(isValidUTF8(buf), true);
});
});
});

0 comments on commit 23ba6b2

Please # to comment.