[fix] Make UTF-8 validation work even if utf-8-validate is not installed

Fixes #1868
websockets · Apr 17, 2021 · 23ba6b2 · 23ba6b2
1 parent 114de9e
commit 23ba6b2
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ can use one of the many wrappers available on npm, like
 npm install ws
 ```
 
-### Opt-in for performance and spec compliance
+### Opt-in for performance
 
 There are 2 optional modules that can be installed along side with the ws
 module. These modules are binary addons which improve certain operations.
@@ -67,7 +67,7 @@ necessarily need to have a C++ compiler installed on your machine.
   operations such as masking and unmasking the data payload of the WebSocket
   frames.
 - `npm install --save-optional utf-8-validate`: Allows to efficiently check if a
-  message contains valid UTF-8 as required by the spec.
+  message contains valid UTF-8.
 
 ## API docs
 

diff --git a/lib/validation.js b/lib/validation.js
@@ -1,24 +1,13 @@
 'use strict';
 
-try {
-  const isValidUTF8 = require('utf-8-validate');
-
-  exports.isValidUTF8 =
-    typeof isValidUTF8 === 'object'
-      ? isValidUTF8.Validation.isValidUTF8 // utf-8-validate@<3.0.0
-      : isValidUTF8;
-} catch (e) /* istanbul ignore next */ {
-  exports.isValidUTF8 = () => true;
-}
-
 /**
  * Checks if a status code is allowed in a close frame.
  *
  * @param {Number} code The status code
  * @return {Boolean} `true` if the status code is valid, else `false`
  * @public
  */
-exports.isValidStatusCode = (code) => {
+function isValidStatusCode(code) {
   return (
     (code >= 1000 &&
       code <= 1014 &&
@@ -27,4 +16,89 @@ exports.isValidStatusCode = (code) => {
       code !== 1006) ||
     (code >= 3000 && code <= 4999)
   );
-};
+}
+
+/**
+ * Checks if a given buffer contains only correct UTF-8.
+ * Ported from https://www.cl.cam.ac.uk/%7Emgk25/ucs/utf8_check.c by
+ * Markus Kuhn.
+ *
+ * @param {Buffer} buf The buffer to check
+ * @return {Boolean} `true` if `buf` contains only correct UTF-8, else `false`
+ * @public
+ */
+function _isValidUTF8(buf) {
+  const len = buf.length;
+  let i = 0;
+
+  while (i < len) {
+    if (buf[i] < 0x80) {
+      // 0xxxxxxx
+      i++;
+    } else if ((buf[i] & 0xe0) === 0xc0) {
+      // 110xxxxx 10xxxxxx
+      if (
+        i + 1 === len ||
+        (buf[i + 1] & 0xc0) !== 0x80 ||
+        (buf[i] & 0xfe) === 0xc0 // Overlong
+      ) {
+        return false;
+      } else {
+        i += 2;
+      }
+    } else if ((buf[i] & 0xf0) === 0xe0) {
+      // 1110xxxx 10xxxxxx 10xxxxxx
+      if (
+        i + 2 >= len ||
+        (buf[i + 1] & 0xc0) !== 0x80 ||
+        (buf[i + 2] & 0xc0) !== 0x80 ||
+        (buf[i] === 0xe0 && (buf[i + 1] & 0xe0) === 0x80) || // Overlong
+        (buf[i] === 0xed && (buf[i + 1] & 0xe0) === 0xa0) // Surrogate (U+D800 - U+DFFF)
+      ) {
+        return false;
+      } else {
+        i += 3;
+      }
+    } else if ((buf[i] & 0xf8) === 0xf0) {
+      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      if (
+        i + 3 >= len ||
+        (buf[i + 1] & 0xc0) !== 0x80 ||
+        (buf[i + 2] & 0xc0) !== 0x80 ||
+        (buf[i + 3] & 0xc0) !== 0x80 ||
+        (buf[i] === 0xf0 && (buf[i + 1] & 0xf0) === 0x80) || // Overlong
+        (buf[i] === 0xf4 && buf[i + 1] > 0x8f) ||
+        buf[i] > 0xf4 // > U+10FFFF
+      ) {
+        return false;
+      } else {
+        i += 4;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+try {
+  let isValidUTF8 = require('utf-8-validate');
+
+  /* istanbul ignore if */
+  if (typeof isValidUTF8 === 'object') {
+    isValidUTF8 = isValidUTF8.Validation.isValidUTF8; // utf-8-validate@<3.0.0
+  }
+
+  module.exports = {
+    isValidStatusCode,
+    isValidUTF8(buf) {
+      return buf.length < 150 ? _isValidUTF8(buf) : isValidUTF8(buf);
+    }
+  };
+} catch (e) /* istanbul ignore next */ {
+  module.exports = {
+    isValidStatusCode,
+    isValidUTF8: _isValidUTF8
+  };
+}
diff --git a/test/validation.test.js b/test/validation.test.js
@@ -0,0 +1,52 @@
+'use strict';
+
+const assert = require('assert');
+
+const { isValidUTF8 } = require('../lib/validation');
+
+describe('extension', () => {
+  describe('isValidUTF8', () => {
+    it('returns false if it finds invalid bytes', () => {
+      assert.strictEqual(isValidUTF8(Buffer.from([0xf8])), false);
+    });
+
+    it('returns false for overlong encodings', () => {
+      assert.strictEqual(isValidUTF8(Buffer.from([0xc0, 0xa0])), false);
+      assert.strictEqual(isValidUTF8(Buffer.from([0xe0, 0x80, 0xa0])), false);
+      assert.strictEqual(
+        isValidUTF8(Buffer.from([0xf0, 0x80, 0x80, 0xa0])),
+        false
+      );
+    });
+
+    it('returns false for code points in the range U+D800 - U+DFFF', () => {
+      for (let i = 0xa0; i < 0xc0; i++) {
+        for (let j = 0x80; j < 0xc0; j++) {
+          assert.strictEqual(isValidUTF8(Buffer.from([0xed, i, j])), false);
+        }
+      }
+    });
+
+    it('returns false for code points greater than U+10FFFF', () => {
+      assert.strictEqual(
+        isValidUTF8(Buffer.from([0xf4, 0x90, 0x80, 0x80])),
+        false
+      );
+      assert.strictEqual(
+        isValidUTF8(Buffer.from([0xf5, 0x80, 0x80, 0x80])),
+        false
+      );
+    });
+
+    it('returns true for a well-formed UTF-8 byte sequence', () => {
+      // prettier-ignore
+      const buf = Buffer.from([
+        0xe2, 0x82, 0xAC, // €
+        0xf0, 0x90, 0x8c, 0x88, // 𐍈
+        0x24 // $
+      ]);
+
+      assert.strictEqual(isValidUTF8(buf), true);
+    });
+  });
+});