buffer: add SIMD Neon optimization for byteLength

anonrig · kvakil · lemire · anonrig · commit 9494cb77997a · 2023-05-16T20:33:08.000-04:00
Co-authored-by: Keyhan Vakil &lt;kvakil@sylph.kvakil.me&gt;
Co-authored-by: Daniel Lemire &lt;daniel@lemire.me&gt;
diff --git a/node.gyp b/node.gyp
@@ -121,6 +121,7 @@
       'src/node_report_utils.cc',
       'src/node_sea.cc',
       'src/node_serdes.cc',
+      'src/node_simd.cc',
       'src/node_shadow_realm.cc',
       'src/node_snapshotable.cc',
       'src/node_sockaddr.cc',
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
@@ -26,6 +26,7 @@
 #include "node_external_reference.h"
 #include "node_i18n.h"
 #include "node_internals.h"
+#include "node_simd.h"
 
 #include "env-inl.h"
 #include "simdutf.h"
@@ -743,14 +744,8 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
 
 uint32_t FastByteLengthUtf8(Local<Value> receiver,
                             const v8::FastOneByteString& source) {
-  uint32_t result = 0;
-  uint32_t length = source.length;
-  const uint8_t* data = reinterpret_cast<const uint8_t*>(source.data);
-  for (uint32_t i = 0; i < length; ++i) {
-    result += (data[i] >> 7);
-  }
-  result += length;
-  return result;
+  return node::simd::utf8_byte_length(
+      reinterpret_cast<const uint8_t*>(source.data), source.length);
 }
 
 static v8::CFunction fast_byte_length_utf8(
diff --git a/src/node_simd.cc b/src/node_simd.cc
@@ -0,0 +1,58 @@
+#include "node_simd.h"
+
+#if NODE_HAS_SIMD_NEON
+#include <arm_neon.h>
+#endif
+
+namespace node {
+namespace simd {
+
+#if NODE_HAS_SIMD_NEON
+uint32_t utf8_byte_length(const uint8_t* data, size_t length) {
+  uint64_t result{0};
+
+  const int lanes = sizeof(uint8x16_t);
+  const int max_sra_count = 256 / lanes; // Avoid overflowing vaddvq_u8.
+  const int unrolls = max_sra_count;
+  const int unrolled_lanes = lanes * unrolls;
+
+  const uint8_t *unroll_end = data + (length / unrolled_lanes) * unrolled_lanes;
+  uint32_t length_after_unroll = length % unrolled_lanes;
+  for (; data < unroll_end;) {
+    uint8x16_t acc = {};
+    for (int i = 0; i < unrolls; ++i, data += lanes) {
+      uint8x16_t chunk = vld1q_u8(data);
+      acc = vsraq_n_u8(acc, chunk, 7);
+    }
+    result += vaddvq_u8(acc);
+  }
+
+  const uint8_t *simd_end = data + (length_after_unroll / lanes) * lanes;
+  uint32_t length_after_simd = length % lanes;
+  uint8x16_t acc = {};
+  for (; data < simd_end; data += lanes) {
+    uint8x16_t chunk = vld1q_u8(data);
+    acc = vsraq_n_u8(acc, chunk, 7);
+  }
+  result += vaddvq_u8(acc);
+
+  const uint8_t *scalar_end = data + length_after_simd;
+  for (; data < scalar_end; data += 1) {
+    result += *data >> 7;
+  }
+
+  return result + length;
+}
+#else
+uint32_t utf8_byte_length(const uint8_t* data, size_t length) {
+  uint32_t result = 0;
+  for (uint32_t i = 0; i < length; ++i) {
+    result += (data[i] >> 7);
+  }
+  result += length;
+  return result;
+}
+#endif
+
+}  // namespace simd
+}  // namespace node
diff --git a/src/node_simd.h b/src/node_simd.h
@@ -0,0 +1,22 @@
+#ifndef SRC_NODE_SIMD_H_
+#define SRC_NODE_SIMD_H_
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define NODE_HAS_SIMD_NEON 1
+#endif
+
+#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
+
+#include <stdlib.h>
+
+namespace node {
+namespace simd {
+
+uint32_t utf8_byte_length(const uint8_t* input, size_t length);
+
+}  // namespace simd
+}  // namespace node
+
+#endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
+
+#endif  // SRC_NODE_SIMD_H_