Skip to content

Commit 9494cb7

Browse files
anonrigkvakillemire
committed
buffer: add SIMD Neon optimization for byteLength
Co-authored-by: Keyhan Vakil <kvakil@sylph.kvakil.me> Co-authored-by: Daniel Lemire <daniel@lemire.me>
1 parent c9ec72d commit 9494cb7

File tree

4 files changed

+84
-8
lines changed

4 files changed

+84
-8
lines changed

node.gyp

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
'src/node_report_utils.cc',
122122
'src/node_sea.cc',
123123
'src/node_serdes.cc',
124+
'src/node_simd.cc',
124125
'src/node_shadow_realm.cc',
125126
'src/node_snapshotable.cc',
126127
'src/node_sockaddr.cc',

src/node_buffer.cc

+3-8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "node_external_reference.h"
2727
#include "node_i18n.h"
2828
#include "node_internals.h"
29+
#include "node_simd.h"
2930

3031
#include "env-inl.h"
3132
#include "simdutf.h"
@@ -743,14 +744,8 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
743744

744745
uint32_t FastByteLengthUtf8(Local<Value> receiver,
745746
const v8::FastOneByteString& source) {
746-
uint32_t result = 0;
747-
uint32_t length = source.length;
748-
const uint8_t* data = reinterpret_cast<const uint8_t*>(source.data);
749-
for (uint32_t i = 0; i < length; ++i) {
750-
result += (data[i] >> 7);
751-
}
752-
result += length;
753-
return result;
747+
return node::simd::utf8_byte_length(
748+
reinterpret_cast<const uint8_t*>(source.data), source.length);
754749
}
755750

756751
static v8::CFunction fast_byte_length_utf8(

src/node_simd.cc

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#include "node_simd.h"
2+
3+
#if NODE_HAS_SIMD_NEON
4+
#include <arm_neon.h>
5+
#endif
6+
7+
namespace node {
8+
namespace simd {
9+
10+
#if NODE_HAS_SIMD_NEON
11+
uint32_t utf8_byte_length(const uint8_t* data, size_t length) {
12+
uint64_t result{0};
13+
14+
const int lanes = sizeof(uint8x16_t);
15+
const int max_sra_count = 256 / lanes; // Avoid overflowing vaddvq_u8.
16+
const int unrolls = max_sra_count;
17+
const int unrolled_lanes = lanes * unrolls;
18+
19+
const uint8_t *unroll_end = data + (length / unrolled_lanes) * unrolled_lanes;
20+
uint32_t length_after_unroll = length % unrolled_lanes;
21+
for (; data < unroll_end;) {
22+
uint8x16_t acc = {};
23+
for (int i = 0; i < unrolls; ++i, data += lanes) {
24+
uint8x16_t chunk = vld1q_u8(data);
25+
acc = vsraq_n_u8(acc, chunk, 7);
26+
}
27+
result += vaddvq_u8(acc);
28+
}
29+
30+
const uint8_t *simd_end = data + (length_after_unroll / lanes) * lanes;
31+
uint32_t length_after_simd = length % lanes;
32+
uint8x16_t acc = {};
33+
for (; data < simd_end; data += lanes) {
34+
uint8x16_t chunk = vld1q_u8(data);
35+
acc = vsraq_n_u8(acc, chunk, 7);
36+
}
37+
result += vaddvq_u8(acc);
38+
39+
const uint8_t *scalar_end = data + length_after_simd;
40+
for (; data < scalar_end; data += 1) {
41+
result += *data >> 7;
42+
}
43+
44+
return result + length;
45+
}
46+
#else
47+
uint32_t utf8_byte_length(const uint8_t* data, size_t length) {
48+
uint32_t result = 0;
49+
for (uint32_t i = 0; i < length; ++i) {
50+
result += (data[i] >> 7);
51+
}
52+
result += length;
53+
return result;
54+
}
55+
#endif
56+
57+
} // namespace simd
58+
} // namespace node

src/node_simd.h

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef SRC_NODE_SIMD_H_
2+
#define SRC_NODE_SIMD_H_
3+
4+
#if defined(__aarch64__) || defined(_M_ARM64)
5+
#define NODE_HAS_SIMD_NEON 1
6+
#endif
7+
8+
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
9+
10+
#include <stdlib.h>
11+
12+
namespace node {
13+
namespace simd {
14+
15+
uint32_t utf8_byte_length(const uint8_t* input, size_t length);
16+
17+
} // namespace simd
18+
} // namespace node
19+
20+
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
21+
22+
#endif // SRC_NODE_SIMD_H_

0 commit comments

Comments
 (0)