Skip to content

Commit 0e2ef75

Browse files
anonrigkvakillemire
committed
buffer: add SIMD Neon optimization for byteLength
Co-authored-by: Keyhan Vakil <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent c9ec72d commit 0e2ef75

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

src/node_buffer.cc

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@
3838
#include <cstring>
3939
#include <climits>
4040

41+
#if defined(__aarch64__) || defined(_M_ARM64)
42+
#define NODE_HAS_SIMD_NEON 1
43+
#endif
44+
45+
#if NODE_HAS_SIMD_NEON
46+
#include <arm_neon.h>
47+
#endif
48+
4149
#define THROW_AND_RETURN_UNLESS_BUFFER(env, obj) \
4250
THROW_AND_RETURN_IF_NOT_BUFFER(env, obj, "argument") \
4351

@@ -741,6 +749,45 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
741749
args.GetReturnValue().Set(args[0].As<String>()->Utf8Length(env->isolate()));
742750
}
743751

752+
#if NODE_HAS_SIMD_NEON
753+
uint32_t FastByteLengthUtf8(Local<Value> receiver,
754+
const v8::FastOneByteString& source) {
755+
auto data = reinterpret_cast<const uint8_t*>(source.data);
756+
auto length = source.length;
757+
758+
uint32_t result{0};
759+
const int lanes = sizeof(uint8x16_t);
760+
uint8_t remaining = length % lanes;
761+
const auto* simd_end = data + (length / lanes) * lanes;
762+
const auto threshold = vdupq_n_u8(0x80);
763+
764+
for (; data < simd_end; data += lanes) {
765+
// load 16 bytes
766+
uint8x16_t input = vld1q_u8(data);
767+
768+
// compare to threshold (0x80)
769+
uint8x16_t with_highbit = vcgeq_u8(input, threshold);
770+
771+
// shift and narrow
772+
uint8x8_t highbits = vshrn_n_u16(vreinterpretq_u16_u8(with_highbit), 4);
773+
774+
// we have 0, 4 or 8 bits per byte
775+
uint8x8_t bitsperbyte = vcnt_u8(highbits);
776+
777+
// sum the bytes vertically to uint32_t
778+
result += vaddlv_u8(bitsperbyte);
779+
}
780+
781+
// we overcounted by a factor of 4
782+
result /= 4;
783+
784+
for (uint8_t j = 0; j < remaining; j++) {
785+
result += (simd_end[j] >> 7);
786+
}
787+
788+
return result + length;
789+
}
790+
#else
744791
uint32_t FastByteLengthUtf8(Local<Value> receiver,
745792
const v8::FastOneByteString& source) {
746793
uint32_t result = 0;
@@ -752,6 +799,7 @@ uint32_t FastByteLengthUtf8(Local<Value> receiver,
752799
result += length;
753800
return result;
754801
}
802+
#endif
755803

756804
static v8::CFunction fast_byte_length_utf8(
757805
v8::CFunction::Make(FastByteLengthUtf8));

0 commit comments

Comments
 (0)