diff --git a/Cargo.toml b/Cargo.toml index 97936957..b7506683 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,9 +22,12 @@ std = [] # expose SIMD implementations in basic::imp::* and compat::imp::* public_imp = [] -# aarch64 NEON SIMD implementation - requires nightly +# aarch64 Neon SIMD implementation - requires nightly aarch64_neon = [] +# arm Neon SIMD implementation - requires nightly +arm_neon = [] + # deprecated - does not do anything hints = [] diff --git a/src/implementation/arm/mod.rs b/src/implementation/arm/mod.rs new file mode 100644 index 00000000..90f95f15 --- /dev/null +++ b/src/implementation/arm/mod.rs @@ -0,0 +1,40 @@ +#[cfg(all(feature = "arm_neon", target_feature = "neon"))] +pub(crate) mod neon; + +#[inline] +#[cfg(all(feature = "arm_neon", target_feature = "neon"))] +pub(crate) unsafe fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> { + if input.len() < super::helpers::SIMD_CHUNK_SIZE { + return super::validate_utf8_basic_fallback(input); + } + + validate_utf8_basic_neon(input) +} + +#[inline(never)] +#[cfg(all(feature = "arm_neon", target_feature = "neon"))] +unsafe fn validate_utf8_basic_neon(input: &[u8]) -> Result<(), crate::basic::Utf8Error> { + neon::validate_utf8_basic(input) +} + +#[cfg(not(all(feature = "arm_neon", target_feature = "neon")))] +pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic; + +#[inline] +#[cfg(all(feature = "arm_neon", target_feature = "neon"))] +pub(crate) unsafe fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> { + if input.len() < super::helpers::SIMD_CHUNK_SIZE { + return super::validate_utf8_compat_fallback(input); + } + + validate_utf8_compat_neon(input) +} + +#[inline(never)] +#[cfg(all(feature = "arm_neon", target_feature = "neon"))] +unsafe fn validate_utf8_compat_neon(input: &[u8]) -> Result<(), crate::compat::Utf8Error> { + neon::validate_utf8_compat(input) +} + +#[cfg(not(all(feature = "arm_neon", target_feature = "neon")))] +pub(crate) use super::validate_utf8_compat_fallback as validate_utf8_compat; diff --git a/src/implementation/arm/neon.rs b/src/implementation/arm/neon.rs new file mode 100644 index 00000000..472ad014 --- /dev/null +++ b/src/implementation/arm/neon.rs @@ -0,0 +1,235 @@ +//! Contains the arm UTF-8 validation implementation. + +use core::arch::arm::{ + uint8x16_t, vandq_u8, vcgtq_u8, vdupq_n_u8, veorq_u8, vextq_u8, vget_high_u32, vget_lane_u32, + vget_low_u32, vld1q_u8, vmovq_n_u8, vorr_u32, vorrq_u8, vpmax_u32, vqsubq_u8, + vreinterpretq_u32_u8, vshrq_n_u8, +}; + +use crate::implementation::helpers::Utf8CheckAlgorithm; + +// arm Neon SIMD primitives + +type SimdU8Value = crate::implementation::helpers::SimdU8Value; + +impl SimdU8Value { + #[inline] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::cast_possible_wrap)] + unsafe fn from_32_cut_off_leading( + _v0: u8, + _v1: u8, + _v2: u8, + _v3: u8, + _v4: u8, + _v5: u8, + _v6: u8, + _v7: u8, + _v8: u8, + _v9: u8, + _v10: u8, + _v11: u8, + _v12: u8, + _v13: u8, + _v14: u8, + _v15: u8, + v16: u8, + v17: u8, + v18: u8, + v19: u8, + v20: u8, + v21: u8, + v22: u8, + v23: u8, + v24: u8, + v25: u8, + v26: u8, + v27: u8, + v28: u8, + v29: u8, + v30: u8, + v31: u8, + ) -> Self { + let arr: [u8; 16] = [ + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + ]; + Self::from(vld1q_u8(arr.as_ptr())) + } + + #[inline] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::cast_possible_wrap)] + unsafe fn repeat_16( + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + ) -> Self { + let arr: [u8; 16] = [ + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + ]; + Self::from(vld1q_u8(arr.as_ptr())) + } + + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn load_from(ptr: *const u8) -> Self { + // WORKAROUND for https://github.com/rust-lang/stdarch/issues/1148 + // The vld1q_u8 intrinsic is currently broken, it treats it as individual + // byte loads so the compiler sometimes decides it is a better to load + // individual bytes to "optimize" a subsequent SIMD shuffle + // + // This code forces a full 128-bit load. + let mut dst = core::mem::MaybeUninit::::uninit(); + core::ptr::copy_nonoverlapping( + ptr.cast::(), + dst.as_mut_ptr().cast::(), + core::mem::size_of::(), + ); + Self::from(dst.assume_init()) + } + + #[inline] + #[allow(clippy::too_many_arguments)] + unsafe fn lookup_16( + self, + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + ) -> Self { + unimplemented!(); + } + + #[inline] + #[allow(clippy::cast_possible_wrap)] + unsafe fn splat(val: u8) -> Self { + Self::from(vmovq_n_u8(val)) + } + + #[inline] + #[allow(clippy::cast_possible_wrap)] + unsafe fn splat0() -> Self { + Self::from(vdupq_n_u8(0)) + } + + #[inline] + unsafe fn or(self, b: Self) -> Self { + Self::from(vorrq_u8(self.0, b.0)) + } + + #[inline] + unsafe fn and(self, b: Self) -> Self { + Self::from(vandq_u8(self.0, b.0)) + } + + #[inline] + unsafe fn xor(self, b: Self) -> Self { + Self::from(veorq_u8(self.0, b.0)) + } + + #[inline] + unsafe fn saturating_sub(self, b: Self) -> Self { + Self::from(vqsubq_u8(self.0, b.0)) + } + + // ugly but shr requires const generics + + #[allow(clippy::cast_lossless)] + #[inline] + unsafe fn shr4(self) -> Self { + Self::from(vshrq_n_u8(self.0, 4)) + } + + // ugly but prev requires const generics + + #[allow(clippy::cast_lossless)] + #[inline] + unsafe fn prev1(self, prev: Self) -> Self { + Self::from(vextq_u8(prev.0, self.0, 16 - 1)) + } + + // ugly but prev requires const generics + + #[allow(clippy::cast_lossless)] + #[inline] + unsafe fn prev2(self, prev: Self) -> Self { + Self::from(vextq_u8(prev.0, self.0, 16 - 2)) + } + + // ugly but prev requires const generics + + #[allow(clippy::cast_lossless)] + #[inline] + unsafe fn prev3(self, prev: Self) -> Self { + Self::from(vextq_u8(prev.0, self.0, 16 - 3)) + } + + #[inline] + unsafe fn unsigned_gt(self, other: Self) -> Self { + Self::from(vcgtq_u8(self.0, other.0)) + } + + #[inline] + unsafe fn any_bit_set(self) -> bool { + let tmp = vreinterpretq_u32_u8(self.0); + let tmp = vorr_u32(vget_low_u32(tmp), vget_high_u32(tmp)); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; + } + + #[inline] + unsafe fn is_ascii(self) -> bool { + unimplemented!(); + } +} + +impl From for SimdU8Value { + #[inline] + fn from(val: uint8x16_t) -> Self { + Self { 0: val } + } +} + +impl Utf8CheckAlgorithm { + #[inline] + unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value { + let is_third_byte = prev2.unsigned_gt(SimdU8Value::splat(0b1110_0000 - 1)); + let is_fourth_byte = prev3.unsigned_gt(SimdU8Value::splat(0b1111_0000 - 1)); + + is_third_byte.or(is_fourth_byte) + } +} + +#[inline] +unsafe fn simd_prefetch(ptr: *const u8) { + // _pld intrinsic currently not available, potential benefit also unknown +} + +const PREFETCH: bool = false; +use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; +simd_input_128_bit!("neon"); +algorithm_simd!("neon"); diff --git a/src/implementation/mod.rs b/src/implementation/mod.rs index 3d3d8181..d0fa3a54 100644 --- a/src/implementation/mod.rs +++ b/src/implementation/mod.rs @@ -49,12 +49,33 @@ pub(super) use aarch64::validate_utf8_basic; #[cfg(target_arch = "aarch64")] pub(super) use aarch64::validate_utf8_compat; +// arm implementation + +#[cfg(target_arch = "arm")] +pub(crate) mod arm; + +#[cfg(target_arch = "arm")] +pub(super) use arm::validate_utf8_basic; + +#[cfg(target_arch = "arm")] +pub(super) use arm::validate_utf8_compat; + // fallback for unsupported architectures -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] +#[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "arm" +)))] pub(super) use validate_utf8_basic_fallback as validate_utf8_basic; -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] +#[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "arm" +)))] pub(super) use validate_utf8_compat_fallback as validate_utf8_compat; // fallback method implementations diff --git a/src/lib.rs b/src/lib.rs index a0391c46..79ce5f6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,9 +14,16 @@ #![cfg_attr(not(feature = "std"), no_std)] #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr( - all(feature = "aarch64_neon", target_arch = "aarch64"), + any( + all(feature = "aarch64_neon", target_arch = "aarch64"), + all(feature = "arm_neon", target_arch = "arm") + ), feature(stdsimd) )] +#![cfg_attr( + all(feature = "arm_neon", target_arch = "arm"), + feature(arm_target_feature) +)] //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved.