From 3e0b363e62e576b5d13a055a12fa52d634955d15 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Thu, 5 Jun 2025 13:13:33 +0900 Subject: [PATCH] use XSSE for bcmath --- ext/bcmath/libbcmath/src/convert.c | 20 +- ext/bcmath/libbcmath/src/simd.h | 59 - ext/bcmath/libbcmath/src/str2num.c | 36 +- ext/bcmath/libbcmath/src/xsse.h | 1965 ++++++++++++++++++++++++++++ 4 files changed, 1993 insertions(+), 87 deletions(-) delete mode 100644 ext/bcmath/libbcmath/src/simd.h create mode 100644 ext/bcmath/libbcmath/src/xsse.h diff --git a/ext/bcmath/libbcmath/src/convert.c b/ext/bcmath/libbcmath/src/convert.c index 5438b4c1c44e5..ca5b84efde1b2 100644 --- a/ext/bcmath/libbcmath/src/convert.c +++ b/ext/bcmath/libbcmath/src/convert.c @@ -17,22 +17,22 @@ #include "bcmath.h" #include "convert.h" #include "private.h" -#include "simd.h" +#include "xsse.h" char *bc_copy_and_toggle_bcd(char *restrict dest, const char *source, const char *source_end) { const size_t bulk_shift = SWAR_REPEAT('0'); -#ifdef HAVE_BC_SIMD_128 +#ifdef XSSE2 /* SIMD SSE2 or NEON bulk shift + copy */ - bc_simd_128_t shift_vector = bc_simd_set_8x16('0'); - while (source + sizeof(bc_simd_128_t) <= source_end) { - bc_simd_128_t bytes = bc_simd_load_8x16((const bc_simd_128_t *) source); - bytes = bc_simd_xor_8x16(bytes, shift_vector); - bc_simd_store_8x16((bc_simd_128_t *) dest, bytes); - - source += sizeof(bc_simd_128_t); - dest += sizeof(bc_simd_128_t); + __m128i shift_vector = _mm_set1_epi8('0'); + while (source + sizeof(__m128i) <= source_end) { + __m128i bytes = _mm_loadu_si128((const __m128i *) source); + bytes = _mm_xor_si128(bytes, shift_vector); + _mm_storeu_si128((__m128i *) dest, bytes); + + source += sizeof(__m128i); + dest += sizeof(__m128i); } #endif diff --git a/ext/bcmath/libbcmath/src/simd.h b/ext/bcmath/libbcmath/src/simd.h deleted file mode 100644 index af38f8349618c..0000000000000 --- a/ext/bcmath/libbcmath/src/simd.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | Copyright (c) The PHP Group | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | https://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ - | Authors: Saki Takamachi | - +----------------------------------------------------------------------+ -*/ - - -#ifndef _BCMATH_SIMD_H_ -#define _BCMATH_SIMD_H_ - -#ifdef __SSE2__ -# include - typedef __m128i bc_simd_128_t; -# define HAVE_BC_SIMD_128 -# define bc_simd_set_8x16(x) _mm_set1_epi8(x) -# define bc_simd_load_8x16(ptr) _mm_loadu_si128((const __m128i *) (ptr)) -# define bc_simd_xor_8x16(a, b) _mm_xor_si128(a, b) -# define bc_simd_store_8x16(ptr, val) _mm_storeu_si128((__m128i *) (ptr), val) -# define bc_simd_add_8x16(a, b) _mm_add_epi8(a, b) -# define bc_simd_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b) -# define bc_simd_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b) -# define bc_simd_movemask_8x16(a) _mm_movemask_epi8(a) - -#elif defined(__aarch64__) || defined(_M_ARM64) -# include - typedef int8x16_t bc_simd_128_t; -# define HAVE_BC_SIMD_128 -# define bc_simd_set_8x16(x) vdupq_n_s8(x) -# define bc_simd_load_8x16(ptr) vld1q_s8((const int8_t *) (ptr)) -# define bc_simd_xor_8x16(a, b) veorq_s8(a, b) -# define bc_simd_store_8x16(ptr, val) vst1q_s8((int8_t *) (ptr), val) -# define bc_simd_add_8x16(a, b) vaddq_s8(a, b) -# define bc_simd_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) -# define bc_simd_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) - static inline int bc_simd_movemask_8x16(int8x16_t vec) - { - /** - * based on code from - * https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon - */ - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s8(vec), 7)); - uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); - } -#endif - -#endif diff --git a/ext/bcmath/libbcmath/src/str2num.c b/ext/bcmath/libbcmath/src/str2num.c index 945de0cf60003..11d71ae492ad6 100644 --- a/ext/bcmath/libbcmath/src/str2num.c +++ b/ext/bcmath/libbcmath/src/str2num.c @@ -32,7 +32,7 @@ #include "bcmath.h" #include "convert.h" #include "private.h" -#include "simd.h" +#include "xsse.h" #include #include @@ -40,20 +40,20 @@ static inline const char *bc_count_digits(const char *str, const char *end) { /* Process in bulk */ -#ifdef HAVE_BC_SIMD_128 - const bc_simd_128_t offset = bc_simd_set_8x16((signed char) (SCHAR_MIN - '0')); +#ifdef XSSE2 + const __m128i offset = _mm_set1_epi8((signed char) (SCHAR_MIN - '0')); /* we use the less than comparator, so add 1 */ - const bc_simd_128_t threshold = bc_simd_set_8x16(SCHAR_MIN + ('9' + 1 - '0')); + const __m128i threshold = _mm_set1_epi8(SCHAR_MIN + ('9' + 1 - '0')); - while (str + sizeof(bc_simd_128_t) <= end) { - bc_simd_128_t bytes = bc_simd_load_8x16((const bc_simd_128_t *) str); + while (str + sizeof(__m128i) <= end) { + __m128i bytes = _mm_loadu_si128((const __m128i *) str); /* Wrapping-add the offset to the bytes, such that all bytes below '0' are positive and others are negative. * More specifically, '0' will be -128 and '9' will be -119. */ - bytes = bc_simd_add_8x16(bytes, offset); + bytes = _mm_add_epi8(bytes, offset); /* Now mark all bytes that are <= '9', i.e. <= -119, i.e. < -118, i.e. the threshold. */ - bytes = bc_simd_cmplt_8x16(bytes, threshold); + bytes = _mm_cmplt_epi8(bytes, threshold); - int mask = bc_simd_movemask_8x16(bytes); + int mask = _mm_movemask_epi8(bytes); if (mask != 0xffff) { /* At least one of the bytes is not within range. Move to the first offending byte. */ #ifdef PHP_HAVE_BUILTIN_CTZL @@ -63,7 +63,7 @@ static inline const char *bc_count_digits(const char *str, const char *end) #endif } - str += sizeof(bc_simd_128_t); + str += sizeof(__m128i); } #endif @@ -77,19 +77,19 @@ static inline const char *bc_count_digits(const char *str, const char *end) static inline const char *bc_skip_zero_reverse(const char *scanner, const char *stop) { /* Check in bulk */ -#ifdef HAVE_BC_SIMD_128 - const bc_simd_128_t c_zero_repeat = bc_simd_set_8x16('0'); - while (scanner - sizeof(bc_simd_128_t) >= stop) { - scanner -= sizeof(bc_simd_128_t); - bc_simd_128_t bytes = bc_simd_load_8x16((const bc_simd_128_t *) scanner); +#ifdef XSSE2 + const __m128i c_zero_repeat = _mm_set1_epi8('0'); + while (scanner - sizeof(__m128i) >= stop) { + scanner -= sizeof(__m128i); + __m128i bytes = _mm_loadu_si128((const __m128i *) scanner); /* Checks if all numeric strings are equal to '0'. */ - bytes = bc_simd_cmpeq_8x16(bytes, c_zero_repeat); + bytes = _mm_cmpeq_epi8(bytes, c_zero_repeat); - int mask = bc_simd_movemask_8x16(bytes); + int mask = _mm_movemask_epi8(bytes); /* The probability of having 16 trailing 0s in a row is very low, so we use EXPECTED. */ if (EXPECTED(mask != 0xffff)) { /* Move the pointer back and check each character in loop. */ - scanner += sizeof(bc_simd_128_t); + scanner += sizeof(__m128i); break; } } diff --git a/ext/bcmath/libbcmath/src/xsse.h b/ext/bcmath/libbcmath/src/xsse.h new file mode 100644 index 0000000000000..a0d86e335e3f7 --- /dev/null +++ b/ext/bcmath/libbcmath/src/xsse.h @@ -0,0 +1,1965 @@ +/******************************************************************************** + * MIT License + * Copyright (c) 2025 Saki Takamachi + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + ********************************************************************************* + * This is a wrapper library that allows you to use SSE2, SSE3, SSSE3, SSE4.1, + * and SSE4.2 APIs directly with NEON. + * Please note that APIs using `__m64`, `__m128`, or `__m128d` are not supported. + * + * In an SSE environment, if `__SSE2__` is defined, the `XSSE2` macro will be + * defined as well. + * Similarly, `XSSE3` through `XSSE4_2` will also be defined if their + * corresponding features are available.(Note that plain `SSE` is not supported.) + * + * In a NEON environment, all of the macros `XSSE2`, `XSSE3`, `XSSSE3`, + * `XSSE4_1`, and `XSSE4_2` are defined. + *********************************************************************************/ + + +#ifndef XSSE_H +#define XSSE_H + +#define XSSE_VERSION 20000 + +#ifdef _MSC_VER +# define XSSE_FORCE_INLINE __forceinline +# define XSSE_UNREACHABLE() __assume(0) +# define XSSE_EXPECTED(x) (x) +# define XSSE_UNEXPECTED(x) (x) +# define XSSE_ATTR_CONST +# define XSSE_MEMCPY(p, vp, s) memcpy((p), (vp), (s)) +#elif defined(__GNUC__) || defined(__clang__) +# define XSSE_FORCE_INLINE inline __attribute__((always_inline)) +# define XSSE_UNREACHABLE() __builtin_unreachable() +# define XSSE_EXPECTED(x) __builtin_expect(!!(x), 1) +# define XSSE_UNEXPECTED(x) __builtin_expect(!!(x), 0) +# define XSSE_ATTR_CONST __attribute__((const)) +# define XSSE_HAS_MACRO_EXTENSION +# define XSSE_MEMCPY(p, vp, s) __builtin_memcpy((p), (vp), (s)) +# ifdef __OPTIMIZE__ +# define XSSE_IS_OPTIMIZE +# endif +#else +# define XSSE_FORCE_INLINE inline +# define XSSE_UNREACHABLE() do {} while (0) +# define XSSE_EXPECTED(x) (x) +# define XSSE_UNEXPECTED(x) (x) +# define XSSE_ATTR_CONST +# define XSSE_MEMCPY(p, vp, s) memcpy((p), (vp), (s)) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +# define XSSE_ATTR_OPTIMIZE_O2 __attribute__((optimize("O2"))) +#else +# define XSSE_ATTR_OPTIMIZE_O2 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#include +#include +typedef int8x16_t __m128i; +#endif + + +/***************************************************************************** + * * + * SSE2 * + * * + *****************************************************************************/ + +#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) +#include +#define XSSE2 + + +#elif defined(__aarch64__) || defined(_M_ARM64) +#define XSSE2 + +/***************************************************************************** + * Load / Store * + *****************************************************************************/ + +#define _mm_set_epi8(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \ + ((int8x16_t) { \ + (int8_t) (x15), (int8_t) (x14), (int8_t) (x13), (int8_t) (x12), \ + (int8_t) (x11), (int8_t) (x10), (int8_t) (x9), (int8_t) (x8), \ + (int8_t) (x7), (int8_t) (x6), (int8_t) (x5), (int8_t) (x4), \ + (int8_t) (x3), (int8_t) (x2), (int8_t) (x1), (int8_t) (x0) }) +#define _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \ + (vreinterpretq_s8_s16((int16x8_t) { \ + (int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \ + (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) })) +#define _mm_set_epi32(x0, x1, x2, x3) \ + (vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) })) +#define _mm_set_epi64x(x0, x1) (vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) })) +#define _mm_set1_epi8(x) (vdupq_n_s8((int8_t) (x))) +#define _mm_set1_epi16(x) (vreinterpretq_s8_s16(vdupq_n_s16((int16_t) (x)))) +#define _mm_set1_epi32(x) (vreinterpretq_s8_s32(vdupq_n_s32((int32_t) (x)))) +#define _mm_set1_epi64x(x) (vreinterpretq_s8_s64(vdupq_n_s64((int64_t) (x)))) + +#define _mm_setr_epi8(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \ + ((int8x16_t) { \ + (int8_t) (x0), (int8_t) (x1), (int8_t) (x2), (int8_t) (x3), \ + (int8_t) (x4), (int8_t) (x5), (int8_t) (x6), (int8_t) (x7), \ + (int8_t) (x8), (int8_t) (x9), (int8_t) (x10), (int8_t) (x11), \ + (int8_t) (x12), (int8_t) (x13), (int8_t) (x14), (int8_t) (x15) }) +#define _mm_setr_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \ + (vreinterpretq_s8_s16((int16x8_t) { \ + (int16_t) (x0), (int16_t) (x1), (int16_t) (x2), (int16_t) (x3), \ + (int16_t) (x4), (int16_t) (x5), (int16_t) (x6), (int16_t) (x7) })) +#define _mm_setr_epi32(x0, x1, x2, x3) \ + (vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x0), (int32_t) (x1), (int32_t) (x2), (int32_t) (x3) })) + +#define _mm_setzero_si128() (vdupq_n_s8(0)) +#define _mm_undefined_si128() _mm_setzero_si128() + +#define _mm_load_si128(x) (vld1q_s8((const int8_t*) (x))) +static XSSE_FORCE_INLINE __m128i _mm_loadu_si16(const void *ptr) +{ + int16_t val; + XSSE_MEMCPY(&val, ptr, 2); + return vreinterpretq_s8_s16((int16x8_t) { + (int16_t) val, (int16_t) 0, (int16_t) 0, (int16_t) 0, + (int16_t) 0, (int16_t) 0, (int16_t) 0, (int16_t) 0 + }); +} +static XSSE_FORCE_INLINE __m128i _mm_loadu_si32(const void *ptr) +{ + int32_t val; + XSSE_MEMCPY(&val, ptr, 4); + return vreinterpretq_s8_s32((int32x4_t) { + (int32_t) val, (int32_t) 0, (int32_t) 0, (int32_t) 0 + }); +} +static XSSE_FORCE_INLINE __m128i _mm_loadu_si64(const void *ptr) +{ + int64_t val; + XSSE_MEMCPY(&val, ptr, 8); + return vreinterpretq_s8_s64((int64x2_t) { + (int64_t) val, (int64_t) 0 + }); +} +#define _mm_loadu_si128(x) _mm_load_si128(x) +static XSSE_FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *x) +{ + int64x1_t zero = vdup_n_s64(0); + int64x1_t x64 = vld1_s64((const int64_t*) x); + return vreinterpretq_s8_s64(vcombine_s64(x64, zero)); +} + +#define _mm_storel_epi64(to, x) (vst1_u64((uint64_t*) (to), vget_low_u64(vreinterpretq_u64_s8(x)))) +#define _mm_store_si128(to, x) (vst1q_s8((int8_t*) (to), x)) +static XSSE_FORCE_INLINE void _mm_storeu_si16(void *ptr, __m128i x) +{ + int16_t val = (int16_t) vgetq_lane_s16(vreinterpretq_s16_s8(x), 0); + XSSE_MEMCPY(ptr, &val, sizeof(val)); +} +static XSSE_FORCE_INLINE void _mm_storeu_si32(void *ptr, __m128i x) +{ + int32_t val = (int32_t) vgetq_lane_s32(vreinterpretq_s32_s8(x), 0); + XSSE_MEMCPY(ptr, &val, sizeof(val)); +} +static XSSE_FORCE_INLINE void _mm_storeu_si64(void *ptr, __m128i x) +{ + int64_t val = (int64_t) vgetq_lane_s64(vreinterpretq_s64_s8(x), 0); + XSSE_MEMCPY(ptr, &val, sizeof(val)); +} +#define _mm_storeu_si128(to, x) _mm_store_si128(to, x) +#define _mm_stream_si128(to, x) _mm_store_si128(to, x) +#define _mm_stream_si32(to, x) (*(int32_t*) (to) = (int32_t) (x)) +#define _mm_stream_si64(to, x) (*(int64_t*) (to) = (int64_t) (x)) + + +/***************************************************************************** + * Bit shift / Bit wise * + *****************************************************************************/ + +#define _mm_or_si128(a, b) (vorrq_s8((a), (b))) +#define _mm_xor_si128(a, b) (veorq_s8((a), (b))) +#define _mm_and_si128(a, b) (vandq_s8((a), (b))) +#define _mm_andnot_si128(a, b) (vbicq_s8((b), (a))) + +#define _mm_slli_epi16(x, count) (vreinterpretq_s8_u16(vshlq_n_u16(vreinterpretq_u16_s8(x), (count)))) +#define _mm_slli_epi32(x, count) (vreinterpretq_s8_u32(vshlq_n_u32(vreinterpretq_u32_s8(x), (count)))) +#define _mm_slli_epi64(x, count) (vreinterpretq_s8_u64(vshlq_n_u64(vreinterpretq_u64_s8(x), (count)))) +static XSSE_FORCE_INLINE __m128i _mm_sll_epi16(__m128i x, __m128i count) +{ + uint16x8_t x16 = vreinterpretq_u16_s8(x); + uint16_t shift = (uint16_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFF); + int16x8_t shifts = vdupq_n_s16((int16_t) shift); + return vreinterpretq_s8_u16(vshlq_u16(x16, shifts)); +} +static XSSE_FORCE_INLINE __m128i _mm_sll_epi32(__m128i x, __m128i count) +{ + uint32x4_t x32 = vreinterpretq_u32_s8(x); + uint32_t shift = (uint32_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFFFFFF); + int32x4_t shifts = vdupq_n_s32((int32_t) shift); + return vreinterpretq_s8_u32(vshlq_u32(x32, shifts)); +} +static XSSE_FORCE_INLINE __m128i _mm_sll_epi64(__m128i x, __m128i count) +{ + uint64x2_t x64 = vreinterpretq_u64_s8(x); + uint64_t shift = (uint64_t) vgetq_lane_s64(vreinterpretq_s64_s8(count), 0); + int64x2_t shifts = vdupq_n_s64((int64_t) shift); + return vreinterpretq_s8_u64(vshlq_u64(x64, shifts)); +} + +static XSSE_FORCE_INLINE __m128i _mm_slli_si128(__m128i x, int imm) +{ + switch (imm) { + case 0:; + int8x16_t x2 = x; + return x2; + case 1: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 15)); + case 2: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 14)); + case 3: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 13)); + case 4: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 12)); + case 5: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 11)); + case 6: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 10)); + case 7: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 9)); + case 8: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 8)); + case 9: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 7)); + case 10: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 6)); + case 11: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 5)); + case 12: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 4)); + case 13: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 3)); + case 14: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 2)); + case 15: return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 1)); + default: return vreinterpretq_s8_u8(vdupq_n_u8(0)); + } +} +#define _mm_bslli_si128(x, imm) _mm_slli_si128(x, imm) + +#define _mm_srai_epi16(x, count) (vreinterpretq_s8_s16(vshrq_n_s16(vreinterpretq_s16_s8(x), (count)))) +#define _mm_srai_epi32(x, count) (vreinterpretq_s8_s32(vshrq_n_s32(vreinterpretq_s32_s8(x), (count)))) +static XSSE_FORCE_INLINE __m128i _mm_sra_epi16(__m128i x, __m128i count) +{ + int16x8_t x16 = vreinterpretq_s16_s8(x); + uint16_t shift = (uint16_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFF); + int16x8_t shifts = vdupq_n_s16(-(int16_t) shift); + return vreinterpretq_s8_s16(vshlq_s16(x16, shifts)); +} +static XSSE_FORCE_INLINE __m128i _mm_sra_epi32(__m128i x, __m128i count) +{ + int32x4_t x32 = vreinterpretq_s32_s8(x); + uint32_t shift = (uint32_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFFFFFF); + int32x4_t shifts = vdupq_n_s32(-(int32_t) shift); + return vreinterpretq_s8_s32(vshlq_s32(x32, shifts)); +} + +#define _mm_srli_epi16(x, count) (vreinterpretq_s8_u16(vshrq_n_u16(vreinterpretq_u16_s8(x), (count)))) +#define _mm_srli_epi32(x, count) (vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(x), (count)))) +#define _mm_srli_epi64(x, count) (vreinterpretq_s8_u64(vshrq_n_u64(vreinterpretq_u64_s8(x), (count)))) +static XSSE_FORCE_INLINE __m128i _mm_srl_epi16(__m128i x, __m128i count) +{ + uint16x8_t x16 = vreinterpretq_u16_s8(x); + uint16_t shift = (uint16_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFF); + int16x8_t shifts = vdupq_n_s16(-(int16_t) shift); + return vreinterpretq_s8_u16(vshlq_u16(x16, shifts)); +} +static XSSE_FORCE_INLINE __m128i _mm_srl_epi32(__m128i x, __m128i count) +{ + uint32x4_t x32 = vreinterpretq_u32_s8(x); + uint32_t shift = (uint32_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFFFFFF); + int32x4_t shifts = vdupq_n_s32(-(int32_t) shift); + return vreinterpretq_s8_u32(vshlq_u32(x32, shifts)); +} +static XSSE_FORCE_INLINE __m128i _mm_srl_epi64(__m128i x, __m128i count) +{ + uint64x2_t x64 = vreinterpretq_u64_s8(x); + uint64_t shift = (uint64_t) vgetq_lane_s64(vreinterpretq_s64_s8(count), 0); + int64x2_t shifts = vdupq_n_s64(-(int64_t) shift); + return vreinterpretq_s8_u64(vshlq_u64(x64, shifts)); +} + +static XSSE_FORCE_INLINE __m128i _mm_srli_si128(__m128i x, int imm) +{ + switch (imm) { + case 0: return x; + case 1: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 1)); + case 2: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 2)); + case 3: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 3)); + case 4: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 4)); + case 5: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 5)); + case 6: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 6)); + case 7: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 7)); + case 8: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 8)); + case 9: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 9)); + case 10: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 10)); + case 11: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 11)); + case 12: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 12)); + case 13: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 13)); + case 14: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 14)); + case 15: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 15)); + default: return vreinterpretq_s8_u8(vdupq_n_u8(0)); + } +} +#define _mm_bsrli_si128(x, imm) _mm_srli_si128(x, imm) + + +/***************************************************************************** + * Integer Arithmetic Operations * + *****************************************************************************/ + +/** + * In practice, there is no problem, but a runtime error for signed integer overflow is triggered by UBSAN, + * so perform the calculation as unsigned. Since it is optimized at compile time, there are no unnecessary casts at runtime. + */ +#define _mm_add_epi8(a, b) (vreinterpretq_s8_u8(vaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_add_epi16(a, b) (vreinterpretq_s8_u16(vaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) +#define _mm_add_epi32(a, b) (vreinterpretq_s8_u32(vaddq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)))) +#define _mm_add_epi64(a, b) (vreinterpretq_s8_u64(vaddq_u64(vreinterpretq_u64_s8(a), vreinterpretq_u64_s8(b)))) + +#define _mm_adds_epi8(a, b) (vqaddq_s8((a), (b))) +#define _mm_adds_epi16(a, b) (vreinterpretq_s8_s16(vqaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_adds_epu8(a, b) (vreinterpretq_s8_u8(vqaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_adds_epu16(a, b) (vreinterpretq_s8_u16(vqaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) + +#define _mm_avg_epu8(a, b) (vreinterpretq_s8_u8(vrhaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_avg_epu16(a, b) (vreinterpretq_s8_u16(vrhaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) + +static XSSE_FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + + int16x4_t a_lo = vget_low_s16(a16); + int16x4_t a_hi = vget_high_s16(a16); + int16x4_t b_lo = vget_low_s16(b16); + int16x4_t b_hi = vget_high_s16(b16); + + int32x4_t mul_lo = vmull_s16(a_lo, b_lo); + int32x4_t mul_hi = vmull_s16(a_hi, b_hi); + return vreinterpretq_s8_s32(vpaddq_s32(mul_lo, mul_hi)); +} + +#define _mm_max_epu8(a, b) (vreinterpretq_s8_u8(vmaxq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_max_epi16(a, b) (vreinterpretq_s8_s16(vmaxq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_min_epu8(a, b) (vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_min_epi16(a, b) (vreinterpretq_s8_s16(vminq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) + +static XSSE_FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + + int16x4_t a_lo = vget_low_s16(a16); + int16x4_t a_hi = vget_high_s16(a16); + int16x4_t b_lo = vget_low_s16(b16); + int16x4_t b_hi = vget_high_s16(b16); + + int32x4_t mul_lo = vmull_s16(a_lo, b_lo); + int32x4_t mul_hi = vmull_s16(a_hi, b_hi); + + int16x4_t narrowed_lo = vshrn_n_s32(mul_lo, 16); + int16x4_t narrowed_hi = vshrn_n_s32(mul_hi, 16); + + return vreinterpretq_s8_s16(vcombine_s16(narrowed_lo, narrowed_hi)); +} +static XSSE_FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) +{ + uint16x8_t a16 = vreinterpretq_u16_s8(a); + uint16x8_t b16 = vreinterpretq_u16_s8(b); + + uint16x4_t a_lo = vget_low_u16(a16); + uint16x4_t a_hi = vget_high_u16(a16); + uint16x4_t b_lo = vget_low_u16(b16); + uint16x4_t b_hi = vget_high_u16(b16); + + uint32x4_t mul_lo = vmull_u16(a_lo, b_lo); + uint32x4_t mul_hi = vmull_u16(a_hi, b_hi); + + uint16x4_t narrowed_lo = vshrn_n_u32(mul_lo, 16); + uint16x4_t narrowed_hi = vshrn_n_u32(mul_hi, 16); + + return vreinterpretq_s8_u16(vcombine_u16(narrowed_lo, narrowed_hi)); +} +static XSSE_FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + return vreinterpretq_s8_s16(vmulq_s16(a16, b16)); +} +static XSSE_FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + uint32x4_t a32 = vreinterpretq_u32_s8(a); + uint32x4_t b32 = vreinterpretq_u32_s8(b); + uint32x4_t evens = vuzp1q_u32(a32, b32); + uint32x2_t lo = vget_low_u32(evens); + uint32x2_t hi = vget_high_u32(evens); + + return vreinterpretq_s8_u64(vmull_u32(lo, hi)); +} +static XSSE_FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint8x16_t ua = vreinterpretq_u8_s8(a); + uint8x16_t ub = vreinterpretq_u8_s8(b); + uint16x8_t abs_diffs_16 = vpaddlq_u8(vabdq_u8(ua, ub)); + uint32x4_t abs_diffs_32 = vpaddlq_u16(abs_diffs_16); + uint64x2_t abs_diffs_64 = vpaddlq_u32(abs_diffs_32); + + return vreinterpretq_s8_u16((uint16x8_t) { + (int16_t) vgetq_lane_u64(abs_diffs_64, 0), 0, 0, 0, + (int16_t) vgetq_lane_u64(abs_diffs_64, 1), 0, 0, 0 + }); +} + +#define _mm_sub_epi8(a, b) (vreinterpretq_s8_u8(vsubq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_sub_epi16(a, b) (vreinterpretq_s8_u16(vsubq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) +#define _mm_sub_epi32(a, b) (vreinterpretq_s8_u32(vsubq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)))) +#define _mm_sub_epi64(a, b) (vreinterpretq_s8_u64(vsubq_u64(vreinterpretq_u64_s8(a), vreinterpretq_u64_s8(b)))) + +#define _mm_subs_epi8(a, b) (vqsubq_s8((a), (b))) +#define _mm_subs_epi16(a, b) (vreinterpretq_s8_s16(vqsubq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_subs_epu8(a, b) (vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)))) +#define _mm_subs_epu16(a, b) (vreinterpretq_s8_u16(vqsubq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) + + +/***************************************************************************** + * Comparison * + *****************************************************************************/ + +#define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8((a), (b)))) +#define _mm_cmpeq_epi16(a, b) (vreinterpretq_s8_u16(vceqq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_cmpeq_epi32(a, b) (vreinterpretq_s8_u32(vceqq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) + +#define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8((a), (b)))) +#define _mm_cmplt_epi16(a, b) (vreinterpretq_s8_u16(vcltq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_cmplt_epi32(a, b) (vreinterpretq_s8_u32(vcltq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) + +#define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8((a), (b)))) +#define _mm_cmpgt_epi16(a, b) (vreinterpretq_s8_u16(vcgtq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_cmpgt_epi32(a, b) (vreinterpretq_s8_u32(vcgtq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) + + +/***************************************************************************** + * Convert * + *****************************************************************************/ + +#define _mm_cvtsi32_si128(x) (vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x), 0, 0, 0 })) +#define _mm_cvtsi64_si128(x) (vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x), 0 })) +#define _mm_cvtsi128_si32(x) (vgetq_lane_s32(vreinterpretq_s32_s8(x), 0)) +#define _mm_cvtsi128_si64(x) (vgetq_lane_s64(vreinterpretq_s64_s8(x), 0)) + + +/***************************************************************************** + * Others * + *****************************************************************************/ + +#define _mm_packs_epi16(a, b) (vcombine_s8(vqmovn_s16(vreinterpretq_s16_s8(a)), vqmovn_s16(vreinterpretq_s16_s8(b)))) +#define _mm_packs_epi32(a, b) \ + (vreinterpretq_s8_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_s8(a)), vqmovn_s32(vreinterpretq_s32_s8(b))))) +#define _mm_packus_epi16(a, b) \ + (vreinterpretq_s8_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_s8(a)), vqmovun_s16(vreinterpretq_s16_s8(b))))) + +#define _mm_extract_epi16(x, imm) (vgetq_lane_s16(vreinterpretq_s16_s8(x), (imm))) +#define _mm_insert_epi16(x, val, imm) (vreinterpretq_s8_s16(vsetq_lane_s16((int16_t) (val), vreinterpretq_s16_s8(x), (imm)))) + +static XSSE_FORCE_INLINE void _mm_maskmoveu_si128(__m128i x, __m128i mask, char* dest) +{ + uint8x16_t origin = vld1q_u8((uint8_t*) dest); + + uint8x16_t repeat_0x00 = vdupq_n_u8(0); + uint8x16_t repeat_0x80 = vdupq_n_u8(0x80); + uint8x16_t umask = vreinterpretq_u8_s8(mask); + uint8x16_t ux = vreinterpretq_u8_s8(x); + + uint8x16_t mask_bits = vandq_u8(umask, repeat_0x80); + uint8x16_t mask_bytes = vsubq_u8(repeat_0x00, vshrq_n_u8(mask_bits, 7)); + vst1q_u8((uint8_t*) dest, vbslq_u8(mask_bytes, ux, origin)); +} +static XSSE_FORCE_INLINE int _mm_movemask_epi8(__m128i x) +{ + /** + * based on code from + * https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + */ + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s8(x), 7)); + uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +#define _MM_SHUFFLE(a, b, c, d) (((a) << 6) | ((b) << 4) | ((c) << 2) | (d)) +#ifdef XSSE_HAS_MACRO_EXTENSION +#define _mm_shuffle_epi32(x, imm) __extension__({ \ + int32x4_t __xsse_tmp = vreinterpretq_s32_s8(x); \ + vreinterpretq_s8_s32((int32x4_t) { \ + (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 0) & 0x3), \ + (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 2) & 0x3), \ + (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 4) & 0x3), \ + (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 6) & 0x3) \ + }); \ + }) +#define _mm_shufflehi_epi16(x, imm) __extension__({ \ + int16x8_t __xsse_tmp = vreinterpretq_s16_s8(x); \ + vreinterpretq_s8_s16(vcombine_s16( \ + vget_low_s16(__xsse_tmp), \ + (int16x4_t) { \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 0) & 0x3) + 4), \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 2) & 0x3) + 4), \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 4) & 0x3) + 4), \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 6) & 0x3) + 4) \ + } \ + )); \ + }) +#define _mm_shufflelo_epi16(x, imm) __extension__({ \ + int16x8_t __xsse_tmp = vreinterpretq_s16_s8(x); \ + vreinterpretq_s8_s16(vcombine_s16( \ + (int16x4_t) { \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 0) & 0x3)), \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 2) & 0x3)), \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 4) & 0x3)), \ + (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 6) & 0x3)) \ + }, \ + vget_high_s16(__xsse_tmp) \ + )); \ + }) +#else +static XSSE_FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i x, int imm) +{ + int32x4_t vec = vreinterpretq_s32_s8(x); + int32_t arr[4]; + vst1q_s32(arr, vec); + + return vreinterpretq_s8_s32((int32x4_t) { + arr[(imm >> 0) & 0x3], + arr[(imm >> 2) & 0x3], + arr[(imm >> 4) & 0x3], + arr[(imm >> 6) & 0x3] + }); +} +static XSSE_FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i x, int imm) +{ + int16x8_t vec = vreinterpretq_s16_s8(x); + int16_t arr[8]; + vst1q_s16(arr, vec); + + return vreinterpretq_s8_s16((int16x8_t) { + arr[0], arr[1], arr[2], arr[3], + arr[((imm >> 0) & 0x3) + 4], + arr[((imm >> 2) & 0x3) + 4], + arr[((imm >> 4) & 0x3) + 4], + arr[((imm >> 6) & 0x3) + 4] + }); +} +static XSSE_FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i x, int imm) +{ + int16x8_t vec = vreinterpretq_s16_s8(x); + int16_t arr[8]; + vst1q_s16(arr, vec); + + return vreinterpretq_s8_s16((int16x8_t) { + arr[((imm >> 0) & 0x3)], + arr[((imm >> 2) & 0x3)], + arr[((imm >> 4) & 0x3)], + arr[((imm >> 6) & 0x3)], + arr[4], arr[5], arr[6], arr[7] + }); +} +#endif + +#define _mm_unpackhi_epi8(a, b) (vzip2q_s8((a), (b))) +#define _mm_unpackhi_epi16(a, b) (vreinterpretq_s8_s16(vzip2q_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_unpackhi_epi32(a, b) (vreinterpretq_s8_s32(vzip2q_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) +#define _mm_unpackhi_epi64(a, b) (vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(a), vreinterpretq_s64_s8(b)))) + +#define _mm_unpacklo_epi8(a, b) (vzip1q_s8((a), (b))) +#define _mm_unpacklo_epi16(a, b) (vreinterpretq_s8_s16(vzip1q_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)))) +#define _mm_unpacklo_epi32(a, b) (vreinterpretq_s8_s32(vzip1q_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) +#define _mm_unpacklo_epi64(a, b) (vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(a), vreinterpretq_s64_s8(b)))) + +#define _mm_move_epi64(x) (vreinterpretq_s8_s64((int64x2_t) { vgetq_lane_s64(vreinterpretq_s64_s8(x), 0), 0 })) + +#define _mm_clflush(x) ((void) 0) +static XSSE_FORCE_INLINE void _mm_mfence(void) +{ + __asm__ __volatile__("dsb sy" ::: "memory"); +} +static XSSE_FORCE_INLINE void _mm_lfence(void) +{ + __asm__ __volatile__("dsb ld" ::: "memory"); +} +static XSSE_FORCE_INLINE void _mm_pause(void) +{ + __asm__ __volatile__("yield"); +} + +#endif /* SSE2 */ + + +/***************************************************************************** + * * + * SSE3 * + * * + *****************************************************************************/ + +#if defined(__SSE3__) +#include +#define XSSE3 + + +#elif defined(__aarch64__) || defined(_M_ARM64) +#define XSSE3 + +/***************************************************************************** + * Load / Store * + *****************************************************************************/ + +#define _mm_lddqu_si128(x) _mm_load_si128(x) + +#endif /* SSE3 */ + + +/***************************************************************************** + * * + * SSSE3 * + * * + *****************************************************************************/ + +#if defined(__SSSE3__) +#include +#define XSSSE3 + + +#elif defined(__aarch64__) || defined(_M_ARM64) +#define XSSSE3 + +/***************************************************************************** + * Bit shift / Bit wise * + *****************************************************************************/ + +static XSSE_FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm) +{ + switch (imm) { + case 0:; + int8x16_t b2 = b; + return b2; + case 1: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 1)); + case 2: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 2)); + case 3: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 3)); + case 4: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 4)); + case 5: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 5)); + case 6: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 6)); + case 7: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 7)); + case 8: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 8)); + case 9: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 9)); + case 10: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 10)); + case 11: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 11)); + case 12: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 12)); + case 13: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 13)); + case 14: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 14)); + case 15: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(b), vreinterpretq_u8_s8(a), 15)); + case 16:; + int8x16_t a2 = a; + return a2; + case 17: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 1)); + case 18: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 2)); + case 19: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 3)); + case 20: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 4)); + case 21: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 5)); + case 22: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 6)); + case 23: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 7)); + case 24: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 8)); + case 25: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 9)); + case 26: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 10)); + case 27: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 11)); + case 28: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 12)); + case 29: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 13)); + case 30: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 14)); + case 31: return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(a), vdupq_n_u8(0), 15)); + default: return vdupq_n_s8(0); + } +} + + +/***************************************************************************** + * Integer Arithmetic Operations * + *****************************************************************************/ + +#define _mm_abs_epi8(x) (vabsq_s8(x)) +#define _mm_abs_epi16(x) (vreinterpretq_s8_s16(vabsq_s16(vreinterpretq_s16_s8(x)))) +#define _mm_abs_epi32(x) (vreinterpretq_s8_s32(vabsq_s32(vreinterpretq_s32_s8(x)))) + +#define _mm_hadd_epi16(a, b) (vreinterpretq_s8_u16(vpaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) +#define _mm_hadd_epi32(a, b) (vreinterpretq_s8_u32(vpaddq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)))) +static XSSE_FORCE_INLINE __m128i _mm_hadds_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + int16x8_t evens = vuzp1q_s16(a16, b16); + int16x8_t odds = vuzp2q_s16(a16, b16); + return vreinterpretq_s8_s16(vqaddq_s16(evens, odds)); +} + +static XSSE_FORCE_INLINE __m128i _mm_hsub_epi16(__m128i a, __m128i b) +{ + uint16x8_t a16 = vreinterpretq_u16_s8(a); + uint16x8_t b16 = vreinterpretq_u16_s8(b); + uint16x8_t evens = vuzp1q_u16(a16, b16); + uint16x8_t odds = vuzp2q_u16(a16, b16); + return vreinterpretq_s8_u16(vsubq_u16(evens, odds)); +} +static XSSE_FORCE_INLINE __m128i _mm_hsub_epi32(__m128i a, __m128i b) +{ + uint32x4_t a32 = vreinterpretq_u32_s8(a); + uint32x4_t b32 = vreinterpretq_u32_s8(b); + uint32x4_t evens = vuzp1q_u32(a32, b32); + uint32x4_t odds = vuzp2q_u32(a32, b32); + return vreinterpretq_s8_u32(vsubq_u32(evens, odds)); +} +static XSSE_FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + int16x8_t evens = vuzp1q_s16(a16, b16); + int16x8_t odds = vuzp2q_s16(a16, b16); + return vreinterpretq_s8_s16(vqsubq_s16(evens, odds)); +} + +static XSSE_FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i a, __m128i b) +{ + uint8x16_t ua = vreinterpretq_u8_s8(a); + uint8x8_t ua_lo = vget_low_u8(ua); + uint8x8_t ua_hi = vget_high_u8(ua); + + uint16x8_t wide_ua_lo = vmovl_u8(ua_lo); + uint16x8_t wide_ua_hi = vmovl_u8(ua_hi); + + int16x8_t wide_a_lo = vreinterpretq_s16_u16(wide_ua_lo); + int16x8_t wide_a_hi = vreinterpretq_s16_u16(wide_ua_hi); + + int8x8_t b_lo = vget_low_s8(b); + int8x8_t b_hi = vget_high_s8(b); + + int16x8_t wide_b_lo = vmovl_s8(b_lo); + int16x8_t wide_b_hi = vmovl_s8(b_hi); + + int16x8_t mul_lo = vmulq_s16(wide_a_lo, wide_b_lo); + int16x8_t mul_hi = vmulq_s16(wide_a_hi, wide_b_hi); + + uint16x8_t umul_lo = vreinterpretq_u16_s16(mul_lo); + uint16x8_t umul_hi = vreinterpretq_u16_s16(mul_hi); + + return vreinterpretq_s8_u16(vpaddq_u16(umul_lo, umul_hi)); +} + +static XSSE_FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + + int16x4_t a_lo = vget_low_s16(a16); + int16x4_t a_hi = vget_high_s16(a16); + int16x4_t b_lo = vget_low_s16(b16); + int16x4_t b_hi = vget_high_s16(b16); + + int32x4_t mul_lo = vmull_s16(a_lo, b_lo); + int32x4_t mul_hi = vmull_s16(a_hi, b_hi); + + int32x4_t repeat_0x4000 = vdupq_n_s32(0x4000); + + int32x4_t add_lo = vaddq_s32(mul_lo, repeat_0x4000); + int32x4_t add_hi = vaddq_s32(mul_hi, repeat_0x4000); + + int32x4_t lo = vshrq_n_s32(add_lo, 15); + int32x4_t hi = vshrq_n_s32(add_hi, 15); + + int16x4_t narrowed_lo = vqmovn_s32(lo); + int16x4_t narrowed_hi = vqmovn_s32(hi); + + return vreinterpretq_s8_s16(vcombine_s16(narrowed_lo, narrowed_hi)); +} + +static XSSE_FORCE_INLINE __m128i _mm_sign_epi8(__m128i a, __m128i b) +{ + int8x16_t repeat_0x00 = vdupq_n_s8(0); + int8x16_t repeat_0x01 = vdupq_n_s8(1); + + uint8x16_t upos_cmp_ret = vcltq_s8(repeat_0x00, b); + uint8x16_t uneg_cmp_ret = vcltq_s8(b, repeat_0x00); + + int8x16_t pos_cmp_ret = vreinterpretq_s8_u8(upos_cmp_ret); + int8x16_t pos_mask = vandq_s8(pos_cmp_ret, repeat_0x01); + int8x16_t neg_mask = vreinterpretq_s8_u8(uneg_cmp_ret); + + int8x16_t mask = vorrq_s8(pos_mask, neg_mask); + + return vmulq_s8(a, mask); +} +static XSSE_FORCE_INLINE __m128i _mm_sign_epi16(__m128i a, __m128i b) +{ + int16x8_t a16 = vreinterpretq_s16_s8(a); + int16x8_t b16 = vreinterpretq_s16_s8(b); + + int16x8_t repeat_0x00 = vdupq_n_s16(0); + int16x8_t repeat_0x01 = vdupq_n_s16(1); + + uint16x8_t upos_cmp_ret = vcltq_s16(repeat_0x00, b16); + uint16x8_t uneg_cmp_ret = vcltq_s16(b16, repeat_0x00); + + int16x8_t pos_cmp_ret = vreinterpretq_s16_u16(upos_cmp_ret); + int16x8_t pos_mask = vandq_s16(pos_cmp_ret, repeat_0x01); + int16x8_t neg_mask = vreinterpretq_s16_u16(uneg_cmp_ret); + + int16x8_t mask = vorrq_s16(pos_mask, neg_mask); + + return vreinterpretq_s8_s16(vmulq_s16(a16, mask)); +} +static XSSE_FORCE_INLINE __m128i _mm_sign_epi32(__m128i a, __m128i b) +{ + int32x4_t a32 = vreinterpretq_s32_s8(a); + int32x4_t b32 = vreinterpretq_s32_s8(b); + + int32x4_t repeat_0x00 = vdupq_n_s32(0); + int32x4_t repeat_0x01 = vdupq_n_s32(1); + + uint32x4_t upos_cmp_ret = vcltq_s32(repeat_0x00, b32); + uint32x4_t uneg_cmp_ret = vcltq_s32(b32, repeat_0x00); + + int32x4_t pos_cmp_ret = vreinterpretq_s32_u32(upos_cmp_ret); + int32x4_t pos_mask = vandq_s32(pos_cmp_ret, repeat_0x01); + int32x4_t neg_mask = vreinterpretq_s32_u32(uneg_cmp_ret); + + int32x4_t mask = vorrq_s32(pos_mask, neg_mask); + + return vreinterpretq_s8_s32(vmulq_s32(a32, mask)); +} + + +/***************************************************************************** + * Others * + *****************************************************************************/ + +static XSSE_FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + uint8x16_t index = vreinterpretq_u8_s8(b); + uint8x16_t repeat_0x0F = vdupq_n_u8(0x0F); + uint8x16_t repeat_0x80 = vdupq_n_u8(0x80); + + uint8x16_t and_mask = vandq_u8(index, repeat_0x0F); + uint8x16_t cmp_mask = vcgeq_u8(index, repeat_0x80); + uint8x16_t masked_index = vorrq_u8(and_mask, cmp_mask); + return vqtbl1q_s8(a, masked_index); +} + +#endif /* SSSE3 */ + + +/***************************************************************************** + * * + * SSE4.1 * + * * + *****************************************************************************/ + +#if defined(__SSE4_1__) +#include +#define XSSE4_1 + + +#elif defined(__aarch64__) || defined(_M_ARM64) +#define XSSE4_1 + +/***************************************************************************** + * Load / Store * + *****************************************************************************/ + +#define _mm_stream_load_si128(x) _mm_load_si128(x) + + +/***************************************************************************** + * Integer Arithmetic Operations * + *****************************************************************************/ + +#define _mm_max_epi8(a, b) (vmaxq_s8((a), (b))) +#define _mm_max_epi32(a, b) (vreinterpretq_s8_s32(vmaxq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) +#define _mm_max_epu16(a, b) (vreinterpretq_s8_u16(vmaxq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) +#define _mm_max_epu32(a, b) (vreinterpretq_s8_u32(vmaxq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)))) + +#define _mm_min_epi8(a, b) (vminq_s8((a), (b))) +#define _mm_min_epi32(a, b) (vreinterpretq_s8_s32(vminq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)))) +#define _mm_min_epu16(a, b) (vreinterpretq_s8_u16(vminq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)))) +#define _mm_min_epu32(a, b) (vreinterpretq_s8_u32(vminq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)))) + +static XSSE_FORCE_INLINE __m128i _mm_minpos_epu16(__m128i x) +{ + uint16x8_t x16 = vreinterpretq_u16_s8(x); + + uint16x4_t lo = vget_low_u16(x16); + uint16x4_t hi = vget_high_u16(x16); + uint16x4_t min4 = vmin_u16(lo, hi); + + uint16x4_t min2 = vpmin_u16(min4, min4); + uint16x4_t min1 = vpmin_u16(min2, min2); + + uint16_t min = (uint16_t) vget_lane_u16(min1, 0); + uint16x8_t repeat_min = vdupq_n_u16(min); + + uint16x8_t cmp = vceqq_u16(x16, repeat_min); + uint8x8_t narrowed_cmp = vmovn_u16(cmp); + uint64x1_t cmp_64 = vreinterpret_u64_u8(narrowed_cmp); + + uint64_t index_mask8 = (uint64_t) vget_lane_u64(cmp_64, 0); + + uint16_t index = 0; +#if (defined(__has_builtin) && __has_builtin(__builtin_ctzll)) || defined(__GNUC__) || defined(__clang__) + index = (uint16_t) __builtin_ctzll(index_mask8) / 8; +#else + for (int i = 0; i < 8; i++) { + if (index_mask8 & 1) { + index = (uint16_t) i; + break; + } + index_mask8 >>= 8; + } +#endif + return vreinterpretq_s8_u16((uint16x8_t) { min, index, 0, 0, 0, 0, 0, 0 }); +} + +static XSSE_FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + int32x4_t a32 = vreinterpretq_s32_s8(a); + int32x4_t b32 = vreinterpretq_s32_s8(b); + int32x4_t evens = vuzp1q_s32(a32, b32); + int32x2_t lo = vget_low_s32(evens); + int32x2_t hi = vget_high_s32(evens); + return vreinterpretq_s8_s64(vmull_s32(lo, hi)); +} +static XSSE_FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + int32x4_t a32 = vreinterpretq_s32_s8(a); + int32x4_t b32 = vreinterpretq_s32_s8(b); + return vreinterpretq_s8_s32(vmulq_s32(a32, b32)); +} + +static XSSE_FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm8) +{ + uint8x16_t ua = vreinterpretq_u8_s8(a); + + uint8x8_t a_sliced_0_4; + uint8x8_t a_sliced_1_5; + uint8x8_t a_sliced_2_6; + uint8x8_t a_sliced_3_7; + uint8x16_t a_pre_sliced_0_4; + uint8x16_t a_pre_sliced_1_5; + uint8x16_t a_pre_sliced_2_6; + uint8x16_t a_pre_sliced_3_7; + if ((imm8 >> 2) & 1) { + a_pre_sliced_0_4 = vextq_u8(ua, ua, 4); + a_pre_sliced_1_5 = vextq_u8(ua, ua, 5); + a_pre_sliced_2_6 = vextq_u8(ua, ua, 6); + a_pre_sliced_3_7 = vextq_u8(ua, ua, 7); + a_sliced_0_4 = vget_low_u8(a_pre_sliced_0_4); + } else { + a_pre_sliced_1_5 = vextq_u8(ua, ua, 1); + a_pre_sliced_2_6 = vextq_u8(ua, ua, 2); + a_pre_sliced_3_7 = vextq_u8(ua, ua, 3); + a_sliced_0_4 = vget_low_u8(ua); + } + a_sliced_1_5 = vget_low_u8(a_pre_sliced_1_5); + a_sliced_2_6 = vget_low_u8(a_pre_sliced_2_6); + a_sliced_3_7 = vget_low_u8(a_pre_sliced_3_7); + + uint32x4_t b32 = vreinterpretq_u32_s8(b); + uint8x8_t b_slicedx2; + switch (imm8 & 0x03) { + case 0: + b_slicedx2 = vreinterpret_u8_u32(vdup_n_u32(vgetq_lane_u32(b32, 0))); + break; + case 1: + b_slicedx2 = vreinterpret_u8_u32(vdup_n_u32(vgetq_lane_u32(b32, 1))); + break; + case 2: + b_slicedx2 = vreinterpret_u8_u32(vdup_n_u32(vgetq_lane_u32(b32, 2))); + break; + case 3: + b_slicedx2 = vreinterpret_u8_u32(vdup_n_u32(vgetq_lane_u32(b32, 3))); + break; + } + + uint16x8_t abs_diffs_0_4 = vabdl_u8(a_sliced_0_4, b_slicedx2); + uint16x8_t abs_diffs_1_5 = vabdl_u8(a_sliced_1_5, b_slicedx2); + uint16x8_t abs_diffs_2_6 = vabdl_u8(a_sliced_2_6, b_slicedx2); + uint16x8_t abs_diffs_3_7 = vabdl_u8(a_sliced_3_7, b_slicedx2); + + uint16x8_t abs_diffs_0_4_2_6 = vpaddq_u16(abs_diffs_0_4, abs_diffs_2_6); + uint16x8_t abs_diffs_1_5_3_7 = vpaddq_u16(abs_diffs_1_5, abs_diffs_3_7); + + uint32x4_t abs_diffs_0_4_2_6x32 = vreinterpretq_u32_u16(abs_diffs_0_4_2_6); + uint32x4_t abs_diffs_1_5_3_7x32 = vreinterpretq_u32_u16(abs_diffs_1_5_3_7); + uint32x4_t abs_diffs_0_1_2_3x32 = vtrn1q_u32(abs_diffs_0_4_2_6x32, abs_diffs_1_5_3_7x32); + uint32x4_t abs_diffs_4_5_6_7x32 = vtrn2q_u32(abs_diffs_0_4_2_6x32, abs_diffs_1_5_3_7x32); + + uint16x8_t abs_diffs_0_1_2_3 = vreinterpretq_u16_u32(abs_diffs_0_1_2_3x32); + uint16x8_t abs_diffs_4_5_6_7 = vreinterpretq_u16_u32(abs_diffs_4_5_6_7x32); + + return vreinterpretq_s8_u16(vpaddq_u16(abs_diffs_0_1_2_3, abs_diffs_4_5_6_7)); +} + + +/***************************************************************************** + * Comparison * + *****************************************************************************/ + +#define _mm_cmpeq_epi64(a, b) (vreinterpretq_s8_u64(vceqq_s64(vreinterpretq_s64_s8(a), vreinterpretq_s64_s8(b)))) + + +/***************************************************************************** + * Convert * + *****************************************************************************/ + +#define _mm_cvtepi8_epi16(x) (vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(x)))) +#define _mm_cvtepi8_epi32(x) (vreinterpretq_s8_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))))) +#define _mm_cvtepi8_epi64(x) \ + (vreinterpretq_s8_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))))))) +#define _mm_cvtepi16_epi32(x) (vreinterpretq_s8_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s8(x))))) +#define _mm_cvtepi16_epi64(x) \ + (vreinterpretq_s8_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s8(x))))))) +#define _mm_cvtepi32_epi64(x) (vreinterpretq_s8_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(x))))) + +#define _mm_cvtepu8_epi16(x) (vreinterpretq_s8_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(x))))) +#define _mm_cvtepu8_epi32(x) (vreinterpretq_s8_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(x))))))) +#define _mm_cvtepu8_epi64(x) \ + (vreinterpretq_s8_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(x))))))))) +#define _mm_cvtepu16_epi32(x) (vreinterpretq_s8_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s8(x))))) +#define _mm_cvtepu16_epi64(x) \ + (vreinterpretq_s8_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s8(x))))))) +#define _mm_cvtepu32_epi64(x) (vreinterpretq_s8_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s8(x))))) + + +/***************************************************************************** + * Tests * + *****************************************************************************/ + +static XSSE_FORCE_INLINE int _mm_test_all_ones(__m128i x) +{ + uint64x2_t x64 = vreinterpretq_u64_s8(x); + return (vgetq_lane_u64(x64, 0) == ~0ULL) && (vgetq_lane_u64(x64, 1) == ~0ULL); +} +static XSSE_FORCE_INLINE int _mm_test_all_zeros(__m128i mask, __m128i x) +{ + int8x16_t masked = vandq_s8(x, mask); + uint64x2_t masked64 = vreinterpretq_u64_s8(masked); + return (vgetq_lane_u64(masked64, 0) == 0) && (vgetq_lane_u64(masked64, 1) == 0); +} +static XSSE_FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i mask, __m128i x) +{ + int8x16_t and = vandq_s8(x, mask); + uint64x2_t and64 = vreinterpretq_u64_s8(and); + int has_ones = (vgetq_lane_u64(and64, 0) | vgetq_lane_u64(and64, 1)) != 0; + + int8x16_t andnot = vbicq_s8(x, mask); + uint64x2_t andnot64 = vreinterpretq_u64_s8(andnot); + int has_zeros = (vgetq_lane_u64(andnot64, 0) | vgetq_lane_u64(andnot64, 1)) != 0; + + return has_ones && has_zeros; +} +static XSSE_FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int8x16_t andnot = vbicq_s8(b, a); + uint64x2_t andnot64 = vreinterpretq_u64_s8(andnot); + return (vgetq_lane_u64(andnot64, 0) == 0) && (vgetq_lane_u64(andnot64, 1) == 0); +} +#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) +#define _mm_testz_si128(a, b) _mm_test_all_zeros(a, b) + + +/***************************************************************************** + * Others * + *****************************************************************************/ + +#define _mm_packus_epi32(a, b) \ + (vreinterpretq_s8_u16(vcombine_u16(vqmovun_s32(vreinterpretq_s32_s8(a)), vqmovun_s32(vreinterpretq_s32_s8(b))))) + +#define _mm_extract_epi8(x, imm) (vgetq_lane_s8((x), (imm))) +#define _mm_extract_epi32(x, imm) (vgetq_lane_s32(vreinterpretq_s32_s8(x), (imm))) +#define _mm_extract_epi64(x, imm) (vgetq_lane_s64(vreinterpretq_s64_s8(x), (imm))) + +#define _mm_insert_epi8(x, val, imm) (vsetq_lane_s8((int8_t) (val), (x), (imm))) +#define _mm_insert_epi32(x, val, imm) (vreinterpretq_s8_s32(vsetq_lane_s32((int32_t) (val), vreinterpretq_s32_s8(x), (imm)))) +#define _mm_insert_epi64(x, val, imm) (vreinterpretq_s8_s64(vsetq_lane_s64((int64_t) (val), vreinterpretq_s64_s8(x), (imm)))) + +#define _mm_blend_epi16(a, b, imm8) \ + (vreinterpretq_s8_s16(vbslq_s16( \ + (uint16x8_t){ \ + (imm8 & (1 << 0)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 1)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 2)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 3)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 4)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 5)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 6)) ? 0xFFFF : 0x0000, \ + (imm8 & (1 << 7)) ? 0xFFFF : 0x0000 \ + }, \ + vreinterpretq_s16_s8(b), \ + vreinterpretq_s16_s8(a) \ + ))) +static XSSE_FORCE_INLINE __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i mask) +{ + uint8x16_t umask = vreinterpretq_u8_s8(mask); + uint8x16_t repeat_0x80 = vdupq_n_u8(0x80); + uint8x16_t mask_fill = vcgeq_u8(umask, repeat_0x80); + return vbslq_s8(mask_fill, b, a); +} + +#endif /* SSE4_1 */ + + +/***************************************************************************** + * * + * SSE4.2 * + * * + *****************************************************************************/ + +#if defined(__SSE4_2__) +#include +#define XSSE4_2 + + +#elif defined(__aarch64__) || defined(_M_ARM64) +#define XSSE4_2 + +/***************************************************************************** + * Comparison * + *****************************************************************************/ + +#define _mm_cmpgt_epi64(a, b) (vreinterpretq_s8_u64(vcgtq_s64(vreinterpretq_s64_s8(a), vreinterpretq_s64_s8(b)))) + + +/***************************************************************************** + * Packed Compare * + *****************************************************************************/ + +/* [1:0]*/ +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_UWORD_OPS 0x01 +#define _SIDD_SBYTE_OPS 0x02 +#define _SIDD_SWORD_OPS 0x03 + +/* [3:2] */ +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ORDERED 0x0C + +/* [5:4] */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 + +/* [6] */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* [6] */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +typedef struct { + int8_t cf; + int8_t zf; + int8_t sf; + uint8x16_t mask; +} _xsse_pcmp_str_result_t; + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_seach_most_significant_byte_index(uint64_t mask) +{ +#if (defined(__has_builtin) && __has_builtin(__builtin_clzll)) || defined(__GNUC__) || defined(__clang__) + return (int) 7 - (__builtin_clzll(mask) / 8); +#else + for (int i = 0; i < 8; ++i) { + uint8_t byte = (mask >> ((7 - i) * 8)) & 0xFF; + if (byte != 0) { + return 7 - i; + } + } + XSSE_UNREACHABLE(); +#endif +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_seach_least_significant_byte_index(uint64_t mask) +{ +#if (defined(__has_builtin) && __has_builtin(__builtin_ctzll)) || defined(__GNUC__) || defined(__clang__) + return (int) __builtin_ctzll(mask) / 8; +#else + for (int i = 0; i < 8; i++) { + if (mask & 0xFF) { + return i; + } + mask >>= 8; + } + XSSE_UNREACHABLE(); +#endif +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_seach_most_significant_word_index(uint64_t mask) +{ +#if (defined(__has_builtin) && __has_builtin(__builtin_clzll)) || defined(__GNUC__) || defined(__clang__) + return (int) 3 - (__builtin_clzll(mask) / 16); +#else + for (int i = 0; i < 4; ++i) { + uint16_t byte = (mask >> ((3 - i) * 16)) & 0xFFFF; + if (byte != 0) { + return 3 - i; + } + } + XSSE_UNREACHABLE(); +#endif +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_seach_least_significant_word_index(uint64_t mask) +{ +#if (defined(__has_builtin) && __has_builtin(__builtin_ctzll)) || defined(__GNUC__) || defined(__clang__) + return (int) __builtin_ctzll(mask) / 16; +#else + for (int i = 0; i < 4; i++) { + if (mask & 0xFFFF) { + return i; + } + mask >>= 16; + } + XSSE_UNREACHABLE(); +#endif +} + +#ifndef XSSE_IS_OPTIMIZE +XSSE_ATTR_OPTIMIZE_O2 +#endif +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE _xsse_pcmp_str_result_t _xsse_pcmp_str_core(const __m128i a, int la, const __m128i b, int lb, const int imm8, const int check_null) +{ + if (imm8 & 0x01) { + /* word */ +#define XSSE_PCMP_ANY_WORD(i, l) \ + tmp_cmp_##i = (imm8 & 0x02) ? vceqq_s16(vdupq_n_s16(vgetq_lane_s16(a16, l)), b16) : vceqq_u16(vdupq_n_u16(vgetq_lane_u16(ua16, l)), ub16) +#define XSSE_PCMP_RANGES_WORD(i, l) \ + tmp_cmp_##i = (imm8 & 0x02) \ + ? vandq_u16(vcleq_s16(vdupq_n_s16(vgetq_lane_s16(a16, (l))), b16), vcleq_s16(b16, vdupq_n_s16(vgetq_lane_s16(a16, (l + 1))))) \ + : vandq_u16(vcleq_u16(vdupq_n_u16(vgetq_lane_u16(ua16, (l))), ub16), vcleq_u16(ub16, vdupq_n_u16(vgetq_lane_u16(ua16, (l + 1))))) +#define XSSE_PCMP_EACH_WORD() cmp_ret = (imm8 & 0x02) ? vceqq_s16(a16, b16) : vceqq_u16(ua16, ub16) +#define XSSE_PCMP_ORDERED_WORD(i, l) \ + tmp_cmp_##i = (imm8 & 0x02) \ + ? vextq_u16(vbicq_u16(vceqq_s16(vdupq_n_s16(vgetq_lane_s16(a16, l)), b16), b_invalid_mask), repeat_full_bits, l) \ + : vextq_u16(vbicq_u16(vceqq_u16(vdupq_n_u16(vgetq_lane_u16(ua16, l)), ub16), b_invalid_mask), repeat_full_bits, l) + +#define XSSE_VORR_U16(a, b) tmp_cmp_##a = vorrq_u16(tmp_cmp_##a, tmp_cmp_##b); +#define XSSE_VAND_U16(a, b) tmp_cmp_##a = vandq_u16(tmp_cmp_##a, tmp_cmp_##b); + + if (check_null) { + uint16x8_t repeat_nul = vdupq_n_u16(0); + uint16x8_t cmp_nul_a = vceqq_u16(vreinterpretq_u16_s8(a), repeat_nul); + uint16x8_t cmp_nul_b = vceqq_u16(vreinterpretq_u16_s8(b), repeat_nul); + + uint64_t low_a = vgetq_lane_u64(vreinterpretq_u64_u16(cmp_nul_a), 0); + uint64_t high_a = vgetq_lane_u64(vreinterpretq_u64_u16(cmp_nul_a), 1); + if (low_a != 0) { + la = _xsse_seach_least_significant_word_index(low_a); + } else if (high_a != 0) { + la = _xsse_seach_least_significant_word_index(high_a) + 4; + } else { + la = 8; + } + + uint64_t low_b = vgetq_lane_u64(vreinterpretq_u64_u16(cmp_nul_b), 0); + uint64_t high_b = vgetq_lane_u64(vreinterpretq_u64_u16(cmp_nul_b), 1); + if (low_b != 0) { + lb = _xsse_seach_least_significant_word_index(low_b); + } else if (high_b != 0) { + lb = _xsse_seach_least_significant_word_index(high_b) + 4; + } else { + lb = 8; + } + } + + uint16x8_t cmp_ret; + uint16x8_t lanes = { 0, 1, 2, 3, 4, 5, 6, 7 }; + uint16x8_t a_threshold = vdupq_n_u16(la); + uint16x8_t b_threshold = vdupq_n_u16(lb); + uint16x8_t a_invalid_mask = vcgeq_u16(lanes, a_threshold); + uint16x8_t b_invalid_mask = vcgeq_u16(lanes, b_threshold); + + int16x8_t a16, b16; + uint16x8_t ua16, ub16; + if (imm8 & 0x02) { + a16 = vreinterpretq_s16_s8(a); + b16 = vreinterpretq_s16_s8(b); + } else { + ua16 = vreinterpretq_u16_s8(a); + ub16 = vreinterpretq_u16_s8(b); + } + + /* mode */ + switch ((imm8 >> 2) & 0x03) { + case _SIDD_CMP_EQUAL_ANY >> 2: + { + cmp_ret = vdupq_n_u16(0); + if (la >= 8) { + uint16x8_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + XSSE_PCMP_ANY_WORD(0, 0); XSSE_PCMP_ANY_WORD(1, 1); XSSE_PCMP_ANY_WORD(2, 2); XSSE_PCMP_ANY_WORD(3, 3); + XSSE_PCMP_ANY_WORD(4, 4); XSSE_PCMP_ANY_WORD(5, 5); XSSE_PCMP_ANY_WORD(6, 6); XSSE_PCMP_ANY_WORD(7, 7); + + XSSE_VORR_U16(0, 1); XSSE_VORR_U16(2, 3); XSSE_VORR_U16(4, 5); XSSE_VORR_U16(6, 7); + XSSE_VORR_U16(0, 2); XSSE_VORR_U16(4, 6); XSSE_VORR_U16(0, 4); + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + } else { + int checked = 0; + int la2 = la; + if (la2 >= 4) { + uint16x8_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3; + XSSE_PCMP_ANY_WORD(0, 0); XSSE_PCMP_ANY_WORD(1, 1); XSSE_PCMP_ANY_WORD(2, 2); XSSE_PCMP_ANY_WORD(3, 3); + XSSE_VORR_U16(0, 1); XSSE_VORR_U16(2, 3); XSSE_VORR_U16(0, 2); + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + checked = 4; + la2 -= 4; + } + if (la2 >= 2) { + uint16x8_t tmp_cmp_0, tmp_cmp_1; + if (checked == 4) { + XSSE_PCMP_ANY_WORD(0, 4); XSSE_PCMP_ANY_WORD(1, 5); + } else { + XSSE_PCMP_ANY_WORD(0, 0); XSSE_PCMP_ANY_WORD(1, 1); + } + XSSE_VORR_U16(0, 1); + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + checked += 2; + la2 -= 2; + } + if (la2 >= 1) { + uint16x8_t tmp_cmp_0; + switch (checked) { + case 6: XSSE_PCMP_ANY_WORD(0, 6); break; + case 4: XSSE_PCMP_ANY_WORD(0, 4); break; + case 2: XSSE_PCMP_ANY_WORD(0, 2); break; + case 0: XSSE_PCMP_ANY_WORD(0, 0); break; + default: XSSE_UNREACHABLE(); + } + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + } + } + cmp_ret = vbicq_u16(cmp_ret, b_invalid_mask); + } + break; + + case _SIDD_CMP_RANGES >> 2: + { + cmp_ret = vdupq_n_u16(0); + if (la >= 8) { + uint16x8_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3; + XSSE_PCMP_RANGES_WORD(0, 0); XSSE_PCMP_RANGES_WORD(1, 2); XSSE_PCMP_RANGES_WORD(2, 4); XSSE_PCMP_RANGES_WORD(3, 6); + XSSE_VORR_U16(0, 1); XSSE_VORR_U16(2, 3); XSSE_VORR_U16(0, 2); + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + } else { + int checked = 0; + int la2 = la; + if (la2 >= 4) { + uint16x8_t tmp_cmp_0, tmp_cmp_1; + XSSE_PCMP_RANGES_WORD(0, 0); XSSE_PCMP_RANGES_WORD(1, 2); + XSSE_VORR_U16(0, 1); + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + checked = 4; + la2 -= 4; + } + if (la2 >= 2) { + uint16x8_t tmp_cmp_0; + if (checked == 4) { + XSSE_PCMP_RANGES_WORD(0, 4); + } else { + XSSE_PCMP_RANGES_WORD(0, 0); + } + cmp_ret = vorrq_u16(cmp_ret, tmp_cmp_0); + } + } + cmp_ret = vbicq_u16(cmp_ret, b_invalid_mask); + } + break; + + case _SIDD_CMP_EQUAL_EACH >> 2: + { + uint16x8_t and_invalid_mask = vandq_u16(a_invalid_mask, b_invalid_mask); + uint16x8_t xor_invalid_mask = veorq_u16(a_invalid_mask, b_invalid_mask); + XSSE_PCMP_EACH_WORD(); + cmp_ret = vorrq_u16(cmp_ret, and_invalid_mask); + cmp_ret = vbicq_u16(cmp_ret, xor_invalid_mask); + } + break; + + case _SIDD_CMP_EQUAL_ORDERED >> 2: + { + cmp_ret = vdupq_n_u16(0xFFFF); + uint16x8_t repeat_full_bits = vdupq_n_u16(0xFFFF); + if (la >= 8) { + uint16x8_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + XSSE_PCMP_ORDERED_WORD(0, 0); XSSE_PCMP_ORDERED_WORD(1, 1); XSSE_PCMP_ORDERED_WORD(2, 2); XSSE_PCMP_ORDERED_WORD(3, 3); + XSSE_PCMP_ORDERED_WORD(4, 4); XSSE_PCMP_ORDERED_WORD(5, 5); XSSE_PCMP_ORDERED_WORD(6, 6); XSSE_PCMP_ORDERED_WORD(7, 7); + + XSSE_VAND_U16(0, 1); XSSE_VAND_U16(2, 3); XSSE_VAND_U16(4, 5); XSSE_VAND_U16(6, 7); + XSSE_VAND_U16(0, 2); XSSE_VAND_U16(4, 6); XSSE_VAND_U16(0, 4); + cmp_ret = vandq_u16(cmp_ret, tmp_cmp_0); + } else { + int checked = 0; + int la2 = la; + if (la2 >= 4) { + uint16x8_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3; + XSSE_PCMP_ORDERED_WORD(0, 0); XSSE_PCMP_ORDERED_WORD(1, 1); XSSE_PCMP_ORDERED_WORD(2, 2); XSSE_PCMP_ORDERED_WORD(3, 3); + XSSE_VAND_U16(0, 1); XSSE_VAND_U16(2, 3); XSSE_VAND_U16(0, 2); + cmp_ret = vandq_u16(cmp_ret, tmp_cmp_0); + checked = 4; + la2 -= 4; + } + if (la2 >= 2) { + uint16x8_t tmp_cmp_0, tmp_cmp_1; + if (checked == 4) { + XSSE_PCMP_ORDERED_WORD(0, 4); XSSE_PCMP_ORDERED_WORD(1, 5); + } else { + XSSE_PCMP_ORDERED_WORD(0, 0); XSSE_PCMP_ORDERED_WORD(1, 1); + } + XSSE_VAND_U16(0, 1); + cmp_ret = vandq_u16(cmp_ret, tmp_cmp_0); + checked += 2; + la2 -= 2; + } + if (la2 >= 1) { + uint16x8_t tmp_cmp_0; + switch (checked) { + case 6: XSSE_PCMP_ORDERED_WORD(0, 6); break; + case 4: XSSE_PCMP_ORDERED_WORD(0, 4); break; + case 2: XSSE_PCMP_ORDERED_WORD(0, 2); break; + case 0: XSSE_PCMP_ORDERED_WORD(0, 0); break; + default: XSSE_UNREACHABLE(); + } + cmp_ret = vandq_u16(cmp_ret, tmp_cmp_0); + } + } + } + break; + } + + /* negate */ + if (imm8 & _SIDD_NEGATIVE_POLARITY) { + uint16x8_t not_cmp_ret = vmvnq_u16(cmp_ret); + if (imm8 & _SIDD_MASKED_NEGATIVE_POLARITY) { + cmp_ret = vbslq_u16(cmp_ret, not_cmp_ret, b_invalid_mask); + } else { + cmp_ret = not_cmp_ret; + } + } + + uint64x2_t cmp_ret_64 = vreinterpretq_u64_u16(cmp_ret); + + _xsse_pcmp_str_result_t result; + result.cf = (vgetq_lane_u64(cmp_ret_64, 0) | vgetq_lane_u64(cmp_ret_64, 1)) != 0; + result.zf = lb < 8; + result.sf = la < 8; + result.mask = vreinterpretq_u8_u16(cmp_ret); + return result; + +#undef XSSE_VAND_U16 +#undef XSSE_VORR_U16 +#undef XSSE_PCMP_ORDERED_WORD +#undef XSSE_PCMP_EACH_WORD +#undef XSSE_PCMP_ANY_WORD +#undef XSSE_VORR_U16 +#undef XSSE_VAND_U16 + } else { + /* byte */ +#define XSSE_PCMP_ANY_BYTE(i, l) \ + tmp_cmp_##i = (imm8 & 0x02) ? vceqq_s8(vdupq_n_s8(vgetq_lane_s8(a, l)), b) : vceqq_u8(vdupq_n_u8(vgetq_lane_u8(ua, l)), ub) +#define XSSE_PCMP_RANGES_BYTE(i, l) \ + tmp_cmp_##i = (imm8 & 0x02) \ + ? vandq_u8(vcleq_s8(vdupq_n_s8(vgetq_lane_s8(a, (l))), b), vcleq_s8(b, vdupq_n_s8(vgetq_lane_s8(a, (l + 1))))) \ + : vandq_u8(vcleq_u8(vdupq_n_u8(vgetq_lane_u8(ua, (l))), ub), vcleq_u8(ub, vdupq_n_u8(vgetq_lane_u8(ua, (l + 1))))) +#define XSSE_PCMP_EACH_BYTE() cmp_ret = (imm8 & 0x02) ? vceqq_s8(a, b) : vceqq_u8(ua, ub) +#define XSSE_PCMP_ORDERED_BYTE(i, l) \ + tmp_cmp_##i = (imm8 & 0x02) \ + ? vextq_u8(vbicq_u8(vceqq_s8(vdupq_n_s8(vgetq_lane_s8(a, l)), b), b_invalid_mask), repeat_full_bits, l) \ + : vextq_u8(vbicq_u8(vceqq_u8(vdupq_n_u8(vgetq_lane_u8(ua, l)), ub), b_invalid_mask), repeat_full_bits, l) + +#define XSSE_VORR_U8(a, b) tmp_cmp_##a = vorrq_u8(tmp_cmp_##a, tmp_cmp_##b); +#define XSSE_VAND_U8(a, b) tmp_cmp_##a = vandq_u8(tmp_cmp_##a, tmp_cmp_##b); + + if (check_null) { + uint8x16_t repeat_nul = vdupq_n_u8(0); + uint8x16_t cmp_nul_a = vceqq_u8(vreinterpretq_u8_s8(a), repeat_nul); + uint8x16_t cmp_nul_b = vceqq_u8(vreinterpretq_u8_s8(b), repeat_nul); + + uint64_t low_a = vgetq_lane_u64(vreinterpretq_u64_u8(cmp_nul_a), 0); + uint64_t high_a = vgetq_lane_u64(vreinterpretq_u64_u8(cmp_nul_a), 1); + if (low_a != 0) { + la = _xsse_seach_least_significant_byte_index(low_a); + } else if (high_a != 0) { + la = _xsse_seach_least_significant_byte_index(high_a) + 8; + } else { + la = 16; + } + + uint64_t low_b = vgetq_lane_u64(vreinterpretq_u64_u8(cmp_nul_b), 0); + uint64_t high_b = vgetq_lane_u64(vreinterpretq_u64_u8(cmp_nul_b), 1); + if (low_b != 0) { + lb = _xsse_seach_least_significant_byte_index(low_b); + } else if (high_b != 0) { + lb = _xsse_seach_least_significant_byte_index(high_b) + 8; + } else { + lb = 16; + } + } + + uint8x16_t cmp_ret; + uint8x16_t lanes = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + uint8x16_t a_threshold = vdupq_n_u8(la); + uint8x16_t b_threshold = vdupq_n_u8(lb); + uint8x16_t a_invalid_mask = vcgeq_u8(lanes, a_threshold); + uint8x16_t b_invalid_mask = vcgeq_u8(lanes, b_threshold); + + uint8x16_t ua, ub; + if ((imm8 & 0x02) == 0) { + ua = vreinterpretq_u8_s8(a); + ub = vreinterpretq_u8_s8(b); + } + + /* mode */ + switch ((imm8 >> 2) & 0x03) { + case _SIDD_CMP_EQUAL_ANY >> 2: + { + cmp_ret = vdupq_n_u8(0); + if (la >= 16) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + uint8x16_t tmp_cmp_8, tmp_cmp_9, tmp_cmp_10, tmp_cmp_11, tmp_cmp_12, tmp_cmp_13, tmp_cmp_14, tmp_cmp_15; + XSSE_PCMP_ANY_BYTE(0, 0); XSSE_PCMP_ANY_BYTE(1, 1); XSSE_PCMP_ANY_BYTE(2, 2); XSSE_PCMP_ANY_BYTE(3, 3); + XSSE_PCMP_ANY_BYTE(4, 4); XSSE_PCMP_ANY_BYTE(5, 5); XSSE_PCMP_ANY_BYTE(6, 6); XSSE_PCMP_ANY_BYTE(7, 7); + XSSE_PCMP_ANY_BYTE(8, 8); XSSE_PCMP_ANY_BYTE(9, 9); XSSE_PCMP_ANY_BYTE(10, 10); XSSE_PCMP_ANY_BYTE(11, 11); + XSSE_PCMP_ANY_BYTE(12, 12); XSSE_PCMP_ANY_BYTE(13, 13); XSSE_PCMP_ANY_BYTE(14, 14); XSSE_PCMP_ANY_BYTE(15, 15); + + XSSE_VORR_U8(0, 1); XSSE_VORR_U8(2, 3); XSSE_VORR_U8(4, 5); XSSE_VORR_U8(6, 7); + XSSE_VORR_U8(8, 9); XSSE_VORR_U8(10, 11); XSSE_VORR_U8(12, 13); XSSE_VORR_U8(14, 15); + XSSE_VORR_U8(0, 2); XSSE_VORR_U8(4, 6); XSSE_VORR_U8(8, 10); XSSE_VORR_U8(12, 14); + XSSE_VORR_U8(0, 4); XSSE_VORR_U8(8, 12); XSSE_VORR_U8(0, 8); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + } else { + int checked = 0; + int la2 = la; + if (la2 >= 8) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + XSSE_PCMP_ANY_BYTE(0, 0); XSSE_PCMP_ANY_BYTE(1, 1); XSSE_PCMP_ANY_BYTE(2, 2); XSSE_PCMP_ANY_BYTE(3, 3); + XSSE_PCMP_ANY_BYTE(4, 4); XSSE_PCMP_ANY_BYTE(5, 5); XSSE_PCMP_ANY_BYTE(6, 6); XSSE_PCMP_ANY_BYTE(7, 7); + + XSSE_VORR_U8(0, 1); XSSE_VORR_U8(2, 3); XSSE_VORR_U8(4, 5); XSSE_VORR_U8(6, 7); + XSSE_VORR_U8(0, 2); XSSE_VORR_U8(4, 6); XSSE_VORR_U8(0, 4); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + checked = 8; + la2 -= 8; + } + if (la2 >= 4) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3; + if (checked == 8) { + XSSE_PCMP_ANY_BYTE(0, 8); XSSE_PCMP_ANY_BYTE(1, 9); XSSE_PCMP_ANY_BYTE(2, 10); XSSE_PCMP_ANY_BYTE(3, 11); + } else { + XSSE_PCMP_ANY_BYTE(0, 0); XSSE_PCMP_ANY_BYTE(1, 1); XSSE_PCMP_ANY_BYTE(2, 2); XSSE_PCMP_ANY_BYTE(3, 3); + } + XSSE_VORR_U8(0, 1); XSSE_VORR_U8(2, 3); XSSE_VORR_U8(0, 2); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + checked += 4; + la2 -= 4; + } + if (la2 >= 2) { + uint8x16_t tmp_cmp_0, tmp_cmp_1; + switch (checked) { + case 12: XSSE_PCMP_ANY_BYTE(0, 12); XSSE_PCMP_ANY_BYTE(1, 13); break; + case 8: XSSE_PCMP_ANY_BYTE(0, 8); XSSE_PCMP_ANY_BYTE(1, 9); break; + case 4: XSSE_PCMP_ANY_BYTE(0, 4); XSSE_PCMP_ANY_BYTE(1, 5); break; + case 0: XSSE_PCMP_ANY_BYTE(0, 0); XSSE_PCMP_ANY_BYTE(1, 1); break; + default: XSSE_UNREACHABLE(); + } + XSSE_VORR_U8(0, 1); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + checked += 2; + la2 -= 2; + } + if (la2 == 1) { + uint8x16_t tmp_cmp_0; + switch (checked) { + case 14: XSSE_PCMP_ANY_BYTE(0, 14); break; + case 12: XSSE_PCMP_ANY_BYTE(0, 12); break; + case 10: XSSE_PCMP_ANY_BYTE(0, 10); break; + case 8: XSSE_PCMP_ANY_BYTE(0, 8); break; + case 6: XSSE_PCMP_ANY_BYTE(0, 6); break; + case 4: XSSE_PCMP_ANY_BYTE(0, 4); break; + case 2: XSSE_PCMP_ANY_BYTE(0, 2); break; + case 0: XSSE_PCMP_ANY_BYTE(0, 0); break; + default: XSSE_UNREACHABLE(); + } + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + } + } + cmp_ret = vbicq_u8(cmp_ret, b_invalid_mask); + } + break; + + case _SIDD_CMP_RANGES >> 2: + { + cmp_ret = vdupq_n_u8(0); + if (la >= 16) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + XSSE_PCMP_RANGES_BYTE(0, 0); XSSE_PCMP_RANGES_BYTE(1, 2); XSSE_PCMP_RANGES_BYTE(2, 4); XSSE_PCMP_RANGES_BYTE(3, 6); + XSSE_PCMP_RANGES_BYTE(4, 8); XSSE_PCMP_RANGES_BYTE(5, 10); XSSE_PCMP_RANGES_BYTE(6, 12); XSSE_PCMP_RANGES_BYTE(7, 14); + + XSSE_VORR_U8(0, 1); XSSE_VORR_U8(2, 3); XSSE_VORR_U8(4, 5); XSSE_VORR_U8(6, 7); + XSSE_VORR_U8(0, 2); XSSE_VORR_U8(4, 6); XSSE_VORR_U8(0, 4); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + } else { + int checked = 0; + int la2 = la; + if (la2 >= 8) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3; + XSSE_PCMP_RANGES_BYTE(0, 0); XSSE_PCMP_RANGES_BYTE(1, 2); XSSE_PCMP_RANGES_BYTE(2, 4); XSSE_PCMP_RANGES_BYTE(3, 6); + + XSSE_VORR_U8(0, 1); XSSE_VORR_U8(2, 3); XSSE_VORR_U8(0, 2); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + checked = 8; + la2 -= 8; + } + if (la2 >= 4) { + uint8x16_t tmp_cmp_0, tmp_cmp_1; + if (checked == 8) { + XSSE_PCMP_RANGES_BYTE(0, 8); XSSE_PCMP_RANGES_BYTE(1, 10); + } else { + XSSE_PCMP_RANGES_BYTE(0, 0); XSSE_PCMP_RANGES_BYTE(1, 2); + } + XSSE_VORR_U8(0, 1); + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + checked += 4; + la2 -= 4; + } + if (la2 >= 2) { + uint8x16_t tmp_cmp_0; + switch (checked) { + case 12: XSSE_PCMP_RANGES_BYTE(0, 12); break; + case 8: XSSE_PCMP_RANGES_BYTE(0, 8); break; + case 4: XSSE_PCMP_RANGES_BYTE(0, 4); break; + case 0: XSSE_PCMP_RANGES_BYTE(0, 0); break; + default: XSSE_UNREACHABLE(); + } + cmp_ret = vorrq_u8(cmp_ret, tmp_cmp_0); + } + } + cmp_ret = vbicq_u8(cmp_ret, b_invalid_mask); + } + break; + + case _SIDD_CMP_EQUAL_EACH >> 2: + { + uint8x16_t and_invalid_mask = vandq_u8(a_invalid_mask, b_invalid_mask); + uint8x16_t xor_invalid_mask = veorq_u8(a_invalid_mask, b_invalid_mask); + XSSE_PCMP_EACH_BYTE(); + cmp_ret = vorrq_u8(cmp_ret, and_invalid_mask); + cmp_ret = vbicq_u8(cmp_ret, xor_invalid_mask); + } + break; + + case _SIDD_CMP_EQUAL_ORDERED >> 2: + { + cmp_ret = vdupq_n_u8(0xFF); + uint8x16_t repeat_full_bits = vdupq_n_u8(0xFF); + if (la >= 16) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + uint8x16_t tmp_cmp_8, tmp_cmp_9, tmp_cmp_10, tmp_cmp_11, tmp_cmp_12, tmp_cmp_13, tmp_cmp_14, tmp_cmp_15; + XSSE_PCMP_ORDERED_BYTE(0, 0); XSSE_PCMP_ORDERED_BYTE(1, 1); XSSE_PCMP_ORDERED_BYTE(2, 2); XSSE_PCMP_ORDERED_BYTE(3, 3); + XSSE_PCMP_ORDERED_BYTE(4, 4); XSSE_PCMP_ORDERED_BYTE(5, 5); XSSE_PCMP_ORDERED_BYTE(6, 6); XSSE_PCMP_ORDERED_BYTE(7, 7); + XSSE_PCMP_ORDERED_BYTE(8, 8); XSSE_PCMP_ORDERED_BYTE(9, 9); XSSE_PCMP_ORDERED_BYTE(10, 10); XSSE_PCMP_ORDERED_BYTE(11, 11); + XSSE_PCMP_ORDERED_BYTE(12, 12); XSSE_PCMP_ORDERED_BYTE(13, 13); XSSE_PCMP_ORDERED_BYTE(14, 14); XSSE_PCMP_ORDERED_BYTE(15, 15); + + XSSE_VAND_U8(0, 1); XSSE_VAND_U8(2, 3); XSSE_VAND_U8(4, 5); XSSE_VAND_U8(6, 7); + XSSE_VAND_U8(8, 9); XSSE_VAND_U8(10, 11); XSSE_VAND_U8(12, 13); XSSE_VAND_U8(14, 15); + XSSE_VAND_U8(0, 2); XSSE_VAND_U8(4, 6); XSSE_VAND_U8(8, 10); XSSE_VAND_U8(12, 14); + XSSE_VAND_U8(0, 4); XSSE_VAND_U8(8, 12); XSSE_VAND_U8(0, 8); + cmp_ret = vandq_u8(cmp_ret, tmp_cmp_0); + } else { + int checked = 0; + int la2 = la; + if (la2 >= 8) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3, tmp_cmp_4, tmp_cmp_5, tmp_cmp_6, tmp_cmp_7; + XSSE_PCMP_ORDERED_BYTE(0, 0); XSSE_PCMP_ORDERED_BYTE(1, 1); XSSE_PCMP_ORDERED_BYTE(2, 2); XSSE_PCMP_ORDERED_BYTE(3, 3); + XSSE_PCMP_ORDERED_BYTE(4, 4); XSSE_PCMP_ORDERED_BYTE(5, 5); XSSE_PCMP_ORDERED_BYTE(6, 6); XSSE_PCMP_ORDERED_BYTE(7, 7); + + XSSE_VAND_U8(0, 1); XSSE_VAND_U8(2, 3); XSSE_VAND_U8(4, 5); XSSE_VAND_U8(6, 7); + XSSE_VAND_U8(0, 2); XSSE_VAND_U8(4, 6); XSSE_VAND_U8(0, 4); + cmp_ret = vandq_u8(cmp_ret, tmp_cmp_0); + checked = 8; + la2 -= 8; + } + if (la2 >= 4) { + uint8x16_t tmp_cmp_0, tmp_cmp_1, tmp_cmp_2, tmp_cmp_3; + if (checked == 8) { + XSSE_PCMP_ORDERED_BYTE(0, 8); XSSE_PCMP_ORDERED_BYTE(1, 9); XSSE_PCMP_ORDERED_BYTE(2, 10); XSSE_PCMP_ORDERED_BYTE(3, 11); + } else { + XSSE_PCMP_ORDERED_BYTE(0, 0); XSSE_PCMP_ORDERED_BYTE(1, 1); XSSE_PCMP_ORDERED_BYTE(2, 2); XSSE_PCMP_ORDERED_BYTE(3, 3); + } + XSSE_VAND_U8(0, 1); XSSE_VAND_U8(2, 3); XSSE_VAND_U8(0, 2); + cmp_ret = vandq_u8(cmp_ret, tmp_cmp_0); + checked += 4; + la2 -= 4; + } + if (la2 >= 2) { + uint8x16_t tmp_cmp_0, tmp_cmp_1; + switch (checked) { + case 12: XSSE_PCMP_ORDERED_BYTE(0, 12); XSSE_PCMP_ORDERED_BYTE(1, 13); break; + case 8: XSSE_PCMP_ORDERED_BYTE(0, 8); XSSE_PCMP_ORDERED_BYTE(1, 9); break; + case 4: XSSE_PCMP_ORDERED_BYTE(0, 4); XSSE_PCMP_ORDERED_BYTE(1, 5); break; + case 0: XSSE_PCMP_ORDERED_BYTE(0, 0); XSSE_PCMP_ORDERED_BYTE(1, 1); break; + default: XSSE_UNREACHABLE(); + } + XSSE_VAND_U8(0, 1); + cmp_ret = vandq_u8(cmp_ret, tmp_cmp_0); + checked += 2; + la2 -= 2; + } + if (la2 == 1) { + uint8x16_t tmp_cmp_0; + switch (checked) { + case 14: XSSE_PCMP_ORDERED_BYTE(0, 14); break; + case 12: XSSE_PCMP_ORDERED_BYTE(0, 12); break; + case 10: XSSE_PCMP_ORDERED_BYTE(0, 10); break; + case 8: XSSE_PCMP_ORDERED_BYTE(0, 8); break; + case 6: XSSE_PCMP_ORDERED_BYTE(0, 6); break; + case 4: XSSE_PCMP_ORDERED_BYTE(0, 4); break; + case 2: XSSE_PCMP_ORDERED_BYTE(0, 2); break; + case 0: XSSE_PCMP_ORDERED_BYTE(0, 0); break; + default: XSSE_UNREACHABLE(); + } + cmp_ret = vandq_u8(cmp_ret, tmp_cmp_0); + } + } + } + break; + } + + /* negate */ + if (imm8 & _SIDD_NEGATIVE_POLARITY) { + uint8x16_t not_cmp_ret = vmvnq_u8(cmp_ret); + if (imm8 & _SIDD_MASKED_NEGATIVE_POLARITY) { + cmp_ret = vbslq_u8(cmp_ret, not_cmp_ret, b_invalid_mask); + } else { + cmp_ret = not_cmp_ret; + } + } + + uint64x2_t cmp_ret_64 = vreinterpretq_u64_u8(cmp_ret); + + _xsse_pcmp_str_result_t result; + result.cf = (vgetq_lane_u64(cmp_ret_64, 0) | vgetq_lane_u64(cmp_ret_64, 1)) != 0; + result.zf = lb < 16; + result.sf = la < 16; + result.mask = cmp_ret; + return result; + +#undef XSSE_PCMP_ANY_BYTE +#undef XSSE_PCMP_RANGES_BYTE +#undef XSSE_PCMP_EACH_BYTE +#undef XSSE_PCMP_EACH_BYTE +#undef XSSE_PCMP_ORDERED_BYTE +#undef XSSE_VORR_U8 +#undef XSSE_VAND_U8 + } +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_pcmp_str_a(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + return result.cf == 0 && result.zf == 0; +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_pcmp_str_c(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + return result.cf; +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_pcmp_str_i(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + + uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(result.mask), 0); + uint64_t high = vgetq_lane_u64(vreinterpretq_u64_u8(result.mask), 1); + if (imm8 & 0x01) { + if (imm8 & _SIDD_MOST_SIGNIFICANT) { + if (high != 0) { + return _xsse_seach_most_significant_word_index(high) + 4; + } else if (low != 0) { + return _xsse_seach_most_significant_word_index(low); + } else { + return 8; + } + } else { + if (low != 0) { + return _xsse_seach_least_significant_word_index(low); + } else if (high != 0) { + return _xsse_seach_least_significant_word_index(high) + 4; + } else { + return 8; + } + } + } else { + if (imm8 & _SIDD_MOST_SIGNIFICANT) { + if (high != 0) { + return _xsse_seach_most_significant_byte_index(high) + 8; + } else if (low != 0) { + return _xsse_seach_most_significant_byte_index(low); + } else { + return 16; + } + } else { + if (low != 0) { + return _xsse_seach_least_significant_byte_index(low); + } else if (high != 0) { + return _xsse_seach_least_significant_byte_index(high) + 8; + } else { + return 16; + } + } + } +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE __m128i _xsse_pcmp_str_m(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + + if (imm8 & _SIDD_UNIT_MASK) { + return vreinterpretq_s8_u8(result.mask); + } + + if (imm8 & 0x01) { + uint32x4_t high_bits = vreinterpretq_u32_u16(vshrq_n_u16(vreinterpretq_u16_u8(result.mask), 15)); + uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(high_bits, high_bits, 15)); + uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 30)); + + return vreinterpretq_s8_u8((uint8x16_t) { + (vgetq_lane_u8(paired64, 0) | (vgetq_lane_u8(paired64, 8) << 4)), 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }); + } else { + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(result.mask, 7)); + uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + return vreinterpretq_s8_u8((uint8x16_t) { + vgetq_lane_u8(paired64, 0), vgetq_lane_u8(paired64, 8), 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }); + } +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_pcmp_str_o(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + uint64_t low = vgetq_lane_u64(vreinterpretq_u64_u8(result.mask), 0); + return low & 1; +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_pcmp_str_s(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + return result.sf; +} + +XSSE_ATTR_CONST +static XSSE_FORCE_INLINE int _xsse_pcmp_str_z(const __m128i a, const int la, const __m128i b, const int lb, const int imm8, const int check_null) +{ + _xsse_pcmp_str_result_t result = _xsse_pcmp_str_core(a, la, b, lb, imm8 & 0x3F, check_null); + return result.zf; +} + +#define _mm_cmpestra(a, la, b, lb, imm8) _xsse_pcmp_str_a(a, la, b, lb, imm8, 0) +#define _mm_cmpestrc(a, la, b, lb, imm8) _xsse_pcmp_str_c(a, la, b, lb, imm8, 0) +#define _mm_cmpestri(a, la, b, lb, imm8) _xsse_pcmp_str_i(a, la, b, lb, imm8, 0) +#define _mm_cmpestrm(a, la, b, lb, imm8) _xsse_pcmp_str_m(a, la, b, lb, imm8, 0) +#define _mm_cmpestro(a, la, b, lb, imm8) _xsse_pcmp_str_o(a, la, b, lb, imm8, 0) +#define _mm_cmpestrs(a, la, b, lb, imm8) _xsse_pcmp_str_s(a, la, b, lb, imm8, 0) +#define _mm_cmpestrz(a, la, b, lb, imm8) _xsse_pcmp_str_z(a, la, b, lb, imm8, 0) + +#define _mm_cmpistra(a, b, imm8) _xsse_pcmp_str_a(a, 0, b, 0, imm8, 1) +#define _mm_cmpistrc(a, b, imm8) _xsse_pcmp_str_c(a, 0, b, 0, imm8, 1) +#define _mm_cmpistri(a, b, imm8) _xsse_pcmp_str_i(a, 0, b, 0, imm8, 1) +#define _mm_cmpistrm(a, b, imm8) _xsse_pcmp_str_m(a, 0, b, 0, imm8, 1) +#define _mm_cmpistro(a, b, imm8) _xsse_pcmp_str_o(a, 0, b, 0, imm8, 1) +#define _mm_cmpistrs(a, b, imm8) _xsse_pcmp_str_s(a, 0, b, 0, imm8, 1) +#define _mm_cmpistrz(a, b, imm8) _xsse_pcmp_str_z(a, 0, b, 0, imm8, 1) + + +/***************************************************************************** + * CRC * + *****************************************************************************/ + +#ifdef __ARM_FEATURE_CRC32 +#include +#define _mm_crc32_u8(crc, v) (__crc32cb(crc, v)) +#define _mm_crc32_u16(crc, v) (__crc32ch(crc, v)) +#define _mm_crc32_u32(crc, v) (__crc32cw(crc, v)) +#define _mm_crc32_u64(crc, v) (__crc32cd(crc, v)) +#else +static XSSE_FORCE_INLINE int _mm_crc32_u8(unsigned int crc, unsigned char v) +{ + crc ^= v; + static const uint32_t crc32_nibble_tbl[] = { + 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, + 0x417b1dbc, 0x5125dad3, 0x61c69362, 0x7198540d, + 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9, + 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75 + }; + crc = (crc >> 4) ^ crc32_nibble_tbl[crc & 0x0F]; + crc = (crc >> 4) ^ crc32_nibble_tbl[crc & 0x0F]; + return crc; +} +static XSSE_FORCE_INLINE int _mm_crc32_u16(unsigned int crc, unsigned short v) +{ + crc = _mm_crc32_u8(crc, (unsigned char) (v & 0xFF)); + crc = _mm_crc32_u8(crc, (unsigned char) (v >> 8)); + return crc; +} +static XSSE_FORCE_INLINE int _mm_crc32_u32(unsigned int crc, unsigned int v) +{ + crc = _mm_crc32_u16(crc, (unsigned short) (v & 0xFFFF)); + crc = _mm_crc32_u16(crc, (unsigned short) (v >> 16)); + return crc; +} +static XSSE_FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ + unsigned int crc32 = (unsigned int) crc; + crc32 = _mm_crc32_u32(crc32, (unsigned int) (v & 0xFFFFFFFF)); + crc32 = _mm_crc32_u32(crc32, (unsigned int) (v >> 32)); + return crc32; +} +#endif /* __ARM_FEATURE_CRC32 */ + +#endif /* SSE4_2 */ + +#endif /* XSSE_H */