5252#elif defined(_MSC_VER)
5353#include < intrin.h>
5454#define HAVE_AVX2_DISPATCH
55- #define HAVE_AVX2_DISPATCH_MSVC
56- #elif defined(__AVX2__)
57- #define HAVE_AVX2_NATIVELY
5855#endif
5956#elif defined(USE_NEON_OPTIMIZATIONS)
6057#include " include_sse2neon.h"
@@ -3003,16 +3000,14 @@ CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
30033000
30043001// ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
30053002
3006- #if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
3007- #if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC )
3003+ #if defined(HAVE_AVX2_DISPATCH)
3004+ #if !defined(_MSC_VER )
30083005__attribute__ ((target(" avx2" )))
30093006#endif
30103007static void GDALCopyWordsInt32ToUInt8_AVX2 (const int32_t *CPL_RESTRICT pSrc,
30113008 uint8_t *CPL_RESTRICT pDst,
30123009 GPtrDiff_t nWordCount)
30133010{
3014- const __m256i ymm_zero = _mm256_setzero_si256 ();
3015- const __m256i ymm_255 = _mm256_set1_epi32 (255 );
30163011 const __m256i permuteIdx = _mm256_setr_epi32 (0 , 4 , 1 , 5 , 2 , 6 , 3 , 7 );
30173012 GPtrDiff_t n = 0 ;
30183013 for (; n < nWordCount - 31 ; n += 32 )
@@ -3026,15 +3021,7 @@ static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
30263021 __m256i v3 = _mm256_loadu_si256 (
30273022 reinterpret_cast <const __m256i *>(pSrc + n + 24 ));
30283023 // Clamp to [0, 255]
3029- v0 = _mm256_max_epi32 (v0, ymm_zero);
3030- v1 = _mm256_max_epi32 (v1, ymm_zero);
3031- v2 = _mm256_max_epi32 (v2, ymm_zero);
3032- v3 = _mm256_max_epi32 (v3, ymm_zero);
3033- v0 = _mm256_min_epi32 (v0, ymm_255);
3034- v1 = _mm256_min_epi32 (v1, ymm_255);
3035- v2 = _mm256_min_epi32 (v2, ymm_255);
3036- v3 = _mm256_min_epi32 (v3, ymm_255);
3037- // Pack int32→int16→uint8, then fix cross-lane ordering
3024+ // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
30383025 __m256i ab16 = _mm256_packs_epi32 (v0, v1);
30393026 __m256i cd16 = _mm256_packs_epi32 (v2, v3);
30403027 __m256i bytes = _mm256_packus_epi16 (ab16, cd16);
@@ -3048,17 +3035,14 @@ static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
30483035 : static_cast <uint8_t >(pSrc[n]);
30493036 }
30503037}
3051- #endif // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
30523038
3053- #if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
3054- #if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
3039+ #if !defined(_MSC_VER)
30553040__attribute__ ((target(" avx2" )))
30563041#endif
30573042static void GDALCopyWordsInt32ToUInt16_AVX2 (const int32_t *CPL_RESTRICT pSrc,
30583043 uint16_t *CPL_RESTRICT pDst,
30593044 GPtrDiff_t nWordCount)
30603045{
3061- const __m256i ymm_zero = _mm256_setzero_si256 ();
30623046 // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
30633047 // [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
30643048 // Permute to deinterleave: all v0 values first, then all v1 values
@@ -3071,8 +3055,6 @@ static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
30713055 __m256i v1 =
30723056 _mm256_loadu_si256 (reinterpret_cast <const __m256i *>(pSrc + n + 8 ));
30733057 // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3074- v0 = _mm256_max_epi32 (v0, ymm_zero);
3075- v1 = _mm256_max_epi32 (v1, ymm_zero);
30763058 __m256i packed = _mm256_packus_epi32 (v0, v1);
30773059 // Fix cross-lane interleave from packus
30783060 packed = _mm256_permutevar8x32_epi32 (packed, permuteIdx);
@@ -3085,7 +3067,7 @@ static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
30853067 : static_cast <uint16_t >(pSrc[n]);
30863068 }
30873069}
3088- #endif // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
3070+ #endif // HAVE_AVX2_DISPATCH
30893071
30903072// ---- int32 -> uint8 with clamping to [0, 255] ----
30913073template <>
@@ -3103,14 +3085,10 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31033085 GDALCopyWordsInt32ToUInt8_AVX2 (pSrcData, pDstData, nWordCount);
31043086 return ;
31053087 }
3106- #elif defined(HAVE_AVX2_NATIVELY)
3107- GDALCopyWordsInt32ToUInt8_AVX2 (pSrcData, pDstData, nWordCount);
3108- return ;
31093088#endif
31103089#ifdef HAVE_SSE2
31113090 // SSE2 path: 16 pixels per iteration
31123091 decltype (nWordCount) n = 0 ;
3113- const __m128i xmm_255 = _mm_set1_epi32 (255 );
31143092 for (; n < nWordCount - 15 ; n += 16 )
31153093 {
31163094 __m128i v0 = _mm_loadu_si128 (
@@ -3121,26 +3099,7 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31213099 reinterpret_cast <const __m128i *>(pSrcData + n + 8 ));
31223100 __m128i v3 = _mm_loadu_si128 (
31233101 reinterpret_cast <const __m128i *>(pSrcData + n + 12 ));
3124- // Clamp to [0, 255] using SSE2 arithmetic:
3125- // max(v, 0): zero out negatives via sign bit mask
3126- v0 = _mm_andnot_si128 (_mm_srai_epi32 (v0, 31 ), v0);
3127- v1 = _mm_andnot_si128 (_mm_srai_epi32 (v1, 31 ), v1);
3128- v2 = _mm_andnot_si128 (_mm_srai_epi32 (v2, 31 ), v2);
3129- v3 = _mm_andnot_si128 (_mm_srai_epi32 (v3, 31 ), v3);
3130- // min(v, 255): blend 255 where v > 255
3131- __m128i gt0 = _mm_cmpgt_epi32 (v0, xmm_255);
3132- __m128i gt1 = _mm_cmpgt_epi32 (v1, xmm_255);
3133- __m128i gt2 = _mm_cmpgt_epi32 (v2, xmm_255);
3134- __m128i gt3 = _mm_cmpgt_epi32 (v3, xmm_255);
3135- v0 = _mm_or_si128 (_mm_andnot_si128 (gt0, v0),
3136- _mm_and_si128 (gt0, xmm_255));
3137- v1 = _mm_or_si128 (_mm_andnot_si128 (gt1, v1),
3138- _mm_and_si128 (gt1, xmm_255));
3139- v2 = _mm_or_si128 (_mm_andnot_si128 (gt2, v2),
3140- _mm_and_si128 (gt2, xmm_255));
3141- v3 = _mm_or_si128 (_mm_andnot_si128 (gt3, v3),
3142- _mm_and_si128 (gt3, xmm_255));
3143- // Values in [0, 255]: pack int32→int16→uint8
3102+ // Values in [0, 255]: pack int32->int16->uint8
31443103 __m128i lo16 = _mm_packs_epi32 (v0, v1);
31453104 __m128i hi16 = _mm_packs_epi32 (v2, v3);
31463105 __m128i bytes = _mm_packus_epi16 (lo16, hi16);
@@ -3180,9 +3139,6 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31803139 GDALCopyWordsInt32ToUInt16_AVX2 (pSrcData, pDstData, nWordCount);
31813140 return ;
31823141 }
3183- #elif defined(HAVE_AVX2_NATIVELY)
3184- GDALCopyWordsInt32ToUInt16_AVX2 (pSrcData, pDstData, nWordCount);
3185- return ;
31863142#endif
31873143 decltype (nWordCount) n = 0 ;
31883144#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
@@ -3193,8 +3149,6 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31933149 reinterpret_cast <const __m128i *>(pSrcData + n));
31943150 __m128i v1 = _mm_loadu_si128 (
31953151 reinterpret_cast <const __m128i *>(pSrcData + n + 4 ));
3196- v0 = _mm_max_epi32 (v0, _mm_setzero_si128 ());
3197- v1 = _mm_max_epi32 (v1, _mm_setzero_si128 ());
31983152 __m128i packed = _mm_packus_epi32 (v0, v1);
31993153 _mm_storeu_si128 (reinterpret_cast <__m128i *>(pDstData + n), packed);
32003154 }
@@ -3219,7 +3173,7 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
32193173 _mm_and_si128 (gt0, xmm_65535));
32203174 v1 = _mm_or_si128 (_mm_andnot_si128 (gt1, v1),
32213175 _mm_and_si128 (gt1, xmm_65535));
3222- // Shift [0, 65535] → [-32768, 32767] for _mm_packs_epi32
3176+ // Shift [0, 65535] -> [-32768, 32767] for _mm_packs_epi32
32233177 v0 = _mm_sub_epi32 (v0, xmm_bias32);
32243178 v1 = _mm_sub_epi32 (v1, xmm_bias32);
32253179 __m128i packed = _mm_packs_epi32 (v0, v1);
0 commit comments