Skip to content

Commit d33430d

Browse files
authored
Merge pull request #14236 from rouault/optimize_CPLHaveRuntimeAVX2
Optimize CPLHaveRuntimeAVX2() on gcc and Simplify rasterio.cpp wrt AVX2 use
2 parents 4f22285 + 135cd91 commit d33430d

File tree

3 files changed

+71
-84
lines changed

3 files changed

+71
-84
lines changed

gcore/rasterio.cpp

Lines changed: 7 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@
5252
#elif defined(_MSC_VER)
5353
#include <intrin.h>
5454
#define HAVE_AVX2_DISPATCH
55-
#define HAVE_AVX2_DISPATCH_MSVC
56-
#elif defined(__AVX2__)
57-
#define HAVE_AVX2_NATIVELY
5855
#endif
5956
#elif defined(USE_NEON_OPTIMIZATIONS)
6057
#include "include_sse2neon.h"
@@ -3003,16 +3000,14 @@ CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
30033000

30043001
// ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
30053002

3006-
#if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
3007-
#if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
3003+
#if defined(HAVE_AVX2_DISPATCH)
3004+
#if !defined(_MSC_VER)
30083005
__attribute__((target("avx2")))
30093006
#endif
30103007
static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
30113008
uint8_t *CPL_RESTRICT pDst,
30123009
GPtrDiff_t nWordCount)
30133010
{
3014-
const __m256i ymm_zero = _mm256_setzero_si256();
3015-
const __m256i ymm_255 = _mm256_set1_epi32(255);
30163011
const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
30173012
GPtrDiff_t n = 0;
30183013
for (; n < nWordCount - 31; n += 32)
@@ -3026,15 +3021,7 @@ static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
30263021
__m256i v3 = _mm256_loadu_si256(
30273022
reinterpret_cast<const __m256i *>(pSrc + n + 24));
30283023
// Clamp to [0, 255]
3029-
v0 = _mm256_max_epi32(v0, ymm_zero);
3030-
v1 = _mm256_max_epi32(v1, ymm_zero);
3031-
v2 = _mm256_max_epi32(v2, ymm_zero);
3032-
v3 = _mm256_max_epi32(v3, ymm_zero);
3033-
v0 = _mm256_min_epi32(v0, ymm_255);
3034-
v1 = _mm256_min_epi32(v1, ymm_255);
3035-
v2 = _mm256_min_epi32(v2, ymm_255);
3036-
v3 = _mm256_min_epi32(v3, ymm_255);
3037-
// Pack int32→int16→uint8, then fix cross-lane ordering
3024+
// Pack int32 -> int16 -> uint8, then fix cross-lane ordering
30383025
__m256i ab16 = _mm256_packs_epi32(v0, v1);
30393026
__m256i cd16 = _mm256_packs_epi32(v2, v3);
30403027
__m256i bytes = _mm256_packus_epi16(ab16, cd16);
@@ -3048,17 +3035,14 @@ static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
30483035
: static_cast<uint8_t>(pSrc[n]);
30493036
}
30503037
}
3051-
#endif // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
30523038

3053-
#if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
3054-
#if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
3039+
#if !defined(_MSC_VER)
30553040
__attribute__((target("avx2")))
30563041
#endif
30573042
static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
30583043
uint16_t *CPL_RESTRICT pDst,
30593044
GPtrDiff_t nWordCount)
30603045
{
3061-
const __m256i ymm_zero = _mm256_setzero_si256();
30623046
// _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
30633047
// [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
30643048
// Permute to deinterleave: all v0 values first, then all v1 values
@@ -3071,8 +3055,6 @@ static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
30713055
__m256i v1 =
30723056
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
30733057
// Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3074-
v0 = _mm256_max_epi32(v0, ymm_zero);
3075-
v1 = _mm256_max_epi32(v1, ymm_zero);
30763058
__m256i packed = _mm256_packus_epi32(v0, v1);
30773059
// Fix cross-lane interleave from packus
30783060
packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
@@ -3085,7 +3067,7 @@ static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
30853067
: static_cast<uint16_t>(pSrc[n]);
30863068
}
30873069
}
3088-
#endif // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
3070+
#endif // HAVE_AVX2_DISPATCH
30893071

30903072
// ---- int32 -> uint8 with clamping to [0, 255] ----
30913073
template <>
@@ -3103,14 +3085,10 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31033085
GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
31043086
return;
31053087
}
3106-
#elif defined(HAVE_AVX2_NATIVELY)
3107-
GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3108-
return;
31093088
#endif
31103089
#ifdef HAVE_SSE2
31113090
// SSE2 path: 16 pixels per iteration
31123091
decltype(nWordCount) n = 0;
3113-
const __m128i xmm_255 = _mm_set1_epi32(255);
31143092
for (; n < nWordCount - 15; n += 16)
31153093
{
31163094
__m128i v0 = _mm_loadu_si128(
@@ -3121,26 +3099,7 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31213099
reinterpret_cast<const __m128i *>(pSrcData + n + 8));
31223100
__m128i v3 = _mm_loadu_si128(
31233101
reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3124-
// Clamp to [0, 255] using SSE2 arithmetic:
3125-
// max(v, 0): zero out negatives via sign bit mask
3126-
v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
3127-
v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
3128-
v2 = _mm_andnot_si128(_mm_srai_epi32(v2, 31), v2);
3129-
v3 = _mm_andnot_si128(_mm_srai_epi32(v3, 31), v3);
3130-
// min(v, 255): blend 255 where v > 255
3131-
__m128i gt0 = _mm_cmpgt_epi32(v0, xmm_255);
3132-
__m128i gt1 = _mm_cmpgt_epi32(v1, xmm_255);
3133-
__m128i gt2 = _mm_cmpgt_epi32(v2, xmm_255);
3134-
__m128i gt3 = _mm_cmpgt_epi32(v3, xmm_255);
3135-
v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
3136-
_mm_and_si128(gt0, xmm_255));
3137-
v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
3138-
_mm_and_si128(gt1, xmm_255));
3139-
v2 = _mm_or_si128(_mm_andnot_si128(gt2, v2),
3140-
_mm_and_si128(gt2, xmm_255));
3141-
v3 = _mm_or_si128(_mm_andnot_si128(gt3, v3),
3142-
_mm_and_si128(gt3, xmm_255));
3143-
// Values in [0, 255]: pack int32→int16→uint8
3102+
// Values in [0, 255]: pack int32->int16->uint8
31443103
__m128i lo16 = _mm_packs_epi32(v0, v1);
31453104
__m128i hi16 = _mm_packs_epi32(v2, v3);
31463105
__m128i bytes = _mm_packus_epi16(lo16, hi16);
@@ -3180,9 +3139,6 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31803139
GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
31813140
return;
31823141
}
3183-
#elif defined(HAVE_AVX2_NATIVELY)
3184-
GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3185-
return;
31863142
#endif
31873143
decltype(nWordCount) n = 0;
31883144
#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
@@ -3193,8 +3149,6 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
31933149
reinterpret_cast<const __m128i *>(pSrcData + n));
31943150
__m128i v1 = _mm_loadu_si128(
31953151
reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3196-
v0 = _mm_max_epi32(v0, _mm_setzero_si128());
3197-
v1 = _mm_max_epi32(v1, _mm_setzero_si128());
31983152
__m128i packed = _mm_packus_epi32(v0, v1);
31993153
_mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
32003154
}
@@ -3219,7 +3173,7 @@ CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
32193173
_mm_and_si128(gt0, xmm_65535));
32203174
v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
32213175
_mm_and_si128(gt1, xmm_65535));
3222-
// Shift [0, 65535] [-32768, 32767] for _mm_packs_epi32
3176+
// Shift [0, 65535] -> [-32768, 32767] for _mm_packs_epi32
32233177
v0 = _mm_sub_epi32(v0, xmm_bias32);
32243178
v1 = _mm_sub_epi32(v1, xmm_bias32);
32253179
__m128i packed = _mm_packs_epi32(v0, v1);

port/cpl_cpu_features.cpp

Lines changed: 47 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -146,43 +146,45 @@ static bool CPLDetectRuntimeAVX()
146146
}
147147

148148
bool bCPLHasAVX = false;
149-
static void CPLHaveRuntimeAVXInitialize() __attribute__((constructor));
149+
static void CPLHaveRuntimeAVXInitialize() __attribute__((constructor(101)));
150150

151151
static void CPLHaveRuntimeAVXInitialize()
152152
{
153153
bCPLHasAVX = CPLDetectRuntimeAVX();
154154
}
155155

156-
#elif defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
157-
(defined(_M_IX86) || defined(_M_X64))
158-
// _xgetbv available only in Visual Studio 2010 SP1 or later
159-
156+
#elif defined(_MSC_VER)
160157
bool CPLHaveRuntimeAVX()
161158
{
162-
int cpuinfo[4] = {0, 0, 0, 0};
163-
CPL_CPUID(1, cpuinfo);
164-
165-
// Check OSXSAVE feature.
166-
if ((cpuinfo[REG_ECX] & (1 << CPUID_OSXSAVE_ECX_BIT)) == 0)
159+
static const bool bHasAVX = []() -> bool
167160
{
168-
return false;
169-
}
161+
int cpuinfo[4] = {0, 0, 0, 0};
162+
CPL_CPUID(1, cpuinfo);
170163

171-
// Check AVX feature.
172-
if ((cpuinfo[REG_ECX] & (1 << CPUID_AVX_ECX_BIT)) == 0)
173-
{
174-
return false;
175-
}
164+
// Check OSXSAVE feature.
165+
if ((cpuinfo[REG_ECX] & (1 << CPUID_OSXSAVE_ECX_BIT)) == 0)
166+
{
167+
return false;
168+
}
176169

177-
// Issue XGETBV and check the XMM and YMM state bit.
178-
unsigned __int64 xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
179-
if ((xcrFeatureMask & (BIT_XMM_STATE | BIT_YMM_STATE)) !=
180-
(BIT_XMM_STATE | BIT_YMM_STATE))
181-
{
182-
return false;
183-
}
170+
// Check AVX feature.
171+
if ((cpuinfo[REG_ECX] & (1 << CPUID_AVX_ECX_BIT)) == 0)
172+
{
173+
return false;
174+
}
184175

185-
return true;
176+
// Issue XGETBV and check the XMM and YMM state bit.
177+
unsigned __int64 xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
178+
if ((xcrFeatureMask & (BIT_XMM_STATE | BIT_YMM_STATE)) !=
179+
(BIT_XMM_STATE | BIT_YMM_STATE))
180+
{
181+
return false;
182+
}
183+
184+
return true;
185+
}();
186+
187+
return bHasAVX;
186188
}
187189

188190
#else
@@ -194,9 +196,23 @@ bool CPLHaveRuntimeAVX()
194196

195197
#endif
196198

197-
#endif // defined(HAVE_AVX_AT_COMPILE_TIME) && !defined(CPLHaveRuntimeAVX)
199+
#endif // defined(HAVE_AVX_AT_COMPILE_TIME) && !defined(HAVE_INLINE_AVX)
200+
201+
#if defined(HAVE_AVX2_AT_COMPILE_TIME) && !defined(HAVE_INLINE_AVX2)
202+
203+
#if defined(__GNUC__)
198204

199-
#ifdef HAVE_AVX2_AT_COMPILE_TIME
205+
bool bCPLHasAVX2 = false;
206+
// Use 102 because CPLHaveRuntimeAVXInitialize() uses 101, so we are run
207+
// afterwards
208+
static void CPLHaveRuntimeAVX2Initialize() __attribute__((constructor(102)));
209+
210+
static void CPLHaveRuntimeAVX2Initialize()
211+
{
212+
bCPLHasAVX2 = CPLHaveRuntimeAVX() && __builtin_cpu_supports("avx2");
213+
}
214+
215+
#else
200216

201217
/************************************************************************/
202218
/* CPLHaveRuntimeAVX2() */
@@ -206,11 +222,9 @@ bool CPLHaveRuntimeAVX2()
206222
{
207223
static const bool bHasAVX2 = []() -> bool
208224
{
225+
#if defined(_MSC_VER)
209226
if (!CPLHaveRuntimeAVX())
210227
return false;
211-
#if defined(__GNUC__) || defined(__clang__)
212-
return __builtin_cpu_supports("avx2");
213-
#elif defined(_MSC_VER)
214228
int cpuInfo[4] = {};
215229
__cpuidex(cpuInfo, 7, 0);
216230
return (cpuInfo[REG_EBX] & (1 << 5)) != 0; // EBX bit 5 = AVX2
@@ -221,6 +235,8 @@ bool CPLHaveRuntimeAVX2()
221235
return bHasAVX2;
222236
}
223237

224-
#endif // HAVE_AVX2_AT_COMPILE_TIME
238+
#endif
239+
240+
#endif // defined(HAVE_AVX2_AT_COMPILE_TIME) && !defined(HAVE_INLINE_AVX2)
225241

226242
//! @endcond

port/cpl_cpu_features.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,26 @@ bool CPLHaveRuntimeAVX();
8383
#endif
8484

8585
#ifdef HAVE_AVX2_AT_COMPILE_TIME
86+
#if __AVX2__
87+
#define HAVE_INLINE_AVX2
88+
89+
static bool inline CPLHaveRuntimeAVX2()
90+
{
91+
return true;
92+
}
93+
#elif defined(__GNUC__)
94+
extern bool bCPLHasAVX2;
95+
96+
static bool inline CPLHaveRuntimeAVX2()
97+
{
98+
return bCPLHasAVX2;
99+
}
100+
#else
86101
bool CPLHaveRuntimeAVX2();
87102
#endif
88103

104+
#endif
105+
89106
//! @endcond
90107

91108
#endif // CPL_CPU_FEATURES_H

0 commit comments

Comments
 (0)