Skip to content

Commit 7c42b98

Browse files
committed
Fix: Type-casting UBs of movemask bitsets
1 parent f26ffc8 commit 7c42b98

File tree

3 files changed

+28
-28
lines changed

3 files changed

+28
-28
lines changed

include/stringzilla/find.h

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,7 @@ SZ_PUBLIC sz_cptr_t sz_find_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
905905
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
906906

907907
// Broadcast those characters into XMM registers.
908-
int matches;
908+
sz_u32_vec_t matches_vec;
909909
sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
910910
n_first_vec.xmm = _mm_set1_epi8(n[offset_first]);
911911
n_mid_vec.xmm = _mm_set1_epi8(n[offset_mid]);
@@ -916,14 +916,14 @@ SZ_PUBLIC sz_cptr_t sz_find_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
916916
h_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(h + offset_first));
917917
h_mid_vec.xmm = _mm_lddqu_si128((__m128i const *)(h + offset_mid));
918918
h_last_vec.xmm = _mm_lddqu_si128((__m128i const *)(h + offset_last));
919-
matches = //
919+
matches_vec.i32 = //
920920
_mm_movemask_epi8(_mm_cmpeq_epi8(h_first_vec.xmm, n_first_vec.xmm)) &
921921
_mm_movemask_epi8(_mm_cmpeq_epi8(h_mid_vec.xmm, n_mid_vec.xmm)) &
922922
_mm_movemask_epi8(_mm_cmpeq_epi8(h_last_vec.xmm, n_last_vec.xmm));
923-
while (matches) {
924-
int potential_offset = sz_u32_ctz(matches);
923+
while (matches_vec.u32) {
924+
int potential_offset = sz_u32_ctz(matches_vec.u32);
925925
if (sz_equal_westmere(h + potential_offset, n, n_length)) return h + potential_offset;
926-
matches &= matches - 1;
926+
matches_vec.u32 &= matches_vec.u32 - 1;
927927
}
928928
}
929929

@@ -940,7 +940,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
940940
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
941941

942942
// Broadcast those characters into XMM registers.
943-
int matches;
943+
sz_u32_vec_t matches_vec;
944944
sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
945945
n_first_vec.xmm = _mm_set1_epi8(n[offset_first]);
946946
n_mid_vec.xmm = _mm_set1_epi8(n[offset_mid]);
@@ -953,15 +953,15 @@ SZ_PUBLIC sz_cptr_t sz_rfind_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
953953
h_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(h_reversed + offset_first));
954954
h_mid_vec.xmm = _mm_lddqu_si128((__m128i const *)(h_reversed + offset_mid));
955955
h_last_vec.xmm = _mm_lddqu_si128((__m128i const *)(h_reversed + offset_last));
956-
matches = //
956+
matches_vec.i32 = //
957957
_mm_movemask_epi8(_mm_cmpeq_epi8(h_first_vec.xmm, n_first_vec.xmm)) &
958958
_mm_movemask_epi8(_mm_cmpeq_epi8(h_mid_vec.xmm, n_mid_vec.xmm)) &
959959
_mm_movemask_epi8(_mm_cmpeq_epi8(h_last_vec.xmm, n_last_vec.xmm));
960-
while (matches) {
961-
int potential_offset = sz_u32_clz(matches) - 16;
960+
while (matches_vec.u32) {
961+
int potential_offset = sz_u32_clz(matches_vec.u32) - 16;
962962
if (sz_equal_westmere(h + h_length - n_length - potential_offset, n, n_length))
963963
return h + h_length - n_length - potential_offset;
964-
matches &= ~(1 << (15 - potential_offset));
964+
matches_vec.u32 &= ~(1u << (15 - potential_offset));
965965
}
966966
}
967967

@@ -1029,7 +1029,7 @@ SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
10291029
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
10301030

10311031
// Broadcast those characters into YMM registers.
1032-
int matches;
1032+
sz_u32_vec_t matches_vec;
10331033
sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
10341034
n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
10351035
n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
@@ -1040,14 +1040,14 @@ SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
10401040
h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
10411041
h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
10421042
h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
1043-
matches = //
1043+
matches_vec.i32 = //
10441044
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
10451045
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
10461046
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
1047-
while (matches) {
1048-
int potential_offset = sz_u32_ctz(matches);
1047+
while (matches_vec.u32) {
1048+
int potential_offset = sz_u32_ctz(matches_vec.u32);
10491049
if (sz_equal_haswell(h + potential_offset, n, n_length)) return h + potential_offset;
1050-
matches &= matches - 1;
1050+
matches_vec.u32 &= matches_vec.u32 - 1;
10511051
}
10521052
}
10531053

@@ -1065,7 +1065,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
10651065
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
10661066

10671067
// Broadcast those characters into YMM registers.
1068-
int matches;
1068+
sz_u32_vec_t matches_vec;
10691069
sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
10701070
n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
10711071
n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
@@ -1078,15 +1078,15 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
10781078
h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
10791079
h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
10801080
h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
1081-
matches = //
1081+
matches_vec.i32 = //
10821082
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
10831083
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
10841084
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
1085-
while (matches) {
1086-
int potential_offset = sz_u32_clz(matches);
1085+
while (matches_vec.u32) {
1086+
int potential_offset = sz_u32_clz(matches_vec.u32);
10871087
if (sz_equal_haswell(h + h_length - n_length - potential_offset, n, n_length))
10881088
return h + h_length - n_length - potential_offset;
1089-
matches &= ~(1 << (31 - potential_offset));
1089+
matches_vec.u32 &= ~(1u << (31 - potential_offset));
10901090
}
10911091
}
10921092

@@ -1388,9 +1388,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
13881388
int potential_offset = sz_u64_clz(matches);
13891389
if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
13901390
return h + h_length - n_length - potential_offset;
1391-
sz_assert_((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
1392-
"The bit must be set before we squash it");
1393-
matches &= ~((sz_u64_t)1 << (63 - potential_offset));
1391+
sz_assert_((matches & (1ull << (63 - potential_offset))) != 0 && "The bit must be set before we squash it");
1392+
matches &= ~(1ull << (63 - potential_offset));
13941393
}
13951394
}
13961395

@@ -1409,9 +1408,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
14091408
int potential_offset = sz_u64_clz(matches);
14101409
if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
14111410
return h + 64 - potential_offset - 1;
1412-
sz_assert_((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
1413-
"The bit must be set before we squash it");
1414-
matches &= ~((sz_u64_t)1 << (63 - potential_offset));
1411+
sz_assert_((matches & (1ull << (63 - potential_offset))) != 0 && "The bit must be set before we squash it");
1412+
matches &= ~(1ull << (63 - potential_offset));
14151413
}
14161414
}
14171415

include/stringzilla/intersect.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_serial(
288288
// Allocate memory for the hash table and initialize it with 0xFF.
289289
// The higher is the `hash_table_slots` multiple - the more memory we will use,
290290
// but the less likely the collisions will be.
291-
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1 << SZ_SEQUENCE_INTERSECT_BUDGET);
291+
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1u << SZ_SEQUENCE_INTERSECT_BUDGET);
292292
sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
293293
sz_size_t *const table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
294294
if (!table_positions) return sz_bad_alloc_k;
@@ -414,7 +414,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
414414
// Allocate memory for the hash table and initialize it with 0xFF.
415415
// The higher is the `hash_table_slots` multiple - the more memory we will use,
416416
// but the less likely the collisions will be.
417-
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1 << SZ_SEQUENCE_INTERSECT_BUDGET);
417+
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1u << SZ_SEQUENCE_INTERSECT_BUDGET);
418418
sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
419419
sz_size_t *table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
420420
if (!table_positions) return sz_bad_alloc_k;

include/stringzilla/types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,7 @@ typedef union sz_u16_vec_t {
840840
*/
841841
typedef union sz_u32_vec_t {
842842
sz_u32_t u32;
843+
sz_i32_t i32;
843844
sz_u16_t u16s[2];
844845
sz_i16_t i16s[2];
845846
sz_u8_t u8s[4];
@@ -852,6 +853,7 @@ typedef union sz_u32_vec_t {
852853
*/
853854
typedef union sz_u64_vec_t {
854855
sz_u64_t u64;
856+
sz_i64_t i64;
855857
sz_u32_t u32s[2];
856858
sz_i32_t i32s[2];
857859
sz_u16_t u16s[4];

0 commit comments

Comments
 (0)