Use inline assembly in strlen for vector loads (#593)

alexcrichton · web-flow · commit 88e442769f1d · 2025-07-02T19:49:27.000-07:00
This commit is a refinement of #586 to use inline assembly to perform vector loads instead of using a C-defined load. This is done to avoid UB in LLVM where C cannot read either before or after an allocation. When `strlen` is not inlined, as it currently isn't, then there's not really any reasonable path that a compiler could prove that a load was out-of-bounds so this is issue is unlikely in practice, but it nevertheless is still UB. In the future the eventual goal is to move these SIMD routines into header files to avoid needing multiple builds of libc itself, and in such a situation inlining is indeed possible and a compiler would be capable of much more easily seeing the UB which could cause problems. Inline assembly unfortunately doesn't work with vector output parameters on Clang 19 and Clang 20 due to an ICE. This was fixed in llvm/llvm-project#146574 for Clang 21, but it means that the SIMD routines are now excluded with Clang 19 and Clang 20 to avoid compilation errors there.
diff --git a/libc-top-half/musl/src/string/strlen.c b/libc-top-half/musl/src/string/strlen.c
@@ -14,17 +14,28 @@
 size_t strlen(const char *s)
 {
 #if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
-	// strlen must stop as soon as it finds the terminator.
-	// Aligning ensures loads beyond the terminator are safe.
-	// Casting through uintptr_t makes this implementation-defined,
-	// rather than undefined behavior.
+// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which
+// results in an ICE when inline assembly is used with a vector result.
+#if __clang_major__ != 19 && __clang_major__ != 20
+        // Note that reading before/after the allocation of a pointer is UB in
+        // C, so inline assembly is used to generate the exact machine
+        // instruction we want with opaque semantics to the compiler to avoid
+        // the UB.
 	uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-	const v128_t *v = (v128_t *)((uintptr_t)s - align);
+	uintptr_t v = (uintptr_t)s - align;
 
 	for (;;) {
+                v128_t chunk;
+		__asm__ (
+			"local.get %1\n"
+			"v128.load 0\n"
+			"local.set %0\n"
+			: "=r"(chunk)
+                        : "r"(v)
+                        : "memory");
 		// Bitmask is slow on AArch64, all_true is much faster.
-		if (!wasm_i8x16_all_true(*v)) {
-			const v128_t cmp = wasm_i8x16_eq(*v, (v128_t){});
+		if (!wasm_i8x16_all_true(chunk)) {
+			const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});
 			// Clear the bits corresponding to align (little-endian)
 			// so we can count trailing zeros.
 			int mask = wasm_i8x16_bitmask(cmp) >> align << align;
@@ -35,12 +46,13 @@ size_t strlen(const char *s)
 			// it's as if we didn't find anything.
 			if (mask) {
 				// Find the offset of the first one bit (little-endian).
-				return (char *)v - s + __builtin_ctz(mask);
+				return v - (uintptr_t)s + __builtin_ctz(mask);
 			}
 		}
 		align = 0;
-		v++;
+		v += sizeof(v128_t);
 	}
+#endif
 #endif
 
 	const char *a = s;