diff --git a/benchmarks/Dockerfile.emscripten b/benchmarks/Dockerfile.emscripten
deleted file mode 100644
index b41451b5..00000000
--- a/benchmarks/Dockerfile.emscripten
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM emscripten/emsdk:4.0.10
-
-WORKDIR /
-COPY benchmark.c .
-COPY sightglass.h .
-WORKDIR /benchmark
-RUN emcc ../benchmark.c -O3 -g -DNDEBUG -I.. -o benchmark.wasm
-# We output the Wasm file to the `/benchmark` directory, where the client
-# expects it.
diff --git a/benchmarks/blake3-simd/Dockerfile b/benchmarks/blake3-simd/Dockerfile
index 4d6bf678..509b3024 100644
--- a/benchmarks/blake3-simd/Dockerfile
+++ b/benchmarks/blake3-simd/Dockerfile
@@ -1,26 +1,66 @@
-FROM emscripten/emsdk:4.0.10
+# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every
+# build.
+
+# First, retrieve wasi-sdk:
+
+FROM ubuntu:24.04 AS builder
+WORKDIR /
+RUN apt update && apt install -y wget
+
+# Download and extract wasi-sdk.
+RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz
+RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz
+
+# Second, compile the benchmark to Wasm.
+
+FROM ubuntu:24.04
+WORKDIR /
+COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/
+RUN apt update && apt install -y git patch
+
+# Set common env vars.
+ENV CC=/wasi-sdk/bin/clang
+ENV CXX=/wasi-sdk/bin/clang++
+ENV LD=/wasi-sdk/bin/lld
+ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+# Prepend wasi-sdk's bin while keeping the system PATH so `git`/`patch` resolve.
+ENV PATH=/wasi-sdk/bin:$PATH
 
 WORKDIR /usr/src
 RUN git clone https://github.com/BLAKE3-team/BLAKE3.git
 
+WORKDIR /usr/src/BLAKE3
+RUN git checkout 8aa5145039b972ba30e98e788752d37d14568824
+# BLAKE3's dispatcher only selects its SSE2 kernels when IS_X86 is defined (it is
+# not on wasm). This patch forces the SSE2 path on wasm and exposes the SSE2
+# function declarations there.
+COPY blake3-wasm-sse2.patch .
+RUN patch -p1 < blake3-wasm-sse2.patch
+
 WORKDIR /usr/src/BLAKE3/c
 COPY benchmark.c .
 COPY sightglass.h .
+COPY wasm_sse_compat.h .
+# wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm. `wasm_sse_compat.h`
+# reimplements the ones blake3_sse2.c uses on top of `<wasm_simd128.h>`; we expose
+# it under the x86 intrinsic-header name so blake3_sse2.c's `#include <immintrin.h>`
+# resolves to it (the shim dir is first on the include path).
+RUN mkdir -p /shim && cp wasm_sse_compat.h /shim/ && \
+    for h in immintrin emmintrin tmmintrin xmmintrin; do \
+      echo '#include "wasm_sse_compat.h"' > /shim/$h.h; \
+    done
+
 RUN mkdir /benchmark
-# I am not sure that all of the parameters passed below are needed; this is what I received. It
-# would be preferable if the blake3 would tell us what SIMD parameters they plan to use for their
-# NPM package (https://www.npmjs.com/package/blake3), which currently uses only scalar instructions.
-# The benchmark reads its workload from disk into the heap, and runs under a
-# plain WASI runtime (no JS), so we use a fixed linear memory large enough for
-# the workload and disable memory growth (growth would import an Emscripten JS
-# function the runtime does not provide).
-RUN emcc -O3 -s STANDALONE_WASM=1 \
-    -s INITIAL_MEMORY=67108864 -s ALLOW_MEMORY_GROWTH=0 -s TOTAL_STACK=131072 \
-    -s "EXPORTED_FUNCTIONS=['_main']" \
-    -msimd128 -msse4.1 -msse4.2 \
+# Build BLAKE3 with its hand-written SSE2 implementation (blake3_sse2.c), lowered
+# to Wasm SIMD by the compat header above. `-DBLAKE3_NO_SSE41/AVX2/AVX512` keep the
+# (patched) dispatcher and headers limited to the SSE2 tier. The benchmark reads
+# its workload from disk into the heap under a plain WASI runtime (no JS);
+# wasi-libc's `malloc` grows linear memory natively.
+RUN $CC $CFLAGS -O3 -g -msimd128 -I/shim \
     -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \
+    -Wl,-z,stack-size=131072 \
     -o /benchmark/benchmark.wasm \
-    blake3.c blake3_sse2.c blake3_dispatch.c \
-    blake3_portable.c benchmark.c
+    blake3.c blake3_dispatch.c blake3_portable.c blake3_sse2.c benchmark.c
 # We output the Wasm file to the `/benchmark` directory, where the client
 # expects it.
diff --git a/benchmarks/blake3-simd/README.md b/benchmarks/blake3-simd/README.md
index b6d645f5..0b072d39 100644
--- a/benchmarks/blake3-simd/README.md
+++ b/benchmarks/blake3-simd/README.md
@@ -1,4 +1,6 @@
 # BLAKE3
 
 This benchmark is similar to [../blake3-scalar] and should return the same hash result, but the
-build is configured to compile the C version of BLAKE3 to Wasm SIMD operations using Emscripten.
+build compiles BLAKE3's hand-written SSE2 implementation (`blake3_sse2.c`): `wasm_sse_compat.h`
+maps its x86 SSE2 intrinsics onto Wasm SIMD (via `<wasm_simd128.h>`), and a small patch forces
+BLAKE3's runtime dispatcher to select the SSE2 kernels on wasm.
diff --git a/benchmarks/blake3-simd/benchmark.c b/benchmarks/blake3-simd/benchmark.c
index 7edd70e0..28c4a30f 100644
--- a/benchmarks/blake3-simd/benchmark.c
+++ b/benchmarks/blake3-simd/benchmark.c
@@ -1,46 +1,23 @@
 #include "blake3.h"
+#include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
+#include <unistd.h>
 #include "sightglass.h"
 
-// This benchmark is built with Emscripten (for the SSE2 -> Wasm SIMD blake3
-// implementation), whose libc filesystem does not reach WASI preopened
-// directories. So, like the splay benchmark, we read the workload from disk by
-// calling the WASI `path_open`/`fd_read` syscalls directly. This lets the input
-// be resized without recompiling, similar to the blake3-scalar benchmark.
-#define WASI_IMPORT(name) \
-    __attribute__((import_module("wasi_snapshot_preview1"), import_name(name)))
-
-typedef struct {
-    const void *buf;
-    size_t len;
-} wasi_iovec_t;
-
-WASI_IMPORT("path_open")
-int wasi_path_open(int fd, int dirflags, const char *path, size_t path_len,
-                   int oflags, uint64_t rights_base, uint64_t rights_inheriting,
-                   int fdflags, int *opened_fd);
-
-WASI_IMPORT("fd_read")
-int wasi_fd_read(int fd, const wasi_iovec_t *iovs, size_t iovs_len, size_t *nread);
-
-// The benchmark directory is preopened by the runner as the first preopen (fd 3).
-#define PREOPEN_FD 3
-#define RIGHT_FD_READ (1ULL << 1)
-#define RIGHT_FD_SEEK (1ULL << 2)
+// The workload is read from `default.input` at runtime so it can be resized
+// without recompiling (like the blake3-scalar benchmark). The C BLAKE3
+// hand-written SSE2 implementation is compiled to Wasm SIMD via the SSE->Wasm
+// shim in wasm_sse_compat.h.
 
 int main()
 {
     const char *path = "default.input";
-    int fd = -1;
-    int rc = wasi_path_open(PREOPEN_FD, 0, path, strlen(path), 0,
-                            RIGHT_FD_READ | RIGHT_FD_SEEK,
-                            RIGHT_FD_READ | RIGHT_FD_SEEK, 0, &fd);
-    if (rc != 0 || fd < 0)
+    int fd = open(path, O_RDONLY);
+    if (fd < 0)
     {
-        fprintf(stderr, "failed to open default.input (rc=%d)\n", rc);
+        fprintf(stderr, "failed to open default.input\n");
         return 1;
     }
 
@@ -54,18 +31,17 @@ int main()
             cap *= 2;
             buffer = (unsigned char *)realloc(buffer, cap);
         }
-        wasi_iovec_t iov = {buffer + len, cap - len};
-        size_t nread = 0;
-        rc = wasi_fd_read(fd, &iov, 1, &nread);
-        if (rc != 0)
+        ssize_t nread = read(fd, buffer + len, cap - len);
+        if (nread < 0)
         {
-            fprintf(stderr, "fd_read failed (rc=%d)\n", rc);
+            fprintf(stderr, "read failed\n");
             return 1;
         }
         if (nread == 0)
             break;
-        len += nread;
+        len += (size_t)nread;
     }
+    close(fd);
 
     fprintf(stderr, "[blake3] hashing ./default.input\n");
     fprintf(stderr, "[blake3] input size = %zu\n", len);
diff --git a/benchmarks/blake3-simd/benchmark.wasm b/benchmarks/blake3-simd/benchmark.wasm
index d736513e..27269ca0 100755
Binary files a/benchmarks/blake3-simd/benchmark.wasm and b/benchmarks/blake3-simd/benchmark.wasm differ
diff --git a/benchmarks/blake3-simd/blake3-wasm-sse2.patch b/benchmarks/blake3-simd/blake3-wasm-sse2.patch
new file mode 100644
index 00000000..d60511ca
--- /dev/null
+++ b/benchmarks/blake3-simd/blake3-wasm-sse2.patch
@@ -0,0 +1,133 @@
+diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c
+index 14dfbbe..32d69a2 100644
+--- a/c/blake3_dispatch.c
++++ b/c/blake3_dispatch.c
+@@ -164,16 +164,22 @@ static
+   }
+ }
+ #endif
+ 
+ void blake3_compress_in_place(uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags) {
++#if defined(__wasm__)
++  // wasm has no runtime CPU dispatch; always use the SSE2 implementation, which
++  // wasm_sse_compat.h compiles to Wasm SIMD.
++  blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
++  return;
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if (features & AVX512VL) {
+     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+     return;
+   }
+@@ -193,16 +199,20 @@ void blake3_compress_in_place(uint32_t cv[8],
+ #endif
+   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+ }
+ 
+ void blake3_compress_xof(const uint32_t cv[8],
+                          const uint8_t block[BLAKE3_BLOCK_LEN],
+                          uint8_t block_len, uint64_t counter, uint8_t flags,
+                          uint8_t out[64]) {
++#if defined(__wasm__)
++  blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
++  return;
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if (features & AVX512VL) {
+     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+     return;
+   }
+@@ -246,16 +256,21 @@ void blake3_xof_many(const uint32_t cv[8],
+     blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+   }
+ }
+ 
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                       size_t blocks, const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
++#if defined(__wasm__)
++  blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
++                        increment_counter, flags, flags_start, flags_end, out);
++  return;
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+                             increment_counter, flags, flags_start, flags_end,
+                             out);
+@@ -296,16 +311,19 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+ 
+   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                             increment_counter, flags, flags_start, flags_end,
+                             out);
+ }
+ 
+ // The dynamically detected SIMD degree of the current platform.
+ size_t blake3_simd_degree(void) {
++#if defined(__wasm__)
++  return 4; // the SSE2 implementation processes 4 inputs at a time
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+     return 16;
+   }
+ #endif
+diff --git a/c/blake3_impl.h b/c/blake3_impl.h
+index 88e71e4..f89f9f7 100644
+--- a/c/blake3_impl.h
++++ b/c/blake3_impl.h
+@@ -68,16 +68,19 @@ enum blake3_flags {
+     #endif
+   #else
+     #define BLAKE3_USE_NEON 0
+   #endif
+ #endif
+ 
+ #if defined(IS_X86)
+ #define MAX_SIMD_DEGREE 16
++#elif defined(__wasm__)
++// We force the SSE2 implementation on wasm; it processes 4 inputs at a time.
++#define MAX_SIMD_DEGREE 4
+ #elif BLAKE3_USE_NEON == 1
+ #define MAX_SIMD_DEGREE 4
+ #else
+ #define MAX_SIMD_DEGREE 1
+ #endif
+ 
+ // There are some places where we want a static size that's equal to the
+ // MAX_SIMD_DEGREE, but also at least 2.
+@@ -249,17 +252,17 @@ void blake3_compress_xof_portable(const uint32_t cv[8],
+                                   uint8_t flags, uint8_t out[64]);
+ 
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                                size_t blocks, const uint32_t key[8],
+                                uint64_t counter, bool increment_counter,
+                                uint8_t flags, uint8_t flags_start,
+                                uint8_t flags_end, uint8_t *out);
+ 
+-#if defined(IS_X86)
++#if defined(IS_X86) || defined(__wasm__)
+ #if !defined(BLAKE3_NO_SSE2)
+ void blake3_compress_in_place_sse2(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags);
+ void blake3_compress_xof_sse2(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
diff --git a/benchmarks/blake3-simd/wasm_sse_compat.h b/benchmarks/blake3-simd/wasm_sse_compat.h
new file mode 100644
index 00000000..7b525c0e
--- /dev/null
+++ b/benchmarks/blake3-simd/wasm_sse_compat.h
@@ -0,0 +1,75 @@
+// x86 SSE2 -> WebAssembly SIMD compatibility shim for BLAKE3's blake3_sse2.c.
+//
+// wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm (its
+// `<emmintrin.h>` #errors, and `-msse2` is rejected). BLAKE3's hand-written
+// SSE2 kernels are written against those intrinsics, so this header provides
+// exactly the `_mm_*` intrinsics blake3_sse2.c uses, implemented with the native
+// Wasm SIMD intrinsics from `<wasm_simd128.h>`. On a native (x86) build it just
+// includes the real header, so the same source compiles either way.
+#pragma once
+
+#if defined(__wasm_simd128__)
+
+#include <wasm_simd128.h>
+#include <stdint.h>
+
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+
+// --- Arithmetic / logical --------------------------------------------------
+static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_add((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_sub((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_and_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_and((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_or_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_or((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_xor_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_xor((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_andnot((v128_t)(b), (v128_t)(a)); } // (~a) & b
+static inline __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_eq((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_gt((v128_t)(a), (v128_t)(b)); }
+
+// --- Splat / set -----------------------------------------------------------
+static inline __m128i _mm_set1_epi16(short x) { return (__m128i)wasm_i16x8_splat(x); }
+static inline __m128i _mm_set1_epi32(int x) { return (__m128i)wasm_i32x4_splat(x); }
+// `set` is high-lane-first; `make` is low-lane-first.
+#define _mm_set_epi32(e3, e2, e1, e0)  ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3)))
+#define _mm_setr_epi32(e0, e1, e2, e3) ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3)))
+#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
+  ((__m128i)wasm_i16x8_make((e0), (e1), (e2), (e3), (e4), (e5), (e6), (e7)))
+
+// --- Load / store (unaligned) ----------------------------------------------
+static inline __m128i _mm_loadu_si128(const __m128i *p) { return (__m128i)wasm_v128_load(p); }
+static inline void _mm_storeu_si128(__m128i *p, __m128i a) { wasm_v128_store(p, (v128_t)(a)); }
+
+// --- Reinterpret casts ------------------------------------------------------
+static inline __m128 _mm_castsi128_ps(__m128i a) { return (__m128)(a); }
+static inline __m128i _mm_castps_si128(__m128 a) { return (__m128i)(a); }
+
+// --- Shifts (immediate or runtime count) -----------------------------------
+static inline __m128i _mm_slli_epi32(__m128i a, int c) { return (__m128i)wasm_i32x4_shl((v128_t)(a), c); }
+static inline __m128i _mm_srli_epi32(__m128i a, int c) { return (__m128i)wasm_u32x4_shr((v128_t)(a), c); }
+
+// --- Interleave (unpack) ----------------------------------------------------
+static inline __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); }
+static inline __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); }
+static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); }
+static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); }
+
+// --- Shuffles (immediate); macros so lane indices stay compile-time ---------
+#define _mm_shuffle_epi32(a, imm) \
+  ((__m128i)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3))
+#define _mm_shuffle_ps(a, b, imm) \
+  ((__m128)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(b), (imm) & 3, ((imm) >> 2) & 3, 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))
+#define _mm_shufflelo_epi16(a, imm) \
+  ((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3, 4, 5, 6, 7))
+#define _mm_shufflehi_epi16(a, imm) \
+  ((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), 0, 1, 2, 3, 4 + ((imm) & 3), 4 + (((imm) >> 2) & 3), 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))
+
+// --- Misc -------------------------------------------------------------------
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+#define _MM_HINT_T0 3
+#define _mm_prefetch(p, hint) ((void)(p)) // no prefetch hint on wasm
+
+#else // !__wasm_simd128__ : native x86 build uses the real intrinsics.
+
+#include <immintrin.h>
+
+#endif
diff --git a/benchmarks/hashset/HashSet.cpp b/benchmarks/hashset/HashSet.cpp
index f18c53a1..5c478a59 100644
--- a/benchmarks/hashset/HashSet.cpp
+++ b/benchmarks/hashset/HashSet.cpp
@@ -29,7 +29,7 @@
 #include <sightglass.h>
 
 // Compile with: xcrun clang++ -o HashSet HashSet.cpp -O2 -W -framework Foundation -licucore -std=c++11 -fvisibility=hidden -DNDEBUG=1
-// Or for wasm: em++ -o HashSet.js -o HashSet.html HashSet.cpp -O2 -W -std=c++11 -DNDEBUG=1 -g1 -s WASM=1 -s TOTAL_MEMORY=52428800
+// The Wasm build is defined by this benchmark's Dockerfile (wasi-sdk clang++).
 
 #define ALWAYS_INLINE inline __attribute__((__always_inline__))
 
diff --git a/benchmarks/intgemm-simd/Dockerfile b/benchmarks/intgemm-simd/Dockerfile
index a00164ea..b46e2035 100644
--- a/benchmarks/intgemm-simd/Dockerfile
+++ b/benchmarks/intgemm-simd/Dockerfile
@@ -1,4 +1,32 @@
-FROM emscripten/emsdk:4.0.10
+# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every
+# build.
+
+# First, retrieve wasi-sdk:
+
+FROM ubuntu:24.04 AS builder
+WORKDIR /
+RUN apt update && apt install -y wget
+
+# Download and extract wasi-sdk.
+RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz
+RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz
+
+# Second, compile the benchmark to Wasm.
+
+FROM ubuntu:24.04
+WORKDIR /
+COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/
+RUN apt update && apt install -y git cmake make patch
+
+# Set common env vars.
+ENV CC=/wasi-sdk/bin/clang
+ENV CXX=/wasi-sdk/bin/clang++
+ENV LD=/wasi-sdk/bin/lld
+ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+# Prepend wasi-sdk's bin while keeping the system PATH so `git`, `cmake`, `make`
+# and `patch` still resolve.
+ENV PATH=/wasi-sdk/bin:$PATH
 
 WORKDIR /usr/src
 RUN git clone https://github.com/kpu/intgemm.git
@@ -7,22 +35,42 @@ WORKDIR /usr/src/intgemm
 RUN git checkout be3053515a8a04d19c6959a370eaf8b5a6eab686
 COPY benchmark.cpp .
 COPY sightglass.h .
-
 COPY patch-intgemm.diff .
+COPY wasm_sse_compat.h .
 RUN patch -p 1 < patch-intgemm.diff
 
-# Building static library
+# wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm. `wasm_sse_compat.h`
+# reimplements the `_mm_*` intrinsics intgemm uses on top of `<wasm_simd128.h>`.
+# We expose it under the x86 intrinsic-header names in a shim directory placed
+# first on the include path, so intgemm's `#include <emmintrin.h>` (etc.) resolve
+# to the compat header instead of clang's x86-only headers (which #error on wasm).
+RUN mkdir -p /shim && cp wasm_sse_compat.h /shim/ && \
+    for h in emmintrin tmmintrin xmmintrin smmintrin pmmintrin nmmintrin immintrin; do \
+      echo '#include "wasm_sse_compat.h"' > /shim/$h.h; \
+    done
+
+# Flags shared by the library and the final link: emit Wasm SIMD (`-msimd128`),
+# resolve x86 intrinsic headers to the compat shim, and define `WASM` (activates
+# intgemm's patched SSSE3-only CPU selection — there is no runtime CPUID on Wasm).
+# intgemm's AVX2/AVX512 feature-detection fails under wasi-sdk, so those kernels
+# compile out. `-Wno-unknown-warning-option` keeps intgemm's `-Werror` build from
+# choking on its Emscripten-only `-Wno-error=pthreads-mem-growth` flag.
+# `-fno-exceptions` matches the other C++ benchmarks (gcc-loops, hashset) and
+# avoids pulling in the C++ exception runtime, which wasi-sdk does not link by
+# default; intgemm's only `throw` is already guarded behind `__EXCEPTIONS`.
+ENV WASM_FLAGS="-O3 -msimd128 -fno-exceptions -I/shim -DWASM -Wno-unknown-warning-option"
+
+# Build the static library.
 RUN mkdir build
 WORKDIR /usr/src/intgemm/build
-RUN emcmake cmake .. -DCOMPILE_WASM=1 -DCMAKE_CXX_FLAGS="-msimd128 -mssse3 -O3"
-RUN emmake make intgemm
+RUN cmake .. -DCMAKE_TOOLCHAIN_FILE=/wasi-sdk/share/cmake/wasi-sdk.cmake \
+    -DCOMPILE_WASM=1 -DCMAKE_CXX_FLAGS="$WASM_FLAGS"
+RUN make intgemm
 
 WORKDIR /usr/src/intgemm
 RUN mkdir /benchmark
-RUN emcc -O3 -s STANDALONE_WASM=1 \
-    -s INITIAL_MEMORY=33554432 -s MAXIMUM_MEMORY=33554432 \
-    -s ALLOW_MEMORY_GROWTH=0 -s TOTAL_STACK=2097152 \
-    -msimd128 -mssse3 -O3 -Ibuild \
+RUN $CXX $CXXFLAGS $WASM_FLAGS -Ibuild \
+    -Wl,-z,stack-size=2097152 \
     -o /benchmark/benchmark.wasm \
     benchmark.cpp build/libintgemm.a
 # We output the Wasm file to the `/benchmark` directory, where the client
diff --git a/benchmarks/intgemm-simd/README.md b/benchmarks/intgemm-simd/README.md
index 43b4b8dd..059a96c3 100644
--- a/benchmarks/intgemm-simd/README.md
+++ b/benchmarks/intgemm-simd/README.md
@@ -4,4 +4,8 @@ Tests integer matrix multiplication.
 
 See https://github.com/kpu/intgemm
 
-Using older emscripten 2.0 to avoid special pmaddwd sequences. Update emscripten image in the future and moar speed.
\ No newline at end of file
+Built with wasi-sdk (clang). intgemm's kernels are written with x86 SSE/SSSE3
+intrinsics, which wasi-sdk's clang cannot compile for Wasm, so `wasm_sse_compat.h`
+reimplements the intrinsics intgemm uses on top of `<wasm_simd128.h>` and is
+substituted for the x86 intrinsic headers at build time (see the `Dockerfile`).
+`-DWASM` selects intgemm's SSSE3 kernels, since there is no runtime CPUID on Wasm.
\ No newline at end of file
diff --git a/benchmarks/intgemm-simd/benchmark.cpp b/benchmarks/intgemm-simd/benchmark.cpp
index 308c2eb2..ac405e9d 100644
--- a/benchmarks/intgemm-simd/benchmark.cpp
+++ b/benchmarks/intgemm-simd/benchmark.cpp
@@ -20,44 +20,29 @@
 #include <iostream>
 #include <random>
 
-// This benchmark is built with Emscripten, whose libc filesystem does not reach
-// WASI preopened directories. So, like the splay and blake3-simd benchmarks, we
-// read the workload size from disk by calling the WASI `path_open`/`fd_read`
-// syscalls directly. This lets the workload be resized without recompiling.
-#define WASI_IMPORT(name) \
-    __attribute__((import_module("wasi_snapshot_preview1"), import_name(name)))
-
-typedef struct {
-    const void *buf;
-    size_t len;
-} wasi_iovec_t;
-
-WASI_IMPORT("path_open")
-int wasi_path_open(int fd, int dirflags, const char *path, size_t path_len,
-                   int oflags, uint64_t rights_base, uint64_t rights_inheriting,
-                   int fdflags, int *opened_fd);
-
-WASI_IMPORT("fd_read")
-int wasi_fd_read(int fd, const wasi_iovec_t *iovs, size_t iovs_len, size_t *nread);
+#include <fcntl.h>
+#include <unistd.h>
 
+// The dominant matrix's row count is read from `default.input` at runtime so the
+// workload can be resized without recompiling (like the quicksort and
+// blake3-simd benchmarks).
 static int read_int_from_file()
 {
-    const char *path = "default.input"; // preopen fd 3 is the benchmark dir
-    int fd = -1;
-    if (wasi_path_open(3, 0, path, strlen(path), 0, (1ULL << 1) | (1ULL << 2),
-                       (1ULL << 1) | (1ULL << 2), 0, &fd) != 0 || fd < 0) {
+    const char *path = "default.input";
+    int fd = open(path, O_RDONLY);
+    if (fd < 0) {
         std::cerr << "failed to open default.input" << std::endl;
         abort();
     }
     char buf[64] = {0};
     size_t total = 0;
     for (;;) {
-        wasi_iovec_t iov = {buf + total, sizeof(buf) - 1 - total};
-        size_t nread = 0;
-        if (wasi_fd_read(fd, &iov, 1, &nread) != 0) { abort(); }
+        ssize_t nread = read(fd, buf + total, sizeof(buf) - 1 - total);
+        if (nread < 0) { abort(); }
         if (nread == 0 || total >= sizeof(buf) - 1) break;
-        total += nread;
+        total += (size_t)nread;
     }
+    close(fd);
     buf[total] = '\0';
     return atoi(buf);
 }
diff --git a/benchmarks/intgemm-simd/benchmark.wasm b/benchmarks/intgemm-simd/benchmark.wasm
index af04210f..64fda66c 100755
Binary files a/benchmarks/intgemm-simd/benchmark.wasm and b/benchmarks/intgemm-simd/benchmark.wasm differ
diff --git a/benchmarks/intgemm-simd/patch-intgemm.diff b/benchmarks/intgemm-simd/patch-intgemm.diff
index 0102a570..4d2b6a44 100644
--- a/benchmarks/intgemm-simd/patch-intgemm.diff
+++ b/benchmarks/intgemm-simd/patch-intgemm.diff
@@ -2,7 +2,12 @@ diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc
 index 58e4bc5..70679ee 100644
 --- a/intgemm/intgemm.cc
 +++ b/intgemm/intgemm.cc
-@@ -120,7 +120,11 @@ CPUType GetCPUID() {
+@@ -115,17 +115,21 @@ CPUType GetCPUID() {
+ #ifdef INTGEMM_CPUID_ENVIRONMENT
+     std::min(RealCPUID(), EnvironmentCPUID());
+ #else
+     RealCPUID();
+ #endif
    return kLocalCPU;
  }
  
@@ -14,11 +19,21 @@ index 58e4bc5..70679ee 100644
  
  void UnsupportedCPUError() {
  #if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
+   throw UnsupportedCPU();
+ #else
+   fprintf(stderr, "intgemm does not support this CPU.\n");
+   abort();
+ #endif
 diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h
 index 26febb5..616fe0a 100644
 --- a/intgemm/intgemm.h
 +++ b/intgemm/intgemm.h
-@@ -150,8 +150,13 @@ CPUType GetCPUID();
+@@ -145,18 +145,23 @@ CPUType GetCPUID();
+  *
+  * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
+  *
+  * sse2 if the CPU supports SSE2
+  *
   * unsupported otherwise
   */
  template <class T> T ChooseCPU(T avx512vnni, T avx512bw, T avx2, T ssse3, T sse2, T unsupported) {
@@ -32,3 +47,37 @@ index 26febb5..616fe0a 100644
  }
  
  struct TileInfo {
+   const Index a_rows;
+   const Index a_cols;
+   const Index b_rows;
+   const Index b_cols;
+ };
+diff --git a/intgemm/types.h b/intgemm/types.h
+index 44fb4e2..048339e 100644
+--- a/intgemm/types.h
++++ b/intgemm/types.h
+@@ -14,18 +14,23 @@
+ #include <avx2intrin.h>
+ #include <smmintrin.h>
+ #include <avx512fintrin.h>
+ #include <avx512dqintrin.h>
+ #include <avx512bwintrin.h>
+ #include <avx512vnniintrin.h>
+ #endif
+ 
+-#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER)
++#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER) || defined(__wasm__)
+ /* Real MSVC does not appear to have target attributes but is also fine with
++ * just using intrinsics anywhere.  The same is true for our wasm build, which
++ * compiles the SSE intrinsics through wasm_sse_compat.h and has no notion of
++ * x86 target attributes.
++ * Original comment follows:
++ * Real MSVC does not appear to have target attributes but is also fine with
+  * just using intrinsics anywhere.  clang-cl pretending to be MSVC requires
+  * target attributes, so it's excluded from the above.
+  *
+  * The Intel compiler has a bug whereby constructors with target attributes do
+  * not link.  Like this program doesn't compile with icpc:
+  * class Foo {
+  *   public:
+  *     __attribute__ ((target ("avx2"))) Foo() {}
diff --git a/benchmarks/intgemm-simd/wasm_sse_compat.h b/benchmarks/intgemm-simd/wasm_sse_compat.h
new file mode 100644
index 00000000..fc5cca44
--- /dev/null
+++ b/benchmarks/intgemm-simd/wasm_sse_compat.h
@@ -0,0 +1,167 @@
+// x86 SSE/SSSE3 -> WebAssembly SIMD compatibility shim for intgemm.
+//
+// wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm (its
+// `<emmintrin.h>`/`<tmmintrin.h>` #error out, and `-msse*` are rejected), and
+// intgemm's kernels are written against those intrinsics. Rather than pull in a
+// whole library, this header provides exactly the `_mm_*` intrinsics intgemm
+// uses, implemented with the native Wasm SIMD intrinsics from
+// `<wasm_simd128.h>`. On a native (x86) target it just includes the real
+// headers, so the same intgemm source builds either way.
+//
+// Note: __m128, __m128i and __m128d are defined as *distinct* vector types
+// (different element types) on purpose. intgemm overloads functions on them
+// (e.g. callbacks' run_callbacks), so they must not collapse to a single type
+// the way wasm's `v128_t` would.
+#pragma once
+
+#if defined(__wasm_simd128__)
+
+#include <wasm_simd128.h>
+#include <cstdint>
+
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
+
+// Reinterpret (bit-cast) helpers between our typed vectors and wasm's v128_t.
+#define INTGEMM_W(x) ((v128_t)(x))
+
+// --- Load / store / set / zero ---------------------------------------------
+static inline __m128 _mm_load_ps(const float *p) { return (__m128)wasm_v128_load(p); }
+static inline __m128 _mm_loadu_ps(const float *p) { return (__m128)wasm_v128_load(p); }
+static inline void _mm_storeu_ps(float *p, __m128 a) { wasm_v128_store(p, INTGEMM_W(a)); }
+static inline __m128i _mm_set1_epi8(int8_t v) { return (__m128i)wasm_i8x16_splat(v); }
+static inline __m128i _mm_set1_epi16(int16_t v) { return (__m128i)wasm_i16x8_splat(v); }
+static inline __m128i _mm_set1_epi32(int32_t v) { return (__m128i)wasm_i32x4_splat(v); }
+static inline __m128 _mm_set1_ps(float v) { return (__m128)wasm_f32x4_splat(v); }
+static inline __m128d _mm_set1_pd(double v) { return (__m128d)wasm_f64x2_splat(v); }
+static inline __m128i _mm_setzero_si128(void) { return (__m128i)wasm_i64x2_const(0, 0); }
+static inline __m128 _mm_setzero_ps(void) { return (__m128)wasm_i64x2_const(0, 0); }
+static inline __m128d _mm_setzero_pd(void) { return (__m128d)wasm_i64x2_const(0, 0); }
+
+// --- Integer arithmetic ----------------------------------------------------
+static inline __m128i _mm_add_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_add(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_add_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_add(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_add(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_sub_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_sub(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_add_sat(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_abs_epi8(__m128i a) { return (__m128i)wasm_i8x16_abs(INTGEMM_W(a)); }
+static inline __m128i _mm_max_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_max(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_max(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_mullo_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_mul(INTGEMM_W(a), INTGEMM_W(b)); }
+
+// High 16 bits of signed 16-bit products.
+static inline __m128i _mm_mulhi_epi16(__m128i a, __m128i b) {
+  v128_t lo = wasm_i32x4_extmul_low_i16x8(INTGEMM_W(a), INTGEMM_W(b));
+  v128_t hi = wasm_i32x4_extmul_high_i16x8(INTGEMM_W(a), INTGEMM_W(b));
+  // Pick the high 16 bits (odd 16-bit lanes) of each 32-bit product.
+  return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15);
+}
+// Unsigned 32x32 -> 64-bit products of the even 32-bit lanes (0 and 2).
+static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) {
+  v128_t ea = wasm_i32x4_shuffle(INTGEMM_W(a), INTGEMM_W(a), 0, 2, 0, 2);
+  v128_t eb = wasm_i32x4_shuffle(INTGEMM_W(b), INTGEMM_W(b), 0, 2, 0, 2);
+  return (__m128i)wasm_u64x2_extmul_low_u32x4(ea, eb);
+}
+// Multiply adjacent unsigned*signed bytes, add pairs with signed saturation.
+static inline __m128i _mm_maddubs_epi16(__m128i a, __m128i b) {
+  v128_t alo = wasm_u16x8_extend_low_u8x16(INTGEMM_W(a));
+  v128_t ahi = wasm_u16x8_extend_high_u8x16(INTGEMM_W(a));
+  v128_t blo = wasm_i16x8_extend_low_i8x16(INTGEMM_W(b));
+  v128_t bhi = wasm_i16x8_extend_high_i8x16(INTGEMM_W(b));
+  v128_t plo = wasm_i16x8_mul(alo, blo); // products of bytes 0..7
+  v128_t phi = wasm_i16x8_mul(ahi, bhi); // products of bytes 8..15
+  v128_t even = wasm_i16x8_shuffle(plo, phi, 0, 2, 4, 6, 8, 10, 12, 14);
+  v128_t odd = wasm_i16x8_shuffle(plo, phi, 1, 3, 5, 7, 9, 11, 13, 15);
+  return (__m128i)wasm_i16x8_add_sat(even, odd);
+}
+// Multiply 16-bit pairs and add adjacent into 32-bit lanes.
+static inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
+  return (__m128i)wasm_i32x4_dot_i16x8(INTGEMM_W(a), INTGEMM_W(b));
+}
+// Negate/zero each byte of `a` according to the sign of the matching byte of `b`.
+static inline __m128i _mm_sign_epi8(__m128i a, __m128i b) {
+  v128_t z = wasm_i8x16_splat(0);
+  v128_t neg = wasm_i8x16_neg(INTGEMM_W(a));
+  v128_t ltz = wasm_i8x16_lt(INTGEMM_W(b), z);  // b < 0
+  v128_t eqz = wasm_i8x16_eq(INTGEMM_W(b), z);  // b == 0
+  v128_t r = wasm_v128_bitselect(neg, INTGEMM_W(a), ltz);
+  return (__m128i)wasm_v128_andnot(r, eqz); // zero where b == 0
+}
+
+// --- Float / double arithmetic ---------------------------------------------
+static inline __m128 _mm_add_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_add(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_sub_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_sub(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_mul_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_mul(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_div_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_div(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_min_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_min(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_max_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_max(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128d _mm_add_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_add(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128d _mm_sub_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_sub(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128d _mm_mul_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_mul(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128d _mm_max_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_max(INTGEMM_W(a), INTGEMM_W(b)); }
+
+// --- Conversions ------------------------------------------------------------
+static inline __m128 _mm_cvtepi32_ps(__m128i a) { return (__m128)wasm_f32x4_convert_i32x4(INTGEMM_W(a)); }
+static inline __m128i _mm_cvttps_epi32(__m128 a) { return (__m128i)wasm_i32x4_trunc_sat_f32x4(INTGEMM_W(a)); }
+// Round to nearest (ties to even), then convert, matching SSE's default mode.
+static inline __m128i _mm_cvtps_epi32(__m128 a) { return (__m128i)wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(INTGEMM_W(a))); }
+static inline __m128 _mm_castsi128_ps(__m128i a) { return (__m128)(a); }
+
+// --- Bitwise ----------------------------------------------------------------
+static inline __m128i _mm_and_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_and(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_or_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_or(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_xor_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_xor(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_and_ps(__m128 a, __m128 b) { return (__m128)wasm_v128_and(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_andnot_ps(__m128 a, __m128 b) { return (__m128)wasm_v128_andnot(INTGEMM_W(b), INTGEMM_W(a)); } // (~a) & b
+
+// --- Comparisons (produce all-ones / all-zeros masks) ----------------------
+static inline __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_eq(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_gt(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_lt(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_gt(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_lt(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_cmplt_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_lt(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_ne(INTGEMM_W(a), INTGEMM_W(b)); }
+
+// --- Saturating narrowing (pack) -------------------------------------------
+static inline __m128i _mm_packs_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_narrow_i16x8(INTGEMM_W(a), INTGEMM_W(b)); }
+static inline __m128i _mm_packs_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_narrow_i32x4(INTGEMM_W(a), INTGEMM_W(b)); }
+
+// --- Interleave (unpack) ----------------------------------------------------
+static inline __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); }
+static inline __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); }
+static inline __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23); }
+static inline __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31); }
+static inline __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); }
+static inline __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); }
+static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); }
+static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); }
+
+// --- Shifts by a (compile-time or runtime) count ---------------------------
+static inline __m128i _mm_slli_epi16(__m128i a, int c) { return (__m128i)wasm_i16x8_shl(INTGEMM_W(a), c); }
+static inline __m128i _mm_srli_epi16(__m128i a, int c) { return (__m128i)wasm_u16x8_shr(INTGEMM_W(a), c); }
+static inline __m128i _mm_srai_epi16(__m128i a, int c) { return (__m128i)wasm_i16x8_shr(INTGEMM_W(a), c); }
+static inline __m128i _mm_srai_epi32(__m128i a, int c) { return (__m128i)wasm_i32x4_shr(INTGEMM_W(a), c); }
+
+// --- Shuffles (immediate); macros so the lane indices stay compile-time -----
+#define _mm_shuffle_epi32(a, imm) \
+  ((__m128i)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3))
+#define _mm_shuffle_ps(a, b, imm) \
+  ((__m128)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(b), (imm) & 3, ((imm) >> 2) & 3, 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))
+// Byte-wise shift right of the whole 128-bit value (shifting in zeros).
+#define _mm_srli_si128(a, imm) \
+  ((__m128i)wasm_i8x16_shuffle((v128_t)(a), wasm_i8x16_splat(0), \
+    (imm) + 0, (imm) + 1, (imm) + 2, (imm) + 3, (imm) + 4, (imm) + 5, (imm) + 6, (imm) + 7, \
+    (imm) + 8, (imm) + 9, (imm) + 10, (imm) + 11, (imm) + 12, (imm) + 13, (imm) + 14, (imm) + 15))
+
+#undef INTGEMM_W
+
+#else // !__wasm_simd128__ : native x86 build uses the real intrinsics.
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <smmintrin.h>
+
+#endif
diff --git a/benchmarks/noop/Dockerfile b/benchmarks/noop/Dockerfile
deleted file mode 120000
index edc75978..00000000
--- a/benchmarks/noop/Dockerfile
+++ /dev/null
@@ -1 +0,0 @@
-../Dockerfile.emscripten
\ No newline at end of file
diff --git a/benchmarks/noop/Dockerfile b/benchmarks/noop/Dockerfile
new file mode 100644
index 00000000..f6883aee
--- /dev/null
+++ b/benchmarks/noop/Dockerfile
@@ -0,0 +1,34 @@
+# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every
+# build.
+
+# First, retrieve wasi-sdk:
+
+FROM ubuntu:24.04 AS builder
+WORKDIR /
+RUN apt update && apt install -y wget
+
+# Download and extract wasi-sdk.
+RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz
+RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz
+
+# Second, compile the benchmark to Wasm.
+
+FROM ubuntu:24.04
+WORKDIR /
+COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/
+
+# Set common env vars.
+ENV CC=/wasi-sdk/bin/clang
+ENV CXX=/wasi-sdk/bin/clang++
+ENV LD=/wasi-sdk/bin/lld
+ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+ENV PATH /wasi-sdk
+
+# Compile `benchmark.c` to `./benchmark.wasm`.
+COPY benchmark.c .
+COPY sightglass.h .
+WORKDIR /benchmark
+RUN $CC $CFLAGS ../benchmark.c -O3 -g -DNDEBUG -I.. -o benchmark.wasm
+# We output the Wasm file to the `/benchmark` directory, where the client
+# expects it.
diff --git a/benchmarks/noop/benchmark.wasm b/benchmarks/noop/benchmark.wasm
index 7449a879..522a60de 100755
Binary files a/benchmarks/noop/benchmark.wasm and b/benchmarks/noop/benchmark.wasm differ