bytecodealliance · cfallin · Jun 25, 2026 · Jun 25, 2026
diff --git a/benchmarks/Dockerfile.emscripten b/benchmarks/Dockerfile.emscripten
diff --git a/benchmarks/blake3-simd/Dockerfile b/benchmarks/blake3-simd/Dockerfile
@@ -1,26 +1,66 @@
-FROM emscripten/emsdk:4.0.10
+# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every
+# build.
+
+# First, retrieve wasi-sdk:
+
+FROM ubuntu:24.04 AS builder
+WORKDIR /
+RUN apt update && apt install -y wget
+
+# Download and extract wasi-sdk.
+RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz
+RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz
+
+# Second, compile the benchmark to Wasm.
+
+FROM ubuntu:24.04
+WORKDIR /
+COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/
+RUN apt update && apt install -y git patch
+
+# Set common env vars.
+ENV CC=/wasi-sdk/bin/clang
+ENV CXX=/wasi-sdk/bin/clang++
+ENV LD=/wasi-sdk/bin/lld
+ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
+# Prepend wasi-sdk's bin while keeping the system PATH so `git`/`patch` resolve.
+ENV PATH=/wasi-sdk/bin:$PATH
 
 WORKDIR /usr/src
 RUN git clone https://github.com/BLAKE3-team/BLAKE3.git
 
+WORKDIR /usr/src/BLAKE3
+RUN git checkout 8aa5145039b972ba30e98e788752d37d14568824
+# BLAKE3's dispatcher only selects its SSE2 kernels when IS_X86 is defined (it is
+# not on wasm). This patch forces the SSE2 path on wasm and exposes the SSE2
+# function declarations there.
+COPY blake3-wasm-sse2.patch .
+RUN patch -p1 < blake3-wasm-sse2.patch
+
 WORKDIR /usr/src/BLAKE3/c
 COPY benchmark.c .
 COPY sightglass.h .
+COPY wasm_sse_compat.h .
+# wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm. `wasm_sse_compat.h`
+# reimplements the ones blake3_sse2.c uses on top of `<wasm_simd128.h>`; we expose
+# it under the x86 intrinsic-header name so blake3_sse2.c's `#include <immintrin.h>`
+# resolves to it (the shim dir is first on the include path).
+RUN mkdir -p /shim && cp wasm_sse_compat.h /shim/ && \
+    for h in immintrin emmintrin tmmintrin xmmintrin; do \
+      echo '#include "wasm_sse_compat.h"' > /shim/$h.h; \
+    done
+
 RUN mkdir /benchmark
-# I am not sure that all of the parameters passed below are needed; this is what I received. It
-# would be preferable if the blake3 would tell us what SIMD parameters they plan to use for their
-# NPM package (https://www.npmjs.com/package/blake3), which currently uses only scalar instructions.
-# The benchmark reads its workload from disk into the heap, and runs under a
-# plain WASI runtime (no JS), so we use a fixed linear memory large enough for
-# the workload and disable memory growth (growth would import an Emscripten JS
-# function the runtime does not provide).
-RUN emcc -O3 -s STANDALONE_WASM=1 \
-    -s INITIAL_MEMORY=67108864 -s ALLOW_MEMORY_GROWTH=0 -s TOTAL_STACK=131072 \
-    -s "EXPORTED_FUNCTIONS=['_main']" \
-    -msimd128 -msse4.1 -msse4.2 \
+# Build BLAKE3 with its hand-written SSE2 implementation (blake3_sse2.c), lowered
+# to Wasm SIMD by the compat header above. `-DBLAKE3_NO_SSE41/AVX2/AVX512` keep the
+# (patched) dispatcher and headers limited to the SSE2 tier. The benchmark reads
+# its workload from disk into the heap under a plain WASI runtime (no JS);
+# wasi-libc's `malloc` grows linear memory natively.
+RUN $CC $CFLAGS -O3 -g -msimd128 -I/shim \
     -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \
+    -Wl,-z,stack-size=131072 \
     -o /benchmark/benchmark.wasm \
-    blake3.c blake3_sse2.c blake3_dispatch.c \
-    blake3_portable.c benchmark.c
+    blake3.c blake3_dispatch.c blake3_portable.c blake3_sse2.c benchmark.c
 # We output the Wasm file to the `/benchmark` directory, where the client
 # expects it.
diff --git a/benchmarks/blake3-simd/README.md b/benchmarks/blake3-simd/README.md
@@ -1,4 +1,6 @@
 # BLAKE3
 
 This benchmark is similar to [../blake3-scalar] and should return the same hash result, but the
-build is configured to compile the C version of BLAKE3 to Wasm SIMD operations using Emscripten.
+build compiles BLAKE3's hand-written SSE2 implementation (`blake3_sse2.c`): `wasm_sse_compat.h`
+maps its x86 SSE2 intrinsics onto Wasm SIMD (via `<wasm_simd128.h>`), and a small patch forces
+BLAKE3's runtime dispatcher to select the SSE2 kernels on wasm.
diff --git a/benchmarks/blake3-simd/benchmark.c b/benchmarks/blake3-simd/benchmark.c
@@ -1,46 +1,23 @@
 #include "blake3.h"
+#include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
+#include <unistd.h>
 #include "sightglass.h"
 
-// This benchmark is built with Emscripten (for the SSE2 -> Wasm SIMD blake3
-// implementation), whose libc filesystem does not reach WASI preopened
-// directories. So, like the splay benchmark, we read the workload from disk by
-// calling the WASI `path_open`/`fd_read` syscalls directly. This lets the input
-// be resized without recompiling, similar to the blake3-scalar benchmark.
-#define WASI_IMPORT(name) \
-    __attribute__((import_module("wasi_snapshot_preview1"), import_name(name)))
-
-typedef struct {
-    const void *buf;
-    size_t len;
-} wasi_iovec_t;
-
-WASI_IMPORT("path_open")
-int wasi_path_open(int fd, int dirflags, const char *path, size_t path_len,
-                   int oflags, uint64_t rights_base, uint64_t rights_inheriting,
-                   int fdflags, int *opened_fd);
-
-WASI_IMPORT("fd_read")
-int wasi_fd_read(int fd, const wasi_iovec_t *iovs, size_t iovs_len, size_t *nread);
-
-// The benchmark directory is preopened by the runner as the first preopen (fd 3).
-#define PREOPEN_FD 3
-#define RIGHT_FD_READ (1ULL << 1)
-#define RIGHT_FD_SEEK (1ULL << 2)
+// The workload is read from `default.input` at runtime so it can be resized
+// without recompiling (like the blake3-scalar benchmark). The C BLAKE3
+// hand-written SSE2 implementation is compiled to Wasm SIMD via the SSE->Wasm
+// shim in wasm_sse_compat.h.
 
 int main()
 {
     const char *path = "default.input";
-    int fd = -1;
-    int rc = wasi_path_open(PREOPEN_FD, 0, path, strlen(path), 0,
-                            RIGHT_FD_READ | RIGHT_FD_SEEK,
-                            RIGHT_FD_READ | RIGHT_FD_SEEK, 0, &fd);
-    if (rc != 0 || fd < 0)
+    int fd = open(path, O_RDONLY);
+    if (fd < 0)
     {
-        fprintf(stderr, "failed to open default.input (rc=%d)\n", rc);
+        fprintf(stderr, "failed to open default.input\n");
         return 1;
     }
 
@@ -54,18 +31,17 @@ int main()
             cap *= 2;
             buffer = (unsigned char *)realloc(buffer, cap);
         }
-        wasi_iovec_t iov = {buffer + len, cap - len};
-        size_t nread = 0;
-        rc = wasi_fd_read(fd, &iov, 1, &nread);
-        if (rc != 0)
+        ssize_t nread = read(fd, buffer + len, cap - len);
+        if (nread < 0)
         {
-            fprintf(stderr, "fd_read failed (rc=%d)\n", rc);
+            fprintf(stderr, "read failed\n");
             return 1;
         }
         if (nread == 0)
             break;
-        len += nread;
+        len += (size_t)nread;
     }
+    close(fd);
 
     fprintf(stderr, "[blake3] hashing ./default.input\n");
     fprintf(stderr, "[blake3] input size = %zu\n", len);

diff --git a/benchmarks/blake3-simd/benchmark.wasm b/benchmarks/blake3-simd/benchmark.wasm
diff --git a/benchmarks/blake3-simd/blake3-wasm-sse2.patch b/benchmarks/blake3-simd/blake3-wasm-sse2.patch
@@ -0,0 +1,133 @@
+diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c
+index 14dfbbe..32d69a2 100644
+--- a/c/blake3_dispatch.c
++++ b/c/blake3_dispatch.c
+@@ -164,16 +164,22 @@ static
+   }
+ }
+ #endif
+
+ void blake3_compress_in_place(uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
+                               uint8_t flags) {
++#if defined(__wasm__)
++  // wasm has no runtime CPU dispatch; always use the SSE2 implementation, which
++  // wasm_sse_compat.h compiles to Wasm SIMD.
++  blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
++  return;
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if (features & AVX512VL) {
+     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+     return;
+   }
+@@ -193,16 +199,20 @@ void blake3_compress_in_place(uint32_t cv[8],
+ #endif
+   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+ }
+
+ void blake3_compress_xof(const uint32_t cv[8],
+                          const uint8_t block[BLAKE3_BLOCK_LEN],
+                          uint8_t block_len, uint64_t counter, uint8_t flags,
+                          uint8_t out[64]) {
++#if defined(__wasm__)
++  blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
++  return;
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if (features & AVX512VL) {
+     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+     return;
+   }
+@@ -246,16 +256,21 @@ void blake3_xof_many(const uint32_t cv[8],
+     blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+   }
+ }
+
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+                       size_t blocks, const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
++#if defined(__wasm__)
++  blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
++                        increment_counter, flags, flags_start, flags_end, out);
++  return;
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+                             increment_counter, flags, flags_start, flags_end,
+                             out);
+@@ -296,16 +311,19 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+
+   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                             increment_counter, flags, flags_start, flags_end,
+                             out);
+ }
+
+ // The dynamically detected SIMD degree of the current platform.
+ size_t blake3_simd_degree(void) {
++#if defined(__wasm__)
++  return 4; // the SSE2 implementation processes 4 inputs at a time
++#endif
+ #if defined(IS_X86)
+   const enum cpu_feature features = get_cpu_features();
+   MAYBE_UNUSED(features);
+ #if !defined(BLAKE3_NO_AVX512)
+   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+     return 16;
+   }
+ #endif
+diff --git a/c/blake3_impl.h b/c/blake3_impl.h
+index 88e71e4..f89f9f7 100644
+--- a/c/blake3_impl.h
++++ b/c/blake3_impl.h
+@@ -68,16 +68,19 @@ enum blake3_flags {
+     #endif
+   #else
+     #define BLAKE3_USE_NEON 0
+   #endif
+ #endif
+
+ #if defined(IS_X86)
+ #define MAX_SIMD_DEGREE 16
++#elif defined(__wasm__)
++// We force the SSE2 implementation on wasm; it processes 4 inputs at a time.
++#define MAX_SIMD_DEGREE 4
+ #elif BLAKE3_USE_NEON == 1
+ #define MAX_SIMD_DEGREE 4
+ #else
+ #define MAX_SIMD_DEGREE 1
+ #endif
+
+ // There are some places where we want a static size that's equal to the
+ // MAX_SIMD_DEGREE, but also at least 2.
+@@ -249,17 +252,17 @@ void blake3_compress_xof_portable(const uint32_t cv[8],
+                                   uint8_t flags, uint8_t out[64]);
+
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                                size_t blocks, const uint32_t key[8],
+                                uint64_t counter, bool increment_counter,
+                                uint8_t flags, uint8_t flags_start,
+                                uint8_t flags_end, uint8_t *out);
+
+-#if defined(IS_X86)
++#if defined(IS_X86) || defined(__wasm__)
+ #if !defined(BLAKE3_NO_SSE2)
+ void blake3_compress_in_place_sse2(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t counter,
+                                    uint8_t flags);
+ void blake3_compress_xof_sse2(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t counter,
diff --git a/benchmarks/blake3-simd/wasm_sse_compat.h b/benchmarks/blake3-simd/wasm_sse_compat.h
@@ -0,0 +1,75 @@
+// x86 SSE2 -> WebAssembly SIMD compatibility shim for BLAKE3's blake3_sse2.c.
+//
+// wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm (its
+// `<emmintrin.h>` #errors, and `-msse2` is rejected). BLAKE3's hand-written
+// SSE2 kernels are written against those intrinsics, so this header provides
+// exactly the `_mm_*` intrinsics blake3_sse2.c uses, implemented with the native
+// Wasm SIMD intrinsics from `<wasm_simd128.h>`. On a native (x86) build it just
+// includes the real header, so the same source compiles either way.
+#pragma once
+
+#if defined(__wasm_simd128__)
+
+#include <wasm_simd128.h>
+#include <stdint.h>
+
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+
+// --- Arithmetic / logical --------------------------------------------------
+static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_add((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_sub((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_and_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_and((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_or_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_or((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_xor_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_xor((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_andnot((v128_t)(b), (v128_t)(a)); } // (~a) & b
+static inline __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_eq((v128_t)(a), (v128_t)(b)); }
+static inline __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_gt((v128_t)(a), (v128_t)(b)); }
+
+// --- Splat / set -----------------------------------------------------------
+static inline __m128i _mm_set1_epi16(short x) { return (__m128i)wasm_i16x8_splat(x); }
+static inline __m128i _mm_set1_epi32(int x) { return (__m128i)wasm_i32x4_splat(x); }
+// `set` is high-lane-first; `make` is low-lane-first.
+#define _mm_set_epi32(e3, e2, e1, e0)  ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3)))
+#define _mm_setr_epi32(e0, e1, e2, e3) ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3)))
+#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
+  ((__m128i)wasm_i16x8_make((e0), (e1), (e2), (e3), (e4), (e5), (e6), (e7)))
+
+// --- Load / store (unaligned) ----------------------------------------------
+static inline __m128i _mm_loadu_si128(const __m128i *p) { return (__m128i)wasm_v128_load(p); }
+static inline void _mm_storeu_si128(__m128i *p, __m128i a) { wasm_v128_store(p, (v128_t)(a)); }
+
+// --- Reinterpret casts ------------------------------------------------------
+static inline __m128 _mm_castsi128_ps(__m128i a) { return (__m128)(a); }
+static inline __m128i _mm_castps_si128(__m128 a) { return (__m128i)(a); }
+
+// --- Shifts (immediate or runtime count) -----------------------------------
+static inline __m128i _mm_slli_epi32(__m128i a, int c) { return (__m128i)wasm_i32x4_shl((v128_t)(a), c); }
+static inline __m128i _mm_srli_epi32(__m128i a, int c) { return (__m128i)wasm_u32x4_shr((v128_t)(a), c); }
+
+// --- Interleave (unpack) ----------------------------------------------------
+static inline __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); }
+static inline __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); }
+static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); }
+static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); }
+
+// --- Shuffles (immediate); macros so lane indices stay compile-time ---------
+#define _mm_shuffle_epi32(a, imm) \
+  ((__m128i)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3))
+#define _mm_shuffle_ps(a, b, imm) \
+  ((__m128)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(b), (imm) & 3, ((imm) >> 2) & 3, 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))
+#define _mm_shufflelo_epi16(a, imm) \
+  ((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3, 4, 5, 6, 7))
+#define _mm_shufflehi_epi16(a, imm) \
+  ((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), 0, 1, 2, 3, 4 + ((imm) & 3), 4 + (((imm) >> 2) & 3), 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))
+
+// --- Misc -------------------------------------------------------------------
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+#define _MM_HINT_T0 3
+#define _mm_prefetch(p, hint) ((void)(p)) // no prefetch hint on wasm
+
+#else // !__wasm_simd128__ : native x86 build uses the real intrinsics.
+
+#include <immintrin.h>
+
+#endif
diff --git a/benchmarks/hashset/HashSet.cpp b/benchmarks/hashset/HashSet.cpp
@@ -29,7 +29,7 @@
 #include <sightglass.h>
 
 // Compile with: xcrun clang++ -o HashSet HashSet.cpp -O2 -W -framework Foundation -licucore -std=c++11 -fvisibility=hidden -DNDEBUG=1
-// Or for wasm: em++ -o HashSet.js -o HashSet.html HashSet.cpp -O2 -W -std=c++11 -DNDEBUG=1 -g1 -s WASM=1 -s TOTAL_MEMORY=52428800
+// The Wasm build is defined by this benchmark's Dockerfile (wasi-sdk clang++).
 
 #define ALWAYS_INLINE inline __attribute__((__always_inline__))