Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions benchmarks/Dockerfile.emscripten

This file was deleted.

68 changes: 54 additions & 14 deletions benchmarks/blake3-simd/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,26 +1,66 @@
FROM emscripten/emsdk:4.0.10
# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every
# build.

# First, retrieve wasi-sdk:

FROM ubuntu:24.04 AS builder
WORKDIR /
RUN apt update && apt install -y wget

# Download and extract wasi-sdk.
RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz
RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz

# Second, compile the benchmark to Wasm.

FROM ubuntu:24.04
WORKDIR /
COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/
RUN apt update && apt install -y git patch

# Set common env vars.
ENV CC=/wasi-sdk/bin/clang
ENV CXX=/wasi-sdk/bin/clang++
ENV LD=/wasi-sdk/bin/lld
ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot
# Prepend wasi-sdk's bin while keeping the system PATH so `git`/`patch` resolve.
ENV PATH=/wasi-sdk/bin:$PATH

WORKDIR /usr/src
RUN git clone https://github.com/BLAKE3-team/BLAKE3.git

WORKDIR /usr/src/BLAKE3
RUN git checkout 8aa5145039b972ba30e98e788752d37d14568824
# BLAKE3's dispatcher only selects its SSE2 kernels when IS_X86 is defined (it is
# not on wasm). This patch forces the SSE2 path on wasm and exposes the SSE2
# function declarations there.
COPY blake3-wasm-sse2.patch .
RUN patch -p1 < blake3-wasm-sse2.patch

WORKDIR /usr/src/BLAKE3/c
COPY benchmark.c .
COPY sightglass.h .
COPY wasm_sse_compat.h .
# wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm. `wasm_sse_compat.h`
# reimplements the ones blake3_sse2.c uses on top of `<wasm_simd128.h>`; we expose
# it under the x86 intrinsic-header name so blake3_sse2.c's `#include <immintrin.h>`
# resolves to it (the shim dir is first on the include path).
RUN mkdir -p /shim && cp wasm_sse_compat.h /shim/ && \
for h in immintrin emmintrin tmmintrin xmmintrin; do \
echo '#include "wasm_sse_compat.h"' > /shim/$h.h; \
done

RUN mkdir /benchmark
# I am not sure that all of the parameters passed below are needed; this is what I received. It
# would be preferable if the blake3 would tell us what SIMD parameters they plan to use for their
# NPM package (https://www.npmjs.com/package/blake3), which currently uses only scalar instructions.
# The benchmark reads its workload from disk into the heap, and runs under a
# plain WASI runtime (no JS), so we use a fixed linear memory large enough for
# the workload and disable memory growth (growth would import an Emscripten JS
# function the runtime does not provide).
RUN emcc -O3 -s STANDALONE_WASM=1 \
-s INITIAL_MEMORY=67108864 -s ALLOW_MEMORY_GROWTH=0 -s TOTAL_STACK=131072 \
-s "EXPORTED_FUNCTIONS=['_main']" \
-msimd128 -msse4.1 -msse4.2 \
# Build BLAKE3 with its hand-written SSE2 implementation (blake3_sse2.c), lowered
# to Wasm SIMD by the compat header above. `-DBLAKE3_NO_SSE41/AVX2/AVX512` keep the
# (patched) dispatcher and headers limited to the SSE2 tier. The benchmark reads
# its workload from disk into the heap under a plain WASI runtime (no JS);
# wasi-libc's `malloc` grows linear memory natively.
RUN $CC $CFLAGS -O3 -g -msimd128 -I/shim \
-DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \
-Wl,-z,stack-size=131072 \
-o /benchmark/benchmark.wasm \
blake3.c blake3_sse2.c blake3_dispatch.c \
blake3_portable.c benchmark.c
blake3.c blake3_dispatch.c blake3_portable.c blake3_sse2.c benchmark.c
# We output the Wasm file to the `/benchmark` directory, where the client
# expects it.
4 changes: 3 additions & 1 deletion benchmarks/blake3-simd/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# BLAKE3

This benchmark is similar to [../blake3-scalar] and should return the same hash result, but the
build is configured to compile the C version of BLAKE3 to Wasm SIMD operations using Emscripten.
build compiles BLAKE3's hand-written SSE2 implementation (`blake3_sse2.c`): `wasm_sse_compat.h`
maps its x86 SSE2 intrinsics onto Wasm SIMD (via `<wasm_simd128.h>`), and a small patch forces
BLAKE3's runtime dispatcher to select the SSE2 kernels on wasm.
52 changes: 14 additions & 38 deletions benchmarks/blake3-simd/benchmark.c
Original file line number Diff line number Diff line change
@@ -1,46 +1,23 @@
#include "blake3.h"
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "sightglass.h"

// This benchmark is built with Emscripten (for the SSE2 -> Wasm SIMD blake3
// implementation), whose libc filesystem does not reach WASI preopened
// directories. So, like the splay benchmark, we read the workload from disk by
// calling the WASI `path_open`/`fd_read` syscalls directly. This lets the input
// be resized without recompiling, similar to the blake3-scalar benchmark.
#define WASI_IMPORT(name) \
__attribute__((import_module("wasi_snapshot_preview1"), import_name(name)))

typedef struct {
const void *buf;
size_t len;
} wasi_iovec_t;

WASI_IMPORT("path_open")
int wasi_path_open(int fd, int dirflags, const char *path, size_t path_len,
int oflags, uint64_t rights_base, uint64_t rights_inheriting,
int fdflags, int *opened_fd);

WASI_IMPORT("fd_read")
int wasi_fd_read(int fd, const wasi_iovec_t *iovs, size_t iovs_len, size_t *nread);

// The benchmark directory is preopened by the runner as the first preopen (fd 3).
#define PREOPEN_FD 3
#define RIGHT_FD_READ (1ULL << 1)
#define RIGHT_FD_SEEK (1ULL << 2)
// The workload is read from `default.input` at runtime so it can be resized
// without recompiling (like the blake3-scalar benchmark). The C BLAKE3
// hand-written SSE2 implementation is compiled to Wasm SIMD via the SSE->Wasm
// shim in wasm_sse_compat.h.

int main()
{
const char *path = "default.input";
int fd = -1;
int rc = wasi_path_open(PREOPEN_FD, 0, path, strlen(path), 0,
RIGHT_FD_READ | RIGHT_FD_SEEK,
RIGHT_FD_READ | RIGHT_FD_SEEK, 0, &fd);
if (rc != 0 || fd < 0)
int fd = open(path, O_RDONLY);
if (fd < 0)
{
fprintf(stderr, "failed to open default.input (rc=%d)\n", rc);
fprintf(stderr, "failed to open default.input\n");
return 1;
}

Expand All @@ -54,18 +31,17 @@ int main()
cap *= 2;
buffer = (unsigned char *)realloc(buffer, cap);
}
wasi_iovec_t iov = {buffer + len, cap - len};
size_t nread = 0;
rc = wasi_fd_read(fd, &iov, 1, &nread);
if (rc != 0)
ssize_t nread = read(fd, buffer + len, cap - len);
if (nread < 0)
{
fprintf(stderr, "fd_read failed (rc=%d)\n", rc);
fprintf(stderr, "read failed\n");
return 1;
}
if (nread == 0)
break;
len += nread;
len += (size_t)nread;
}
close(fd);

fprintf(stderr, "[blake3] hashing ./default.input\n");
fprintf(stderr, "[blake3] input size = %zu\n", len);
Expand Down
Binary file modified benchmarks/blake3-simd/benchmark.wasm
Binary file not shown.
133 changes: 133 additions & 0 deletions benchmarks/blake3-simd/blake3-wasm-sse2.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c
index 14dfbbe..32d69a2 100644
--- a/c/blake3_dispatch.c
+++ b/c/blake3_dispatch.c
@@ -164,16 +164,22 @@ static
}
}
#endif

void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
+#if defined(__wasm__)
+ // wasm has no runtime CPU dispatch; always use the SSE2 implementation, which
+ // wasm_sse_compat.h compiles to Wasm SIMD.
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
+ return;
+#endif
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
return;
}
@@ -193,16 +199,20 @@ void blake3_compress_in_place(uint32_t cv[8],
#endif
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}

void blake3_compress_xof(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]) {
+#if defined(__wasm__)
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
+ return;
+#endif
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
return;
}
@@ -246,16 +256,21 @@ void blake3_xof_many(const uint32_t cv[8],
blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
}
}

void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+#if defined(__wasm__)
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ return;
+#endif
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
@@ -296,16 +311,19 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,

blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
}

// The dynamically detected SIMD degree of the current platform.
size_t blake3_simd_degree(void) {
+#if defined(__wasm__)
+ return 4; // the SSE2 implementation processes 4 inputs at a time
+#endif
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
return 16;
}
#endif
diff --git a/c/blake3_impl.h b/c/blake3_impl.h
index 88e71e4..f89f9f7 100644
--- a/c/blake3_impl.h
+++ b/c/blake3_impl.h
@@ -68,16 +68,19 @@ enum blake3_flags {
#endif
#else
#define BLAKE3_USE_NEON 0
#endif
#endif

#if defined(IS_X86)
#define MAX_SIMD_DEGREE 16
+#elif defined(__wasm__)
+// We force the SSE2 implementation on wasm; it processes 4 inputs at a time.
+#define MAX_SIMD_DEGREE 4
#elif BLAKE3_USE_NEON == 1
#define MAX_SIMD_DEGREE 4
#else
#define MAX_SIMD_DEGREE 1
#endif

// There are some places where we want a static size that's equal to the
// MAX_SIMD_DEGREE, but also at least 2.
@@ -249,17 +252,17 @@ void blake3_compress_xof_portable(const uint32_t cv[8],
uint8_t flags, uint8_t out[64]);

void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);

-#if defined(IS_X86)
+#if defined(IS_X86) || defined(__wasm__)
#if !defined(BLAKE3_NO_SSE2)
void blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
75 changes: 75 additions & 0 deletions benchmarks/blake3-simd/wasm_sse_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// x86 SSE2 -> WebAssembly SIMD compatibility shim for BLAKE3's blake3_sse2.c.
//
// wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm (its
// `<emmintrin.h>` #errors, and `-msse2` is rejected). BLAKE3's hand-written
// SSE2 kernels are written against those intrinsics, so this header provides
// exactly the `_mm_*` intrinsics blake3_sse2.c uses, implemented with the native
// Wasm SIMD intrinsics from `<wasm_simd128.h>`. On a native (x86) build it just
// includes the real header, so the same source compiles either way.
#pragma once

#if defined(__wasm_simd128__)

#include <wasm_simd128.h>
#include <stdint.h>

typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));

// --- Arithmetic / logical --------------------------------------------------
static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_add((v128_t)(a), (v128_t)(b)); }
static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_sub((v128_t)(a), (v128_t)(b)); }
static inline __m128i _mm_and_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_and((v128_t)(a), (v128_t)(b)); }
static inline __m128i _mm_or_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_or((v128_t)(a), (v128_t)(b)); }
static inline __m128i _mm_xor_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_xor((v128_t)(a), (v128_t)(b)); }
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_andnot((v128_t)(b), (v128_t)(a)); } // (~a) & b
static inline __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_eq((v128_t)(a), (v128_t)(b)); }
static inline __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_gt((v128_t)(a), (v128_t)(b)); }

// --- Splat / set -----------------------------------------------------------
static inline __m128i _mm_set1_epi16(short x) { return (__m128i)wasm_i16x8_splat(x); }
static inline __m128i _mm_set1_epi32(int x) { return (__m128i)wasm_i32x4_splat(x); }
// `set` is high-lane-first; `make` is low-lane-first.
#define _mm_set_epi32(e3, e2, e1, e0) ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3)))
#define _mm_setr_epi32(e0, e1, e2, e3) ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3)))
#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
((__m128i)wasm_i16x8_make((e0), (e1), (e2), (e3), (e4), (e5), (e6), (e7)))

// --- Load / store (unaligned) ----------------------------------------------
static inline __m128i _mm_loadu_si128(const __m128i *p) { return (__m128i)wasm_v128_load(p); }
static inline void _mm_storeu_si128(__m128i *p, __m128i a) { wasm_v128_store(p, (v128_t)(a)); }

// --- Reinterpret casts ------------------------------------------------------
static inline __m128 _mm_castsi128_ps(__m128i a) { return (__m128)(a); }
static inline __m128i _mm_castps_si128(__m128 a) { return (__m128i)(a); }

// --- Shifts (immediate or runtime count) -----------------------------------
static inline __m128i _mm_slli_epi32(__m128i a, int c) { return (__m128i)wasm_i32x4_shl((v128_t)(a), c); }
static inline __m128i _mm_srli_epi32(__m128i a, int c) { return (__m128i)wasm_u32x4_shr((v128_t)(a), c); }

// --- Interleave (unpack) ----------------------------------------------------
static inline __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); }
static inline __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); }
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); }
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); }

// --- Shuffles (immediate); macros so lane indices stay compile-time ---------
#define _mm_shuffle_epi32(a, imm) \
((__m128i)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3))
#define _mm_shuffle_ps(a, b, imm) \
((__m128)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(b), (imm) & 3, ((imm) >> 2) & 3, 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))
#define _mm_shufflelo_epi16(a, imm) \
((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3, 4, 5, 6, 7))
#define _mm_shufflehi_epi16(a, imm) \
((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), 0, 1, 2, 3, 4 + ((imm) & 3), 4 + (((imm) >> 2) & 3), 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3)))

// --- Misc -------------------------------------------------------------------
#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
#define _MM_HINT_T0 3
#define _mm_prefetch(p, hint) ((void)(p)) // no prefetch hint on wasm

#else // !__wasm_simd128__ : native x86 build uses the real intrinsics.

#include <immintrin.h>

#endif
2 changes: 1 addition & 1 deletion benchmarks/hashset/HashSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include <sightglass.h>

// Compile with: xcrun clang++ -o HashSet HashSet.cpp -O2 -W -framework Foundation -licucore -std=c++11 -fvisibility=hidden -DNDEBUG=1
// Or for wasm: em++ -o HashSet.js -o HashSet.html HashSet.cpp -O2 -W -std=c++11 -DNDEBUG=1 -g1 -s WASM=1 -s TOTAL_MEMORY=52428800
// The Wasm build is defined by this benchmark's Dockerfile (wasi-sdk clang++).

#define ALWAYS_INLINE inline __attribute__((__always_inline__))

Expand Down
Loading
Loading