diff --git a/benchmarks/Dockerfile.emscripten b/benchmarks/Dockerfile.emscripten deleted file mode 100644 index b41451b5..00000000 --- a/benchmarks/Dockerfile.emscripten +++ /dev/null @@ -1,9 +0,0 @@ -FROM emscripten/emsdk:4.0.10 - -WORKDIR / -COPY benchmark.c . -COPY sightglass.h . -WORKDIR /benchmark -RUN emcc ../benchmark.c -O3 -g -DNDEBUG -I.. -o benchmark.wasm -# We output the Wasm file to the `/benchmark` directory, where the client -# expects it. diff --git a/benchmarks/blake3-simd/Dockerfile b/benchmarks/blake3-simd/Dockerfile index 4d6bf678..509b3024 100644 --- a/benchmarks/blake3-simd/Dockerfile +++ b/benchmarks/blake3-simd/Dockerfile @@ -1,26 +1,66 @@ -FROM emscripten/emsdk:4.0.10 +# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every +# build. + +# First, retrieve wasi-sdk: + +FROM ubuntu:24.04 AS builder +WORKDIR / +RUN apt update && apt install -y wget + +# Download and extract wasi-sdk. +RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz +RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz + +# Second, compile the benchmark to Wasm. + +FROM ubuntu:24.04 +WORKDIR / +COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/ +RUN apt update && apt install -y git patch + +# Set common env vars. +ENV CC=/wasi-sdk/bin/clang +ENV CXX=/wasi-sdk/bin/clang++ +ENV LD=/wasi-sdk/bin/lld +ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot +ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot +# Prepend wasi-sdk's bin while keeping the system PATH so `git`/`patch` resolve. +ENV PATH=/wasi-sdk/bin:$PATH WORKDIR /usr/src RUN git clone https://github.com/BLAKE3-team/BLAKE3.git +WORKDIR /usr/src/BLAKE3 +RUN git checkout 8aa5145039b972ba30e98e788752d37d14568824 +# BLAKE3's dispatcher only selects its SSE2 kernels when IS_X86 is defined (it is +# not on wasm). This patch forces the SSE2 path on wasm and exposes the SSE2 +# function declarations there. +COPY blake3-wasm-sse2.patch . +RUN patch -p1 < blake3-wasm-sse2.patch + WORKDIR /usr/src/BLAKE3/c COPY benchmark.c . COPY sightglass.h . +COPY wasm_sse_compat.h . +# wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm. `wasm_sse_compat.h` +# reimplements the ones blake3_sse2.c uses on top of ``; we expose +# it under the x86 intrinsic-header name so blake3_sse2.c's `#include ` +# resolves to it (the shim dir is first on the include path). +RUN mkdir -p /shim && cp wasm_sse_compat.h /shim/ && \ + for h in immintrin emmintrin tmmintrin xmmintrin; do \ + echo '#include "wasm_sse_compat.h"' > /shim/$h.h; \ + done + RUN mkdir /benchmark -# I am not sure that all of the parameters passed below are needed; this is what I received. It -# would be preferable if the blake3 would tell us what SIMD parameters they plan to use for their -# NPM package (https://www.npmjs.com/package/blake3), which currently uses only scalar instructions. -# The benchmark reads its workload from disk into the heap, and runs under a -# plain WASI runtime (no JS), so we use a fixed linear memory large enough for -# the workload and disable memory growth (growth would import an Emscripten JS -# function the runtime does not provide). -RUN emcc -O3 -s STANDALONE_WASM=1 \ - -s INITIAL_MEMORY=67108864 -s ALLOW_MEMORY_GROWTH=0 -s TOTAL_STACK=131072 \ - -s "EXPORTED_FUNCTIONS=['_main']" \ - -msimd128 -msse4.1 -msse4.2 \ +# Build BLAKE3 with its hand-written SSE2 implementation (blake3_sse2.c), lowered +# to Wasm SIMD by the compat header above. `-DBLAKE3_NO_SSE41/AVX2/AVX512` keep the +# (patched) dispatcher and headers limited to the SSE2 tier. The benchmark reads +# its workload from disk into the heap under a plain WASI runtime (no JS); +# wasi-libc's `malloc` grows linear memory natively. +RUN $CC $CFLAGS -O3 -g -msimd128 -I/shim \ -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \ + -Wl,-z,stack-size=131072 \ -o /benchmark/benchmark.wasm \ - blake3.c blake3_sse2.c blake3_dispatch.c \ - blake3_portable.c benchmark.c + blake3.c blake3_dispatch.c blake3_portable.c blake3_sse2.c benchmark.c # We output the Wasm file to the `/benchmark` directory, where the client # expects it. diff --git a/benchmarks/blake3-simd/README.md b/benchmarks/blake3-simd/README.md index b6d645f5..0b072d39 100644 --- a/benchmarks/blake3-simd/README.md +++ b/benchmarks/blake3-simd/README.md @@ -1,4 +1,6 @@ # BLAKE3 This benchmark is similar to [../blake3-scalar] and should return the same hash result, but the -build is configured to compile the C version of BLAKE3 to Wasm SIMD operations using Emscripten. +build compiles BLAKE3's hand-written SSE2 implementation (`blake3_sse2.c`): `wasm_sse_compat.h` +maps its x86 SSE2 intrinsics onto Wasm SIMD (via ``), and a small patch forces +BLAKE3's runtime dispatcher to select the SSE2 kernels on wasm. diff --git a/benchmarks/blake3-simd/benchmark.c b/benchmarks/blake3-simd/benchmark.c index 7edd70e0..28c4a30f 100644 --- a/benchmarks/blake3-simd/benchmark.c +++ b/benchmarks/blake3-simd/benchmark.c @@ -1,46 +1,23 @@ #include "blake3.h" +#include #include #include #include -#include +#include #include "sightglass.h" -// This benchmark is built with Emscripten (for the SSE2 -> Wasm SIMD blake3 -// implementation), whose libc filesystem does not reach WASI preopened -// directories. So, like the splay benchmark, we read the workload from disk by -// calling the WASI `path_open`/`fd_read` syscalls directly. This lets the input -// be resized without recompiling, similar to the blake3-scalar benchmark. -#define WASI_IMPORT(name) \ - __attribute__((import_module("wasi_snapshot_preview1"), import_name(name))) - -typedef struct { - const void *buf; - size_t len; -} wasi_iovec_t; - -WASI_IMPORT("path_open") -int wasi_path_open(int fd, int dirflags, const char *path, size_t path_len, - int oflags, uint64_t rights_base, uint64_t rights_inheriting, - int fdflags, int *opened_fd); - -WASI_IMPORT("fd_read") -int wasi_fd_read(int fd, const wasi_iovec_t *iovs, size_t iovs_len, size_t *nread); - -// The benchmark directory is preopened by the runner as the first preopen (fd 3). -#define PREOPEN_FD 3 -#define RIGHT_FD_READ (1ULL << 1) -#define RIGHT_FD_SEEK (1ULL << 2) +// The workload is read from `default.input` at runtime so it can be resized +// without recompiling (like the blake3-scalar benchmark). The C BLAKE3 +// hand-written SSE2 implementation is compiled to Wasm SIMD via the SSE->Wasm +// shim in wasm_sse_compat.h. int main() { const char *path = "default.input"; - int fd = -1; - int rc = wasi_path_open(PREOPEN_FD, 0, path, strlen(path), 0, - RIGHT_FD_READ | RIGHT_FD_SEEK, - RIGHT_FD_READ | RIGHT_FD_SEEK, 0, &fd); - if (rc != 0 || fd < 0) + int fd = open(path, O_RDONLY); + if (fd < 0) { - fprintf(stderr, "failed to open default.input (rc=%d)\n", rc); + fprintf(stderr, "failed to open default.input\n"); return 1; } @@ -54,18 +31,17 @@ int main() cap *= 2; buffer = (unsigned char *)realloc(buffer, cap); } - wasi_iovec_t iov = {buffer + len, cap - len}; - size_t nread = 0; - rc = wasi_fd_read(fd, &iov, 1, &nread); - if (rc != 0) + ssize_t nread = read(fd, buffer + len, cap - len); + if (nread < 0) { - fprintf(stderr, "fd_read failed (rc=%d)\n", rc); + fprintf(stderr, "read failed\n"); return 1; } if (nread == 0) break; - len += nread; + len += (size_t)nread; } + close(fd); fprintf(stderr, "[blake3] hashing ./default.input\n"); fprintf(stderr, "[blake3] input size = %zu\n", len); diff --git a/benchmarks/blake3-simd/benchmark.wasm b/benchmarks/blake3-simd/benchmark.wasm index d736513e..27269ca0 100755 Binary files a/benchmarks/blake3-simd/benchmark.wasm and b/benchmarks/blake3-simd/benchmark.wasm differ diff --git a/benchmarks/blake3-simd/blake3-wasm-sse2.patch b/benchmarks/blake3-simd/blake3-wasm-sse2.patch new file mode 100644 index 00000000..d60511ca --- /dev/null +++ b/benchmarks/blake3-simd/blake3-wasm-sse2.patch @@ -0,0 +1,133 @@ +diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c +index 14dfbbe..32d69a2 100644 +--- a/c/blake3_dispatch.c ++++ b/c/blake3_dispatch.c +@@ -164,16 +164,22 @@ static + } + } + #endif + + void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { ++#if defined(__wasm__) ++ // wasm has no runtime CPU dispatch; always use the SSE2 implementation, which ++ // wasm_sse_compat.h compiles to Wasm SIMD. ++ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); ++ return; ++#endif + #if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); + #if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +@@ -193,16 +199,20 @@ void blake3_compress_in_place(uint32_t cv[8], + #endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); + } + + void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { ++#if defined(__wasm__) ++ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); ++ return; ++#endif + #if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); + #if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +@@ -246,16 +256,21 @@ void blake3_xof_many(const uint32_t cv[8], + blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i); + } + } + + void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { ++#if defined(__wasm__) ++ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, ++ increment_counter, flags, flags_start, flags_end, out); ++ return; ++#endif + #if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); + #if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +@@ -296,16 +311,19 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + } + + // The dynamically detected SIMD degree of the current platform. + size_t blake3_simd_degree(void) { ++#if defined(__wasm__) ++ return 4; // the SSE2 implementation processes 4 inputs at a time ++#endif + #if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); + #if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } + #endif +diff --git a/c/blake3_impl.h b/c/blake3_impl.h +index 88e71e4..f89f9f7 100644 +--- a/c/blake3_impl.h ++++ b/c/blake3_impl.h +@@ -68,16 +68,19 @@ enum blake3_flags { + #endif + #else + #define BLAKE3_USE_NEON 0 + #endif + #endif + + #if defined(IS_X86) + #define MAX_SIMD_DEGREE 16 ++#elif defined(__wasm__) ++// We force the SSE2 implementation on wasm; it processes 4 inputs at a time. ++#define MAX_SIMD_DEGREE 4 + #elif BLAKE3_USE_NEON == 1 + #define MAX_SIMD_DEGREE 4 + #else + #define MAX_SIMD_DEGREE 1 + #endif + + // There are some places where we want a static size that's equal to the + // MAX_SIMD_DEGREE, but also at least 2. +@@ -249,17 +252,17 @@ void blake3_compress_xof_portable(const uint32_t cv[8], + uint8_t flags, uint8_t out[64]); + + void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +-#if defined(IS_X86) ++#if defined(IS_X86) || defined(__wasm__) + #if !defined(BLAKE3_NO_SSE2) + void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, diff --git a/benchmarks/blake3-simd/wasm_sse_compat.h b/benchmarks/blake3-simd/wasm_sse_compat.h new file mode 100644 index 00000000..7b525c0e --- /dev/null +++ b/benchmarks/blake3-simd/wasm_sse_compat.h @@ -0,0 +1,75 @@ +// x86 SSE2 -> WebAssembly SIMD compatibility shim for BLAKE3's blake3_sse2.c. +// +// wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm (its +// `` #errors, and `-msse2` is rejected). BLAKE3's hand-written +// SSE2 kernels are written against those intrinsics, so this header provides +// exactly the `_mm_*` intrinsics blake3_sse2.c uses, implemented with the native +// Wasm SIMD intrinsics from ``. On a native (x86) build it just +// includes the real header, so the same source compiles either way. +#pragma once + +#if defined(__wasm_simd128__) + +#include +#include + +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); + +// --- Arithmetic / logical -------------------------------------------------- +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_add((v128_t)(a), (v128_t)(b)); } +static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_sub((v128_t)(a), (v128_t)(b)); } +static inline __m128i _mm_and_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_and((v128_t)(a), (v128_t)(b)); } +static inline __m128i _mm_or_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_or((v128_t)(a), (v128_t)(b)); } +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_xor((v128_t)(a), (v128_t)(b)); } +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_andnot((v128_t)(b), (v128_t)(a)); } // (~a) & b +static inline __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_eq((v128_t)(a), (v128_t)(b)); } +static inline __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_gt((v128_t)(a), (v128_t)(b)); } + +// --- Splat / set ----------------------------------------------------------- +static inline __m128i _mm_set1_epi16(short x) { return (__m128i)wasm_i16x8_splat(x); } +static inline __m128i _mm_set1_epi32(int x) { return (__m128i)wasm_i32x4_splat(x); } +// `set` is high-lane-first; `make` is low-lane-first. +#define _mm_set_epi32(e3, e2, e1, e0) ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3))) +#define _mm_setr_epi32(e0, e1, e2, e3) ((__m128i)wasm_i32x4_make((e0), (e1), (e2), (e3))) +#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \ + ((__m128i)wasm_i16x8_make((e0), (e1), (e2), (e3), (e4), (e5), (e6), (e7))) + +// --- Load / store (unaligned) ---------------------------------------------- +static inline __m128i _mm_loadu_si128(const __m128i *p) { return (__m128i)wasm_v128_load(p); } +static inline void _mm_storeu_si128(__m128i *p, __m128i a) { wasm_v128_store(p, (v128_t)(a)); } + +// --- Reinterpret casts ------------------------------------------------------ +static inline __m128 _mm_castsi128_ps(__m128i a) { return (__m128)(a); } +static inline __m128i _mm_castps_si128(__m128 a) { return (__m128i)(a); } + +// --- Shifts (immediate or runtime count) ----------------------------------- +static inline __m128i _mm_slli_epi32(__m128i a, int c) { return (__m128i)wasm_i32x4_shl((v128_t)(a), c); } +static inline __m128i _mm_srli_epi32(__m128i a, int c) { return (__m128i)wasm_u32x4_shr((v128_t)(a), c); } + +// --- Interleave (unpack) ---------------------------------------------------- +static inline __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); } +static inline __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); } +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); } +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle((v128_t)(a), (v128_t)(b), 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); } + +// --- Shuffles (immediate); macros so lane indices stay compile-time --------- +#define _mm_shuffle_epi32(a, imm) \ + ((__m128i)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3)) +#define _mm_shuffle_ps(a, b, imm) \ + ((__m128)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(b), (imm) & 3, ((imm) >> 2) & 3, 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3))) +#define _mm_shufflelo_epi16(a, imm) \ + ((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3, 4, 5, 6, 7)) +#define _mm_shufflehi_epi16(a, imm) \ + ((__m128i)wasm_i16x8_shuffle((v128_t)(a), (v128_t)(a), 0, 1, 2, 3, 4 + ((imm) & 3), 4 + (((imm) >> 2) & 3), 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3))) + +// --- Misc ------------------------------------------------------------------- +#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) +#define _MM_HINT_T0 3 +#define _mm_prefetch(p, hint) ((void)(p)) // no prefetch hint on wasm + +#else // !__wasm_simd128__ : native x86 build uses the real intrinsics. + +#include + +#endif diff --git a/benchmarks/hashset/HashSet.cpp b/benchmarks/hashset/HashSet.cpp index f18c53a1..5c478a59 100644 --- a/benchmarks/hashset/HashSet.cpp +++ b/benchmarks/hashset/HashSet.cpp @@ -29,7 +29,7 @@ #include // Compile with: xcrun clang++ -o HashSet HashSet.cpp -O2 -W -framework Foundation -licucore -std=c++11 -fvisibility=hidden -DNDEBUG=1 -// Or for wasm: em++ -o HashSet.js -o HashSet.html HashSet.cpp -O2 -W -std=c++11 -DNDEBUG=1 -g1 -s WASM=1 -s TOTAL_MEMORY=52428800 +// The Wasm build is defined by this benchmark's Dockerfile (wasi-sdk clang++). #define ALWAYS_INLINE inline __attribute__((__always_inline__)) diff --git a/benchmarks/intgemm-simd/Dockerfile b/benchmarks/intgemm-simd/Dockerfile index a00164ea..b46e2035 100644 --- a/benchmarks/intgemm-simd/Dockerfile +++ b/benchmarks/intgemm-simd/Dockerfile @@ -1,4 +1,32 @@ -FROM emscripten/emsdk:4.0.10 +# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every +# build. + +# First, retrieve wasi-sdk: + +FROM ubuntu:24.04 AS builder +WORKDIR / +RUN apt update && apt install -y wget + +# Download and extract wasi-sdk. +RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz +RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz + +# Second, compile the benchmark to Wasm. + +FROM ubuntu:24.04 +WORKDIR / +COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/ +RUN apt update && apt install -y git cmake make patch + +# Set common env vars. +ENV CC=/wasi-sdk/bin/clang +ENV CXX=/wasi-sdk/bin/clang++ +ENV LD=/wasi-sdk/bin/lld +ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot +ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot +# Prepend wasi-sdk's bin while keeping the system PATH so `git`, `cmake`, `make` +# and `patch` still resolve. +ENV PATH=/wasi-sdk/bin:$PATH WORKDIR /usr/src RUN git clone https://github.com/kpu/intgemm.git @@ -7,22 +35,42 @@ WORKDIR /usr/src/intgemm RUN git checkout be3053515a8a04d19c6959a370eaf8b5a6eab686 COPY benchmark.cpp . COPY sightglass.h . - COPY patch-intgemm.diff . +COPY wasm_sse_compat.h . RUN patch -p 1 < patch-intgemm.diff -# Building static library +# wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm. `wasm_sse_compat.h` +# reimplements the `_mm_*` intrinsics intgemm uses on top of ``. +# We expose it under the x86 intrinsic-header names in a shim directory placed +# first on the include path, so intgemm's `#include ` (etc.) resolve +# to the compat header instead of clang's x86-only headers (which #error on wasm). +RUN mkdir -p /shim && cp wasm_sse_compat.h /shim/ && \ + for h in emmintrin tmmintrin xmmintrin smmintrin pmmintrin nmmintrin immintrin; do \ + echo '#include "wasm_sse_compat.h"' > /shim/$h.h; \ + done + +# Flags shared by the library and the final link: emit Wasm SIMD (`-msimd128`), +# resolve x86 intrinsic headers to the compat shim, and define `WASM` (activates +# intgemm's patched SSSE3-only CPU selection — there is no runtime CPUID on Wasm). +# intgemm's AVX2/AVX512 feature-detection fails under wasi-sdk, so those kernels +# compile out. `-Wno-unknown-warning-option` keeps intgemm's `-Werror` build from +# choking on its Emscripten-only `-Wno-error=pthreads-mem-growth` flag. +# `-fno-exceptions` matches the other C++ benchmarks (gcc-loops, hashset) and +# avoids pulling in the C++ exception runtime, which wasi-sdk does not link by +# default; intgemm's only `throw` is already guarded behind `__EXCEPTIONS`. +ENV WASM_FLAGS="-O3 -msimd128 -fno-exceptions -I/shim -DWASM -Wno-unknown-warning-option" + +# Build the static library. RUN mkdir build WORKDIR /usr/src/intgemm/build -RUN emcmake cmake .. -DCOMPILE_WASM=1 -DCMAKE_CXX_FLAGS="-msimd128 -mssse3 -O3" -RUN emmake make intgemm +RUN cmake .. -DCMAKE_TOOLCHAIN_FILE=/wasi-sdk/share/cmake/wasi-sdk.cmake \ + -DCOMPILE_WASM=1 -DCMAKE_CXX_FLAGS="$WASM_FLAGS" +RUN make intgemm WORKDIR /usr/src/intgemm RUN mkdir /benchmark -RUN emcc -O3 -s STANDALONE_WASM=1 \ - -s INITIAL_MEMORY=33554432 -s MAXIMUM_MEMORY=33554432 \ - -s ALLOW_MEMORY_GROWTH=0 -s TOTAL_STACK=2097152 \ - -msimd128 -mssse3 -O3 -Ibuild \ +RUN $CXX $CXXFLAGS $WASM_FLAGS -Ibuild \ + -Wl,-z,stack-size=2097152 \ -o /benchmark/benchmark.wasm \ benchmark.cpp build/libintgemm.a # We output the Wasm file to the `/benchmark` directory, where the client diff --git a/benchmarks/intgemm-simd/README.md b/benchmarks/intgemm-simd/README.md index 43b4b8dd..059a96c3 100644 --- a/benchmarks/intgemm-simd/README.md +++ b/benchmarks/intgemm-simd/README.md @@ -4,4 +4,8 @@ Tests integer matrix multiplication. See https://github.com/kpu/intgemm -Using older emscripten 2.0 to avoid special pmaddwd sequences. Update emscripten image in the future and moar speed. \ No newline at end of file +Built with wasi-sdk (clang). intgemm's kernels are written with x86 SSE/SSSE3 +intrinsics, which wasi-sdk's clang cannot compile for Wasm, so `wasm_sse_compat.h` +reimplements the intrinsics intgemm uses on top of `` and is +substituted for the x86 intrinsic headers at build time (see the `Dockerfile`). +`-DWASM` selects intgemm's SSSE3 kernels, since there is no runtime CPUID on Wasm. \ No newline at end of file diff --git a/benchmarks/intgemm-simd/benchmark.cpp b/benchmarks/intgemm-simd/benchmark.cpp index 308c2eb2..ac405e9d 100644 --- a/benchmarks/intgemm-simd/benchmark.cpp +++ b/benchmarks/intgemm-simd/benchmark.cpp @@ -20,44 +20,29 @@ #include #include -// This benchmark is built with Emscripten, whose libc filesystem does not reach -// WASI preopened directories. So, like the splay and blake3-simd benchmarks, we -// read the workload size from disk by calling the WASI `path_open`/`fd_read` -// syscalls directly. This lets the workload be resized without recompiling. -#define WASI_IMPORT(name) \ - __attribute__((import_module("wasi_snapshot_preview1"), import_name(name))) - -typedef struct { - const void *buf; - size_t len; -} wasi_iovec_t; - -WASI_IMPORT("path_open") -int wasi_path_open(int fd, int dirflags, const char *path, size_t path_len, - int oflags, uint64_t rights_base, uint64_t rights_inheriting, - int fdflags, int *opened_fd); - -WASI_IMPORT("fd_read") -int wasi_fd_read(int fd, const wasi_iovec_t *iovs, size_t iovs_len, size_t *nread); +#include +#include +// The dominant matrix's row count is read from `default.input` at runtime so the +// workload can be resized without recompiling (like the quicksort and +// blake3-simd benchmarks). static int read_int_from_file() { - const char *path = "default.input"; // preopen fd 3 is the benchmark dir - int fd = -1; - if (wasi_path_open(3, 0, path, strlen(path), 0, (1ULL << 1) | (1ULL << 2), - (1ULL << 1) | (1ULL << 2), 0, &fd) != 0 || fd < 0) { + const char *path = "default.input"; + int fd = open(path, O_RDONLY); + if (fd < 0) { std::cerr << "failed to open default.input" << std::endl; abort(); } char buf[64] = {0}; size_t total = 0; for (;;) { - wasi_iovec_t iov = {buf + total, sizeof(buf) - 1 - total}; - size_t nread = 0; - if (wasi_fd_read(fd, &iov, 1, &nread) != 0) { abort(); } + ssize_t nread = read(fd, buf + total, sizeof(buf) - 1 - total); + if (nread < 0) { abort(); } if (nread == 0 || total >= sizeof(buf) - 1) break; - total += nread; + total += (size_t)nread; } + close(fd); buf[total] = '\0'; return atoi(buf); } diff --git a/benchmarks/intgemm-simd/benchmark.wasm b/benchmarks/intgemm-simd/benchmark.wasm index af04210f..64fda66c 100755 Binary files a/benchmarks/intgemm-simd/benchmark.wasm and b/benchmarks/intgemm-simd/benchmark.wasm differ diff --git a/benchmarks/intgemm-simd/patch-intgemm.diff b/benchmarks/intgemm-simd/patch-intgemm.diff index 0102a570..4d2b6a44 100644 --- a/benchmarks/intgemm-simd/patch-intgemm.diff +++ b/benchmarks/intgemm-simd/patch-intgemm.diff @@ -2,7 +2,12 @@ diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc index 58e4bc5..70679ee 100644 --- a/intgemm/intgemm.cc +++ b/intgemm/intgemm.cc -@@ -120,7 +120,11 @@ CPUType GetCPUID() { +@@ -115,17 +115,21 @@ CPUType GetCPUID() { + #ifdef INTGEMM_CPUID_ENVIRONMENT + std::min(RealCPUID(), EnvironmentCPUID()); + #else + RealCPUID(); + #endif return kLocalCPU; } @@ -14,11 +19,21 @@ index 58e4bc5..70679ee 100644 void UnsupportedCPUError() { #if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS) + throw UnsupportedCPU(); + #else + fprintf(stderr, "intgemm does not support this CPU.\n"); + abort(); + #endif diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h index 26febb5..616fe0a 100644 --- a/intgemm/intgemm.h +++ b/intgemm/intgemm.h -@@ -150,8 +150,13 @@ CPUType GetCPUID(); +@@ -145,18 +145,23 @@ CPUType GetCPUID(); + * + * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit) + * + * sse2 if the CPU supports SSE2 + * * unsupported otherwise */ template T ChooseCPU(T avx512vnni, T avx512bw, T avx2, T ssse3, T sse2, T unsupported) { @@ -32,3 +47,37 @@ index 26febb5..616fe0a 100644 } struct TileInfo { + const Index a_rows; + const Index a_cols; + const Index b_rows; + const Index b_cols; + }; +diff --git a/intgemm/types.h b/intgemm/types.h +index 44fb4e2..048339e 100644 +--- a/intgemm/types.h ++++ b/intgemm/types.h +@@ -14,18 +14,23 @@ + #include + #include + #include + #include + #include + #include + #endif + +-#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER) ++#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER) || defined(__wasm__) + /* Real MSVC does not appear to have target attributes but is also fine with ++ * just using intrinsics anywhere. The same is true for our wasm build, which ++ * compiles the SSE intrinsics through wasm_sse_compat.h and has no notion of ++ * x86 target attributes. ++ * Original comment follows: ++ * Real MSVC does not appear to have target attributes but is also fine with + * just using intrinsics anywhere. clang-cl pretending to be MSVC requires + * target attributes, so it's excluded from the above. + * + * The Intel compiler has a bug whereby constructors with target attributes do + * not link. Like this program doesn't compile with icpc: + * class Foo { + * public: + * __attribute__ ((target ("avx2"))) Foo() {} diff --git a/benchmarks/intgemm-simd/wasm_sse_compat.h b/benchmarks/intgemm-simd/wasm_sse_compat.h new file mode 100644 index 00000000..fc5cca44 --- /dev/null +++ b/benchmarks/intgemm-simd/wasm_sse_compat.h @@ -0,0 +1,167 @@ +// x86 SSE/SSSE3 -> WebAssembly SIMD compatibility shim for intgemm. +// +// wasi-sdk's clang cannot compile x86 SSE intrinsics for wasm (its +// ``/`` #error out, and `-msse*` are rejected), and +// intgemm's kernels are written against those intrinsics. Rather than pull in a +// whole library, this header provides exactly the `_mm_*` intrinsics intgemm +// uses, implemented with the native Wasm SIMD intrinsics from +// ``. On a native (x86) target it just includes the real +// headers, so the same intgemm source builds either way. +// +// Note: __m128, __m128i and __m128d are defined as *distinct* vector types +// (different element types) on purpose. intgemm overloads functions on them +// (e.g. callbacks' run_callbacks), so they must not collapse to a single type +// the way wasm's `v128_t` would. +#pragma once + +#if defined(__wasm_simd128__) + +#include +#include + +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); +typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); + +// Reinterpret (bit-cast) helpers between our typed vectors and wasm's v128_t. +#define INTGEMM_W(x) ((v128_t)(x)) + +// --- Load / store / set / zero --------------------------------------------- +static inline __m128 _mm_load_ps(const float *p) { return (__m128)wasm_v128_load(p); } +static inline __m128 _mm_loadu_ps(const float *p) { return (__m128)wasm_v128_load(p); } +static inline void _mm_storeu_ps(float *p, __m128 a) { wasm_v128_store(p, INTGEMM_W(a)); } +static inline __m128i _mm_set1_epi8(int8_t v) { return (__m128i)wasm_i8x16_splat(v); } +static inline __m128i _mm_set1_epi16(int16_t v) { return (__m128i)wasm_i16x8_splat(v); } +static inline __m128i _mm_set1_epi32(int32_t v) { return (__m128i)wasm_i32x4_splat(v); } +static inline __m128 _mm_set1_ps(float v) { return (__m128)wasm_f32x4_splat(v); } +static inline __m128d _mm_set1_pd(double v) { return (__m128d)wasm_f64x2_splat(v); } +static inline __m128i _mm_setzero_si128(void) { return (__m128i)wasm_i64x2_const(0, 0); } +static inline __m128 _mm_setzero_ps(void) { return (__m128)wasm_i64x2_const(0, 0); } +static inline __m128d _mm_setzero_pd(void) { return (__m128d)wasm_i64x2_const(0, 0); } + +// --- Integer arithmetic ---------------------------------------------------- +static inline __m128i _mm_add_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_add(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_add_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_add(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_add(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_sub_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_sub(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_add_sat(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_abs_epi8(__m128i a) { return (__m128i)wasm_i8x16_abs(INTGEMM_W(a)); } +static inline __m128i _mm_max_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_max(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_max(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_mullo_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_mul(INTGEMM_W(a), INTGEMM_W(b)); } + +// High 16 bits of signed 16-bit products. +static inline __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { + v128_t lo = wasm_i32x4_extmul_low_i16x8(INTGEMM_W(a), INTGEMM_W(b)); + v128_t hi = wasm_i32x4_extmul_high_i16x8(INTGEMM_W(a), INTGEMM_W(b)); + // Pick the high 16 bits (odd 16-bit lanes) of each 32-bit product. + return (__m128i)wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); +} +// Unsigned 32x32 -> 64-bit products of the even 32-bit lanes (0 and 2). +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) { + v128_t ea = wasm_i32x4_shuffle(INTGEMM_W(a), INTGEMM_W(a), 0, 2, 0, 2); + v128_t eb = wasm_i32x4_shuffle(INTGEMM_W(b), INTGEMM_W(b), 0, 2, 0, 2); + return (__m128i)wasm_u64x2_extmul_low_u32x4(ea, eb); +} +// Multiply adjacent unsigned*signed bytes, add pairs with signed saturation. +static inline __m128i _mm_maddubs_epi16(__m128i a, __m128i b) { + v128_t alo = wasm_u16x8_extend_low_u8x16(INTGEMM_W(a)); + v128_t ahi = wasm_u16x8_extend_high_u8x16(INTGEMM_W(a)); + v128_t blo = wasm_i16x8_extend_low_i8x16(INTGEMM_W(b)); + v128_t bhi = wasm_i16x8_extend_high_i8x16(INTGEMM_W(b)); + v128_t plo = wasm_i16x8_mul(alo, blo); // products of bytes 0..7 + v128_t phi = wasm_i16x8_mul(ahi, bhi); // products of bytes 8..15 + v128_t even = wasm_i16x8_shuffle(plo, phi, 0, 2, 4, 6, 8, 10, 12, 14); + v128_t odd = wasm_i16x8_shuffle(plo, phi, 1, 3, 5, 7, 9, 11, 13, 15); + return (__m128i)wasm_i16x8_add_sat(even, odd); +} +// Multiply 16-bit pairs and add adjacent into 32-bit lanes. +static inline __m128i _mm_madd_epi16(__m128i a, __m128i b) { + return (__m128i)wasm_i32x4_dot_i16x8(INTGEMM_W(a), INTGEMM_W(b)); +} +// Negate/zero each byte of `a` according to the sign of the matching byte of `b`. +static inline __m128i _mm_sign_epi8(__m128i a, __m128i b) { + v128_t z = wasm_i8x16_splat(0); + v128_t neg = wasm_i8x16_neg(INTGEMM_W(a)); + v128_t ltz = wasm_i8x16_lt(INTGEMM_W(b), z); // b < 0 + v128_t eqz = wasm_i8x16_eq(INTGEMM_W(b), z); // b == 0 + v128_t r = wasm_v128_bitselect(neg, INTGEMM_W(a), ltz); + return (__m128i)wasm_v128_andnot(r, eqz); // zero where b == 0 +} + +// --- Float / double arithmetic --------------------------------------------- +static inline __m128 _mm_add_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_add(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_sub_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_sub(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_mul(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_div_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_div(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_min_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_min(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_max_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_max(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128d _mm_add_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_add(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128d _mm_sub_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_sub(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128d _mm_mul_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_mul(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128d _mm_max_pd(__m128d a, __m128d b) { return (__m128d)wasm_f64x2_max(INTGEMM_W(a), INTGEMM_W(b)); } + +// --- Conversions ------------------------------------------------------------ +static inline __m128 _mm_cvtepi32_ps(__m128i a) { return (__m128)wasm_f32x4_convert_i32x4(INTGEMM_W(a)); } +static inline __m128i _mm_cvttps_epi32(__m128 a) { return (__m128i)wasm_i32x4_trunc_sat_f32x4(INTGEMM_W(a)); } +// Round to nearest (ties to even), then convert, matching SSE's default mode. +static inline __m128i _mm_cvtps_epi32(__m128 a) { return (__m128i)wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_nearest(INTGEMM_W(a))); } +static inline __m128 _mm_castsi128_ps(__m128i a) { return (__m128)(a); } + +// --- Bitwise ---------------------------------------------------------------- +static inline __m128i _mm_and_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_and(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_or_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_or(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) { return (__m128i)wasm_v128_xor(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_and_ps(__m128 a, __m128 b) { return (__m128)wasm_v128_and(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_andnot_ps(__m128 a, __m128 b) { return (__m128)wasm_v128_andnot(INTGEMM_W(b), INTGEMM_W(a)); } // (~a) & b + +// --- Comparisons (produce all-ones / all-zeros masks) ---------------------- +static inline __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_eq(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_gt(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_lt(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_gt(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i32x4_lt(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_cmplt_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_lt(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { return (__m128)wasm_f32x4_ne(INTGEMM_W(a), INTGEMM_W(b)); } + +// --- Saturating narrowing (pack) ------------------------------------------- +static inline __m128i _mm_packs_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_narrow_i16x8(INTGEMM_W(a), INTGEMM_W(b)); } +static inline __m128i _mm_packs_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i16x8_narrow_i32x4(INTGEMM_W(a), INTGEMM_W(b)); } + +// --- Interleave (unpack) ---------------------------------------------------- +static inline __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); } +static inline __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); } +static inline __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23); } +static inline __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31); } +static inline __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); } +static inline __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); } +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); } +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { return (__m128i)wasm_i8x16_shuffle(INTGEMM_W(a), INTGEMM_W(b), 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); } + +// --- Shifts by a (compile-time or runtime) count --------------------------- +static inline __m128i _mm_slli_epi16(__m128i a, int c) { return (__m128i)wasm_i16x8_shl(INTGEMM_W(a), c); } +static inline __m128i _mm_srli_epi16(__m128i a, int c) { return (__m128i)wasm_u16x8_shr(INTGEMM_W(a), c); } +static inline __m128i _mm_srai_epi16(__m128i a, int c) { return (__m128i)wasm_i16x8_shr(INTGEMM_W(a), c); } +static inline __m128i _mm_srai_epi32(__m128i a, int c) { return (__m128i)wasm_i32x4_shr(INTGEMM_W(a), c); } + +// --- Shuffles (immediate); macros so the lane indices stay compile-time ----- +#define _mm_shuffle_epi32(a, imm) \ + ((__m128i)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(a), (imm) & 3, ((imm) >> 2) & 3, ((imm) >> 4) & 3, ((imm) >> 6) & 3)) +#define _mm_shuffle_ps(a, b, imm) \ + ((__m128)wasm_i32x4_shuffle((v128_t)(a), (v128_t)(b), (imm) & 3, ((imm) >> 2) & 3, 4 + (((imm) >> 4) & 3), 4 + (((imm) >> 6) & 3))) +// Byte-wise shift right of the whole 128-bit value (shifting in zeros). +#define _mm_srli_si128(a, imm) \ + ((__m128i)wasm_i8x16_shuffle((v128_t)(a), wasm_i8x16_splat(0), \ + (imm) + 0, (imm) + 1, (imm) + 2, (imm) + 3, (imm) + 4, (imm) + 5, (imm) + 6, (imm) + 7, \ + (imm) + 8, (imm) + 9, (imm) + 10, (imm) + 11, (imm) + 12, (imm) + 13, (imm) + 14, (imm) + 15)) + +#undef INTGEMM_W + +#else // !__wasm_simd128__ : native x86 build uses the real intrinsics. + +#include +#include +#include +#include + +#endif diff --git a/benchmarks/noop/Dockerfile b/benchmarks/noop/Dockerfile deleted file mode 120000 index edc75978..00000000 --- a/benchmarks/noop/Dockerfile +++ /dev/null @@ -1 +0,0 @@ -../Dockerfile.emscripten \ No newline at end of file diff --git a/benchmarks/noop/Dockerfile b/benchmarks/noop/Dockerfile new file mode 100644 index 00000000..f6883aee --- /dev/null +++ b/benchmarks/noop/Dockerfile @@ -0,0 +1,34 @@ +# This two-phase Dockerfile allows us to avoid re-downloading APT packages and wasi-sdk with every +# build. + +# First, retrieve wasi-sdk: + +FROM ubuntu:24.04 AS builder +WORKDIR / +RUN apt update && apt install -y wget + +# Download and extract wasi-sdk. +RUN wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-28/wasi-sdk-28.0-x86_64-linux.tar.gz +RUN tar xvf wasi-sdk-28.0-x86_64-linux.tar.gz + +# Second, compile the benchmark to Wasm. + +FROM ubuntu:24.04 +WORKDIR / +COPY --from=builder /wasi-sdk-28.0-x86_64-linux /wasi-sdk/ + +# Set common env vars. +ENV CC=/wasi-sdk/bin/clang +ENV CXX=/wasi-sdk/bin/clang++ +ENV LD=/wasi-sdk/bin/lld +ENV CFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot +ENV CXXFLAGS=--sysroot=/wasi-sdk/share/wasi-sysroot +ENV PATH /wasi-sdk + +# Compile `benchmark.c` to `./benchmark.wasm`. +COPY benchmark.c . +COPY sightglass.h . +WORKDIR /benchmark +RUN $CC $CFLAGS ../benchmark.c -O3 -g -DNDEBUG -I.. -o benchmark.wasm +# We output the Wasm file to the `/benchmark` directory, where the client +# expects it. diff --git a/benchmarks/noop/benchmark.wasm b/benchmarks/noop/benchmark.wasm index 7449a879..522a60de 100755 Binary files a/benchmarks/noop/benchmark.wasm and b/benchmarks/noop/benchmark.wasm differ