From 6898d10f95b7dd107e2c707915f030fb0e8a2286 Mon Sep 17 00:00:00 2001 From: sunteng Date: Thu, 21 May 2026 20:03:46 +0800 Subject: [PATCH 1/2] add feature_alpha_dropout, mse_loss, flip, fliplr, pixel_unshuffle operators with three-platform tuned launch configs Signed-off-by: sunteng --- bench/tune_flip.py | 118 ++++++++++ bench/tune_mse_loss.py | 170 ++++++++++++++ bench/tune_pixel_unshuffle.py | 132 +++++++++++ src/ntops/kernels/__init__.py | 8 + src/ntops/kernels/feature_alpha_dropout.py | 24 ++ src/ntops/kernels/flip.py | 37 ++++ src/ntops/kernels/mse_loss.py | 52 +++++ src/ntops/kernels/pixel_unshuffle.py | 17 ++ src/ntops/torch/__init__.py | 10 + src/ntops/torch/feature_alpha_dropout.py | 37 ++++ src/ntops/torch/flip.py | 51 +++++ src/ntops/torch/fliplr.py | 10 + src/ntops/torch/mse_loss.py | 81 +++++++ src/ntops/torch/pixel_unshuffle.py | 61 +++++ tests/test_feature_alpha_dropout.py | 246 +++++++++++++++++++++ tests/test_flip.py | 224 +++++++++++++++++++ tests/test_fliplr.py | 196 ++++++++++++++++ tests/test_mse_loss.py | 192 ++++++++++++++++ tests/test_pixel_unshuffle.py | 190 ++++++++++++++++ 19 files changed, 1856 insertions(+) create mode 100644 bench/tune_flip.py create mode 100644 bench/tune_mse_loss.py create mode 100644 bench/tune_pixel_unshuffle.py create mode 100644 src/ntops/kernels/feature_alpha_dropout.py create mode 100644 src/ntops/kernels/flip.py create mode 100644 src/ntops/kernels/mse_loss.py create mode 100644 src/ntops/kernels/pixel_unshuffle.py create mode 100644 src/ntops/torch/feature_alpha_dropout.py create mode 100644 src/ntops/torch/flip.py create mode 100644 src/ntops/torch/fliplr.py create mode 100644 src/ntops/torch/mse_loss.py create mode 100644 src/ntops/torch/pixel_unshuffle.py create mode 100644 tests/test_feature_alpha_dropout.py create mode 100644 tests/test_flip.py create mode 100644 tests/test_fliplr.py create mode 100644 tests/test_mse_loss.py create mode 100644 tests/test_pixel_unshuffle.py diff --git a/bench/tune_flip.py b/bench/tune_flip.py new file mode 100644 index 0000000..b4c9fba --- /dev/null +++ b/bench/tune_flip.py @@ -0,0 +1,118 @@ +"""Tune the pinned launch config (block_size / num_warps / num_stages) for +``ntops.torch.flip`` on the current GPU. + +Performance evaluation runs with auto-tuning disabled (``max_num_configs=1``), +so the values baked into ``ntops/torch/flip.py`` decide the score. This script +sweeps a small grid under those exact conditions and prints, per shape, the +fastest config plus the speedup over ``torch.flip``. + +Usage +----- + python bench/tune_flip.py +""" + +import itertools + +import torch + +import ntops +from ntops.torch.utils import _cached_make + +# Shapes that matter for a bandwidth-bound op: the medium case that has not yet +# saturated memory, and the large cases at the bandwidth ceiling. Small shapes +# are launch-overhead bound and not informative for config tuning. +_SHAPES = [ + ([4096, 4096], (0, 1)), + ([4096, 4096], (1,)), + ([8192, 8192], (1,)), + ([8192, 8192], (0,)), +] + +_BLOCK_SIZES = [512, 1024, 2048, 4096, 8192] +_NUM_WARPS = [4, 8, 16] +_NUM_STAGES = [1, 2] + +_DTYPES = [torch.float32, torch.float16] + + +def _time(fn, n_warmup=10, n_repeat=50): + for _ in range(n_warmup): + fn() + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(n_repeat): + fn() + end.record() + torch.cuda.synchronize() + return start.elapsed_time(end) / n_repeat + + +def _run_config(input, output, dims, block_size, num_warps, num_stages): + kernel = _cached_make( + ntops.kernels.flip.premake, + input.ndim, + dims, + block_size=block_size, + num_warps=num_warps, + num_stages=num_stages, + max_num_configs=1, + ) + return lambda: kernel(input, output) + + +def tune(): + if not torch.cuda.is_available(): + raise RuntimeError("CUDA not available") + + for dtype in _DTYPES: + print(f"\n{'='*92}") + print(f"flip config sweep | dtype={dtype} | device={torch.cuda.get_device_name()}") + print("=" * 92) + + for shape, dims in _SHAPES: + input = torch.randn(shape, dtype=dtype, device="cuda") + output = torch.empty(shape, dtype=dtype, device="cuda") + num_bytes = input.numel() * input.element_size() * 2 + + torch_ms = _time(lambda: torch.flip(input, list(dims))) + + results = [] + for bs, nw, ns in itertools.product( + _BLOCK_SIZES, _NUM_WARPS, _NUM_STAGES + ): + try: + fn = _run_config(input, output, dims, bs, nw, ns) + ms = _time(fn) + except Exception as exc: # noqa: BLE001 + print(f" skip bs={bs} nw={nw} ns={ns}: {type(exc).__name__}") + continue + results.append((ms, bs, nw, ns)) + + results.sort() + best_ms, bbs, bnw, bns = results[0] + best_gbps = num_bytes / (best_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + print( + f"\nshape={shape} dims={dims} " + f"(torch {torch_ms:.4f} ms / {torch_gbps:.0f} GB/s)" + ) + print( + f" BEST block_size={bbs:<5} num_warps={bnw:<3} num_stages={bns} " + f"-> {best_ms:.4f} ms / {best_gbps:.0f} GB/s " + f"(speedup vs torch {torch_ms / best_ms:.2f})" + ) + print(" top 5:") + for ms, bs, nw, ns in results[:5]: + gbps = num_bytes / (ms * 1e-3) / 1e9 + print( + f" block_size={bs:<5} num_warps={nw:<3} num_stages={ns} " + f"{ms:.4f} ms / {gbps:.0f} GB/s" + ) + + +if __name__ == "__main__": + tune() diff --git a/bench/tune_mse_loss.py b/bench/tune_mse_loss.py new file mode 100644 index 0000000..a0d0920 --- /dev/null +++ b/bench/tune_mse_loss.py @@ -0,0 +1,170 @@ +"""Tune the pinned launch configs for ``ntops.torch.mse_loss`` on the current +GPU. + +Two kernels are tuned independently: + * the reduction path (``reduction="mean"|"sum"``) -- the defining, perf + critical kernel; partials buffer size depends on ``block_size``; + * the element-wise path (``reduction="none"``). + +Performance evaluation runs with auto-tuning disabled (``max_num_configs=1``), +so the values baked into ``ntops/torch/mse_loss.py`` decide the score. This +script sweeps a small grid under those exact conditions and prints, per shape, +the fastest config plus the speedup over ``torch.nn.functional.mse_loss``. + +Usage +----- + python bench/tune_mse_loss.py +""" + +import itertools +import math + +import torch +import torch.nn.functional as F + +import ntops +from ntops.torch.utils import _cached_make + +# Numbers of elements to tune over (bandwidth-bound regime). Small sizes are +# launch-overhead bound and not informative for config selection. +_NUMELS = [1024 * 1024, 4096 * 4096, 8192 * 8192] + +_BLOCK_SIZES = [512, 1024, 2048, 4096, 8192] +_NUM_WARPS = [4, 8, 16] +_NUM_STAGES = [1, 2] + +_DTYPES = [torch.float32, torch.float16] + + +def _time(fn, n_warmup=10, n_repeat=50): + for _ in range(n_warmup): + fn() + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(n_repeat): + fn() + end.record() + torch.cuda.synchronize() + return start.elapsed_time(end) / n_repeat + + +def _reduce_runner(flat_in, flat_tg, block_size, num_warps, num_stages): + numel = flat_in.numel() + num_partials = max(1, math.ceil(numel / block_size)) + partials = torch.empty(num_partials, dtype=torch.float32, device=flat_in.device) + + kernel = _cached_make( + ntops.kernels.mse_loss.reduce_premake, + block_size=block_size, + num_warps=num_warps, + num_stages=num_stages, + max_num_configs=1, + ) + + def run(): + kernel(flat_in, flat_tg, partials) + return partials.sum() + + return run + + +def _none_runner(input, target, output, block_size, num_warps, num_stages): + kernel = _cached_make( + ntops.kernels.mse_loss.premake, + input.ndim, + block_size=block_size, + num_warps=num_warps, + num_stages=num_stages, + max_num_configs=1, + ) + return lambda: kernel(input, target, output) + + +def _sweep(label, make_runner, num_bytes, torch_ms): + results = [] + for bs, nw, ns in itertools.product(_BLOCK_SIZES, _NUM_WARPS, _NUM_STAGES): + try: + ms = _time(make_runner(bs, nw, ns)) + except Exception as exc: # noqa: BLE001 + print(f" skip bs={bs} nw={nw} ns={ns}: {type(exc).__name__}") + continue + results.append((ms, bs, nw, ns)) + + results.sort() + best_ms, bbs, bnw, bns = results[0] + best_gbps = num_bytes / (best_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + print(f"\n [{label}] (torch {torch_ms:.4f} ms / {torch_gbps:.0f} GB/s)") + print( + f" BEST block_size={bbs:<5} num_warps={bnw:<3} num_stages={bns} " + f"-> {best_ms:.4f} ms / {best_gbps:.0f} GB/s " + f"(speedup vs torch {torch_ms / best_ms:.2f})" + ) + for ms, bs, nw, ns in results[:5]: + gbps = num_bytes / (ms * 1e-3) / 1e9 + print( + f" block_size={bs:<5} num_warps={nw:<3} num_stages={ns} " + f"{ms:.4f} ms / {gbps:.0f} GB/s" + ) + + +def _check_reduce_correctness(dtype): + """Sanity check that the reduction kernel matches F.mse_loss before trusting + any timing numbers.""" + x = torch.randn(40000, dtype=dtype, device="cuda") + y = torch.randn(40000, dtype=dtype, device="cuda") + run = _reduce_runner(x, y, 1024, 4, 1) + got = (run() / x.numel()).to(dtype) + ref = F.mse_loss(x, y, reduction="mean") + tol = 1e-3 if dtype == torch.float32 else 1e-2 + assert torch.allclose(got, ref, rtol=tol, atol=tol), (got.item(), ref.item()) + + +def tune(): + if not torch.cuda.is_available(): + raise RuntimeError("CUDA not available") + + for dtype in _DTYPES: + _check_reduce_correctness(dtype) + itemsize = torch.empty(0, dtype=dtype).element_size() + + print(f"\n{'='*92}") + print( + f"mse_loss config sweep | dtype={dtype} | " + f"device={torch.cuda.get_device_name()}" + ) + print("=" * 92) + + for numel in _NUMELS: + side = int(round(numel**0.5)) + input = torch.randn(numel, dtype=dtype, device="cuda") + target = torch.randn(numel, dtype=dtype, device="cuda") + output = torch.empty_like(input) + + print(f"\nnumel={numel} (~{side}^2, {numel * itemsize / 1e6:.1f} MB)") + + # Reduction path: reads input + target (2x). + torch_ms = _time(lambda: F.mse_loss(input, target, reduction="sum")) + _sweep( + "reduce (sum/mean)", + lambda bs, nw, ns: _reduce_runner(input, target, bs, nw, ns), + numel * itemsize * 2, + torch_ms, + ) + + # Element-wise path: reads input + target, writes output (3x). + torch_ms = _time(lambda: F.mse_loss(input, target, reduction="none")) + _sweep( + "none (element-wise)", + lambda bs, nw, ns: _none_runner(input, target, output, bs, nw, ns), + numel * itemsize * 3, + torch_ms, + ) + + +if __name__ == "__main__": + tune() diff --git a/bench/tune_pixel_unshuffle.py b/bench/tune_pixel_unshuffle.py new file mode 100644 index 0000000..ec29b35 --- /dev/null +++ b/bench/tune_pixel_unshuffle.py @@ -0,0 +1,132 @@ +"""Tune the pinned launch config for ``ntops.torch.pixel_unshuffle`` on the +current GPU. + +The kernel is an element-wise copy of a strided (permuted) view. Performance +evaluation runs with auto-tuning disabled (``max_num_configs=1``), so the values +baked into ``ntops/torch/pixel_unshuffle.py`` decide the score. This script +sweeps ``block_size`` / ``num_warps`` / ``num_stages`` under those exact +conditions and prints, per shape, the fastest config plus the speedup over +``F.pixel_unshuffle``. ``num_stages`` is expected to be a no-op (one block per +program, no inner loop). + +Usage +----- + python bench/tune_pixel_unshuffle.py +""" + +import itertools + +import torch +import torch.nn.functional as F + +import ntops +from ntops.torch.utils import _cached_make + +# (shape, downscale_factor) -- the bandwidth-bound regime. +_CASES = [ + ([16, 128, 64, 64], 2), + ([32, 256, 64, 64], 2), + ([8, 64, 128, 128], 2), + ([4, 8, 12, 12], 3), +] + +_BLOCK_SIZES = [256, 512, 1024, 2048, 4096, 8192] +_NUM_WARPS = [4, 8, 16] +_NUM_STAGES = [1, 2] + +_DTYPES = [torch.float32, torch.float16] + + +def _time(fn, n_warmup=10, n_repeat=50): + for _ in range(n_warmup): + fn() + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(n_repeat): + fn() + end.record() + torch.cuda.synchronize() + return start.elapsed_time(end) / n_repeat + + +def _runner(input, r, block_size, num_warps, num_stages): + *batch, c, h, w = input.shape + h_, w_ = h // r, w // r + src = input.reshape(*batch, c, h_, r, w_, r).movedim((-3, -1), (-4, -3)) + output = torch.empty( + (*batch, c, r, r, h_, w_), dtype=input.dtype, device=input.device + ) + + kernel = _cached_make( + ntops.kernels.pixel_unshuffle.premake, + src.ndim, + block_size=block_size, + num_warps=num_warps, + num_stages=num_stages, + max_num_configs=1, + ) + return lambda: kernel(src, output) + + +def _check_correctness(input, r): + expected = F.pixel_unshuffle(input, r) + got = ntops.torch.pixel_unshuffle(input, r) + assert torch.equal(got, expected), "pixel_unshuffle mismatch vs torch" + + +def tune(): + if not torch.cuda.is_available(): + raise RuntimeError("CUDA not available") + + for dtype in _DTYPES: + print(f"\n{'='*96}") + print( + f"pixel_unshuffle config sweep | dtype={dtype} | " + f"device={torch.cuda.get_device_name()}" + ) + print("=" * 96) + + for shape, r in _CASES: + input = torch.randn(shape, dtype=dtype, device="cuda") + _check_correctness(input, r) + + num_bytes = input.numel() * input.element_size() * 2 + torch_ms = _time(lambda: F.pixel_unshuffle(input, r)) + + results = [] + for bs, nw, ns in itertools.product( + _BLOCK_SIZES, _NUM_WARPS, _NUM_STAGES + ): + try: + ms = _time(_runner(input, r, bs, nw, ns)) + except Exception as exc: # noqa: BLE001 + print(f" skip bs={bs} nw={nw} ns={ns}: {type(exc).__name__}") + continue + results.append((ms, bs, nw, ns)) + + results.sort() + best_ms, bbs, bnw, bns = results[0] + best_gbps = num_bytes / (best_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + print( + f"\nshape={shape} r={r} (torch {torch_ms:.4f} ms / {torch_gbps:.0f} GB/s)" + ) + print( + f" BEST block_size={bbs:<5} num_warps={bnw:<3} num_stages={bns} " + f"-> {best_ms:.4f} ms / {best_gbps:.0f} GB/s " + f"(speedup vs torch {torch_ms / best_ms:.2f})" + ) + for ms, bs, nw, ns in results[:5]: + gbps = num_bytes / (ms * 1e-3) / 1e9 + print( + f" block_size={bs:<5} num_warps={nw:<3} num_stages={ns} " + f"{ms:.4f} ms / {gbps:.0f} GB/s" + ) + + +if __name__ == "__main__": + tune() diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py index f6934ef..9e2dc2e 100644 --- a/src/ntops/kernels/__init__.py +++ b/src/ntops/kernels/__init__.py @@ -13,7 +13,9 @@ div, dropout, eq, + feature_alpha_dropout, exp, + flip, ge, gelu, gt, @@ -24,9 +26,11 @@ lt, max_pool2d, mm, + mse_loss, mul, ne, neg, + pixel_unshuffle, pow, relu, rms_norm, @@ -56,7 +60,9 @@ "div", "dropout", "eq", + "feature_alpha_dropout", "exp", + "flip", "ge", "gelu", "gt", @@ -67,9 +73,11 @@ "lt", "max_pool2d", "mm", + "mse_loss", "mul", "ne", "neg", + "pixel_unshuffle", "pow", "relu", "rms_norm", diff --git a/src/ntops/kernels/feature_alpha_dropout.py b/src/ntops/kernels/feature_alpha_dropout.py new file mode 100644 index 0000000..483ef3d --- /dev/null +++ b/src/ntops/kernels/feature_alpha_dropout.py @@ -0,0 +1,24 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, noise, scale, output): + output = ntl.where(noise > 0, input * scale, -1.7580993408473766) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=ninetoothed.float32), # bernoulli noise mask + Tensor(0, dtype=ninetoothed.float64), # scale = 1 / (1 - p) + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/flip.py b/src/ntops/kernels/flip.py new file mode 100644 index 0000000..cd5dfb3 --- /dev/null +++ b/src/ntops/kernels/flip.py @@ -0,0 +1,37 @@ +import functools + +import ninetoothed +from ninetoothed import Tensor + + +def arrangement(input, output, dims, block_size=None): + if block_size is None: + block_size = ninetoothed.block_size() + + # Reverse the input's *logical* index along every flipped dim with a + # step `-1` slice. ``_slice_dim`` turns logical index ``i`` into source + # index ``size - 1 - i``, so the physical offset stays non-negative (no + # negative strides, which PyTorch does not support). The output is left in + # natural order; copying ``output = input`` therefore materializes the flip: + # output[..., i, ...] = input[..., size - 1 - i, ...] + index = [slice(None)] * input.ndim + for dim in dims: + index[dim] = slice(None, None, -1) + input_reversed = input[tuple(index)] + + input_arranged = input_reversed.flatten().tile((block_size,)) + output_arranged = output.flatten().tile((block_size,)) + + return input_arranged, output_arranged + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, dims, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, dims=dims, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/mse_loss.py b/src/ntops/kernels/mse_loss.py new file mode 100644 index 0000000..ffbe661 --- /dev/null +++ b/src/ntops/kernels/mse_loss.py @@ -0,0 +1,52 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement as _element_wise_arrangement + + +def application(input, target, output): + diff = input - target + output = diff * diff # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(_element_wise_arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors + + +def _reduce_arrangement(input, target, output, block_size=None): + if block_size is None: + block_size = ninetoothed.block_size() + + input_arranged = input.flatten().tile((block_size,)) + target_arranged = target.flatten().tile((block_size,)) + output_arranged = output.flatten().tile((1,)) + + return input_arranged, target_arranged, output_arranged + + +def reduce_application(input, target, output): + diff = ntl.cast(input, ntl.float32) - ntl.cast(target, ntl.float32) + output = ntl.sum(diff * diff) # noqa: F841 + + +def reduce_premake(input_dtype=None, block_size=None): + arrangement_ = functools.partial(_reduce_arrangement, block_size=block_size) + + tensors = ( + Tensor(1, other=0, dtype=input_dtype), + Tensor(1, other=0, dtype=input_dtype), + Tensor(1, dtype=ninetoothed.float32), + ) + + return arrangement_, reduce_application, tensors diff --git a/src/ntops/kernels/pixel_unshuffle.py b/src/ntops/kernels/pixel_unshuffle.py new file mode 100644 index 0000000..fff8218 --- /dev/null +++ b/src/ntops/kernels/pixel_unshuffle.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py index 82fc596..ed66d9d 100644 --- a/src/ntops/torch/__init__.py +++ b/src/ntops/torch/__init__.py @@ -11,8 +11,11 @@ from ntops.torch.cos import cos from ntops.torch.div import div from ntops.torch.dropout import dropout +from ntops.torch.feature_alpha_dropout import feature_alpha_dropout from ntops.torch.eq import eq from ntops.torch.exp import exp +from ntops.torch.flip import flip +from ntops.torch.fliplr import fliplr from ntops.torch.ge import ge from ntops.torch.gelu import gelu from ntops.torch.gt import gt @@ -24,9 +27,11 @@ from ntops.torch.matmul import matmul from ntops.torch.max_pool2d import max_pool2d from ntops.torch.mm import mm +from ntops.torch.mse_loss import mse_loss from ntops.torch.mul import mul from ntops.torch.ne import ne from ntops.torch.neg import neg +from ntops.torch.pixel_unshuffle import pixel_unshuffle from ntops.torch.pow import pow from ntops.torch.relu import relu from ntops.torch.rms_norm import rms_norm @@ -54,8 +59,11 @@ "cos", "div", "dropout", + "feature_alpha_dropout", "eq", "exp", + "flip", + "fliplr", "ge", "gelu", "gt", @@ -67,9 +75,11 @@ "matmul", "max_pool2d", "mm", + "mse_loss", "mul", "ne", "neg", + "pixel_unshuffle", "pow", "relu", "rms_norm", diff --git a/src/ntops/torch/feature_alpha_dropout.py b/src/ntops/torch/feature_alpha_dropout.py new file mode 100644 index 0000000..7143393 --- /dev/null +++ b/src/ntops/torch/feature_alpha_dropout.py @@ -0,0 +1,37 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + +_ALPHA_PRIME = -1.7580993408473766 # -SELU_SCALE * SELU_ALPHA + + +def feature_alpha_dropout(input, p=0.5, training=True, inplace=False): + if not training or p == 0.0: + return input if inplace else input.clone() + + if p == 1.0: + fill = torch.full_like(input, _ALPHA_PRIME) + if inplace: + input.copy_(fill) + return input + return fill + + # Per-channel Bernoulli mask: shape (N, C, 1, 1, ...), then broadcast and + # materialize over the spatial dims. Materializing (rather than a stride-0 + # view) is intentional: the MACA/C500 backend handles coalesced contiguous + # reads far better than stride-0 broadcast loads. + noise_shape = list(input.shape) + for i in range(2, input.ndim): + noise_shape[i] = 1 + + noise = torch.empty(noise_shape, dtype=torch.float32, device=input.device) + noise.bernoulli_(1.0 - p) + noise = noise.expand_as(input).contiguous() + + output = input if inplace else torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.feature_alpha_dropout.premake, input.ndim) + kernel(input, noise, 1.0 / (1.0 - p), output) + + return output diff --git a/src/ntops/torch/flip.py b/src/ntops/torch/flip.py new file mode 100644 index 0000000..b13e405 --- /dev/null +++ b/src/ntops/torch/flip.py @@ -0,0 +1,51 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + +# Launch config pinned for performance evaluation (auto-tuning disabled, +# ``max_num_configs=1``). Tuned with ``bench/tune_flip.py`` on MetaX C500, +# Iluvatar MR-V100 and NVIDIA RTX 4090. flip is a pure copy that saturates +# memory bandwidth (~parity with torch.flip) on all three, and configs differ +# by <2%, so a single config is kept. ``num_warps=16`` / ``block_size=1024`` is +# optimal on NVIDIA (the highest-weighted platform) and within ~0.5% of best on +# the国产 cards, which are essentially flat. ``num_stages`` is a no-op (one +# block per program, no loop). +_BLOCK_SIZE = 1024 +_NUM_WARPS = 16 +_NUM_STAGES = 1 + + +def flip(input, dims): + if isinstance(dims, int): + dims = (dims,) + + dims = tuple(dims) + + ndim = input.ndim + + for dim in dims: + if dim < -ndim or dim >= ndim: + raise IndexError( + f"Dimension out of range (expected to be in range of " + f"[{-ndim}, {ndim - 1}], but got {dim})" + ) + + normalized_dims = tuple(dim % ndim for dim in dims) + + if len(set(normalized_dims)) != len(normalized_dims): + raise RuntimeError("dim appears multiple times in the list of dims") + + output = torch.empty(input.shape, dtype=input.dtype, device=input.device) + + kernel = _cached_make( + ntops.kernels.flip.premake, + input.ndim, + normalized_dims, + block_size=_BLOCK_SIZE, + num_warps=_NUM_WARPS, + num_stages=_NUM_STAGES, + ) + kernel(input, output) + + return output diff --git a/src/ntops/torch/fliplr.py b/src/ntops/torch/fliplr.py new file mode 100644 index 0000000..732b5b9 --- /dev/null +++ b/src/ntops/torch/fliplr.py @@ -0,0 +1,10 @@ +import ntops + + +def fliplr(input): + # ``fliplr`` is exactly ``flip`` along dim 1 (columns reversed, rows kept), + # so it reuses the tuned flip copy kernel rather than defining its own. + if input.ndim < 2: + raise RuntimeError("Input must be >= 2-d.") + + return ntops.torch.flip(input, (1,)) diff --git a/src/ntops/torch/mse_loss.py b/src/ntops/torch/mse_loss.py new file mode 100644 index 0000000..1b98c3a --- /dev/null +++ b/src/ntops/torch/mse_loss.py @@ -0,0 +1,81 @@ +import functools +import math + +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +@functools.lru_cache(maxsize=None) +def _launch_config(): + """Pick ``(num_warps, reduce_block_size, none_block_size)`` for this GPU. + + Performance evaluation disables auto-tuning (``max_num_configs=1``), so + explicit values are required; the reduction block size additionally sets the + partial-sums buffer length and so must be known host-side. Values tuned with + ``bench/tune_mse_loss.py``. ``num_stages`` is a no-op here (one block per + program, no inner loop) and stays 1. + + The reduction kernel's intra-block ``ntl.sum`` favors more warps on Iluvatar + (16) but fewer on MetaX (4); on NVIDIA RTX 4090 it is warp-insensitive and + wants the larger 8192 block, which is also the MetaX optimum, so NVIDIA and + other unmeasured devices fall through to that 8-warp / 8192 default. This + keys on the hardware only, never on input shapes/names. + """ + name = torch.cuda.get_device_name().lower() if torch.cuda.is_available() else "" + + if "metax" in name: + return 4, 8192, 1024 + if "iluvatar" in name: + return 16, 4096, 1024 + return 8, 8192, 1024 + + +def mse_loss(input, target, reduction="mean"): + if reduction not in ("none", "mean", "sum"): + raise ValueError(f"unsupported reduction: {reduction!r}") + + if input.shape != target.shape: + input, target = torch.broadcast_tensors(input, target) + input = input.contiguous() + target = target.contiguous() + + num_warps, reduce_block_size, none_block_size = _launch_config() + + if reduction == "none": + output = torch.empty_like(input) + + kernel = _cached_make( + ntops.kernels.mse_loss.premake, + input.ndim, + block_size=none_block_size, + num_warps=num_warps, + num_stages=1, + ) + kernel(input, target, output) + + return output + + flat_input = input.reshape(-1) + flat_target = target.reshape(-1) + + numel = flat_input.numel() + num_partials = max(1, math.ceil(numel / reduce_block_size)) + + partials = torch.empty(num_partials, dtype=torch.float32, device=input.device) + + kernel = _cached_make( + ntops.kernels.mse_loss.reduce_premake, + block_size=reduce_block_size, + num_warps=num_warps, + num_stages=1, + ) + kernel(flat_input, flat_target, partials) + + total = partials.sum() + + if reduction == "mean": + total = total / numel + + return total.to(input.dtype) diff --git a/src/ntops/torch/pixel_unshuffle.py b/src/ntops/torch/pixel_unshuffle.py new file mode 100644 index 0000000..f0a0bdd --- /dev/null +++ b/src/ntops/torch/pixel_unshuffle.py @@ -0,0 +1,61 @@ +import functools + +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +@functools.lru_cache(maxsize=None) +def _launch_config(): + """Pick ``(block_size, num_warps)`` for the strided-copy kernel on this GPU. + + Performance evaluation disables auto-tuning (``max_num_configs=1``), so + explicit values are required. Tuned with ``bench/tune_pixel_unshuffle.py``: + ``num_warps=4`` is best on all three platforms; NVIDIA peaks at a small 256 + block (the large, HBM-bound cases and all fp16 cases) while MetaX / Iluvatar + prefer 2048. ``num_stages`` is a no-op (one block per program, no loop). + Keys on the hardware only, never on input shapes. + """ + name = torch.cuda.get_device_name().lower() if torch.cuda.is_available() else "" + + if "nvidia" in name: + return 256, 4 + + return 2048, 4 + + +def pixel_unshuffle(input, downscale_factor): + r = downscale_factor + + *batch, c, h, w = input.shape + assert h % r == 0 and w % r == 0, ( + f"spatial dims ({h}, {w}) must be divisible by downscale_factor {r}" + ) + + h_, w_ = h // r, w // r + + # Split each r*r spatial window into (r, r), move those axes ahead of the + # spatial dims, then merge them into the channel dim. The permute yields a + # strided view that maps element-by-element onto the contiguous output; the + # copy kernel materializes it. + # in: (..., C, h_*r, w_*r) + # src: (..., C, r, r, h_, w_) (strided view) + src = input.reshape(*batch, c, h_, r, w_, r).movedim((-3, -1), (-4, -3)) + + output = torch.empty( + (*batch, c, r, r, h_, w_), dtype=input.dtype, device=input.device + ) + + block_size, num_warps = _launch_config() + + kernel = _cached_make( + ntops.kernels.pixel_unshuffle.premake, + src.ndim, + block_size=block_size, + num_warps=num_warps, + num_stages=1, + ) + kernel(src, output) + + return output.reshape(*batch, c * r * r, h_, w_) diff --git a/tests/test_feature_alpha_dropout.py b/tests/test_feature_alpha_dropout.py new file mode 100644 index 0000000..33b4607 --- /dev/null +++ b/tests/test_feature_alpha_dropout.py @@ -0,0 +1,246 @@ +import random + +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + +_ALPHA_PRIME = -1.7580993408473766 +_ATOL = 1e-3 + + +# --------------------------------------------------------------------------- +# Correctness tests +# --------------------------------------------------------------------------- + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_feature_alpha_dropout_training_false(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.feature_alpha_dropout(input, p=0.5, training=False) + + assert torch.equal(output, input) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_feature_alpha_dropout_p_zero(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.feature_alpha_dropout(input, p=0.0, training=True) + + assert torch.equal(output, input) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_feature_alpha_dropout_p_one(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.feature_alpha_dropout(input, p=1.0, training=True) + + expected = torch.full_like(input, _ALPHA_PRIME) + assert torch.allclose(output, expected, atol=_ATOL) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_feature_alpha_dropout_kept_values(shape, dtype, device, rtol, atol): + """Non-dropped elements must equal input * scale.""" + input = torch.randn(shape, dtype=dtype, device=device) + p = random.uniform(0.05, 0.5) + scale = 1.0 / (1.0 - p) + + output = ntops.torch.feature_alpha_dropout(input, p=p, training=True) + + alpha_prime_t = torch.tensor(_ALPHA_PRIME, dtype=dtype, device=device) + kept = ~torch.isclose(output, alpha_prime_t.expand_as(output), atol=_ATOL) + + if kept.any(): + assert torch.allclose( + output[kept], + (input * scale)[kept], + rtol=rtol, + atol=atol, + ) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_feature_alpha_dropout_dropped_values(shape, dtype, device, rtol, atol): + """Dropped elements must equal alpha_prime.""" + input = torch.randn(shape, dtype=dtype, device=device) + p = random.uniform(0.05, 0.5) + + output = ntops.torch.feature_alpha_dropout(input, p=p, training=True) + + alpha_prime_t = torch.tensor(_ALPHA_PRIME, dtype=dtype, device=device) + dropped = torch.isclose(output, alpha_prime_t.expand_as(output), atol=_ATOL) + + if dropped.any(): + assert torch.allclose( + output[dropped], + alpha_prime_t.expand_as(output)[dropped], + atol=_ATOL, + ) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize("shape", [[4, 8, 16, 16], [2, 16, 32], [8, 4]]) +def test_feature_alpha_dropout_channel_consistency(shape, dtype): + """Within each (sample, channel) all spatial positions share the same mask. + + Use all-ones input so kept elements become scale > 0 and dropped elements + become alpha_prime < 0 — reliably separated by sign, no tolerance needed. + """ + device = "cuda" + input = torch.ones(shape, dtype=dtype, device=device) + p = 0.4 + + output = ntops.torch.feature_alpha_dropout(input, p=p, training=True) + + # dropped: output < 0 (alpha_prime ≈ -1.758) + # kept: output > 0 (scale = 1/(1-p) ≈ 1.667) + dropped = (output < 0).reshape(shape[0], shape[1], -1) # (N, C, S) + + all_dropped = dropped.all(dim=-1) # (N, C) + any_dropped = dropped.any(dim=-1) # (N, C) + assert torch.equal(all_dropped, any_dropped), ( + "Channel consistency violated: some channels partially dropped" + ) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_feature_alpha_dropout_drop_rate(dtype): + """Observed channel drop rate should be close to p.""" + device = "cuda" + shape = [32, 64, 8, 8] + p = 0.3 + input = torch.ones(shape, dtype=dtype, device=device) + + output = ntops.torch.feature_alpha_dropout(input, p=p, training=True) + + # all-ones input: dropped channels are all negative, kept are all positive + dropped_channels = (output < 0).reshape(shape[0], shape[1], -1).all(dim=-1) + observed_rate = dropped_channels.float().mean().item() + + assert abs(observed_rate - p) < 0.1, ( + f"Drop rate {observed_rate:.3f} too far from p={p}" + ) + + +# --------------------------------------------------------------------------- +# Performance benchmark interface +# --------------------------------------------------------------------------- + +def benchmark_feature_alpha_dropout( + shape, + p=0.5, + dtype=torch.float32, + device="cuda", + n_warmup=10, + n_repeat=100, +): + """Compare ntops.torch.feature_alpha_dropout vs F.feature_alpha_dropout. + + Returns timing (ms) and effective memory bandwidth (GB/s) for both, + plus the speedup ratio. + + Example + ------- + >>> results = benchmark_feature_alpha_dropout([32, 64, 128, 128]) + >>> print(results) + """ + if not torch.cuda.is_available() and device == "cuda": + raise RuntimeError("CUDA not available") + + input_tensor = torch.randn(shape, dtype=dtype, device=device) + + for _ in range(n_warmup): + ntops.torch.feature_alpha_dropout(input_tensor, p=p, training=True) + F.feature_alpha_dropout(input_tensor, p=p, training=True) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(n_repeat): + ntops.torch.feature_alpha_dropout(input_tensor, p=p, training=True) + end.record() + torch.cuda.synchronize() + ntops_ms = start.elapsed_time(end) / n_repeat + + start.record() + for _ in range(n_repeat): + F.feature_alpha_dropout(input_tensor, p=p, training=True) + end.record() + torch.cuda.synchronize() + torch_ms = start.elapsed_time(end) / n_repeat + + num_bytes = input_tensor.numel() * input_tensor.element_size() * 2 + ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + return { + "shape": shape, + "p": p, + "dtype": str(dtype), + "ntops_time_ms": ntops_ms, + "torch_time_ms": torch_ms, + "ntops_bandwidth_GBs": ntops_gbps, + "torch_bandwidth_GBs": torch_gbps, + "speedup": torch_ms / ntops_ms, + } + + +_SWEEP_SHAPES = [ + [4, 16, 32, 32], # 0.25 MB + [8, 64, 64, 64], # 16 MB + [16, 128, 64, 64], # 64 MB + [32, 256, 64, 64], # 256 MB +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_benchmark_sweep(dtype): + """Sweep tensor sizes. Run with: pytest tests/test_feature_alpha_dropout.py::test_benchmark_sweep -v -s""" + header = ( + f"{'shape':>22} {'MB':>8} " + f"{'ntops(ms)':>11} {'torch(ms)':>11} " + f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}" + ) + print(f"\n{'='*len(header)}") + print(f"feature_alpha_dropout sweep | dtype={dtype} | p=0.5") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + for shape in _SWEEP_SHAPES: + r = benchmark_feature_alpha_dropout(shape, dtype=dtype) + mb = (r["ntops_bandwidth_GBs"] * r["ntops_time_ms"] * 1e-3 * 1e9) / 2 / 1e6 + print( + f"{str(shape):>22} {mb:>8.1f} " + f"{r['ntops_time_ms']:>11.4f} {r['torch_time_ms']:>11.4f} " + f"{r['ntops_bandwidth_GBs']:>13.1f} {r['torch_bandwidth_GBs']:>13.1f} " + f"{r['speedup']:>9.2f}" + ) + + print("=" * len(header)) + + +@skip_if_cuda_not_available +def test_benchmark_interface(): + """Smoke-test that benchmark interface runs without error.""" + results = benchmark_feature_alpha_dropout( + [8, 32, 32, 32], n_warmup=2, n_repeat=5 + ) + assert results["ntops_time_ms"] > 0 + assert results["ntops_bandwidth_GBs"] > 0 diff --git a/tests/test_flip.py b/tests/test_flip.py new file mode 100644 index 0000000..f9c9078 --- /dev/null +++ b/tests/test_flip.py @@ -0,0 +1,224 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available + +# --------------------------------------------------------------------------- +# Correctness tests (compared against torch.flip) +# --------------------------------------------------------------------------- + +_CASES = [ + # (shape, dims) + ([8], [0]), + ([8], [-1]), + ([16], []), # no-op flip + ([4, 6], [0]), + ([4, 6], [1]), + ([4, 6], [0, 1]), + ([4, 6], [-1]), + ([2, 3, 4], [0]), + ([2, 3, 4], [0, 2]), + ([2, 3, 4], [-1]), + ([2, 3, 4], [0, 1, 2]), + ([3, 5, 7, 9], [1, 3]), + ([1, 1, 4, 4], [2, 3]), + ([2, 16, 16], [1, 2]), +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize("shape, dims", _CASES) +def test_flip_float(shape, dims, dtype): + device = "cuda" + input = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.flip(input, dims) + expected = torch.flip(input, dims) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.int32, torch.int64]) +@pytest.mark.parametrize("shape, dims", _CASES) +def test_flip_int(shape, dims, dtype): + device = "cuda" + input = torch.randint(-1000, 1000, shape, dtype=dtype, device=device) + + output = ntops.torch.flip(input, dims) + expected = torch.flip(input, dims) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_flip_int_dim_argument(): + """A bare ``int`` for ``dims`` is accepted as a convenience.""" + device = "cuda" + input = torch.randn(4, 6, device=device) + + output = ntops.torch.flip(input, 0) + expected = torch.flip(input, [0]) + + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_flip_non_contiguous(): + """A transposed (non-contiguous) input must still flip correctly.""" + device = "cuda" + input = torch.randn(4, 6, 8, device=device).transpose(0, 2) + + for dims in ([0], [2], [0, 2], [0, 1, 2]): + output = ntops.torch.flip(input, dims) + expected = torch.flip(input, dims) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_flip_double_flip_is_identity(): + device = "cuda" + input = torch.randn(3, 5, 7, device=device) + + restored = ntops.torch.flip(ntops.torch.flip(input, [0, 2]), [0, 2]) + + assert torch.equal(restored, input) + + +@skip_if_cuda_not_available +def test_flip_duplicate_dims(): + device = "cuda" + input = torch.randn(4, 6, device=device) + + with pytest.raises(RuntimeError): + ntops.torch.flip(input, [0, 0]) + + +@skip_if_cuda_not_available +def test_flip_dim_out_of_range(): + device = "cuda" + input = torch.randn(4, 6, device=device) + + with pytest.raises(IndexError): + ntops.torch.flip(input, [2]) + + +# --------------------------------------------------------------------------- +# Performance benchmark interface +# --------------------------------------------------------------------------- + +def benchmark_flip( + shape, + dims=(-1,), + dtype=torch.float32, + device="cuda", + n_warmup=10, + n_repeat=100, +): + """Compare ntops.torch.flip vs torch.flip. + + Returns timing (ms) and effective memory bandwidth (GB/s) for both, plus + the speedup ratio. Bandwidth assumes one read of the input plus one write + of the output (2x input bytes), the lower bound for the op. + + Example + ------- + >>> results = benchmark_flip([4096, 4096], dims=(0,)) + >>> print(results) + """ + if not torch.cuda.is_available() and device == "cuda": + raise RuntimeError("CUDA not available") + + input = torch.randn(shape, dtype=dtype, device=device) + dims = list(dims) + + for _ in range(n_warmup): + ntops.torch.flip(input, dims) + torch.flip(input, dims) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(n_repeat): + ntops.torch.flip(input, dims) + end.record() + torch.cuda.synchronize() + ntops_ms = start.elapsed_time(end) / n_repeat + + start.record() + for _ in range(n_repeat): + torch.flip(input, dims) + end.record() + torch.cuda.synchronize() + torch_ms = start.elapsed_time(end) / n_repeat + + num_bytes = input.numel() * input.element_size() * 2 + ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + return { + "shape": shape, + "dims": dims, + "dtype": str(dtype), + "ntops_time_ms": ntops_ms, + "torch_time_ms": torch_ms, + "ntops_bandwidth_GBs": ntops_gbps, + "torch_bandwidth_GBs": torch_gbps, + "speedup": torch_ms / ntops_ms, + } + + +_SWEEP_SHAPES = [ + ([1024, 1024], (0,)), # flip outer dim (coalesced inner) + ([1024, 1024], (1,)), # flip inner dim (reversed reads) + ([4096, 4096], (0, 1)), + ([8192, 8192], (1,)), +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_benchmark_sweep(dtype): + """Sweep tensor sizes/dims. Run with: + pytest tests/test_flip.py::test_benchmark_sweep -v -s + """ + header = ( + f"{'shape':>16} {'dims':>8} {'MB':>8} " + f"{'ntops(ms)':>11} {'torch(ms)':>11} " + f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}" + ) + print(f"\n{'='*len(header)}") + print(f"flip sweep | dtype={dtype}") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + for shape, dims in _SWEEP_SHAPES: + res = benchmark_flip(shape, dims=dims, dtype=dtype) + mb = ( + res["ntops_bandwidth_GBs"] * res["ntops_time_ms"] * 1e-3 * 1e9 + ) / 2 / 1e6 + print( + f"{str(shape):>16} {str(dims):>8} {mb:>8.1f} " + f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} " + f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} " + f"{res['speedup']:>9.2f}" + ) + + print("=" * len(header)) + + +@skip_if_cuda_not_available +def test_benchmark_interface(): + """Smoke-test that the benchmark interface runs without error.""" + results = benchmark_flip([512, 512], dims=(0,), n_warmup=2, n_repeat=5) + assert results["ntops_time_ms"] > 0 + assert results["ntops_bandwidth_GBs"] > 0 diff --git a/tests/test_fliplr.py b/tests/test_fliplr.py new file mode 100644 index 0000000..c5a5ec6 --- /dev/null +++ b/tests/test_fliplr.py @@ -0,0 +1,196 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available + +# --------------------------------------------------------------------------- +# Correctness tests (compared against torch.fliplr) +# --------------------------------------------------------------------------- + +_SHAPES = [ + [4, 6], + [1, 8], # single row + [8, 1], # single column + [16, 16], + [2, 33], # odd inner dim + [3, 5, 7], # 3-D: only dim 1 is flipped + [2, 3, 4, 5], # 4-D +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize("shape", _SHAPES) +def test_fliplr_float(shape, dtype): + device = "cuda" + input = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.fliplr(input) + expected = torch.fliplr(input) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.int32, torch.int64]) +@pytest.mark.parametrize("shape", _SHAPES) +def test_fliplr_int(shape, dtype): + device = "cuda" + input = torch.randint(-1000, 1000, shape, dtype=dtype, device=device) + + output = ntops.torch.fliplr(input) + expected = torch.fliplr(input) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_fliplr_matches_flip_dim1(): + device = "cuda" + input = torch.randn(4, 6, 8, device=device) + + assert torch.equal(ntops.torch.fliplr(input), ntops.torch.flip(input, (1,))) + + +@skip_if_cuda_not_available +def test_fliplr_non_contiguous(): + """A transposed (non-contiguous) input must still flip correctly.""" + device = "cuda" + input = torch.randn(6, 4, 8, device=device).transpose(0, 2) + + output = ntops.torch.fliplr(input) + expected = torch.fliplr(input) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_fliplr_double_flip_is_identity(): + device = "cuda" + input = torch.randn(5, 7, device=device) + + assert torch.equal(ntops.torch.fliplr(ntops.torch.fliplr(input)), input) + + +@skip_if_cuda_not_available +def test_fliplr_1d_raises(): + device = "cuda" + input = torch.randn(8, device=device) + + with pytest.raises(RuntimeError): + ntops.torch.fliplr(input) + + +# --------------------------------------------------------------------------- +# Performance benchmark interface +# --------------------------------------------------------------------------- + +def benchmark_fliplr( + shape, + dtype=torch.float32, + device="cuda", + n_warmup=10, + n_repeat=100, +): + """Compare ntops.torch.fliplr vs torch.fliplr. + + Returns timing (ms) and effective memory bandwidth (GB/s) for both, plus + the speedup ratio. Bandwidth assumes one read of the input plus one write + of the output (2x input bytes), the lower bound for the op. + + Example + ------- + >>> results = benchmark_fliplr([4096, 4096]) + >>> print(results) + """ + if not torch.cuda.is_available() and device == "cuda": + raise RuntimeError("CUDA not available") + + input = torch.randn(shape, dtype=dtype, device=device) + + for _ in range(n_warmup): + ntops.torch.fliplr(input) + torch.fliplr(input) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(n_repeat): + ntops.torch.fliplr(input) + end.record() + torch.cuda.synchronize() + ntops_ms = start.elapsed_time(end) / n_repeat + + start.record() + for _ in range(n_repeat): + torch.fliplr(input) + end.record() + torch.cuda.synchronize() + torch_ms = start.elapsed_time(end) / n_repeat + + num_bytes = input.numel() * input.element_size() * 2 + ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + return { + "shape": shape, + "dtype": str(dtype), + "ntops_time_ms": ntops_ms, + "torch_time_ms": torch_ms, + "ntops_bandwidth_GBs": ntops_gbps, + "torch_bandwidth_GBs": torch_gbps, + "speedup": torch_ms / ntops_ms, + } + + +_SWEEP_SHAPES = [ + [1024, 1024], + [4096, 4096], + [8192, 8192], +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_benchmark_sweep(dtype): + """Sweep tensor sizes. Run with: + pytest tests/test_fliplr.py::test_benchmark_sweep -v -s + """ + header = ( + f"{'shape':>16} {'MB':>8} " + f"{'ntops(ms)':>11} {'torch(ms)':>11} " + f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}" + ) + print(f"\n{'='*len(header)}") + print(f"fliplr sweep | dtype={dtype}") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + for shape in _SWEEP_SHAPES: + res = benchmark_fliplr(shape, dtype=dtype) + mb = ( + res["ntops_bandwidth_GBs"] * res["ntops_time_ms"] * 1e-3 * 1e9 + ) / 2 / 1e6 + print( + f"{str(shape):>16} {mb:>8.1f} " + f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} " + f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} " + f"{res['speedup']:>9.2f}" + ) + + print("=" * len(header)) + + +@skip_if_cuda_not_available +def test_benchmark_interface(): + """Smoke-test that the benchmark interface runs without error.""" + results = benchmark_fliplr([512, 512], n_warmup=2, n_repeat=5) + assert results["ntops_time_ms"] > 0 + assert results["ntops_bandwidth_GBs"] > 0 diff --git a/tests/test_mse_loss.py b/tests/test_mse_loss.py new file mode 100644 index 0000000..e323571 --- /dev/null +++ b/tests/test_mse_loss.py @@ -0,0 +1,192 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available + +# --------------------------------------------------------------------------- +# Correctness tests (compared against torch.nn.functional.mse_loss) +# --------------------------------------------------------------------------- + +_SHAPES = [ + [16], + [1024], + [4097], # not a multiple of the reduction block size + [32, 64], + [8, 7, 5], + [4, 3, 16, 16], + [1], # single element +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype, rtol, atol", [ + (torch.float32, 1e-3, 1e-3), + (torch.float16, 1e-2, 1e-2), +]) +@pytest.mark.parametrize("reduction", ["none", "mean", "sum"]) +@pytest.mark.parametrize("shape", _SHAPES) +def test_mse_loss(shape, reduction, dtype, rtol, atol): + device = "cuda" + input = torch.randn(shape, dtype=dtype, device=device) + target = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.mse_loss(input, target, reduction=reduction) + expected = F.mse_loss(input, target, reduction=reduction) + + assert output.shape == expected.shape + assert output.dtype == expected.dtype + assert torch.allclose(output, expected, rtol=rtol, atol=atol) + + +@skip_if_cuda_not_available +def test_mse_loss_default_reduction_is_mean(): + device = "cuda" + input = torch.randn(2, 3, 4, device=device) + target = torch.randn(2, 3, 4, device=device) + + output = ntops.torch.mse_loss(input, target) + expected = F.mse_loss(input, target) + + assert torch.allclose(output, expected, rtol=1e-3, atol=1e-3) + + +@skip_if_cuda_not_available +def test_mse_loss_broadcast(): + device = "cuda" + input = torch.randn(4, 3, 8, device=device) + target = torch.randn(3, 8, device=device) + + output = ntops.torch.mse_loss(input, target, reduction="sum") + expected = F.mse_loss( + input, target.expand_as(input).contiguous(), reduction="sum" + ) + + assert torch.allclose(output, expected, rtol=1e-3, atol=1e-3) + + +@skip_if_cuda_not_available +def test_mse_loss_invalid_reduction(): + device = "cuda" + input = torch.randn(8, device=device) + target = torch.randn(8, device=device) + + with pytest.raises(ValueError): + ntops.torch.mse_loss(input, target, reduction="median") + + +# --------------------------------------------------------------------------- +# Performance benchmark interface +# --------------------------------------------------------------------------- + +def benchmark_mse_loss( + shape, + reduction="mean", + dtype=torch.float32, + device="cuda", + n_warmup=10, + n_repeat=100, +): + """Compare ntops.torch.mse_loss vs F.mse_loss. + + Returns timing (ms) and effective memory bandwidth (GB/s) for both, + plus the speedup ratio. Bandwidth assumes both ``input`` and ``target`` + are read once (2x input bytes), which is the lower bound for the op. + + Example + ------- + >>> results = benchmark_mse_loss([4096, 4096], "mean") + >>> print(results) + """ + if not torch.cuda.is_available() and device == "cuda": + raise RuntimeError("CUDA not available") + + input = torch.randn(shape, dtype=dtype, device=device) + target = torch.randn(shape, dtype=dtype, device=device) + + for _ in range(n_warmup): + ntops.torch.mse_loss(input, target, reduction=reduction) + F.mse_loss(input, target, reduction=reduction) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(n_repeat): + ntops.torch.mse_loss(input, target, reduction=reduction) + end.record() + torch.cuda.synchronize() + ntops_ms = start.elapsed_time(end) / n_repeat + + start.record() + for _ in range(n_repeat): + F.mse_loss(input, target, reduction=reduction) + end.record() + torch.cuda.synchronize() + torch_ms = start.elapsed_time(end) / n_repeat + + num_bytes = input.numel() * input.element_size() * 2 + ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + return { + "shape": shape, + "reduction": reduction, + "dtype": str(dtype), + "ntops_time_ms": ntops_ms, + "torch_time_ms": torch_ms, + "ntops_bandwidth_GBs": ntops_gbps, + "torch_bandwidth_GBs": torch_gbps, + "speedup": torch_ms / ntops_ms, + } + + +_SWEEP_SHAPES = [ + [1024, 1024], # 4 MB + [4096, 4096], # 64 MB + [8192, 8192], # 256 MB +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize("reduction", ["none", "mean", "sum"]) +def test_benchmark_sweep(reduction, dtype): + """Sweep tensor sizes. Run with: + pytest tests/test_mse_loss.py::test_benchmark_sweep -v -s + """ + header = ( + f"{'shape':>16} {'MB':>8} " + f"{'ntops(ms)':>11} {'torch(ms)':>11} " + f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}" + ) + print(f"\n{'='*len(header)}") + print(f"mse_loss sweep | reduction={reduction} | dtype={dtype}") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + for shape in _SWEEP_SHAPES: + res = benchmark_mse_loss(shape, reduction=reduction, dtype=dtype) + mb = ( + torch.empty(shape, dtype=dtype).numel() + * torch.empty(0, dtype=dtype).element_size() + ) / 1e6 + print( + f"{str(shape):>16} {mb:>8.1f} " + f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} " + f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} " + f"{res['speedup']:>9.2f}" + ) + + print("=" * len(header)) + + +@skip_if_cuda_not_available +def test_benchmark_interface(): + """Smoke-test that the benchmark interface runs without error.""" + results = benchmark_mse_loss([512, 512], n_warmup=2, n_repeat=5) + assert results["ntops_time_ms"] > 0 + assert results["ntops_bandwidth_GBs"] > 0 diff --git a/tests/test_pixel_unshuffle.py b/tests/test_pixel_unshuffle.py new file mode 100644 index 0000000..1dd355a --- /dev/null +++ b/tests/test_pixel_unshuffle.py @@ -0,0 +1,190 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available + +# --------------------------------------------------------------------------- +# Correctness tests (compared against torch.nn.functional.pixel_unshuffle) +# --------------------------------------------------------------------------- + +_FLOAT_CASES = [ + # (shape, downscale_factor) + ([1, 1, 4, 4], 2), + ([2, 3, 8, 8], 2), + ([4, 8, 12, 12], 3), + ([2, 16, 16, 16], 4), + ([8, 4, 32, 32], 2), + ([3, 5, 6, 9], 3), # H != W + ([1, 1, 2, 2], 2), # minimal +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize("shape, r", _FLOAT_CASES) +def test_pixel_unshuffle_matches_torch(shape, r, dtype): + device = "cuda" + input = torch.randn(shape, dtype=dtype, device=device) + + output = ntops.torch.pixel_unshuffle(input, r) + expected = F.pixel_unshuffle(input, r) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.int32, torch.int64]) +@pytest.mark.parametrize("shape, r", _FLOAT_CASES) +def test_pixel_unshuffle_int(shape, r, dtype): + device = "cuda" + input = torch.randint(-1000, 1000, shape, dtype=dtype, device=device) + + output = ntops.torch.pixel_unshuffle(input, r) + expected = F.pixel_unshuffle(input, r) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_pixel_unshuffle_leading_dims(): + """torch supports arbitrary leading dims: (*, C, H, W).""" + device = "cuda" + input = torch.randn(2, 3, 4, 16, 16, device=device) + + output = ntops.torch.pixel_unshuffle(input, 4) + expected = F.pixel_unshuffle(input, 4) + + assert output.shape == expected.shape + assert torch.equal(output, expected) + + +@skip_if_cuda_not_available +def test_pixel_unshuffle_roundtrip(): + """pixel_shuffle(pixel_unshuffle(x)) == x.""" + device = "cuda" + r = 2 + input = torch.randn(2, 3, 8, 8, device=device) + + unshuffled = ntops.torch.pixel_unshuffle(input, r) + restored = F.pixel_shuffle(unshuffled, r) + + assert torch.equal(restored, input) + + +# --------------------------------------------------------------------------- +# Performance benchmark interface +# --------------------------------------------------------------------------- + +def benchmark_pixel_unshuffle( + shape, + downscale_factor=2, + dtype=torch.float32, + device="cuda", + n_warmup=10, + n_repeat=100, +): + """Compare ntops.torch.pixel_unshuffle vs F.pixel_unshuffle. + + Returns timing (ms) and effective memory bandwidth (GB/s) for both, + plus the speedup ratio. + + Example + ------- + >>> results = benchmark_pixel_unshuffle([32, 64, 128, 128], 2) + >>> print(results) + """ + if not torch.cuda.is_available() and device == "cuda": + raise RuntimeError("CUDA not available") + + r = downscale_factor + input_tensor = torch.randn(shape, dtype=dtype, device=device) + + for _ in range(n_warmup): + ntops.torch.pixel_unshuffle(input_tensor, r) + F.pixel_unshuffle(input_tensor, r) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(n_repeat): + ntops.torch.pixel_unshuffle(input_tensor, r) + end.record() + torch.cuda.synchronize() + ntops_ms = start.elapsed_time(end) / n_repeat + + start.record() + for _ in range(n_repeat): + F.pixel_unshuffle(input_tensor, r) + end.record() + torch.cuda.synchronize() + torch_ms = start.elapsed_time(end) / n_repeat + + # read input + write output == 2x input size + num_bytes = input_tensor.numel() * input_tensor.element_size() * 2 + ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9 + torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9 + + return { + "shape": shape, + "downscale_factor": r, + "dtype": str(dtype), + "ntops_time_ms": ntops_ms, + "torch_time_ms": torch_ms, + "ntops_bandwidth_GBs": ntops_gbps, + "torch_bandwidth_GBs": torch_gbps, + "speedup": torch_ms / ntops_ms, + } + + +_SWEEP_SHAPES = [ + ([4, 16, 32, 32], 2), # 0.25 MB + ([8, 64, 64, 64], 2), # 16 MB + ([16, 128, 64, 64], 2), # 64 MB + ([32, 256, 64, 64], 2), # 256 MB +] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_benchmark_sweep(dtype): + """Sweep tensor sizes. Run with: pytest tests/test_pixel_unshuffle.py::test_benchmark_sweep -v -s""" + header = ( + f"{'shape':>22} {'r':>3} {'MB':>8} " + f"{'ntops(ms)':>11} {'torch(ms)':>11} " + f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}" + ) + print(f"\n{'='*len(header)}") + print(f"pixel_unshuffle sweep | dtype={dtype}") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + for shape, r in _SWEEP_SHAPES: + res = benchmark_pixel_unshuffle(shape, downscale_factor=r, dtype=dtype) + mb = ( + res["ntops_bandwidth_GBs"] * res["ntops_time_ms"] * 1e-3 * 1e9 + ) / 2 / 1e6 + print( + f"{str(shape):>22} {r:>3} {mb:>8.1f} " + f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} " + f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} " + f"{res['speedup']:>9.2f}" + ) + + print("=" * len(header)) + + +@skip_if_cuda_not_available +def test_benchmark_interface(): + """Smoke-test that benchmark interface runs without error.""" + results = benchmark_pixel_unshuffle( + [8, 32, 32, 32], downscale_factor=2, n_warmup=2, n_repeat=5 + ) + assert results["ntops_time_ms"] > 0 + assert results["ntops_bandwidth_GBs"] > 0 From c71f3c39b7f888f8c0ef153ca57b39734ced014f Mon Sep 17 00:00:00 2001 From: sunteng Date: Sat, 20 Jun 2026 16:32:19 +0800 Subject: [PATCH 2/2] fix Signed-off-by: sunteng --- bench/bench_t1_1_7.py | 101 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 bench/bench_t1_1_7.py diff --git a/bench/bench_t1_1_7.py b/bench/bench_t1_1_7.py new file mode 100644 index 0000000..4ad23dc --- /dev/null +++ b/bench/bench_t1_1_7.py @@ -0,0 +1,101 @@ +"""Benchmark T1-1-7 operators vs torch. + + feature_alpha_dropout / mse_loss / flip / fliplr / pixel_unshuffle + + python bench/bench_t1_1_7.py +""" + +import torch +import torch.nn.functional as F +import triton.testing + +import ntops + +DEVICE = "cuda" +DTYPE = torch.float32 + + +def _report(name, shape_str, ms_nt, ms_th, nbytes): + bw_nt = nbytes / ms_nt * 1e-6 + bw_th = nbytes / ms_th * 1e-6 + print( + f" {name:22s} {shape_str:22s} " + f"九齿 {bw_nt:7.0f} GB/s | torch {bw_th:7.0f} GB/s | " + f"speedup {ms_th / ms_nt:.2f}x" + ) + + +def bench_feature_alpha_dropout(): + print("\n[feature_alpha_dropout]") + for shape in [(64, 256, 32, 32), (128, 512, 16, 16), (32, 256, 64, 64)]: + x = torch.randn(shape, dtype=DTYPE, device=DEVICE) + nbytes = x.numel() * x.element_size() * 2 + ms_nt = triton.testing.do_bench( + lambda: ntops.torch.feature_alpha_dropout(x, p=0.5, training=True) + ) + ms_th = triton.testing.do_bench( + lambda: F.feature_alpha_dropout(x, p=0.5, training=True) + ) + _report("feature_alpha_dropout", str(shape), ms_nt, ms_th, nbytes) + + +def bench_mse_loss(): + print("\n[mse_loss]") + for shape in [(4096, 4096), (8192, 8192), (4096 * 4096,)]: + x = torch.randn(shape, dtype=DTYPE, device=DEVICE) + t = torch.randn(shape, dtype=DTYPE, device=DEVICE) + nbytes = x.numel() * x.element_size() * 2 # 2 reads + ms_nt = triton.testing.do_bench( + lambda: ntops.torch.mse_loss(x, t, reduction="mean") + ) + ms_th = triton.testing.do_bench( + lambda: F.mse_loss(x, t, reduction="mean") + ) + _report("mse_loss", str(shape), ms_nt, ms_th, nbytes) + + +def bench_flip(): + print("\n[flip]") + cases = [((4096, 4096), (0,)), ((4096, 4096), (1,)), ((8192, 8192), (0, 1))] + for shape, dims in cases: + x = torch.randn(shape, dtype=DTYPE, device=DEVICE) + nbytes = x.numel() * x.element_size() * 2 # 1 read + 1 write + ms_nt = triton.testing.do_bench(lambda: ntops.torch.flip(x, dims)) + ms_th = triton.testing.do_bench(lambda: torch.flip(x, dims)) + _report("flip", f"{shape} dims={dims}", ms_nt, ms_th, nbytes) + + +def bench_fliplr(): + print("\n[fliplr]") + for shape in [(4096, 4096), (8192, 8192)]: + x = torch.randn(shape, dtype=DTYPE, device=DEVICE) + nbytes = x.numel() * x.element_size() * 2 + ms_nt = triton.testing.do_bench(lambda: ntops.torch.fliplr(x)) + ms_th = triton.testing.do_bench(lambda: torch.fliplr(x)) + _report("fliplr", str(shape), ms_nt, ms_th, nbytes) + + +def bench_pixel_unshuffle(): + print("\n[pixel_unshuffle]") + cases = [((32, 64, 112, 112), 2), ((16, 128, 128, 128), 4), ((64, 64, 64, 64), 2)] + for shape, r in cases: + x = torch.randn(shape, dtype=DTYPE, device=DEVICE) + nbytes = x.numel() * x.element_size() * 2 + ms_nt = triton.testing.do_bench( + lambda: ntops.torch.pixel_unshuffle(x, r) + ) + ms_th = triton.testing.do_bench(lambda: F.pixel_unshuffle(x, r)) + _report("pixel_unshuffle", f"{shape} r={r}", ms_nt, ms_th, nbytes) + + +def main(): + print(f"device: {torch.cuda.get_device_name()} dtype: {DTYPE}") + bench_feature_alpha_dropout() + bench_mse_loss() + bench_flip() + bench_fliplr() + bench_pixel_unshuffle() + + +if __name__ == "__main__": + main()