From 6898d10f95b7dd107e2c707915f030fb0e8a2286 Mon Sep 17 00:00:00 2001
From: sunteng <steng2009@163.com>
Date: Thu, 21 May 2026 20:03:46 +0800
Subject: [PATCH 1/2] add feature_alpha_dropout, mse_loss, flip, fliplr,
 pixel_unshuffle operators with three-platform tuned launch configs

Signed-off-by: sunteng <steng2009@163.com>
---
 bench/tune_flip.py                         | 118 ++++++++++
 bench/tune_mse_loss.py                     | 170 ++++++++++++++
 bench/tune_pixel_unshuffle.py              | 132 +++++++++++
 src/ntops/kernels/__init__.py              |   8 +
 src/ntops/kernels/feature_alpha_dropout.py |  24 ++
 src/ntops/kernels/flip.py                  |  37 ++++
 src/ntops/kernels/mse_loss.py              |  52 +++++
 src/ntops/kernels/pixel_unshuffle.py       |  17 ++
 src/ntops/torch/__init__.py                |  10 +
 src/ntops/torch/feature_alpha_dropout.py   |  37 ++++
 src/ntops/torch/flip.py                    |  51 +++++
 src/ntops/torch/fliplr.py                  |  10 +
 src/ntops/torch/mse_loss.py                |  81 +++++++
 src/ntops/torch/pixel_unshuffle.py         |  61 +++++
 tests/test_feature_alpha_dropout.py        | 246 +++++++++++++++++++++
 tests/test_flip.py                         | 224 +++++++++++++++++++
 tests/test_fliplr.py                       | 196 ++++++++++++++++
 tests/test_mse_loss.py                     | 192 ++++++++++++++++
 tests/test_pixel_unshuffle.py              | 190 ++++++++++++++++
 19 files changed, 1856 insertions(+)
 create mode 100644 bench/tune_flip.py
 create mode 100644 bench/tune_mse_loss.py
 create mode 100644 bench/tune_pixel_unshuffle.py
 create mode 100644 src/ntops/kernels/feature_alpha_dropout.py
 create mode 100644 src/ntops/kernels/flip.py
 create mode 100644 src/ntops/kernels/mse_loss.py
 create mode 100644 src/ntops/kernels/pixel_unshuffle.py
 create mode 100644 src/ntops/torch/feature_alpha_dropout.py
 create mode 100644 src/ntops/torch/flip.py
 create mode 100644 src/ntops/torch/fliplr.py
 create mode 100644 src/ntops/torch/mse_loss.py
 create mode 100644 src/ntops/torch/pixel_unshuffle.py
 create mode 100644 tests/test_feature_alpha_dropout.py
 create mode 100644 tests/test_flip.py
 create mode 100644 tests/test_fliplr.py
 create mode 100644 tests/test_mse_loss.py
 create mode 100644 tests/test_pixel_unshuffle.py

diff --git a/bench/tune_flip.py b/bench/tune_flip.py
new file mode 100644
index 0000000..b4c9fba
--- /dev/null
+++ b/bench/tune_flip.py
@@ -0,0 +1,118 @@
+"""Tune the pinned launch config (block_size / num_warps / num_stages) for
+``ntops.torch.flip`` on the current GPU.
+
+Performance evaluation runs with auto-tuning disabled (``max_num_configs=1``),
+so the values baked into ``ntops/torch/flip.py`` decide the score. This script
+sweeps a small grid under those exact conditions and prints, per shape, the
+fastest config plus the speedup over ``torch.flip``.
+
+Usage
+-----
+    python bench/tune_flip.py
+"""
+
+import itertools
+
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+# Shapes that matter for a bandwidth-bound op: the medium case that has not yet
+# saturated memory, and the large cases at the bandwidth ceiling. Small shapes
+# are launch-overhead bound and not informative for config tuning.
+_SHAPES = [
+    ([4096, 4096], (0, 1)),
+    ([4096, 4096], (1,)),
+    ([8192, 8192], (1,)),
+    ([8192, 8192], (0,)),
+]
+
+_BLOCK_SIZES = [512, 1024, 2048, 4096, 8192]
+_NUM_WARPS = [4, 8, 16]
+_NUM_STAGES = [1, 2]
+
+_DTYPES = [torch.float32, torch.float16]
+
+
+def _time(fn, n_warmup=10, n_repeat=50):
+    for _ in range(n_warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(n_repeat):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / n_repeat
+
+
+def _run_config(input, output, dims, block_size, num_warps, num_stages):
+    kernel = _cached_make(
+        ntops.kernels.flip.premake,
+        input.ndim,
+        dims,
+        block_size=block_size,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        max_num_configs=1,
+    )
+    return lambda: kernel(input, output)
+
+
+def tune():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available")
+
+    for dtype in _DTYPES:
+        print(f"\n{'='*92}")
+        print(f"flip config sweep | dtype={dtype} | device={torch.cuda.get_device_name()}")
+        print("=" * 92)
+
+        for shape, dims in _SHAPES:
+            input = torch.randn(shape, dtype=dtype, device="cuda")
+            output = torch.empty(shape, dtype=dtype, device="cuda")
+            num_bytes = input.numel() * input.element_size() * 2
+
+            torch_ms = _time(lambda: torch.flip(input, list(dims)))
+
+            results = []
+            for bs, nw, ns in itertools.product(
+                _BLOCK_SIZES, _NUM_WARPS, _NUM_STAGES
+            ):
+                try:
+                    fn = _run_config(input, output, dims, bs, nw, ns)
+                    ms = _time(fn)
+                except Exception as exc:  # noqa: BLE001
+                    print(f"  skip bs={bs} nw={nw} ns={ns}: {type(exc).__name__}")
+                    continue
+                results.append((ms, bs, nw, ns))
+
+            results.sort()
+            best_ms, bbs, bnw, bns = results[0]
+            best_gbps = num_bytes / (best_ms * 1e-3) / 1e9
+            torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+            print(
+                f"\nshape={shape} dims={dims}  "
+                f"(torch {torch_ms:.4f} ms / {torch_gbps:.0f} GB/s)"
+            )
+            print(
+                f"  BEST  block_size={bbs:<5} num_warps={bnw:<3} num_stages={bns}  "
+                f"-> {best_ms:.4f} ms / {best_gbps:.0f} GB/s  "
+                f"(speedup vs torch {torch_ms / best_ms:.2f})"
+            )
+            print("  top 5:")
+            for ms, bs, nw, ns in results[:5]:
+                gbps = num_bytes / (ms * 1e-3) / 1e9
+                print(
+                    f"    block_size={bs:<5} num_warps={nw:<3} num_stages={ns}  "
+                    f"{ms:.4f} ms / {gbps:.0f} GB/s"
+                )
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/bench/tune_mse_loss.py b/bench/tune_mse_loss.py
new file mode 100644
index 0000000..a0d0920
--- /dev/null
+++ b/bench/tune_mse_loss.py
@@ -0,0 +1,170 @@
+"""Tune the pinned launch configs for ``ntops.torch.mse_loss`` on the current
+GPU.
+
+Two kernels are tuned independently:
+  * the reduction path (``reduction="mean"|"sum"``) -- the defining, perf
+    critical kernel; partials buffer size depends on ``block_size``;
+  * the element-wise path (``reduction="none"``).
+
+Performance evaluation runs with auto-tuning disabled (``max_num_configs=1``),
+so the values baked into ``ntops/torch/mse_loss.py`` decide the score. This
+script sweeps a small grid under those exact conditions and prints, per shape,
+the fastest config plus the speedup over ``torch.nn.functional.mse_loss``.
+
+Usage
+-----
+    python bench/tune_mse_loss.py
+"""
+
+import itertools
+import math
+
+import torch
+import torch.nn.functional as F
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+# Numbers of elements to tune over (bandwidth-bound regime). Small sizes are
+# launch-overhead bound and not informative for config selection.
+_NUMELS = [1024 * 1024, 4096 * 4096, 8192 * 8192]
+
+_BLOCK_SIZES = [512, 1024, 2048, 4096, 8192]
+_NUM_WARPS = [4, 8, 16]
+_NUM_STAGES = [1, 2]
+
+_DTYPES = [torch.float32, torch.float16]
+
+
+def _time(fn, n_warmup=10, n_repeat=50):
+    for _ in range(n_warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(n_repeat):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / n_repeat
+
+
+def _reduce_runner(flat_in, flat_tg, block_size, num_warps, num_stages):
+    numel = flat_in.numel()
+    num_partials = max(1, math.ceil(numel / block_size))
+    partials = torch.empty(num_partials, dtype=torch.float32, device=flat_in.device)
+
+    kernel = _cached_make(
+        ntops.kernels.mse_loss.reduce_premake,
+        block_size=block_size,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        max_num_configs=1,
+    )
+
+    def run():
+        kernel(flat_in, flat_tg, partials)
+        return partials.sum()
+
+    return run
+
+
+def _none_runner(input, target, output, block_size, num_warps, num_stages):
+    kernel = _cached_make(
+        ntops.kernels.mse_loss.premake,
+        input.ndim,
+        block_size=block_size,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        max_num_configs=1,
+    )
+    return lambda: kernel(input, target, output)
+
+
+def _sweep(label, make_runner, num_bytes, torch_ms):
+    results = []
+    for bs, nw, ns in itertools.product(_BLOCK_SIZES, _NUM_WARPS, _NUM_STAGES):
+        try:
+            ms = _time(make_runner(bs, nw, ns))
+        except Exception as exc:  # noqa: BLE001
+            print(f"  skip bs={bs} nw={nw} ns={ns}: {type(exc).__name__}")
+            continue
+        results.append((ms, bs, nw, ns))
+
+    results.sort()
+    best_ms, bbs, bnw, bns = results[0]
+    best_gbps = num_bytes / (best_ms * 1e-3) / 1e9
+    torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+    print(f"\n  [{label}]  (torch {torch_ms:.4f} ms / {torch_gbps:.0f} GB/s)")
+    print(
+        f"    BEST  block_size={bbs:<5} num_warps={bnw:<3} num_stages={bns}  "
+        f"-> {best_ms:.4f} ms / {best_gbps:.0f} GB/s  "
+        f"(speedup vs torch {torch_ms / best_ms:.2f})"
+    )
+    for ms, bs, nw, ns in results[:5]:
+        gbps = num_bytes / (ms * 1e-3) / 1e9
+        print(
+            f"      block_size={bs:<5} num_warps={nw:<3} num_stages={ns}  "
+            f"{ms:.4f} ms / {gbps:.0f} GB/s"
+        )
+
+
+def _check_reduce_correctness(dtype):
+    """Sanity check that the reduction kernel matches F.mse_loss before trusting
+    any timing numbers."""
+    x = torch.randn(40000, dtype=dtype, device="cuda")
+    y = torch.randn(40000, dtype=dtype, device="cuda")
+    run = _reduce_runner(x, y, 1024, 4, 1)
+    got = (run() / x.numel()).to(dtype)
+    ref = F.mse_loss(x, y, reduction="mean")
+    tol = 1e-3 if dtype == torch.float32 else 1e-2
+    assert torch.allclose(got, ref, rtol=tol, atol=tol), (got.item(), ref.item())
+
+
+def tune():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available")
+
+    for dtype in _DTYPES:
+        _check_reduce_correctness(dtype)
+        itemsize = torch.empty(0, dtype=dtype).element_size()
+
+        print(f"\n{'='*92}")
+        print(
+            f"mse_loss config sweep | dtype={dtype} | "
+            f"device={torch.cuda.get_device_name()}"
+        )
+        print("=" * 92)
+
+        for numel in _NUMELS:
+            side = int(round(numel**0.5))
+            input = torch.randn(numel, dtype=dtype, device="cuda")
+            target = torch.randn(numel, dtype=dtype, device="cuda")
+            output = torch.empty_like(input)
+
+            print(f"\nnumel={numel} (~{side}^2, {numel * itemsize / 1e6:.1f} MB)")
+
+            # Reduction path: reads input + target (2x).
+            torch_ms = _time(lambda: F.mse_loss(input, target, reduction="sum"))
+            _sweep(
+                "reduce (sum/mean)",
+                lambda bs, nw, ns: _reduce_runner(input, target, bs, nw, ns),
+                numel * itemsize * 2,
+                torch_ms,
+            )
+
+            # Element-wise path: reads input + target, writes output (3x).
+            torch_ms = _time(lambda: F.mse_loss(input, target, reduction="none"))
+            _sweep(
+                "none (element-wise)",
+                lambda bs, nw, ns: _none_runner(input, target, output, bs, nw, ns),
+                numel * itemsize * 3,
+                torch_ms,
+            )
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/bench/tune_pixel_unshuffle.py b/bench/tune_pixel_unshuffle.py
new file mode 100644
index 0000000..ec29b35
--- /dev/null
+++ b/bench/tune_pixel_unshuffle.py
@@ -0,0 +1,132 @@
+"""Tune the pinned launch config for ``ntops.torch.pixel_unshuffle`` on the
+current GPU.
+
+The kernel is an element-wise copy of a strided (permuted) view. Performance
+evaluation runs with auto-tuning disabled (``max_num_configs=1``), so the values
+baked into ``ntops/torch/pixel_unshuffle.py`` decide the score. This script
+sweeps ``block_size`` / ``num_warps`` / ``num_stages`` under those exact
+conditions and prints, per shape, the fastest config plus the speedup over
+``F.pixel_unshuffle``. ``num_stages`` is expected to be a no-op (one block per
+program, no inner loop).
+
+Usage
+-----
+    python bench/tune_pixel_unshuffle.py
+"""
+
+import itertools
+
+import torch
+import torch.nn.functional as F
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+# (shape, downscale_factor) -- the bandwidth-bound regime.
+_CASES = [
+    ([16, 128, 64, 64], 2),
+    ([32, 256, 64, 64], 2),
+    ([8, 64, 128, 128], 2),
+    ([4, 8, 12, 12], 3),
+]
+
+_BLOCK_SIZES = [256, 512, 1024, 2048, 4096, 8192]
+_NUM_WARPS = [4, 8, 16]
+_NUM_STAGES = [1, 2]
+
+_DTYPES = [torch.float32, torch.float16]
+
+
+def _time(fn, n_warmup=10, n_repeat=50):
+    for _ in range(n_warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(n_repeat):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / n_repeat
+
+
+def _runner(input, r, block_size, num_warps, num_stages):
+    *batch, c, h, w = input.shape
+    h_, w_ = h // r, w // r
+    src = input.reshape(*batch, c, h_, r, w_, r).movedim((-3, -1), (-4, -3))
+    output = torch.empty(
+        (*batch, c, r, r, h_, w_), dtype=input.dtype, device=input.device
+    )
+
+    kernel = _cached_make(
+        ntops.kernels.pixel_unshuffle.premake,
+        src.ndim,
+        block_size=block_size,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        max_num_configs=1,
+    )
+    return lambda: kernel(src, output)
+
+
+def _check_correctness(input, r):
+    expected = F.pixel_unshuffle(input, r)
+    got = ntops.torch.pixel_unshuffle(input, r)
+    assert torch.equal(got, expected), "pixel_unshuffle mismatch vs torch"
+
+
+def tune():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available")
+
+    for dtype in _DTYPES:
+        print(f"\n{'='*96}")
+        print(
+            f"pixel_unshuffle config sweep | dtype={dtype} | "
+            f"device={torch.cuda.get_device_name()}"
+        )
+        print("=" * 96)
+
+        for shape, r in _CASES:
+            input = torch.randn(shape, dtype=dtype, device="cuda")
+            _check_correctness(input, r)
+
+            num_bytes = input.numel() * input.element_size() * 2
+            torch_ms = _time(lambda: F.pixel_unshuffle(input, r))
+
+            results = []
+            for bs, nw, ns in itertools.product(
+                _BLOCK_SIZES, _NUM_WARPS, _NUM_STAGES
+            ):
+                try:
+                    ms = _time(_runner(input, r, bs, nw, ns))
+                except Exception as exc:  # noqa: BLE001
+                    print(f"  skip bs={bs} nw={nw} ns={ns}: {type(exc).__name__}")
+                    continue
+                results.append((ms, bs, nw, ns))
+
+            results.sort()
+            best_ms, bbs, bnw, bns = results[0]
+            best_gbps = num_bytes / (best_ms * 1e-3) / 1e9
+            torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+            print(
+                f"\nshape={shape} r={r}  (torch {torch_ms:.4f} ms / {torch_gbps:.0f} GB/s)"
+            )
+            print(
+                f"  BEST  block_size={bbs:<5} num_warps={bnw:<3} num_stages={bns}  "
+                f"-> {best_ms:.4f} ms / {best_gbps:.0f} GB/s  "
+                f"(speedup vs torch {torch_ms / best_ms:.2f})"
+            )
+            for ms, bs, nw, ns in results[:5]:
+                gbps = num_bytes / (ms * 1e-3) / 1e9
+                print(
+                    f"    block_size={bs:<5} num_warps={nw:<3} num_stages={ns}  "
+                    f"{ms:.4f} ms / {gbps:.0f} GB/s"
+                )
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
index f6934ef..9e2dc2e 100644
--- a/src/ntops/kernels/__init__.py
+++ b/src/ntops/kernels/__init__.py
@@ -13,7 +13,9 @@
     div,
     dropout,
     eq,
+    feature_alpha_dropout,
     exp,
+    flip,
     ge,
     gelu,
     gt,
@@ -24,9 +26,11 @@
     lt,
     max_pool2d,
     mm,
+    mse_loss,
     mul,
     ne,
     neg,
+    pixel_unshuffle,
     pow,
     relu,
     rms_norm,
@@ -56,7 +60,9 @@
     "div",
     "dropout",
     "eq",
+    "feature_alpha_dropout",
     "exp",
+    "flip",
     "ge",
     "gelu",
     "gt",
@@ -67,9 +73,11 @@
     "lt",
     "max_pool2d",
     "mm",
+    "mse_loss",
     "mul",
     "ne",
     "neg",
+    "pixel_unshuffle",
     "pow",
     "relu",
     "rms_norm",
diff --git a/src/ntops/kernels/feature_alpha_dropout.py b/src/ntops/kernels/feature_alpha_dropout.py
new file mode 100644
index 0000000..483ef3d
--- /dev/null
+++ b/src/ntops/kernels/feature_alpha_dropout.py
@@ -0,0 +1,24 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, noise, scale, output):
+    output = ntl.where(noise > 0, input * scale, -1.7580993408473766)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=ninetoothed.float32),  # bernoulli noise mask
+        Tensor(0, dtype=ninetoothed.float64),     # scale = 1 / (1 - p)
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/flip.py b/src/ntops/kernels/flip.py
new file mode 100644
index 0000000..cd5dfb3
--- /dev/null
+++ b/src/ntops/kernels/flip.py
@@ -0,0 +1,37 @@
+import functools
+
+import ninetoothed
+from ninetoothed import Tensor
+
+
+def arrangement(input, output, dims, block_size=None):
+    if block_size is None:
+        block_size = ninetoothed.block_size()
+
+    # Reverse the input's *logical* index along every flipped dim with a
+    # step `-1` slice. ``_slice_dim`` turns logical index ``i`` into source
+    # index ``size - 1 - i``, so the physical offset stays non-negative (no
+    # negative strides, which PyTorch does not support). The output is left in
+    # natural order; copying ``output = input`` therefore materializes the flip:
+    #   output[..., i, ...] = input[..., size - 1 - i, ...]
+    index = [slice(None)] * input.ndim
+    for dim in dims:
+        index[dim] = slice(None, None, -1)
+    input_reversed = input[tuple(index)]
+
+    input_arranged = input_reversed.flatten().tile((block_size,))
+    output_arranged = output.flatten().tile((block_size,))
+
+    return input_arranged, output_arranged
+
+
+def application(input, output):
+    output = input  # noqa: F841
+
+
+def premake(ndim, dims, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, dims=dims, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/mse_loss.py b/src/ntops/kernels/mse_loss.py
new file mode 100644
index 0000000..ffbe661
--- /dev/null
+++ b/src/ntops/kernels/mse_loss.py
@@ -0,0 +1,52 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement as _element_wise_arrangement
+
+
+def application(input, target, output):
+    diff = input - target
+    output = diff * diff  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(_element_wise_arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
+
+
+def _reduce_arrangement(input, target, output, block_size=None):
+    if block_size is None:
+        block_size = ninetoothed.block_size()
+
+    input_arranged = input.flatten().tile((block_size,))
+    target_arranged = target.flatten().tile((block_size,))
+    output_arranged = output.flatten().tile((1,))
+
+    return input_arranged, target_arranged, output_arranged
+
+
+def reduce_application(input, target, output):
+    diff = ntl.cast(input, ntl.float32) - ntl.cast(target, ntl.float32)
+    output = ntl.sum(diff * diff)  # noqa: F841
+
+
+def reduce_premake(input_dtype=None, block_size=None):
+    arrangement_ = functools.partial(_reduce_arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(1, other=0, dtype=input_dtype),
+        Tensor(1, other=0, dtype=input_dtype),
+        Tensor(1, dtype=ninetoothed.float32),
+    )
+
+    return arrangement_, reduce_application, tensors
diff --git a/src/ntops/kernels/pixel_unshuffle.py b/src/ntops/kernels/pixel_unshuffle.py
new file mode 100644
index 0000000..fff8218
--- /dev/null
+++ b/src/ntops/kernels/pixel_unshuffle.py
@@ -0,0 +1,17 @@
+import functools
+
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = input  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
index 82fc596..ed66d9d 100644
--- a/src/ntops/torch/__init__.py
+++ b/src/ntops/torch/__init__.py
@@ -11,8 +11,11 @@
 from ntops.torch.cos import cos
 from ntops.torch.div import div
 from ntops.torch.dropout import dropout
+from ntops.torch.feature_alpha_dropout import feature_alpha_dropout
 from ntops.torch.eq import eq
 from ntops.torch.exp import exp
+from ntops.torch.flip import flip
+from ntops.torch.fliplr import fliplr
 from ntops.torch.ge import ge
 from ntops.torch.gelu import gelu
 from ntops.torch.gt import gt
@@ -24,9 +27,11 @@
 from ntops.torch.matmul import matmul
 from ntops.torch.max_pool2d import max_pool2d
 from ntops.torch.mm import mm
+from ntops.torch.mse_loss import mse_loss
 from ntops.torch.mul import mul
 from ntops.torch.ne import ne
 from ntops.torch.neg import neg
+from ntops.torch.pixel_unshuffle import pixel_unshuffle
 from ntops.torch.pow import pow
 from ntops.torch.relu import relu
 from ntops.torch.rms_norm import rms_norm
@@ -54,8 +59,11 @@
     "cos",
     "div",
     "dropout",
+    "feature_alpha_dropout",
     "eq",
     "exp",
+    "flip",
+    "fliplr",
     "ge",
     "gelu",
     "gt",
@@ -67,9 +75,11 @@
     "matmul",
     "max_pool2d",
     "mm",
+    "mse_loss",
     "mul",
     "ne",
     "neg",
+    "pixel_unshuffle",
     "pow",
     "relu",
     "rms_norm",
diff --git a/src/ntops/torch/feature_alpha_dropout.py b/src/ntops/torch/feature_alpha_dropout.py
new file mode 100644
index 0000000..7143393
--- /dev/null
+++ b/src/ntops/torch/feature_alpha_dropout.py
@@ -0,0 +1,37 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+_ALPHA_PRIME = -1.7580993408473766  # -SELU_SCALE * SELU_ALPHA
+
+
+def feature_alpha_dropout(input, p=0.5, training=True, inplace=False):
+    if not training or p == 0.0:
+        return input if inplace else input.clone()
+
+    if p == 1.0:
+        fill = torch.full_like(input, _ALPHA_PRIME)
+        if inplace:
+            input.copy_(fill)
+            return input
+        return fill
+
+    # Per-channel Bernoulli mask: shape (N, C, 1, 1, ...), then broadcast and
+    # materialize over the spatial dims. Materializing (rather than a stride-0
+    # view) is intentional: the MACA/C500 backend handles coalesced contiguous
+    # reads far better than stride-0 broadcast loads.
+    noise_shape = list(input.shape)
+    for i in range(2, input.ndim):
+        noise_shape[i] = 1
+
+    noise = torch.empty(noise_shape, dtype=torch.float32, device=input.device)
+    noise.bernoulli_(1.0 - p)
+    noise = noise.expand_as(input).contiguous()
+
+    output = input if inplace else torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.feature_alpha_dropout.premake, input.ndim)
+    kernel(input, noise, 1.0 / (1.0 - p), output)
+
+    return output
diff --git a/src/ntops/torch/flip.py b/src/ntops/torch/flip.py
new file mode 100644
index 0000000..b13e405
--- /dev/null
+++ b/src/ntops/torch/flip.py
@@ -0,0 +1,51 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+# Launch config pinned for performance evaluation (auto-tuning disabled,
+# ``max_num_configs=1``). Tuned with ``bench/tune_flip.py`` on MetaX C500,
+# Iluvatar MR-V100 and NVIDIA RTX 4090. flip is a pure copy that saturates
+# memory bandwidth (~parity with torch.flip) on all three, and configs differ
+# by <2%, so a single config is kept. ``num_warps=16`` / ``block_size=1024`` is
+# optimal on NVIDIA (the highest-weighted platform) and within ~0.5% of best on
+# the国产 cards, which are essentially flat. ``num_stages`` is a no-op (one
+# block per program, no loop).
+_BLOCK_SIZE = 1024
+_NUM_WARPS = 16
+_NUM_STAGES = 1
+
+
+def flip(input, dims):
+    if isinstance(dims, int):
+        dims = (dims,)
+
+    dims = tuple(dims)
+
+    ndim = input.ndim
+
+    for dim in dims:
+        if dim < -ndim or dim >= ndim:
+            raise IndexError(
+                f"Dimension out of range (expected to be in range of "
+                f"[{-ndim}, {ndim - 1}], but got {dim})"
+            )
+
+    normalized_dims = tuple(dim % ndim for dim in dims)
+
+    if len(set(normalized_dims)) != len(normalized_dims):
+        raise RuntimeError("dim appears multiple times in the list of dims")
+
+    output = torch.empty(input.shape, dtype=input.dtype, device=input.device)
+
+    kernel = _cached_make(
+        ntops.kernels.flip.premake,
+        input.ndim,
+        normalized_dims,
+        block_size=_BLOCK_SIZE,
+        num_warps=_NUM_WARPS,
+        num_stages=_NUM_STAGES,
+    )
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/fliplr.py b/src/ntops/torch/fliplr.py
new file mode 100644
index 0000000..732b5b9
--- /dev/null
+++ b/src/ntops/torch/fliplr.py
@@ -0,0 +1,10 @@
+import ntops
+
+
+def fliplr(input):
+    # ``fliplr`` is exactly ``flip`` along dim 1 (columns reversed, rows kept),
+    # so it reuses the tuned flip copy kernel rather than defining its own.
+    if input.ndim < 2:
+        raise RuntimeError("Input must be >= 2-d.")
+
+    return ntops.torch.flip(input, (1,))
diff --git a/src/ntops/torch/mse_loss.py b/src/ntops/torch/mse_loss.py
new file mode 100644
index 0000000..1b98c3a
--- /dev/null
+++ b/src/ntops/torch/mse_loss.py
@@ -0,0 +1,81 @@
+import functools
+import math
+
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+@functools.lru_cache(maxsize=None)
+def _launch_config():
+    """Pick ``(num_warps, reduce_block_size, none_block_size)`` for this GPU.
+
+    Performance evaluation disables auto-tuning (``max_num_configs=1``), so
+    explicit values are required; the reduction block size additionally sets the
+    partial-sums buffer length and so must be known host-side. Values tuned with
+    ``bench/tune_mse_loss.py``. ``num_stages`` is a no-op here (one block per
+    program, no inner loop) and stays 1.
+
+    The reduction kernel's intra-block ``ntl.sum`` favors more warps on Iluvatar
+    (16) but fewer on MetaX (4); on NVIDIA RTX 4090 it is warp-insensitive and
+    wants the larger 8192 block, which is also the MetaX optimum, so NVIDIA and
+    other unmeasured devices fall through to that 8-warp / 8192 default. This
+    keys on the hardware only, never on input shapes/names.
+    """
+    name = torch.cuda.get_device_name().lower() if torch.cuda.is_available() else ""
+
+    if "metax" in name:
+        return 4, 8192, 1024
+    if "iluvatar" in name:
+        return 16, 4096, 1024
+    return 8, 8192, 1024
+
+
+def mse_loss(input, target, reduction="mean"):
+    if reduction not in ("none", "mean", "sum"):
+        raise ValueError(f"unsupported reduction: {reduction!r}")
+
+    if input.shape != target.shape:
+        input, target = torch.broadcast_tensors(input, target)
+        input = input.contiguous()
+        target = target.contiguous()
+
+    num_warps, reduce_block_size, none_block_size = _launch_config()
+
+    if reduction == "none":
+        output = torch.empty_like(input)
+
+        kernel = _cached_make(
+            ntops.kernels.mse_loss.premake,
+            input.ndim,
+            block_size=none_block_size,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        kernel(input, target, output)
+
+        return output
+
+    flat_input = input.reshape(-1)
+    flat_target = target.reshape(-1)
+
+    numel = flat_input.numel()
+    num_partials = max(1, math.ceil(numel / reduce_block_size))
+
+    partials = torch.empty(num_partials, dtype=torch.float32, device=input.device)
+
+    kernel = _cached_make(
+        ntops.kernels.mse_loss.reduce_premake,
+        block_size=reduce_block_size,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    kernel(flat_input, flat_target, partials)
+
+    total = partials.sum()
+
+    if reduction == "mean":
+        total = total / numel
+
+    return total.to(input.dtype)
diff --git a/src/ntops/torch/pixel_unshuffle.py b/src/ntops/torch/pixel_unshuffle.py
new file mode 100644
index 0000000..f0a0bdd
--- /dev/null
+++ b/src/ntops/torch/pixel_unshuffle.py
@@ -0,0 +1,61 @@
+import functools
+
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+@functools.lru_cache(maxsize=None)
+def _launch_config():
+    """Pick ``(block_size, num_warps)`` for the strided-copy kernel on this GPU.
+
+    Performance evaluation disables auto-tuning (``max_num_configs=1``), so
+    explicit values are required. Tuned with ``bench/tune_pixel_unshuffle.py``:
+    ``num_warps=4`` is best on all three platforms; NVIDIA peaks at a small 256
+    block (the large, HBM-bound cases and all fp16 cases) while MetaX / Iluvatar
+    prefer 2048. ``num_stages`` is a no-op (one block per program, no loop).
+    Keys on the hardware only, never on input shapes.
+    """
+    name = torch.cuda.get_device_name().lower() if torch.cuda.is_available() else ""
+
+    if "nvidia" in name:
+        return 256, 4
+
+    return 2048, 4
+
+
+def pixel_unshuffle(input, downscale_factor):
+    r = downscale_factor
+
+    *batch, c, h, w = input.shape
+    assert h % r == 0 and w % r == 0, (
+        f"spatial dims ({h}, {w}) must be divisible by downscale_factor {r}"
+    )
+
+    h_, w_ = h // r, w // r
+
+    # Split each r*r spatial window into (r, r), move those axes ahead of the
+    # spatial dims, then merge them into the channel dim. The permute yields a
+    # strided view that maps element-by-element onto the contiguous output; the
+    # copy kernel materializes it.
+    #   in:  (..., C, h_*r, w_*r)
+    #   src: (..., C, r, r, h_, w_)  (strided view)
+    src = input.reshape(*batch, c, h_, r, w_, r).movedim((-3, -1), (-4, -3))
+
+    output = torch.empty(
+        (*batch, c, r, r, h_, w_), dtype=input.dtype, device=input.device
+    )
+
+    block_size, num_warps = _launch_config()
+
+    kernel = _cached_make(
+        ntops.kernels.pixel_unshuffle.premake,
+        src.ndim,
+        block_size=block_size,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    kernel(src, output)
+
+    return output.reshape(*batch, c * r * r, h_, w_)
diff --git a/tests/test_feature_alpha_dropout.py b/tests/test_feature_alpha_dropout.py
new file mode 100644
index 0000000..33b4607
--- /dev/null
+++ b/tests/test_feature_alpha_dropout.py
@@ -0,0 +1,246 @@
+import random
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+_ALPHA_PRIME = -1.7580993408473766
+_ATOL = 1e-3
+
+
+# ---------------------------------------------------------------------------
+# Correctness tests
+# ---------------------------------------------------------------------------
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_feature_alpha_dropout_training_false(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.feature_alpha_dropout(input, p=0.5, training=False)
+
+    assert torch.equal(output, input)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_feature_alpha_dropout_p_zero(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.feature_alpha_dropout(input, p=0.0, training=True)
+
+    assert torch.equal(output, input)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_feature_alpha_dropout_p_one(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.feature_alpha_dropout(input, p=1.0, training=True)
+
+    expected = torch.full_like(input, _ALPHA_PRIME)
+    assert torch.allclose(output, expected, atol=_ATOL)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_feature_alpha_dropout_kept_values(shape, dtype, device, rtol, atol):
+    """Non-dropped elements must equal input * scale."""
+    input = torch.randn(shape, dtype=dtype, device=device)
+    p = random.uniform(0.05, 0.5)
+    scale = 1.0 / (1.0 - p)
+
+    output = ntops.torch.feature_alpha_dropout(input, p=p, training=True)
+
+    alpha_prime_t = torch.tensor(_ALPHA_PRIME, dtype=dtype, device=device)
+    kept = ~torch.isclose(output, alpha_prime_t.expand_as(output), atol=_ATOL)
+
+    if kept.any():
+        assert torch.allclose(
+            output[kept],
+            (input * scale)[kept],
+            rtol=rtol,
+            atol=atol,
+        )
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_feature_alpha_dropout_dropped_values(shape, dtype, device, rtol, atol):
+    """Dropped elements must equal alpha_prime."""
+    input = torch.randn(shape, dtype=dtype, device=device)
+    p = random.uniform(0.05, 0.5)
+
+    output = ntops.torch.feature_alpha_dropout(input, p=p, training=True)
+
+    alpha_prime_t = torch.tensor(_ALPHA_PRIME, dtype=dtype, device=device)
+    dropped = torch.isclose(output, alpha_prime_t.expand_as(output), atol=_ATOL)
+
+    if dropped.any():
+        assert torch.allclose(
+            output[dropped],
+            alpha_prime_t.expand_as(output)[dropped],
+            atol=_ATOL,
+        )
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("shape", [[4, 8, 16, 16], [2, 16, 32], [8, 4]])
+def test_feature_alpha_dropout_channel_consistency(shape, dtype):
+    """Within each (sample, channel) all spatial positions share the same mask.
+
+    Use all-ones input so kept elements become scale > 0 and dropped elements
+    become alpha_prime < 0 — reliably separated by sign, no tolerance needed.
+    """
+    device = "cuda"
+    input = torch.ones(shape, dtype=dtype, device=device)
+    p = 0.4
+
+    output = ntops.torch.feature_alpha_dropout(input, p=p, training=True)
+
+    # dropped: output < 0  (alpha_prime ≈ -1.758)
+    # kept:    output > 0  (scale = 1/(1-p) ≈ 1.667)
+    dropped = (output < 0).reshape(shape[0], shape[1], -1)  # (N, C, S)
+
+    all_dropped = dropped.all(dim=-1)   # (N, C)
+    any_dropped = dropped.any(dim=-1)   # (N, C)
+    assert torch.equal(all_dropped, any_dropped), (
+        "Channel consistency violated: some channels partially dropped"
+    )
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_feature_alpha_dropout_drop_rate(dtype):
+    """Observed channel drop rate should be close to p."""
+    device = "cuda"
+    shape = [32, 64, 8, 8]
+    p = 0.3
+    input = torch.ones(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.feature_alpha_dropout(input, p=p, training=True)
+
+    # all-ones input: dropped channels are all negative, kept are all positive
+    dropped_channels = (output < 0).reshape(shape[0], shape[1], -1).all(dim=-1)
+    observed_rate = dropped_channels.float().mean().item()
+
+    assert abs(observed_rate - p) < 0.1, (
+        f"Drop rate {observed_rate:.3f} too far from p={p}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Performance benchmark interface
+# ---------------------------------------------------------------------------
+
+def benchmark_feature_alpha_dropout(
+    shape,
+    p=0.5,
+    dtype=torch.float32,
+    device="cuda",
+    n_warmup=10,
+    n_repeat=100,
+):
+    """Compare ntops.torch.feature_alpha_dropout vs F.feature_alpha_dropout.
+
+    Returns timing (ms) and effective memory bandwidth (GB/s) for both,
+    plus the speedup ratio.
+
+    Example
+    -------
+    >>> results = benchmark_feature_alpha_dropout([32, 64, 128, 128])
+    >>> print(results)
+    """
+    if not torch.cuda.is_available() and device == "cuda":
+        raise RuntimeError("CUDA not available")
+
+    input_tensor = torch.randn(shape, dtype=dtype, device=device)
+
+    for _ in range(n_warmup):
+        ntops.torch.feature_alpha_dropout(input_tensor, p=p, training=True)
+        F.feature_alpha_dropout(input_tensor, p=p, training=True)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(n_repeat):
+        ntops.torch.feature_alpha_dropout(input_tensor, p=p, training=True)
+    end.record()
+    torch.cuda.synchronize()
+    ntops_ms = start.elapsed_time(end) / n_repeat
+
+    start.record()
+    for _ in range(n_repeat):
+        F.feature_alpha_dropout(input_tensor, p=p, training=True)
+    end.record()
+    torch.cuda.synchronize()
+    torch_ms = start.elapsed_time(end) / n_repeat
+
+    num_bytes = input_tensor.numel() * input_tensor.element_size() * 2
+    ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9
+    torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+    return {
+        "shape": shape,
+        "p": p,
+        "dtype": str(dtype),
+        "ntops_time_ms": ntops_ms,
+        "torch_time_ms": torch_ms,
+        "ntops_bandwidth_GBs": ntops_gbps,
+        "torch_bandwidth_GBs": torch_gbps,
+        "speedup": torch_ms / ntops_ms,
+    }
+
+
+_SWEEP_SHAPES = [
+    [4, 16, 32, 32],     # 0.25 MB
+    [8, 64, 64, 64],     # 16 MB
+    [16, 128, 64, 64],   # 64 MB
+    [32, 256, 64, 64],   # 256 MB
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_benchmark_sweep(dtype):
+    """Sweep tensor sizes. Run with: pytest tests/test_feature_alpha_dropout.py::test_benchmark_sweep -v -s"""
+    header = (
+        f"{'shape':>22} {'MB':>8} "
+        f"{'ntops(ms)':>11} {'torch(ms)':>11} "
+        f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}"
+    )
+    print(f"\n{'='*len(header)}")
+    print(f"feature_alpha_dropout sweep | dtype={dtype} | p=0.5")
+    print("=" * len(header))
+    print(header)
+    print("-" * len(header))
+
+    for shape in _SWEEP_SHAPES:
+        r = benchmark_feature_alpha_dropout(shape, dtype=dtype)
+        mb = (r["ntops_bandwidth_GBs"] * r["ntops_time_ms"] * 1e-3 * 1e9) / 2 / 1e6
+        print(
+            f"{str(shape):>22} {mb:>8.1f} "
+            f"{r['ntops_time_ms']:>11.4f} {r['torch_time_ms']:>11.4f} "
+            f"{r['ntops_bandwidth_GBs']:>13.1f} {r['torch_bandwidth_GBs']:>13.1f} "
+            f"{r['speedup']:>9.2f}"
+        )
+
+    print("=" * len(header))
+
+
+@skip_if_cuda_not_available
+def test_benchmark_interface():
+    """Smoke-test that benchmark interface runs without error."""
+    results = benchmark_feature_alpha_dropout(
+        [8, 32, 32, 32], n_warmup=2, n_repeat=5
+    )
+    assert results["ntops_time_ms"] > 0
+    assert results["ntops_bandwidth_GBs"] > 0
diff --git a/tests/test_flip.py b/tests/test_flip.py
new file mode 100644
index 0000000..f9c9078
--- /dev/null
+++ b/tests/test_flip.py
@@ -0,0 +1,224 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+# ---------------------------------------------------------------------------
+# Correctness tests (compared against torch.flip)
+# ---------------------------------------------------------------------------
+
+_CASES = [
+    # (shape, dims)
+    ([8], [0]),
+    ([8], [-1]),
+    ([16], []),              # no-op flip
+    ([4, 6], [0]),
+    ([4, 6], [1]),
+    ([4, 6], [0, 1]),
+    ([4, 6], [-1]),
+    ([2, 3, 4], [0]),
+    ([2, 3, 4], [0, 2]),
+    ([2, 3, 4], [-1]),
+    ([2, 3, 4], [0, 1, 2]),
+    ([3, 5, 7, 9], [1, 3]),
+    ([1, 1, 4, 4], [2, 3]),
+    ([2, 16, 16], [1, 2]),
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("shape, dims", _CASES)
+def test_flip_float(shape, dims, dtype):
+    device = "cuda"
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.flip(input, dims)
+    expected = torch.flip(input, dims)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
+@pytest.mark.parametrize("shape, dims", _CASES)
+def test_flip_int(shape, dims, dtype):
+    device = "cuda"
+    input = torch.randint(-1000, 1000, shape, dtype=dtype, device=device)
+
+    output = ntops.torch.flip(input, dims)
+    expected = torch.flip(input, dims)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_flip_int_dim_argument():
+    """A bare ``int`` for ``dims`` is accepted as a convenience."""
+    device = "cuda"
+    input = torch.randn(4, 6, device=device)
+
+    output = ntops.torch.flip(input, 0)
+    expected = torch.flip(input, [0])
+
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_flip_non_contiguous():
+    """A transposed (non-contiguous) input must still flip correctly."""
+    device = "cuda"
+    input = torch.randn(4, 6, 8, device=device).transpose(0, 2)
+
+    for dims in ([0], [2], [0, 2], [0, 1, 2]):
+        output = ntops.torch.flip(input, dims)
+        expected = torch.flip(input, dims)
+
+        assert output.shape == expected.shape
+        assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_flip_double_flip_is_identity():
+    device = "cuda"
+    input = torch.randn(3, 5, 7, device=device)
+
+    restored = ntops.torch.flip(ntops.torch.flip(input, [0, 2]), [0, 2])
+
+    assert torch.equal(restored, input)
+
+
+@skip_if_cuda_not_available
+def test_flip_duplicate_dims():
+    device = "cuda"
+    input = torch.randn(4, 6, device=device)
+
+    with pytest.raises(RuntimeError):
+        ntops.torch.flip(input, [0, 0])
+
+
+@skip_if_cuda_not_available
+def test_flip_dim_out_of_range():
+    device = "cuda"
+    input = torch.randn(4, 6, device=device)
+
+    with pytest.raises(IndexError):
+        ntops.torch.flip(input, [2])
+
+
+# ---------------------------------------------------------------------------
+# Performance benchmark interface
+# ---------------------------------------------------------------------------
+
+def benchmark_flip(
+    shape,
+    dims=(-1,),
+    dtype=torch.float32,
+    device="cuda",
+    n_warmup=10,
+    n_repeat=100,
+):
+    """Compare ntops.torch.flip vs torch.flip.
+
+    Returns timing (ms) and effective memory bandwidth (GB/s) for both, plus
+    the speedup ratio. Bandwidth assumes one read of the input plus one write
+    of the output (2x input bytes), the lower bound for the op.
+
+    Example
+    -------
+    >>> results = benchmark_flip([4096, 4096], dims=(0,))
+    >>> print(results)
+    """
+    if not torch.cuda.is_available() and device == "cuda":
+        raise RuntimeError("CUDA not available")
+
+    input = torch.randn(shape, dtype=dtype, device=device)
+    dims = list(dims)
+
+    for _ in range(n_warmup):
+        ntops.torch.flip(input, dims)
+        torch.flip(input, dims)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(n_repeat):
+        ntops.torch.flip(input, dims)
+    end.record()
+    torch.cuda.synchronize()
+    ntops_ms = start.elapsed_time(end) / n_repeat
+
+    start.record()
+    for _ in range(n_repeat):
+        torch.flip(input, dims)
+    end.record()
+    torch.cuda.synchronize()
+    torch_ms = start.elapsed_time(end) / n_repeat
+
+    num_bytes = input.numel() * input.element_size() * 2
+    ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9
+    torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+    return {
+        "shape": shape,
+        "dims": dims,
+        "dtype": str(dtype),
+        "ntops_time_ms": ntops_ms,
+        "torch_time_ms": torch_ms,
+        "ntops_bandwidth_GBs": ntops_gbps,
+        "torch_bandwidth_GBs": torch_gbps,
+        "speedup": torch_ms / ntops_ms,
+    }
+
+
+_SWEEP_SHAPES = [
+    ([1024, 1024], (0,)),       # flip outer dim (coalesced inner)
+    ([1024, 1024], (1,)),       # flip inner dim (reversed reads)
+    ([4096, 4096], (0, 1)),
+    ([8192, 8192], (1,)),
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_benchmark_sweep(dtype):
+    """Sweep tensor sizes/dims. Run with:
+    pytest tests/test_flip.py::test_benchmark_sweep -v -s
+    """
+    header = (
+        f"{'shape':>16} {'dims':>8} {'MB':>8} "
+        f"{'ntops(ms)':>11} {'torch(ms)':>11} "
+        f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}"
+    )
+    print(f"\n{'='*len(header)}")
+    print(f"flip sweep | dtype={dtype}")
+    print("=" * len(header))
+    print(header)
+    print("-" * len(header))
+
+    for shape, dims in _SWEEP_SHAPES:
+        res = benchmark_flip(shape, dims=dims, dtype=dtype)
+        mb = (
+            res["ntops_bandwidth_GBs"] * res["ntops_time_ms"] * 1e-3 * 1e9
+        ) / 2 / 1e6
+        print(
+            f"{str(shape):>16} {str(dims):>8} {mb:>8.1f} "
+            f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} "
+            f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} "
+            f"{res['speedup']:>9.2f}"
+        )
+
+    print("=" * len(header))
+
+
+@skip_if_cuda_not_available
+def test_benchmark_interface():
+    """Smoke-test that the benchmark interface runs without error."""
+    results = benchmark_flip([512, 512], dims=(0,), n_warmup=2, n_repeat=5)
+    assert results["ntops_time_ms"] > 0
+    assert results["ntops_bandwidth_GBs"] > 0
diff --git a/tests/test_fliplr.py b/tests/test_fliplr.py
new file mode 100644
index 0000000..c5a5ec6
--- /dev/null
+++ b/tests/test_fliplr.py
@@ -0,0 +1,196 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+# ---------------------------------------------------------------------------
+# Correctness tests (compared against torch.fliplr)
+# ---------------------------------------------------------------------------
+
+_SHAPES = [
+    [4, 6],
+    [1, 8],          # single row
+    [8, 1],          # single column
+    [16, 16],
+    [2, 33],         # odd inner dim
+    [3, 5, 7],       # 3-D: only dim 1 is flipped
+    [2, 3, 4, 5],    # 4-D
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("shape", _SHAPES)
+def test_fliplr_float(shape, dtype):
+    device = "cuda"
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.fliplr(input)
+    expected = torch.fliplr(input)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
+@pytest.mark.parametrize("shape", _SHAPES)
+def test_fliplr_int(shape, dtype):
+    device = "cuda"
+    input = torch.randint(-1000, 1000, shape, dtype=dtype, device=device)
+
+    output = ntops.torch.fliplr(input)
+    expected = torch.fliplr(input)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_fliplr_matches_flip_dim1():
+    device = "cuda"
+    input = torch.randn(4, 6, 8, device=device)
+
+    assert torch.equal(ntops.torch.fliplr(input), ntops.torch.flip(input, (1,)))
+
+
+@skip_if_cuda_not_available
+def test_fliplr_non_contiguous():
+    """A transposed (non-contiguous) input must still flip correctly."""
+    device = "cuda"
+    input = torch.randn(6, 4, 8, device=device).transpose(0, 2)
+
+    output = ntops.torch.fliplr(input)
+    expected = torch.fliplr(input)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_fliplr_double_flip_is_identity():
+    device = "cuda"
+    input = torch.randn(5, 7, device=device)
+
+    assert torch.equal(ntops.torch.fliplr(ntops.torch.fliplr(input)), input)
+
+
+@skip_if_cuda_not_available
+def test_fliplr_1d_raises():
+    device = "cuda"
+    input = torch.randn(8, device=device)
+
+    with pytest.raises(RuntimeError):
+        ntops.torch.fliplr(input)
+
+
+# ---------------------------------------------------------------------------
+# Performance benchmark interface
+# ---------------------------------------------------------------------------
+
+def benchmark_fliplr(
+    shape,
+    dtype=torch.float32,
+    device="cuda",
+    n_warmup=10,
+    n_repeat=100,
+):
+    """Compare ntops.torch.fliplr vs torch.fliplr.
+
+    Returns timing (ms) and effective memory bandwidth (GB/s) for both, plus
+    the speedup ratio. Bandwidth assumes one read of the input plus one write
+    of the output (2x input bytes), the lower bound for the op.
+
+    Example
+    -------
+    >>> results = benchmark_fliplr([4096, 4096])
+    >>> print(results)
+    """
+    if not torch.cuda.is_available() and device == "cuda":
+        raise RuntimeError("CUDA not available")
+
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    for _ in range(n_warmup):
+        ntops.torch.fliplr(input)
+        torch.fliplr(input)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(n_repeat):
+        ntops.torch.fliplr(input)
+    end.record()
+    torch.cuda.synchronize()
+    ntops_ms = start.elapsed_time(end) / n_repeat
+
+    start.record()
+    for _ in range(n_repeat):
+        torch.fliplr(input)
+    end.record()
+    torch.cuda.synchronize()
+    torch_ms = start.elapsed_time(end) / n_repeat
+
+    num_bytes = input.numel() * input.element_size() * 2
+    ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9
+    torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+    return {
+        "shape": shape,
+        "dtype": str(dtype),
+        "ntops_time_ms": ntops_ms,
+        "torch_time_ms": torch_ms,
+        "ntops_bandwidth_GBs": ntops_gbps,
+        "torch_bandwidth_GBs": torch_gbps,
+        "speedup": torch_ms / ntops_ms,
+    }
+
+
+_SWEEP_SHAPES = [
+    [1024, 1024],
+    [4096, 4096],
+    [8192, 8192],
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_benchmark_sweep(dtype):
+    """Sweep tensor sizes. Run with:
+    pytest tests/test_fliplr.py::test_benchmark_sweep -v -s
+    """
+    header = (
+        f"{'shape':>16} {'MB':>8} "
+        f"{'ntops(ms)':>11} {'torch(ms)':>11} "
+        f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}"
+    )
+    print(f"\n{'='*len(header)}")
+    print(f"fliplr sweep | dtype={dtype}")
+    print("=" * len(header))
+    print(header)
+    print("-" * len(header))
+
+    for shape in _SWEEP_SHAPES:
+        res = benchmark_fliplr(shape, dtype=dtype)
+        mb = (
+            res["ntops_bandwidth_GBs"] * res["ntops_time_ms"] * 1e-3 * 1e9
+        ) / 2 / 1e6
+        print(
+            f"{str(shape):>16} {mb:>8.1f} "
+            f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} "
+            f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} "
+            f"{res['speedup']:>9.2f}"
+        )
+
+    print("=" * len(header))
+
+
+@skip_if_cuda_not_available
+def test_benchmark_interface():
+    """Smoke-test that the benchmark interface runs without error."""
+    results = benchmark_fliplr([512, 512], n_warmup=2, n_repeat=5)
+    assert results["ntops_time_ms"] > 0
+    assert results["ntops_bandwidth_GBs"] > 0
diff --git a/tests/test_mse_loss.py b/tests/test_mse_loss.py
new file mode 100644
index 0000000..e323571
--- /dev/null
+++ b/tests/test_mse_loss.py
@@ -0,0 +1,192 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+# ---------------------------------------------------------------------------
+# Correctness tests (compared against torch.nn.functional.mse_loss)
+# ---------------------------------------------------------------------------
+
+_SHAPES = [
+    [16],
+    [1024],
+    [4097],          # not a multiple of the reduction block size
+    [32, 64],
+    [8, 7, 5],
+    [4, 3, 16, 16],
+    [1],             # single element
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype, rtol, atol", [
+    (torch.float32, 1e-3, 1e-3),
+    (torch.float16, 1e-2, 1e-2),
+])
+@pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
+@pytest.mark.parametrize("shape", _SHAPES)
+def test_mse_loss(shape, reduction, dtype, rtol, atol):
+    device = "cuda"
+    input = torch.randn(shape, dtype=dtype, device=device)
+    target = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.mse_loss(input, target, reduction=reduction)
+    expected = F.mse_loss(input, target, reduction=reduction)
+
+    assert output.shape == expected.shape
+    assert output.dtype == expected.dtype
+    assert torch.allclose(output, expected, rtol=rtol, atol=atol)
+
+
+@skip_if_cuda_not_available
+def test_mse_loss_default_reduction_is_mean():
+    device = "cuda"
+    input = torch.randn(2, 3, 4, device=device)
+    target = torch.randn(2, 3, 4, device=device)
+
+    output = ntops.torch.mse_loss(input, target)
+    expected = F.mse_loss(input, target)
+
+    assert torch.allclose(output, expected, rtol=1e-3, atol=1e-3)
+
+
+@skip_if_cuda_not_available
+def test_mse_loss_broadcast():
+    device = "cuda"
+    input = torch.randn(4, 3, 8, device=device)
+    target = torch.randn(3, 8, device=device)
+
+    output = ntops.torch.mse_loss(input, target, reduction="sum")
+    expected = F.mse_loss(
+        input, target.expand_as(input).contiguous(), reduction="sum"
+    )
+
+    assert torch.allclose(output, expected, rtol=1e-3, atol=1e-3)
+
+
+@skip_if_cuda_not_available
+def test_mse_loss_invalid_reduction():
+    device = "cuda"
+    input = torch.randn(8, device=device)
+    target = torch.randn(8, device=device)
+
+    with pytest.raises(ValueError):
+        ntops.torch.mse_loss(input, target, reduction="median")
+
+
+# ---------------------------------------------------------------------------
+# Performance benchmark interface
+# ---------------------------------------------------------------------------
+
+def benchmark_mse_loss(
+    shape,
+    reduction="mean",
+    dtype=torch.float32,
+    device="cuda",
+    n_warmup=10,
+    n_repeat=100,
+):
+    """Compare ntops.torch.mse_loss vs F.mse_loss.
+
+    Returns timing (ms) and effective memory bandwidth (GB/s) for both,
+    plus the speedup ratio. Bandwidth assumes both ``input`` and ``target``
+    are read once (2x input bytes), which is the lower bound for the op.
+
+    Example
+    -------
+    >>> results = benchmark_mse_loss([4096, 4096], "mean")
+    >>> print(results)
+    """
+    if not torch.cuda.is_available() and device == "cuda":
+        raise RuntimeError("CUDA not available")
+
+    input = torch.randn(shape, dtype=dtype, device=device)
+    target = torch.randn(shape, dtype=dtype, device=device)
+
+    for _ in range(n_warmup):
+        ntops.torch.mse_loss(input, target, reduction=reduction)
+        F.mse_loss(input, target, reduction=reduction)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(n_repeat):
+        ntops.torch.mse_loss(input, target, reduction=reduction)
+    end.record()
+    torch.cuda.synchronize()
+    ntops_ms = start.elapsed_time(end) / n_repeat
+
+    start.record()
+    for _ in range(n_repeat):
+        F.mse_loss(input, target, reduction=reduction)
+    end.record()
+    torch.cuda.synchronize()
+    torch_ms = start.elapsed_time(end) / n_repeat
+
+    num_bytes = input.numel() * input.element_size() * 2
+    ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9
+    torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+    return {
+        "shape": shape,
+        "reduction": reduction,
+        "dtype": str(dtype),
+        "ntops_time_ms": ntops_ms,
+        "torch_time_ms": torch_ms,
+        "ntops_bandwidth_GBs": ntops_gbps,
+        "torch_bandwidth_GBs": torch_gbps,
+        "speedup": torch_ms / ntops_ms,
+    }
+
+
+_SWEEP_SHAPES = [
+    [1024, 1024],      # 4 MB
+    [4096, 4096],      # 64 MB
+    [8192, 8192],      # 256 MB
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
+def test_benchmark_sweep(reduction, dtype):
+    """Sweep tensor sizes. Run with:
+    pytest tests/test_mse_loss.py::test_benchmark_sweep -v -s
+    """
+    header = (
+        f"{'shape':>16} {'MB':>8} "
+        f"{'ntops(ms)':>11} {'torch(ms)':>11} "
+        f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}"
+    )
+    print(f"\n{'='*len(header)}")
+    print(f"mse_loss sweep | reduction={reduction} | dtype={dtype}")
+    print("=" * len(header))
+    print(header)
+    print("-" * len(header))
+
+    for shape in _SWEEP_SHAPES:
+        res = benchmark_mse_loss(shape, reduction=reduction, dtype=dtype)
+        mb = (
+            torch.empty(shape, dtype=dtype).numel()
+            * torch.empty(0, dtype=dtype).element_size()
+        ) / 1e6
+        print(
+            f"{str(shape):>16} {mb:>8.1f} "
+            f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} "
+            f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} "
+            f"{res['speedup']:>9.2f}"
+        )
+
+    print("=" * len(header))
+
+
+@skip_if_cuda_not_available
+def test_benchmark_interface():
+    """Smoke-test that the benchmark interface runs without error."""
+    results = benchmark_mse_loss([512, 512], n_warmup=2, n_repeat=5)
+    assert results["ntops_time_ms"] > 0
+    assert results["ntops_bandwidth_GBs"] > 0
diff --git a/tests/test_pixel_unshuffle.py b/tests/test_pixel_unshuffle.py
new file mode 100644
index 0000000..1dd355a
--- /dev/null
+++ b/tests/test_pixel_unshuffle.py
@@ -0,0 +1,190 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+# ---------------------------------------------------------------------------
+# Correctness tests (compared against torch.nn.functional.pixel_unshuffle)
+# ---------------------------------------------------------------------------
+
+_FLOAT_CASES = [
+    # (shape, downscale_factor)
+    ([1, 1, 4, 4], 2),
+    ([2, 3, 8, 8], 2),
+    ([4, 8, 12, 12], 3),
+    ([2, 16, 16, 16], 4),
+    ([8, 4, 32, 32], 2),
+    ([3, 5, 6, 9], 3),       # H != W
+    ([1, 1, 2, 2], 2),       # minimal
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("shape, r", _FLOAT_CASES)
+def test_pixel_unshuffle_matches_torch(shape, r, dtype):
+    device = "cuda"
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    output = ntops.torch.pixel_unshuffle(input, r)
+    expected = F.pixel_unshuffle(input, r)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
+@pytest.mark.parametrize("shape, r", _FLOAT_CASES)
+def test_pixel_unshuffle_int(shape, r, dtype):
+    device = "cuda"
+    input = torch.randint(-1000, 1000, shape, dtype=dtype, device=device)
+
+    output = ntops.torch.pixel_unshuffle(input, r)
+    expected = F.pixel_unshuffle(input, r)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_pixel_unshuffle_leading_dims():
+    """torch supports arbitrary leading dims: (*, C, H, W)."""
+    device = "cuda"
+    input = torch.randn(2, 3, 4, 16, 16, device=device)
+
+    output = ntops.torch.pixel_unshuffle(input, 4)
+    expected = F.pixel_unshuffle(input, 4)
+
+    assert output.shape == expected.shape
+    assert torch.equal(output, expected)
+
+
+@skip_if_cuda_not_available
+def test_pixel_unshuffle_roundtrip():
+    """pixel_shuffle(pixel_unshuffle(x)) == x."""
+    device = "cuda"
+    r = 2
+    input = torch.randn(2, 3, 8, 8, device=device)
+
+    unshuffled = ntops.torch.pixel_unshuffle(input, r)
+    restored = F.pixel_shuffle(unshuffled, r)
+
+    assert torch.equal(restored, input)
+
+
+# ---------------------------------------------------------------------------
+# Performance benchmark interface
+# ---------------------------------------------------------------------------
+
+def benchmark_pixel_unshuffle(
+    shape,
+    downscale_factor=2,
+    dtype=torch.float32,
+    device="cuda",
+    n_warmup=10,
+    n_repeat=100,
+):
+    """Compare ntops.torch.pixel_unshuffle vs F.pixel_unshuffle.
+
+    Returns timing (ms) and effective memory bandwidth (GB/s) for both,
+    plus the speedup ratio.
+
+    Example
+    -------
+    >>> results = benchmark_pixel_unshuffle([32, 64, 128, 128], 2)
+    >>> print(results)
+    """
+    if not torch.cuda.is_available() and device == "cuda":
+        raise RuntimeError("CUDA not available")
+
+    r = downscale_factor
+    input_tensor = torch.randn(shape, dtype=dtype, device=device)
+
+    for _ in range(n_warmup):
+        ntops.torch.pixel_unshuffle(input_tensor, r)
+        F.pixel_unshuffle(input_tensor, r)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(n_repeat):
+        ntops.torch.pixel_unshuffle(input_tensor, r)
+    end.record()
+    torch.cuda.synchronize()
+    ntops_ms = start.elapsed_time(end) / n_repeat
+
+    start.record()
+    for _ in range(n_repeat):
+        F.pixel_unshuffle(input_tensor, r)
+    end.record()
+    torch.cuda.synchronize()
+    torch_ms = start.elapsed_time(end) / n_repeat
+
+    # read input + write output == 2x input size
+    num_bytes = input_tensor.numel() * input_tensor.element_size() * 2
+    ntops_gbps = num_bytes / (ntops_ms * 1e-3) / 1e9
+    torch_gbps = num_bytes / (torch_ms * 1e-3) / 1e9
+
+    return {
+        "shape": shape,
+        "downscale_factor": r,
+        "dtype": str(dtype),
+        "ntops_time_ms": ntops_ms,
+        "torch_time_ms": torch_ms,
+        "ntops_bandwidth_GBs": ntops_gbps,
+        "torch_bandwidth_GBs": torch_gbps,
+        "speedup": torch_ms / ntops_ms,
+    }
+
+
+_SWEEP_SHAPES = [
+    ([4, 16, 32, 32], 2),     # 0.25 MB
+    ([8, 64, 64, 64], 2),     # 16 MB
+    ([16, 128, 64, 64], 2),   # 64 MB
+    ([32, 256, 64, 64], 2),   # 256 MB
+]
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
+def test_benchmark_sweep(dtype):
+    """Sweep tensor sizes. Run with: pytest tests/test_pixel_unshuffle.py::test_benchmark_sweep -v -s"""
+    header = (
+        f"{'shape':>22} {'r':>3} {'MB':>8} "
+        f"{'ntops(ms)':>11} {'torch(ms)':>11} "
+        f"{'ntops(GB/s)':>13} {'torch(GB/s)':>13} {'speedup':>9}"
+    )
+    print(f"\n{'='*len(header)}")
+    print(f"pixel_unshuffle sweep | dtype={dtype}")
+    print("=" * len(header))
+    print(header)
+    print("-" * len(header))
+
+    for shape, r in _SWEEP_SHAPES:
+        res = benchmark_pixel_unshuffle(shape, downscale_factor=r, dtype=dtype)
+        mb = (
+            res["ntops_bandwidth_GBs"] * res["ntops_time_ms"] * 1e-3 * 1e9
+        ) / 2 / 1e6
+        print(
+            f"{str(shape):>22} {r:>3} {mb:>8.1f} "
+            f"{res['ntops_time_ms']:>11.4f} {res['torch_time_ms']:>11.4f} "
+            f"{res['ntops_bandwidth_GBs']:>13.1f} {res['torch_bandwidth_GBs']:>13.1f} "
+            f"{res['speedup']:>9.2f}"
+        )
+
+    print("=" * len(header))
+
+
+@skip_if_cuda_not_available
+def test_benchmark_interface():
+    """Smoke-test that benchmark interface runs without error."""
+    results = benchmark_pixel_unshuffle(
+        [8, 32, 32, 32], downscale_factor=2, n_warmup=2, n_repeat=5
+    )
+    assert results["ntops_time_ms"] > 0
+    assert results["ntops_bandwidth_GBs"] > 0

From c71f3c39b7f888f8c0ef153ca57b39734ced014f Mon Sep 17 00:00:00 2001
From: sunteng <steng2009@163.com>
Date: Sat, 20 Jun 2026 16:32:19 +0800
Subject: [PATCH 2/2] fix

Signed-off-by: sunteng <steng2009@163.com>
---
 bench/bench_t1_1_7.py | 101 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 bench/bench_t1_1_7.py

diff --git a/bench/bench_t1_1_7.py b/bench/bench_t1_1_7.py
new file mode 100644
index 0000000..4ad23dc
--- /dev/null
+++ b/bench/bench_t1_1_7.py
@@ -0,0 +1,101 @@
+"""Benchmark T1-1-7 operators vs torch.
+
+    feature_alpha_dropout / mse_loss / flip / fliplr / pixel_unshuffle
+
+    python bench/bench_t1_1_7.py
+"""
+
+import torch
+import torch.nn.functional as F
+import triton.testing
+
+import ntops
+
+DEVICE = "cuda"
+DTYPE = torch.float32
+
+
+def _report(name, shape_str, ms_nt, ms_th, nbytes):
+    bw_nt = nbytes / ms_nt * 1e-6
+    bw_th = nbytes / ms_th * 1e-6
+    print(
+        f"  {name:22s} {shape_str:22s} "
+        f"九齿 {bw_nt:7.0f} GB/s | torch {bw_th:7.0f} GB/s | "
+        f"speedup {ms_th / ms_nt:.2f}x"
+    )
+
+
+def bench_feature_alpha_dropout():
+    print("\n[feature_alpha_dropout]")
+    for shape in [(64, 256, 32, 32), (128, 512, 16, 16), (32, 256, 64, 64)]:
+        x = torch.randn(shape, dtype=DTYPE, device=DEVICE)
+        nbytes = x.numel() * x.element_size() * 2
+        ms_nt = triton.testing.do_bench(
+            lambda: ntops.torch.feature_alpha_dropout(x, p=0.5, training=True)
+        )
+        ms_th = triton.testing.do_bench(
+            lambda: F.feature_alpha_dropout(x, p=0.5, training=True)
+        )
+        _report("feature_alpha_dropout", str(shape), ms_nt, ms_th, nbytes)
+
+
+def bench_mse_loss():
+    print("\n[mse_loss]")
+    for shape in [(4096, 4096), (8192, 8192), (4096 * 4096,)]:
+        x = torch.randn(shape, dtype=DTYPE, device=DEVICE)
+        t = torch.randn(shape, dtype=DTYPE, device=DEVICE)
+        nbytes = x.numel() * x.element_size() * 2  # 2 reads
+        ms_nt = triton.testing.do_bench(
+            lambda: ntops.torch.mse_loss(x, t, reduction="mean")
+        )
+        ms_th = triton.testing.do_bench(
+            lambda: F.mse_loss(x, t, reduction="mean")
+        )
+        _report("mse_loss", str(shape), ms_nt, ms_th, nbytes)
+
+
+def bench_flip():
+    print("\n[flip]")
+    cases = [((4096, 4096), (0,)), ((4096, 4096), (1,)), ((8192, 8192), (0, 1))]
+    for shape, dims in cases:
+        x = torch.randn(shape, dtype=DTYPE, device=DEVICE)
+        nbytes = x.numel() * x.element_size() * 2  # 1 read + 1 write
+        ms_nt = triton.testing.do_bench(lambda: ntops.torch.flip(x, dims))
+        ms_th = triton.testing.do_bench(lambda: torch.flip(x, dims))
+        _report("flip", f"{shape} dims={dims}", ms_nt, ms_th, nbytes)
+
+
+def bench_fliplr():
+    print("\n[fliplr]")
+    for shape in [(4096, 4096), (8192, 8192)]:
+        x = torch.randn(shape, dtype=DTYPE, device=DEVICE)
+        nbytes = x.numel() * x.element_size() * 2
+        ms_nt = triton.testing.do_bench(lambda: ntops.torch.fliplr(x))
+        ms_th = triton.testing.do_bench(lambda: torch.fliplr(x))
+        _report("fliplr", str(shape), ms_nt, ms_th, nbytes)
+
+
+def bench_pixel_unshuffle():
+    print("\n[pixel_unshuffle]")
+    cases = [((32, 64, 112, 112), 2), ((16, 128, 128, 128), 4), ((64, 64, 64, 64), 2)]
+    for shape, r in cases:
+        x = torch.randn(shape, dtype=DTYPE, device=DEVICE)
+        nbytes = x.numel() * x.element_size() * 2
+        ms_nt = triton.testing.do_bench(
+            lambda: ntops.torch.pixel_unshuffle(x, r)
+        )
+        ms_th = triton.testing.do_bench(lambda: F.pixel_unshuffle(x, r))
+        _report("pixel_unshuffle", f"{shape} r={r}", ms_nt, ms_th, nbytes)
+
+
+def main():
+    print(f"device: {torch.cuda.get_device_name()}  dtype: {DTYPE}")
+    bench_feature_alpha_dropout()
+    bench_mse_loss()
+    bench_flip()
+    bench_fliplr()
+    bench_pixel_unshuffle()
+
+
+if __name__ == "__main__":
+    main()