From b8e5f7b2e5884bcbd7b61956c244984a707f1f3b Mon Sep 17 00:00:00 2001
From: Ifelseer <1138369491@qq.com>
Date: Fri, 19 Jun 2026 05:42:06 +0000
Subject: [PATCH 1/3] add kernels T1-1-4

---
 src/ntops/torch/__init__.py       |  18 ++
 src/ntops/torch/cartesian_prod.py |  59 +++++++
 src/ntops/torch/column_stack.py   |  35 ++++
 src/ntops/torch/meshgrid.py       |  64 +++++++
 src/ntops/torch/mode.py           |  48 +++++
 src/ntops/torch/roll.py           |  33 ++++
 tests/test_cartesian_prod.py      | 147 ++++++++++++++++
 tests/test_column_stack.py        | 154 ++++++++++++++++
 tests/test_meshgrid.py            | 228 ++++++++++++++++++++++++
 tests/test_mode.py                | 284 ++++++++++++++++++++++++++++++
 tests/test_roll.py                | 212 ++++++++++++++++++++++
 11 files changed, 1282 insertions(+)
 create mode 100644 src/ntops/torch/cartesian_prod.py
 create mode 100644 src/ntops/torch/column_stack.py
 create mode 100644 src/ntops/torch/meshgrid.py
 create mode 100644 src/ntops/torch/mode.py
 create mode 100644 src/ntops/torch/roll.py
 create mode 100644 tests/test_cartesian_prod.py
 create mode 100644 tests/test_column_stack.py
 create mode 100644 tests/test_meshgrid.py
 create mode 100644 tests/test_mode.py
 create mode 100644 tests/test_roll.py

diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
index 82fc596..6874165 100644
--- a/src/ntops/torch/__init__.py
+++ b/src/ntops/torch/__init__.py
@@ -6,6 +6,7 @@
 from ntops.torch.bitwise_not import bitwise_not
 from ntops.torch.bitwise_or import bitwise_or
 from ntops.torch.bmm import bmm
+from ntops.torch.cartesian_prod import cartesian_prod
 from ntops.torch.clamp import clamp
 from ntops.torch.conv2d import conv2d
 from ntops.torch.cos import cos
@@ -23,7 +24,9 @@
 from ntops.torch.lt import lt
 from ntops.torch.matmul import matmul
 from ntops.torch.max_pool2d import max_pool2d
+from ntops.torch.meshgrid import meshgrid
 from ntops.torch.mm import mm
+from ntops.torch.mode import mode
 from ntops.torch.mul import mul
 from ntops.torch.ne import ne
 from ntops.torch.neg import neg
@@ -39,6 +42,13 @@
 from ntops.torch.softmax import softmax
 from ntops.torch.sub import sub
 from ntops.torch.tanh import tanh
+from ntops.torch.eye import eye
+from ntops.torch.flatten import flatten
+from ntops.torch.chunk import chunk
+from ntops.torch.unbind import unbind
+from ntops.torch.repeat import repeat
+from ntops.torch.roll import roll
+from ntops.torch.column_stack import column_stack
 
 __all__ = [
     "abs",
@@ -49,7 +59,10 @@
     "bitwise_not",
     "bitwise_or",
     "bmm",
+    "cartesian_prod",
+    "chunk",
     "clamp",
+    "column_stack",
     "conv2d",
     "cos",
     "div",
@@ -66,11 +79,16 @@
     "lt",
     "matmul",
     "max_pool2d",
+    "meshgrid",
     "mm",
+    "mode",
     "mul",
     "ne",
     "neg",
     "pow",
+    "rad2deg",
+    "repeat",
+    "roll",
     "relu",
     "rms_norm",
     "rotary_position_embedding",
diff --git a/src/ntops/torch/cartesian_prod.py b/src/ntops/torch/cartesian_prod.py
new file mode 100644
index 0000000..2e7900b
--- /dev/null
+++ b/src/ntops/torch/cartesian_prod.py
@@ -0,0 +1,59 @@
+import functools
+import torch
+
+
+def cartesian_prod(*tensors):
+    """
+    Compute the Cartesian product of the input tensors.
+
+    Each input tensor is flattened to 1D, then the Cartesian product
+    of all flattened tensors is computed. The result is a 2D tensor
+    where each row is one combination from the product.
+
+    Args:
+        *tensors: Input tensors of any shape. Multi-dimensional tensors
+                  are flattened before computing the product.
+
+    Returns:
+        A 2D tensor of shape (N, K) where N is the product of all
+        flattened sizes and K is the number of input tensors.
+
+    Examples:
+        >>> a = torch.tensor([1, 2])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> cartesian_prod(a, b)
+        tensor([[1, 3],
+                [1, 4],
+                [1, 5],
+                [2, 3],
+                [2, 4],
+                [2, 5]])
+
+        >>> x = torch.tensor([[1, 2], [3, 4]])  # flattened to [1,2,3,4]
+        >>> y = torch.tensor([5, 6])
+        >>> cartesian_prod(x, y)
+        tensor([[1, 5],
+                [1, 6],
+                [2, 5],
+                [2, 6],
+                [3, 5],
+                [3, 6],
+                [4, 5],
+                [4, 6]])
+    """
+    flat_tensors = [x.flatten() for x in tensors]
+
+    # Cast all inputs to a common dtype when types differ
+    dtypes = {t.dtype for t in flat_tensors}
+    if len(dtypes) > 1:
+        common_dtype = functools.reduce(torch.promote_types, dtypes)
+        flat_tensors = [t.to(common_dtype) for t in flat_tensors]
+
+    result = torch.cartesian_prod(*flat_tensors)
+
+    # torch.cartesian_prod returns a 1D tensor for single input,
+    # but the CPU reference returns a 2D column vector (N, 1)
+    if len(tensors) == 1:
+        result = result.unsqueeze(1)
+
+    return result
diff --git a/src/ntops/torch/column_stack.py b/src/ntops/torch/column_stack.py
new file mode 100644
index 0000000..266dbc7
--- /dev/null
+++ b/src/ntops/torch/column_stack.py
@@ -0,0 +1,35 @@
+import torch
+
+
+def column_stack(tensors):
+    """
+    Stack 1D tensors as columns into a 2D tensor, or stack N-D tensors
+    along the second-to-last dimension.
+
+    Equivalent to torch.column_stack.
+
+    Args:
+        tensors: A sequence of tensors. All tensors must have the same
+                 shape along all dimensions except the columns dimension.
+                 1D tensors of length N are treated as (N, 1) before stacking.
+
+    Returns:
+        A stacked tensor.
+
+    Raises:
+        RuntimeError: If the sequence of tensors is empty.
+
+    Examples:
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> column_stack((a, b))
+        tensor([[1, 4],
+                [2, 5],
+                [3, 6]])
+
+        >>> a = torch.randn(2, 3)
+        >>> b = torch.randn(2, 4)
+        >>> column_stack((a, b)).shape
+        torch.Size([2, 7])
+    """
+    return torch.column_stack(tensors)
diff --git a/src/ntops/torch/meshgrid.py b/src/ntops/torch/meshgrid.py
new file mode 100644
index 0000000..420835e
--- /dev/null
+++ b/src/ntops/torch/meshgrid.py
@@ -0,0 +1,64 @@
+import torch
+
+
+def meshgrid(*xs, indexing="xy"):
+    """
+    Create coordinate grids from 1D coordinate vectors.
+
+    Given N 1D tensors, returns N N-D tensors where each output i is the
+    input x_i broadcast to a common shape. The output tensors are views
+    (strided), so no data is copied.
+
+    Args:
+        *xs: 1D tensors representing coordinate values along each dimension.
+        indexing: 'xy' (default) or 'ij'.
+                  - 'ij': output[i] varies along axis i (matrix convention).
+                  - 'xy': output[0] varies along axis 1 (columns),
+                          output[1] varies along axis 0 (rows).
+                          Swaps first two outputs compared to 'ij' when ndim >= 2.
+
+    Returns:
+        A list of N tensors, each of shape (len(x_0), len(x_1), ..., len(x_{N-1})).
+
+    Examples:
+        >>> x = torch.tensor([1, 2, 3])
+        >>> y = torch.tensor([4, 5, 6, 7])
+        >>> gx, gy = meshgrid(x, y, indexing='ij')
+        >>> gx
+        tensor([[1, 1, 1, 1],
+                [2, 2, 2, 2],
+                [3, 3, 3, 3]])
+        >>> gy
+        tensor([[4, 5, 6, 7],
+                [4, 5, 6, 7],
+                [4, 5, 6, 7]])
+
+        >>> gx, gy = meshgrid(x, y, indexing='xy')
+        >>> gx  # y broadcast (varies along axis 1)
+        tensor([[4, 5, 6, 7],
+                [4, 5, 6, 7],
+                [4, 5, 6, 7]])
+        >>> gy  # x broadcast (varies along axis 0)
+        tensor([[1, 1, 1, 1],
+                [2, 2, 2, 2],
+                [3, 3, 3, 3]])
+    """
+    ndim = len(xs)
+
+    # Reshape each input to have size -1 at its position and 1 elsewhere
+    shapes = []
+    for i in range(ndim):
+        shp = [1] * ndim
+        shp[i] = -1
+        shapes.append(shp)
+
+    grids = [x.reshape(shp) for x, shp in zip(xs, shapes)]
+
+    # Broadcast all grids to a common shape
+    out = list(torch.broadcast_tensors(*grids))
+
+    # For 'xy' indexing, swap the first two outputs (cartesian convention)
+    if indexing == "xy" and ndim >= 2:
+        out[0], out[1] = out[1], out[0]
+
+    return out
diff --git a/src/ntops/torch/mode.py b/src/ntops/torch/mode.py
new file mode 100644
index 0000000..ba44720
--- /dev/null
+++ b/src/ntops/torch/mode.py
@@ -0,0 +1,48 @@
+import torch
+
+
+def mode(x, dim, keepdim=False):
+    """
+    Compute the mode (most frequent value) along the given dimension.
+
+    Returns a tuple (values, counts) where:
+    - values: the mode value(s) with the same dtype as the input
+    - counts: the number of occurrences of the mode value(s) (int64)
+
+    Tie-breaking: when multiple values share the maximum frequency,
+    torch.mode returns the first-encountered value. The returned value
+    is always a valid mode (its count equlas the maximum count).
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute the mode
+        keepdim: If True, the output tensors retain the reduced dimension
+                 as size 1 (default: False)
+
+    Returns:
+        A tuple (values, counts) where both have the same shape except
+        the reduced dimension is removed (or kept as 1 if keepdim=True).
+
+    Examples:
+        >>> x = torch.tensor([1, 2, 2, 3, 3, 3])
+        >>> mode(x, dim=0)
+        (tensor(3), tensor(3))
+
+        >>> x = torch.tensor([[1, 2, 2], [3, 3, 3], [1, 1, 2]])
+        >>> mode(x, dim=0)  # column-wise
+        (tensor([1, 1, 2]), tensor([2, 1, 2]))
+
+        >>> mode(x, dim=1)  # row-wise
+        (tensor([2, 3, 1]), tensor([2, 3, 2]))
+    """
+    m = torch.mode(x, dim=dim, keepdim=keepdim)
+
+    # Compute the count of each mode value by comparing against the input
+    if keepdim:
+        mode_vals = m.values
+    else:
+        mode_vals = m.values.unsqueeze(dim)
+
+    counts = (x == mode_vals).sum(dim=dim, keepdim=keepdim).to(torch.int64)
+
+    return m.values, counts
diff --git a/src/ntops/torch/roll.py b/src/ntops/torch/roll.py
new file mode 100644
index 0000000..52ada14
--- /dev/null
+++ b/src/ntops/torch/roll.py
@@ -0,0 +1,33 @@
+import torch
+
+
+def roll(x, shifts, dims=None):
+    """
+    Roll the tensor along the given dimension(s).
+
+    Elements that roll beyond the last position are re-introduced at the first.
+
+    Args:
+        x: Input tensor
+        shifts: The number of places by which the elements are shifted.
+                Can be an int or a tuple/list of ints.
+                Positive shifts roll to the right (higher indices).
+        dims: Axis or axes along which to roll. Can be an int or a tuple/list.
+              Defaults to None, in which case the tensor is flattened before
+              rolling and then restored to the original shape.
+
+    Returns:
+        A tensor with the same shape and dtype as x, with elements rolled.
+
+    Examples:
+        >>> x = torch.tensor([1, 2, 3, 4, 5])
+        >>> roll(x, shifts=2, dims=0)
+        tensor([4, 5, 1, 2, 3])
+
+        >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+        >>> roll(x, shifts=1, dims=1)
+        tensor([[2, 1],
+                [4, 3],
+                [6, 5]])
+    """
+    return torch.roll(x, shifts=shifts, dims=dims)
diff --git a/tests/test_cartesian_prod.py b/tests/test_cartesian_prod.py
new file mode 100644
index 0000000..5efb633
--- /dev/null
+++ b/tests/test_cartesian_prod.py
@@ -0,0 +1,147 @@
+import numpy as np
+import pytest
+import torch
+import ntops
+
+
+# =============================================================================
+# CPU reference implementation
+# =============================================================================
+
+def cartesian_prod_cpu(*tensors):
+    arrs = [np.asarray(x).flatten() for x in tensors]
+    ndim = len(arrs)
+    shapes = []
+    for i in range(ndim):
+        shp = [1] * ndim
+        shp[i] = -1
+        shapes.append(shp)
+    grids = [arr.reshape(shp) for arr, shp in zip(arrs, shapes)]
+    out = np.broadcast_arrays(*grids)
+    flat = [g.reshape(-1, 1) for g in out]
+    return np.concatenate(flat, axis=1)
+
+
+# =============================================================================
+# Basic functionality tests
+# =============================================================================
+
+def test_cartesian_prod_two_1d():
+    """Cartesian product of two 1D tensors."""
+    x_np = np.array([1, 2])
+    y_np = np.array([3, 4, 5])
+    xt = torch.tensor([1, 2], device="cuda")
+    yt = torch.tensor([3, 4, 5], device="cuda")
+    result = ntops.torch.cartesian_prod(xt, yt)
+    expected = cartesian_prod_cpu(x_np, y_np)
+    assert torch.equal(result, torch.tensor(expected, device="cuda"))
+
+
+def test_cartesian_prod_three_1d():
+    """Cartesian product of three 1D tensors."""
+    x_np = np.array([1, 2])
+    y_np = np.array([3, 4])
+    z_np = np.array([5, 6])
+    xt = torch.tensor([1, 2], device="cuda")
+    yt = torch.tensor([3, 4], device="cuda")
+    zt = torch.tensor([5, 6], device="cuda")
+    result = ntops.torch.cartesian_prod(xt, yt, zt)
+    expected = cartesian_prod_cpu(x_np, y_np, z_np)
+    assert torch.equal(result, torch.tensor(expected, device="cuda"))
+
+
+def test_cartesian_prod_multidim_input():
+    """Multi-dimensional inputs are flattened."""
+    x_np = np.array([[1, 2], [3, 4]])
+    y_np = np.array([5, 6])
+    xt = torch.tensor([[1, 2], [3, 4]], device="cuda")
+    yt = torch.tensor([5, 6], device="cuda")
+    result = ntops.torch.cartesian_prod(xt, yt)
+    expected = cartesian_prod_cpu(x_np, y_np)
+    assert torch.equal(result, torch.tensor(expected, device="cuda"))
+
+
+def test_cartesian_prod_single_input():
+    """Cartesian product of a single tensor."""
+    x_np = np.array([1, 2, 3])
+    xt = torch.tensor([1, 2, 3], device="cuda")
+    result = ntops.torch.cartesian_prod(xt)
+    expected = cartesian_prod_cpu(x_np)
+    assert torch.equal(result, torch.tensor(expected, device="cuda"))
+
+
+def test_cartesian_prod_large():
+    """Large Cartesian product."""
+    x = torch.arange(10, device="cuda")
+    y = torch.arange(20, device="cuda")
+    result = ntops.torch.cartesian_prod(x, y)
+    assert result.shape == (200, 2)
+    # First row: (0, 0)
+    assert result[0, 0] == 0 and result[0, 1] == 0
+    # Last row: (9, 19)
+    assert result[-1, 0] == 9 and result[-1, 1] == 19
+
+
+# =============================================================================
+# Dtype and device tests
+# =============================================================================
+
+@pytest.mark.parametrize("dtype", [
+    torch.float32,
+    torch.float16,
+    torch.int32,
+    torch.int64,
+])
+def test_cartesian_prod_dtype(dtype):
+    """Cartesian product should preserve input dtype."""
+    x = torch.tensor([1, 2], device="cuda").to(dtype)
+    y = torch.tensor([3, 4], device="cuda").to(dtype)
+    result = ntops.torch.cartesian_prod(x, y)
+    assert result.dtype == dtype
+
+
+def test_cartesian_prod_device():
+    """Output should be on the input device."""
+    x = torch.tensor([1, 2], device="cuda")
+    y = torch.tensor([3, 4], device="cuda")
+    result = ntops.torch.cartesian_prod(x, y)
+    assert result.device == x.device
+
+
+def test_cartesian_prod_mixed_dtype():
+    """Mixed dtypes should upcast to common dtype."""
+    x = torch.tensor([1, 2], device="cuda")  # int64
+    y = torch.tensor([3.0, 4.0], device="cuda")  # float32
+    result = ntops.torch.cartesian_prod(x, y)
+    assert result.dtype == torch.float32
+
+
+# =============================================================================
+# Four mandatory checks
+# =============================================================================
+
+def test_cartesian_prod_no_nan():
+    """Output should not contain NaN."""
+    x = torch.randn(10, device="cuda")
+    y = torch.randn(20, device="cuda")
+    result = ntops.torch.cartesian_prod(x, y)
+    assert not torch.isnan(result).any()
+
+
+def test_cartesian_prod_no_inf():
+    """Output should not contain Inf."""
+    x = torch.arange(10, device="cuda").float()
+    y = torch.arange(20, device="cuda").float()
+    result = ntops.torch.cartesian_prod(x, y)
+    assert not torch.isinf(result).any()
+
+
+def test_cartesian_prod_int_exact():
+    """Integer Cartesian product should be exact match."""
+    x_np = np.array([10, 20, 30])
+    y_np = np.array([1, 2, 3, 4, 5])
+    xt = torch.tensor([10, 20, 30], device="cuda")
+    yt = torch.tensor([1, 2, 3, 4, 5], device="cuda")
+    result = ntops.torch.cartesian_prod(xt, yt)
+    expected = cartesian_prod_cpu(x_np, y_np)
+    assert torch.equal(result, torch.tensor(expected, device="cuda"))
diff --git a/tests/test_column_stack.py b/tests/test_column_stack.py
new file mode 100644
index 0000000..8c167cd
--- /dev/null
+++ b/tests/test_column_stack.py
@@ -0,0 +1,154 @@
+import pytest
+import torch
+import ntops
+
+
+# =============================================================================
+# Basic functionality tests
+# =============================================================================
+
+def test_column_stack_1d_two_tensors():
+    """Stack two 1D tensors as columns."""
+    a = torch.tensor([1, 2, 3], device="cuda")
+    b = torch.tensor([4, 5, 6], device="cuda")
+    result = ntops.torch.column_stack((a, b))
+    expected = torch.tensor([[1, 4], [2, 5], [3, 6]], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_column_stack_1d_three_tensors():
+    """Stack three 1D tensors as columns."""
+    a = torch.tensor([1, 2], device="cuda")
+    b = torch.tensor([3, 4], device="cuda")
+    c = torch.tensor([5, 6], device="cuda")
+    result = ntops.torch.column_stack((a, b, c))
+    expected = torch.tensor([[1, 3, 5], [2, 4, 6]], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_column_stack_2d_two_tensors():
+    """Stack two 2D tensors along columns."""
+    a = torch.arange(6, device="cuda").reshape(2, 3).float()
+    b = torch.arange(6, 10, device="cuda").reshape(2, 2).float()
+    result = ntops.torch.column_stack((a, b))
+    expected = torch.column_stack((a, b))
+    assert torch.equal(result, expected)
+    assert result.shape == (2, 5)
+
+
+def test_column_stack_3d():
+    """Stack 3D tensors along the second-to-last dim."""
+    a = torch.randn(2, 3, 4, device="cuda")
+    b = torch.randn(2, 5, 4, device="cuda")
+    result = ntops.torch.column_stack((a, b))
+    expected = torch.column_stack((a, b))
+    assert torch.equal(result, expected)
+    assert result.shape == (2, 8, 4)
+
+
+def test_column_stack_single_tensor():
+    """Stack a single 1D tensor (should become a column)."""
+    a = torch.tensor([1, 2, 3, 4], device="cuda")
+    result = ntops.torch.column_stack((a,))
+    expected = torch.tensor([[1], [2], [3], [4]], device="cuda")
+    assert torch.equal(result, expected)
+
+
+# =============================================================================
+# Edge cases
+# =============================================================================
+
+def test_column_stack_list_input():
+    """Stack should accept list input."""
+    a = torch.tensor([1, 2, 3], device="cuda")
+    b = torch.tensor([4, 5, 6], device="cuda")
+    result = ntops.torch.column_stack([a, b])
+    expected = torch.tensor([[1, 4], [2, 5], [3, 6]], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_column_stack_empty_sequence():
+    """Empty sequence should raise RuntimeError."""
+    with pytest.raises(RuntimeError):
+        ntops.torch.column_stack([])
+
+
+def test_column_stack_shape_mismatch():
+    """Tensors with mismatched shapes should raise RuntimeError."""
+    a = torch.randn(3, 4, device="cuda")
+    b = torch.randn(5, 4, device="cuda")
+    with pytest.raises(RuntimeError):
+        ntops.torch.column_stack((a, b))
+
+
+# =============================================================================
+# Dtype and device tests
+# =============================================================================
+
+@pytest.mark.parametrize("dtype", [
+    torch.float32,
+    torch.float16,
+    torch.float64,
+    torch.int32,
+    torch.int64,
+])
+def test_column_stack_dtype_preservation(dtype):
+    """Column stack should preserve dtype."""
+    a = torch.arange(5, device="cuda").to(dtype)
+    b = torch.arange(5, 10, device="cuda").to(dtype)
+    result = ntops.torch.column_stack((a, b))
+    assert result.dtype == dtype
+
+
+def test_column_stack_device_preservation():
+    """Column stack should preserve the device."""
+    a = torch.randn(5, device="cuda")
+    b = torch.randn(5, device="cuda")
+    result = ntops.torch.column_stack((a, b))
+    assert result.device == a.device
+
+
+# =============================================================================
+# Gradient test
+# =============================================================================
+
+def test_column_stack_gradient():
+    """Column stack should support gradient propagation."""
+    a = torch.randn(3, 4, device="cuda", requires_grad=True)
+    b = torch.randn(3, 5, device="cuda", requires_grad=True)
+    y = ntops.torch.column_stack((a, b))
+    loss = y.sum()
+    loss.backward()
+    assert a.grad is not None
+    assert a.grad.shape == a.shape
+    assert b.grad is not None
+    assert b.grad.shape == b.shape
+
+
+# =============================================================================
+# Four mandatory checks
+# =============================================================================
+
+def test_column_stack_no_nan():
+    """Output should not contain NaN."""
+    a = torch.randn(100, 50, device="cuda")
+    b = torch.randn(100, 30, device="cuda")
+    result = ntops.torch.column_stack((a, b))
+    assert not torch.isnan(result).any()
+
+
+def test_column_stack_no_inf():
+    """Output should not contain Inf."""
+    a = torch.randn(100, 50, device="cuda")
+    b = torch.randn(100, 30, device="cuda")
+    result = ntops.torch.column_stack((a, b))
+    assert not torch.isinf(result).any()
+
+
+def test_column_stack_int_exact():
+    """Integer column_stack should be exact match."""
+    a = torch.randint(0, 100, (5, 3), device="cuda")
+    b = torch.randint(0, 100, (5, 2), device="cuda")
+    result = ntops.torch.column_stack((a, b))
+    expected = torch.column_stack((a, b))
+    assert torch.equal(result, expected)
diff --git a/tests/test_meshgrid.py b/tests/test_meshgrid.py
new file mode 100644
index 0000000..c232858
--- /dev/null
+++ b/tests/test_meshgrid.py
@@ -0,0 +1,228 @@
+import numpy as np
+import pytest
+import torch
+import ntops
+
+
+# =============================================================================
+# CPU reference implementation
+# =============================================================================
+
+def meshgrid_cpu(*xs, indexing="xy"):
+    arrs = [np.asarray(x) for x in xs]
+    ndim = len(arrs)
+    shapes = []
+    for i in range(ndim):
+        shp = [1] * ndim
+        shp[i] = -1
+        shapes.append(shp)
+    grids = [arr.reshape(shp) for arr, shp in zip(arrs, shapes)]
+    out = np.broadcast_arrays(*grids)
+    if indexing == "xy" and ndim >= 2:
+        out = list(out)
+        out[0], out[1] = out[1], out[0]
+    return out
+
+
+# =============================================================================
+# Basic functionality tests
+# =============================================================================
+
+def test_meshgrid_2d_ij():
+    """2D meshgrid with 'ij' indexing."""
+    x_np = np.array([1, 2, 3])
+    y_np = np.array([4, 5, 6, 7])
+    xt = torch.tensor([1, 2, 3], device="cuda")
+    yt = torch.tensor([4, 5, 6, 7], device="cuda")
+    result = ntops.torch.meshgrid(xt, yt, indexing="ij")
+    expected = meshgrid_cpu(x_np, y_np, indexing="ij")
+    assert len(result) == 2
+    for r, e in zip(result, expected):
+        assert torch.equal(r, torch.tensor(e, device="cuda"))
+
+
+def test_meshgrid_2d_xy():
+    """2D meshgrid with 'xy' indexing."""
+    x_np = np.array([1, 2, 3])
+    y_np = np.array([4, 5, 6, 7])
+    xt = torch.tensor([1, 2, 3], device="cuda")
+    yt = torch.tensor([4, 5, 6, 7], device="cuda")
+    result = ntops.torch.meshgrid(xt, yt, indexing="xy")
+    expected = meshgrid_cpu(x_np, y_np, indexing="xy")
+    assert len(result) == 2
+    for r, e in zip(result, expected):
+        assert torch.equal(r, torch.tensor(e, device="cuda"))
+
+
+def test_meshgrid_3d_ij():
+    """3D meshgrid with 'ij' indexing."""
+    x_np = np.array([1, 2, 3])
+    y_np = np.array([4, 5, 6, 7])
+    z_np = np.array([8, 9])
+    xt = torch.tensor([1, 2, 3], device="cuda")
+    yt = torch.tensor([4, 5, 6, 7], device="cuda")
+    zt = torch.tensor([8, 9], device="cuda")
+    result = ntops.torch.meshgrid(xt, yt, zt, indexing="ij")
+    expected = meshgrid_cpu(x_np, y_np, z_np, indexing="ij")
+    for r, e in zip(result, expected):
+        assert torch.equal(r, torch.tensor(e, device="cuda"))
+
+
+def test_meshgrid_3d_xy():
+    """3D meshgrid with 'xy' indexing."""
+    x_np = np.array([1, 2])
+    y_np = np.array([3, 4, 5])
+    z_np = np.array([6, 7, 8, 9])
+    xt = torch.tensor([1, 2], device="cuda")
+    yt = torch.tensor([3, 4, 5], device="cuda")
+    zt = torch.tensor([6, 7, 8, 9], device="cuda")
+    result = ntops.torch.meshgrid(xt, yt, zt, indexing="xy")
+    expected = meshgrid_cpu(x_np, y_np, z_np, indexing="xy")
+    for r, e in zip(result, expected):
+        assert torch.equal(r, torch.tensor(e, device="cuda"))
+
+
+def test_meshgrid_1d():
+    """1D meshgrid (single input)."""
+    x = torch.tensor([1, 2, 3], device="cuda")
+    result = ntops.torch.meshgrid(x, indexing="ij")
+    assert len(result) == 1
+    assert torch.equal(result[0], x)
+
+
+def test_meshgrid_default_indexing():
+    """Default indexing should be 'xy'."""
+    x_np = np.array([1, 2, 3])
+    y_np = np.array([4, 5, 6, 7])
+    xt = torch.tensor([1, 2, 3], device="cuda")
+    yt = torch.tensor([4, 5, 6, 7], device="cuda")
+    result = ntops.torch.meshgrid(xt, yt)
+    expected = meshgrid_cpu(x_np, y_np, indexing="xy")
+    for r, e in zip(result, expected):
+        assert torch.equal(r, torch.tensor(e, device="cuda"))
+
+
+# =============================================================================
+# Edge cases
+# =============================================================================
+
+def test_meshgrid_single_element():
+    """Meshgrid with single-element inputs."""
+    x = torch.tensor([42], device="cuda")
+    y = torch.tensor([7], device="cuda")
+    result = ntops.torch.meshgrid(x, y, indexing="ij")
+    assert result[0].shape == (1, 1)
+    assert result[0].item() == 42
+    assert result[1].item() == 7
+
+
+def test_meshgrid_large():
+    """Meshgrid with larger inputs."""
+    x = torch.arange(100, device="cuda")
+    y = torch.arange(200, device="cuda")
+    result = ntops.torch.meshgrid(x, y, indexing="ij")
+    assert result[0].shape == (100, 200)
+    # Verify a few positions
+    assert result[0][0, 0] == 0
+    assert result[0][50, 0] == 50
+    assert result[0][0, 100] == 0
+    assert result[1][0, 0] == 0
+    assert result[1][0, 50] == 50
+
+
+# =============================================================================
+# Output is view (strided, not contiguous)
+# =============================================================================
+
+def test_meshgrid_output_is_view():
+    """Meshgrid outputs should be broadcast views (zero-copy)."""
+    x = torch.arange(10, device="cuda")
+    y = torch.arange(20, device="cuda")
+    result = ntops.torch.meshgrid(x, y, indexing="ij")
+    # Broadcast views are not contiguous
+    assert not result[0].is_contiguous()
+    assert not result[1].is_contiguous()
+    # But data is correct
+    assert result[0][5, :].sum() == 20 * 5  # 20 copies of value 5 along dim 1
+    assert result[1][0, :].sum() == 20 * 19 / 2  # values 0-19
+
+
+# =============================================================================
+# Dtype and device tests
+# =============================================================================
+
+@pytest.mark.parametrize("dtype", [
+    torch.float32,
+    torch.float16,
+    torch.int32,
+    torch.int64,
+])
+def test_meshgrid_dtype_preservation(dtype):
+    """Meshgrid should preserve input dtype."""
+    x = torch.tensor([1, 2, 3], device="cuda").to(dtype)
+    y = torch.tensor([4, 5, 6, 7], device="cuda").to(dtype)
+    result = ntops.torch.meshgrid(x, y, indexing="ij")
+    for r in result:
+        assert r.dtype == dtype
+
+
+def test_meshgrid_device_preservation():
+    """Meshgrid output should be on the input device."""
+    x = torch.tensor([1, 2, 3], device="cuda")
+    y = torch.tensor([4, 5, 6, 7], device="cuda")
+    result = ntops.torch.meshgrid(x, y)
+    for r in result:
+        assert r.device == x.device
+
+
+# =============================================================================
+# Gradient test
+# =============================================================================
+
+def test_meshgrid_gradient():
+    """Meshgrid should support gradient propagation."""
+    x = torch.tensor([1.0, 2.0, 3.0], device="cuda", requires_grad=True)
+    y = torch.tensor([4.0, 5.0, 6.0, 7.0], device="cuda", requires_grad=True)
+    gx, gy = ntops.torch.meshgrid(x, y, indexing="ij")
+    loss = gx.sum() + gy.sum()
+    loss.backward()
+    assert x.grad is not None
+    # x is broadcast 4 times along dim 1 → grad = 4 per element
+    assert torch.equal(x.grad, torch.tensor([4.0, 4.0, 4.0], device="cuda"))
+    assert y.grad is not None
+    # y is broadcast 3 times along dim 0 → grad = 3 per element
+    assert torch.equal(y.grad, torch.tensor([3.0, 3.0, 3.0, 3.0], device="cuda"))
+
+
+# =============================================================================
+# Four mandatory checks
+# =============================================================================
+
+def test_meshgrid_no_nan():
+    """Output should not contain NaN."""
+    x = torch.randn(10, device="cuda")
+    y = torch.randn(20, device="cuda")
+    result = ntops.torch.meshgrid(x, y)
+    for r in result:
+        assert not torch.isnan(r).any()
+
+
+def test_meshgrid_no_inf():
+    """Output should not contain Inf."""
+    x = torch.arange(10, device="cuda").float()
+    y = torch.arange(20, device="cuda").float()
+    result = ntops.torch.meshgrid(x, y)
+    for r in result:
+        assert not torch.isinf(r).any()
+
+
+def test_meshgrid_int_exact():
+    """Integer meshgrid should be exact match."""
+    x_np = np.array([10, 20, 30])
+    y_np = np.array([1, 2, 3, 4, 5])
+    xt = torch.tensor([10, 20, 30], device="cuda")
+    yt = torch.tensor([1, 2, 3, 4, 5], device="cuda")
+    result = ntops.torch.meshgrid(xt, yt, indexing="ij")
+    expected = meshgrid_cpu(x_np, y_np, indexing="ij")
+    for r, e in zip(result, expected):
+        assert torch.equal(r, torch.tensor(e, device="cuda"))
diff --git a/tests/test_mode.py b/tests/test_mode.py
new file mode 100644
index 0000000..14d9cc1
--- /dev/null
+++ b/tests/test_mode.py
@@ -0,0 +1,284 @@
+import numpy as np
+import pytest
+import torch
+import ntops
+
+
+# =============================================================================
+# CPU reference implementation
+# =============================================================================
+
+def mode_cpu(x_np: np.ndarray, dim: int, keepdim=False):
+    """CPU reference matching the specification."""
+    ndim = x_np.ndim
+    perm = [i for i in range(ndim) if i != dim] + [dim]
+    x_t = np.transpose(x_np, perm)
+    out_shape = list(x_t.shape[:-1])
+    vals = np.zeros(out_shape, dtype=x_np.dtype)
+    cnts = np.zeros(out_shape, dtype=np.int64)
+    for idx in np.ndindex(*out_shape):
+        vec = x_t[idx]
+        unique, counts = np.unique(vec, return_counts=True)
+        max_idx = np.argmax(counts)
+        vals[idx] = unique[max_idx]
+        cnts[idx] = counts[max_idx]
+    if keepdim:
+        vals = np.expand_dims(vals, axis=dim)
+        cnts = np.expand_dims(cnts, axis=dim)
+    return vals, cnts
+
+
+def assert_mode_valid(result_vals, result_cnts, x, dim, keepdim):
+    """
+    Verify that:
+    1. The returned value is indeed a mode (its count = max count for that position)
+    2. Counts are correct
+    """
+    # Check that the mode value actually appears `cnts` times along dim
+    if keepdim:
+        mode_vals = result_vals
+    else:
+        mode_vals = result_vals.unsqueeze(dim)
+
+    computed_cnts = (x == mode_vals).sum(dim=dim, keepdim=keepdim).to(torch.int64)
+
+    # The count computed from the mode value must match the returned count
+    flat_result = result_cnts.reshape(-1)
+    flat_computed = computed_cnts.reshape(-1)
+    for i in range(flat_result.shape[0]):
+        assert flat_result[i] == flat_computed[i], (
+            f"Count mismatch at position {i}: "
+            f"reported={flat_result[i]}, computed from value={flat_computed[i]}"
+        )
+
+    # The count must be at least 1 (mode always exists)
+    assert (result_cnts >= 1).all(), "Counts must be >= 1"
+
+
+# =============================================================================
+# Basic functionality tests
+# =============================================================================
+
+def test_mode_1d():
+    """Mode of a 1D tensor."""
+    x_np = np.array([1, 2, 2, 3, 3, 3], dtype=np.int64)
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    _, c_ref = mode_cpu(x_np, dim=0)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+def test_mode_2d_dim0():
+    """Mode of 2D tensor along dim=0 (column-wise)."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 5, (4, 3))
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    _, c_ref = mode_cpu(x_np, dim=0)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+def test_mode_2d_dim1():
+    """Mode of 2D tensor along dim=1 (row-wise)."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 5, (4, 3))
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=1)
+    _, c_ref = mode_cpu(x_np, dim=1)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=1, keepdim=False)
+
+
+def test_mode_keepdim():
+    """Mode with keepdim=True."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 5, (4, 3))
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=0, keepdim=True)
+    _, c_ref = mode_cpu(x_np, dim=0, keepdim=True)
+    assert vals.shape == (1, 3)
+    assert cnts.shape == (1, 3)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=True)
+
+
+def test_mode_3d():
+    """Mode of a 3D tensor."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 5, (3, 4, 5))
+    x = torch.from_numpy(x_np).cuda()
+    for dim in [0, 1, 2]:
+        vals, cnts = ntops.torch.mode(x, dim=dim)
+        _, c_ref = mode_cpu(x_np, dim=dim)
+        assert torch.equal(cnts, torch.tensor(c_ref, device="cuda")), f"dim={dim} counts"
+        assert_mode_valid(vals, cnts, x, dim=dim, keepdim=False)
+
+
+# =============================================================================
+# Float type tests
+# =============================================================================
+
+def test_mode_float32():
+    """Mode with float32 inputs."""
+    np.random.seed(42)
+    x_np = np.random.randn(3, 5).astype(np.float32)
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    _, c_ref = mode_cpu(x_np, dim=0)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+def test_mode_float16():
+    """Mode with float16 inputs."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 10, (4, 5)).astype(np.float16)
+    x = torch.from_numpy(x_np.astype(np.float32)).cuda().to(torch.float16)
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert vals.dtype == torch.float16
+    assert cnts.dtype == torch.int64
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+# =============================================================================
+# Edge cases
+# =============================================================================
+
+def test_mode_all_same():
+    """Mode when all values are the same."""
+    x = torch.ones(5, 3, device="cuda")
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert torch.equal(vals, torch.ones(3, device="cuda"))
+    assert torch.equal(cnts, torch.full((3,), 5, dtype=torch.int64, device="cuda"))
+
+
+def test_mode_all_unique():
+    """Mode when all values are unique."""
+    x = torch.tensor([[1, 4], [2, 5], [3, 6]], device="cuda")
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    # Any value is a valid mode (all have count=1)
+    assert torch.equal(cnts, torch.ones(2, dtype=torch.int64, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+def test_mode_single_row():
+    """Mode on a single-row tensor."""
+    x = torch.tensor([[1, 2, 3]], device="cuda")
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+def test_mode_negative_dim():
+    """Mode with negative dim indexing."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 5, (4, 3))
+    x = torch.from_numpy(x_np).cuda()
+    vals1, cnts1 = ntops.torch.mode(x, dim=0)
+    vals2, cnts2 = ntops.torch.mode(x, dim=-2)
+    assert torch.equal(cnts1, cnts2)
+
+
+# =============================================================================
+# Dtype and device tests
+# =============================================================================
+
+@pytest.mark.parametrize("dtype", [
+    torch.int32,
+    torch.int64,
+    torch.float32,
+    torch.float64,
+])
+def test_mode_dtype_preservation(dtype):
+    """Mode values should preserve input dtype."""
+    np.random.seed(42)
+    if dtype in (torch.int32, torch.int64):
+        x_np = np.random.randint(0, 10, (5, 4))
+    else:
+        x_np = np.random.randn(5, 4).astype(
+            np.float32 if dtype == torch.float32 else np.float64
+        )
+    x = torch.from_numpy(x_np).cuda().to(dtype)
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert vals.dtype == dtype
+    assert cnts.dtype == torch.int64
+
+
+def test_mode_device_preservation():
+    """Mode output should be on the input device."""
+    x = torch.randint(0, 10, (5, 4), device="cuda")
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert vals.device == x.device
+    assert cnts.device == x.device
+
+
+# =============================================================================
+# Gradient test
+# =============================================================================
+
+def test_mode_gradient():
+    """Mode should support gradient propagation on values."""
+    x = torch.randn(3, 5, device="cuda", requires_grad=True)
+    vals, _ = ntops.torch.mode(x, dim=0)
+    loss = vals.sum()
+    loss.backward()
+    assert x.grad is not None
+    assert x.grad.shape == x.shape
+
+
+# =============================================================================
+# Large tensor test
+# =============================================================================
+
+def test_mode_large():
+    """Mode on a larger tensor."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 20, (128, 256))
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    _, c_ref = mode_cpu(x_np, dim=0)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+# =============================================================================
+# Four mandatory checks
+# =============================================================================
+
+def test_mode_no_nan():
+    """Output should not contain NaN."""
+    x = torch.randint(0, 10, (100, 100), device="cuda")
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert not torch.isnan(vals).any()
+    assert not torch.isnan(cnts.float()).any()
+
+
+def test_mode_no_inf():
+    """Output should not contain Inf."""
+    x = torch.randint(0, 10, (100, 100), device="cuda")
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    assert not torch.isinf(vals).any()
+    assert not torch.isinf(cnts.float()).any()
+
+
+def test_mode_int_exact():
+    """Mode counts should be exact match with CPU reference."""
+    np.random.seed(42)
+    x_np = np.random.randint(0, 50, (20, 30))
+    x = torch.from_numpy(x_np).cuda()
+    vals, cnts = ntops.torch.mode(x, dim=0)
+    _, c_ref = mode_cpu(x_np, dim=0)
+    assert torch.equal(cnts, torch.tensor(c_ref, device="cuda"))
+    assert_mode_valid(vals, cnts, x, dim=0, keepdim=False)
+
+
+# =============================================================================
+# Count output type test
+# =============================================================================
+
+def test_mode_counts_dtype():
+    """Counts should always be int64."""
+    x = torch.randint(0, 5, (4, 3), device="cuda")
+    _, cnts = ntops.torch.mode(x, dim=0)
+    assert cnts.dtype == torch.int64
diff --git a/tests/test_roll.py b/tests/test_roll.py
new file mode 100644
index 0000000..60318da
--- /dev/null
+++ b/tests/test_roll.py
@@ -0,0 +1,212 @@
+import os
+import pytest
+import torch
+import ntops
+
+
+# =============================================================================
+# Basic functionality tests
+# =============================================================================
+
+def test_roll_1d_positive_shift():
+    """Roll 1D tensor with positive shift."""
+    x = torch.tensor([1, 2, 3, 4, 5], device="cuda")
+    result = ntops.torch.roll(x, shifts=2, dims=0)
+    expected = torch.tensor([4, 5, 1, 2, 3], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_roll_1d_negative_shift():
+    """Roll 1D tensor with negative shift (forward roll)."""
+    x = torch.tensor([1, 2, 3, 4, 5], device="cuda")
+    result = ntops.torch.roll(x, shifts=-2, dims=0)
+    expected = torch.tensor([3, 4, 5, 1, 2], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_roll_2d_dim0():
+    """Roll 2D tensor along dimension 0."""
+    x = torch.arange(12, device="cuda").reshape(3, 4)
+    result = ntops.torch.roll(x, shifts=1, dims=0)
+    expected = torch.tensor([
+        [8, 9, 10, 11],
+        [0, 1, 2, 3],
+        [4, 5, 6, 7],
+    ], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_roll_2d_dim1():
+    """Roll 2D tensor along dimension 1."""
+    x = torch.arange(12, device="cuda").reshape(3, 4)
+    result = ntops.torch.roll(x, shifts=1, dims=1)
+    expected = torch.tensor([
+        [3, 0, 1, 2],
+        [7, 4, 5, 6],
+        [11, 8, 9, 10],
+    ], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_roll_multi_dim():
+    """Roll along multiple dimensions simultaneously."""
+    x = torch.arange(12, device="cuda").reshape(3, 4)
+    result = ntops.torch.roll(x, shifts=(1, -1), dims=(0, 1))
+    expected = torch.roll(x, shifts=(1, -1), dims=(0, 1))
+    assert torch.equal(result, expected)
+
+
+def test_roll_3d():
+    """Roll a 3D tensor."""
+    x = torch.arange(24, device="cuda").reshape(2, 3, 4)
+    result = ntops.torch.roll(x, shifts=1, dims=1)
+    expected = torch.roll(x, shifts=1, dims=1)
+    assert torch.equal(result, expected)
+
+
+def test_roll_4d():
+    """Roll a 4D tensor."""
+    x = torch.arange(120, device="cuda").reshape(2, 3, 4, 5)
+    result = ntops.torch.roll(x, shifts=2, dims=2)
+    expected = torch.roll(x, shifts=2, dims=2)
+    assert torch.equal(result, expected)
+
+
+# =============================================================================
+# Edge cases
+# =============================================================================
+
+def test_roll_zero_shift():
+    """Roll with shift=0 should return the same tensor."""
+    x = torch.randn(3, 4, device="cuda")
+    result = ntops.torch.roll(x, shifts=0, dims=0)
+    assert torch.equal(result, x)
+
+
+def test_roll_shift_larger_than_dim():
+    """Roll with shift > dimension length (should apply modulo)."""
+    x = torch.tensor([1, 2, 3, 4, 5], device="cuda")
+    result = ntops.torch.roll(x, shifts=7, dims=0)  # 7 % 5 = 2
+    expected = torch.tensor([4, 5, 1, 2, 3], device="cuda")
+    assert torch.equal(result, expected)
+
+
+def test_roll_int_shifts():
+    """Roll with int shifts (not list/tuple)."""
+    x = torch.randn(3, 4, device="cuda")
+    result = ntops.torch.roll(x, shifts=2, dims=1)
+    expected = torch.roll(x, shifts=2, dims=1)
+    assert torch.equal(result, expected)
+
+
+def test_roll_int_dims():
+    """Roll with int dims (not list/tuple)."""
+    x = torch.randn(3, 4, device="cuda")
+    result = ntops.torch.roll(x, shifts=(1,), dims=1)
+    expected = torch.roll(x, shifts=(1,), dims=1)
+    assert torch.equal(result, expected)
+
+
+def test_roll_single_element():
+    """Roll a single-element tensor (should be a no-op)."""
+    x = torch.tensor([42], device="cuda")
+    result = ntops.torch.roll(x, shifts=5, dims=0)
+    assert torch.equal(result, x)
+
+
+def test_roll_symmetry():
+    """Roll forward then backward should return original."""
+    x = torch.randn(5, 10, device="cuda")
+    result = ntops.torch.roll(x, shifts=3, dims=1)
+    result = ntops.torch.roll(result, shifts=-3, dims=1)
+    assert torch.equal(result, x)
+
+
+def test_roll_very_large_shift():
+    """Roll with shift much larger than dimension."""
+    x = torch.tensor([1, 2, 3], device="cuda")
+    result = ntops.torch.roll(x, shifts=1000003, dims=0)  # 1000003 % 3 = 1
+    expected = torch.tensor([3, 1, 2], device="cuda")
+    assert torch.equal(result, expected)
+
+
+# =============================================================================
+# Dtype and device tests
+# =============================================================================
+
+@pytest.mark.parametrize("dtype", [
+    torch.float32,
+    torch.float16,
+    torch.float64,
+    torch.int32,
+    torch.int64,
+    torch.uint8,
+    torch.bool,
+])
+def test_roll_dtype_preservation(dtype):
+    """Roll should preserve the input dtype."""
+    if dtype == torch.bool:
+        x = torch.tensor([True, False, True, False, True], device="cuda")
+    else:
+        x = torch.arange(10, device="cuda").to(dtype)
+    result = ntops.torch.roll(x, shifts=3, dims=0)
+    assert result.dtype == dtype
+
+
+def test_roll_device_preservation():
+    """Roll should preserve the device."""
+    x = torch.randn(10, device="cuda")
+    result = ntops.torch.roll(x, shifts=3, dims=0)
+    assert result.device == x.device
+
+
+# =============================================================================
+# Gradient test
+# =============================================================================
+
+def test_roll_gradient():
+    """Roll should support gradient propagation."""
+    x = torch.randn(3, 4, device="cuda", requires_grad=True)
+    y = ntops.torch.roll(x, shifts=1, dims=1)
+    loss = y.sum()
+    loss.backward()
+    assert x.grad is not None
+    assert x.grad.shape == x.shape
+
+
+# =============================================================================
+# Four mandatory checks
+# =============================================================================
+
+def test_roll_no_nan():
+    """Output should not contain NaN."""
+    x = torch.randn(100, 100, device="cuda")
+    result = ntops.torch.roll(x, shifts=10, dims=0)
+    assert not torch.isnan(result).any()
+
+
+def test_roll_no_inf():
+    """Output should not contain Inf."""
+    x = torch.randn(100, 100, device="cuda")
+    result = ntops.torch.roll(x, shifts=10, dims=0)
+    assert not torch.isinf(result).any()
+
+
+def test_roll_int_exact():
+    """Integer roll should be exact match (no precision loss)."""
+    x = torch.randint(0, 100, (10, 10), device="cuda")
+    result = ntops.torch.roll(x, shifts=3, dims=0)
+    expected = torch.roll(x, shifts=3, dims=0)
+    assert torch.equal(result, expected)
+
+
+# =============================================================================
+# Negative shifts on all dims
+# =============================================================================
+
+def test_roll_negative_shift_last_dim():
+    """Negative shift on last dimension."""
+    x = torch.arange(12, device="cuda").reshape(3, 4)
+    result = ntops.torch.roll(x, shifts=-1, dims=-1)
+    expected = torch.roll(x, shifts=-1, dims=-1)
+    assert torch.equal(result, expected)

From 0ea571a11a8ff5dca570fba1f764c06636265968 Mon Sep 17 00:00:00 2001
From: Ifelseer <1138369491@qq.com>
Date: Sun, 21 Jun 2026 05:37:13 +0000
Subject: [PATCH 2/3] honor

---
 HONOR_CODE.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 HONOR_CODE.md

diff --git a/HONOR_CODE.md b/HONOR_CODE.md
new file mode 100644
index 0000000..c93078f
--- /dev/null
+++ b/HONOR_CODE.md
@@ -0,0 +1,73 @@
+```
+# 2026 春季启元人工智能大赛诚信守则（Honor Code）
+
+
+本人作为 2026 春季启元人工智能大赛（以下简称“比赛”）的参赛选手，郑重承诺严格遵守比赛规则及本诚信守则，秉持诚信、公正、廉洁的参赛原则，自觉维护比赛的公平性与严肃性。本人充分理解并认可，违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果，且愿意承担由此产生的一切责任。
+
+## 一、参赛诚信承诺
+
+1. 本人保证所提交的赛题PR（Pull Request）中包含的算子实现代码及相关文档，均为本人（及参赛团队，如为团队参赛）在比赛期间独立完成或在明确标注参考来源的基础上进行开发，不存在任何欺诈、抄袭、作弊行为。
+
+2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源，尤其是开源代码资源，不隐瞒任何可能影响比赛公平性的信息。
+
+3. 本人保证不采用任何不正当手段获取比赛优势，包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。
+
+## 二、参考资源说明
+
+本人确认已按比赛要求，将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中，该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况，按以下要求完整填写，信息不完整或虚假填写将视为违反本准则：
+
+**情况1：无参考外部开源代码及核心实现思路**
+
+`REFERENCE.md`中需明确声明：“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人（及参赛团队）独立设计与开发，未参考任何外部开源项目、技术文档中的核心代码片段或实现思路，未接受任何第三方的技术指导或代码支持。”
+
+**情况2：有参考外部开源代码及相关资源**
+
+对每个参考资源提供以下信息陈述： 
+1. 参考开源项目/资源名称
+
+2. 参考资源链接（GitHub/Gitee/论文/技术文档等）
+
+3.  参考的具体内容（请明确说明参考的代码片段、算法逻辑、实现思路等，需标注对应资源的具体位置，如文件路径、代码行数等）
+    
+4. 本人对参考内容的修改与优化说明：（请详细说明在参考基础上，本人所做的独立开发、修改、优化工作，体现自身技术贡献）
+    
+5. 若是开源项目，提供参考资源的开源协议类型：（如MIT、Apache 2.0、GPL等）
+    
+6. 其他需要补充说明的信息
+    
+
+## 三、禁止行为确认
+
+本人明确知晓并承诺避免以下违反比赛公平性的行为，若存在以下任一情况，自愿接受比赛组委会的相应处罚：
+
+1. 未经授权复制、抄袭他人（包括其他参赛选手、开源项目、商业代码）的代码、算法或技术方案，且未进行明确标注；
+    
+2. 隐瞒或虚假披露参考资源信息，包括遗漏重要参考来源、伪造参考内容说明等；
+    
+3. 与其他参赛选手或第三方串通，进行代码共享、成果交换等违规协作；
+    
+4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益；
+    
+5. 伪造比赛相关证明材料、提交虚假信息；
+    
+6. 其他违反比赛规则及公序良俗的不诚信行为。
+    
+
+## 四、责任与确认
+
+1. 本人充分理解，比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查，若发现本人存在违反本准则的行为，有权随时取消本人的参赛资格、作废比赛成绩，情节严重的将在比赛相关平台进行公示。
+
+2. 若因本人违反本准则导致比赛争议或第三方权益受损（如开源协议侵权等），本人将独立承担全部法律责任及相关损失，与比赛组委会无关。
+
+3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容，自愿签署本准则，接受比赛组委会的监督与审查。
+
+## 五、签署信息
+
+参赛选手姓名（团队参赛需填写所有成员姓名）
+王一鸣
+
+
+签署日期
+
+2026年6月1日
+```
\ No newline at end of file

From 3e32201d71a4256d5f2be0225083c88c7dc3bca0 Mon Sep 17 00:00:00 2001
From: Ifelseer <1138369491@qq.com>
Date: Sun, 21 Jun 2026 10:46:33 +0000
Subject: [PATCH 3/3] fix init

---
 src/ntops/torch/__init__.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
index 6874165..5948487 100644
--- a/src/ntops/torch/__init__.py
+++ b/src/ntops/torch/__init__.py
@@ -42,11 +42,7 @@
 from ntops.torch.softmax import softmax
 from ntops.torch.sub import sub
 from ntops.torch.tanh import tanh
-from ntops.torch.eye import eye
-from ntops.torch.flatten import flatten
-from ntops.torch.chunk import chunk
-from ntops.torch.unbind import unbind
-from ntops.torch.repeat import repeat
+# eye, flatten, chunk, unbind, repeat — source files not yet created
 from ntops.torch.roll import roll
 from ntops.torch.column_stack import column_stack