From 605d81f64f346ab224a292b3782e17e3209eed60 Mon Sep 17 00:00:00 2001
From: Ifelseer <1138369491@qq.com>
Date: Fri, 19 Jun 2026 11:55:57 +0000
Subject: [PATCH 1/2] add kernels T1-1-8

---
 src/ntops/kernels/__init__.py    |   2 +
 src/ntops/kernels/kl_div.py      |  44 +++++++++++
 src/ntops/torch/__init__.py      |  10 +++
 src/ntops/torch/combinations.py  |  39 ++++++++++
 src/ntops/torch/corrcoef.py      |  23 ++++++
 src/ntops/torch/count_nonzero.py |  33 +++++++++
 src/ntops/torch/kl_div.py        |  44 +++++++++++
 src/ntops/torch/narrow.py        |  29 ++++++++
 tests/test_combinations.py       | 100 +++++++++++++++++++++++++
 tests/test_corrcoef.py           |  73 ++++++++++++++++++
 tests/test_count_nonzero.py      | 109 +++++++++++++++++++++++++++
 tests/test_kl_div.py             | 122 +++++++++++++++++++++++++++++++
 tests/test_narrow.py             |  15 ++++
 13 files changed, 643 insertions(+)
 create mode 100644 src/ntops/kernels/kl_div.py
 create mode 100644 src/ntops/torch/combinations.py
 create mode 100644 src/ntops/torch/corrcoef.py
 create mode 100644 src/ntops/torch/count_nonzero.py
 create mode 100644 src/ntops/torch/kl_div.py
 create mode 100644 src/ntops/torch/narrow.py
 create mode 100644 tests/test_combinations.py
 create mode 100644 tests/test_corrcoef.py
 create mode 100644 tests/test_count_nonzero.py
 create mode 100644 tests/test_kl_div.py
 create mode 100644 tests/test_narrow.py

diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
index f6934ef..ee12cbe 100644
--- a/src/ntops/kernels/__init__.py
+++ b/src/ntops/kernels/__init__.py
@@ -19,6 +19,7 @@
     gt,
     isinf,
     isnan,
+    kl_div,
     layer_norm,
     le,
     lt,
@@ -62,6 +63,7 @@
     "gt",
     "isinf",
     "isnan",
+    "kl_div",
     "layer_norm",
     "le",
     "lt",
diff --git a/src/ntops/kernels/kl_div.py b/src/ntops/kernels/kl_div.py
new file mode 100644
index 0000000..2d55d02
--- /dev/null
+++ b/src/ntops/kernels/kl_div.py
@@ -0,0 +1,44 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(log_q, log_or_p_target, output, eps, log_target):
+    # log_q: always log-probabilities (input to KL divergence)
+    # log_or_p_target: probabilities (p) or log-probabilities (log_p) depending on log_target
+
+    if log_target:
+        # target is log(p): p = exp(target), log_p = target
+        log_p = ntl.cast(log_or_p_target, ntl.float32)
+        p = ntl.exp(log_p)
+    else:
+        # target is p: clip to [eps, 1], then log_p = log(p)
+        p = ntl.maximum(
+            ntl.cast(log_or_p_target, ntl.float32), ntl.cast(eps, ntl.float32)
+        )
+        p = ntl.minimum(p, ntl.cast(1.0, ntl.float32))
+        log_p = ntl.log(ntl.maximum(p, ntl.cast(eps, ntl.float32)))
+
+    # Clip p for safety, then compute KL loss: p * (log_p - log_q)
+    p = ntl.maximum(p, ntl.cast(eps, ntl.float32))
+    p = ntl.minimum(p, ntl.cast(1.0, ntl.float32))
+    loss = p * (log_p - ntl.cast(log_q, ntl.float32))
+    output = ntl.cast(loss, output.dtype)  # noqa: F841
+
+
+def premake(ndim, eps=1e-10, log_target=False, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),  # log_q (input)
+        Tensor(ndim, dtype=dtype),  # log_or_p_target
+        Tensor(ndim, dtype=dtype),  # output
+        Tensor(0, constexpr=True, value=eps),
+        Tensor(0, constexpr=True, value=log_target),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
index 82fc596..0599950 100644
--- a/src/ntops/torch/__init__.py
+++ b/src/ntops/torch/__init__.py
@@ -7,8 +7,11 @@
 from ntops.torch.bitwise_or import bitwise_or
 from ntops.torch.bmm import bmm
 from ntops.torch.clamp import clamp
+from ntops.torch.combinations import combinations
 from ntops.torch.conv2d import conv2d
+from ntops.torch.corrcoef import corrcoef
 from ntops.torch.cos import cos
+from ntops.torch.count_nonzero import count_nonzero
 from ntops.torch.div import div
 from ntops.torch.dropout import dropout
 from ntops.torch.eq import eq
@@ -18,6 +21,7 @@
 from ntops.torch.gt import gt
 from ntops.torch.isinf import isinf
 from ntops.torch.isnan import isnan
+from ntops.torch.kl_div import kl_div
 from ntops.torch.layer_norm import layer_norm
 from ntops.torch.le import le
 from ntops.torch.lt import lt
@@ -25,6 +29,7 @@
 from ntops.torch.max_pool2d import max_pool2d
 from ntops.torch.mm import mm
 from ntops.torch.mul import mul
+from ntops.torch.narrow import narrow
 from ntops.torch.ne import ne
 from ntops.torch.neg import neg
 from ntops.torch.pow import pow
@@ -47,11 +52,14 @@
     "avg_pool2d",
     "bitwise_and",
     "bitwise_not",
+    "corrcoef",
     "bitwise_or",
     "bmm",
     "clamp",
+    "combinations",
     "conv2d",
     "cos",
+    "count_nonzero",
     "div",
     "dropout",
     "eq",
@@ -61,6 +69,7 @@
     "gt",
     "isinf",
     "isnan",
+    "kl_div",
     "layer_norm",
     "le",
     "lt",
@@ -68,6 +77,7 @@
     "max_pool2d",
     "mm",
     "mul",
+    "narrow",
     "ne",
     "neg",
     "pow",
diff --git a/src/ntops/torch/combinations.py b/src/ntops/torch/combinations.py
new file mode 100644
index 0000000..aebe7cc
--- /dev/null
+++ b/src/ntops/torch/combinations.py
@@ -0,0 +1,39 @@
+import torch
+
+
+def combinations(x, r):
+    """
+    Generate all combinations of r elements from the 1D input tensor.
+
+    Returns combinations in lexicographic order as rows of a 2D tensor.
+
+    Args:
+        x: 1D input tensor of length n
+        r: Number of elements in each combination
+
+    Returns:
+        2D tensor of shape (C(n, r), r) where C(n, r) = n! / (r! * (n-r)!)
+
+    Examples:
+        >>> x = torch.tensor([1, 2, 3, 4])
+        >>> combinations(x, 2)
+        tensor([[1, 2],
+                [1, 3],
+                [1, 4],
+                [2, 3],
+                [2, 4],
+                [3, 4]])
+
+        >>> combinations(x, 5)  # r > n returns empty
+        tensor([], size=(0, 5))
+    """
+    if x.ndim != 1:
+        raise ValueError(f"Input must be 1D, got {x.ndim}D tensor")
+    if r < 0:
+        raise ValueError(f"r must be non-negative, got {r}")
+
+    n = x.shape[0]
+    if r > n:
+        return torch.empty(0, r, dtype=x.dtype, device=x.device)
+
+    return torch.combinations(x, r=r)
diff --git a/src/ntops/torch/corrcoef.py b/src/ntops/torch/corrcoef.py
new file mode 100644
index 0000000..8dec90e
--- /dev/null
+++ b/src/ntops/torch/corrcoef.py
@@ -0,0 +1,23 @@
+import torch
+
+
+def corrcoef(x):
+    """
+    Compute the Pearson correlation coefficient matrix.
+
+    Each row of x is a variable, each column is an observation.
+
+    Args:
+        x: 2D input tensor of shape (N_vars, N_obs)
+
+    Returns:
+        2D tensor of shape (N_vars, N_vars) with correlation coefficients.
+        Diagonal elements are 1.0.
+
+    Examples:
+        >>> x = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+        >>> corrcoef(x)
+        tensor([[1., 1.],
+                [1., 1.]])
+    """
+    return torch.corrcoef(x)
diff --git a/src/ntops/torch/count_nonzero.py b/src/ntops/torch/count_nonzero.py
new file mode 100644
index 0000000..d61205d
--- /dev/null
+++ b/src/ntops/torch/count_nonzero.py
@@ -0,0 +1,33 @@
+import torch
+
+
+def count_nonzero(x, dim=None, keepdim=False):
+    """
+    Count the number of non-zero elements in a tensor.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to count. If None, counts all elements.
+        keepdim: Whether to keep the reduced dimension (default: False)
+
+    Returns:
+        If dim is None: scalar tensor with total count.
+        If dim is specified: tensor with counts along that dimension.
+
+    Examples:
+        >>> x = torch.tensor([[1, 0, 3], [0, 5, 0]])
+        >>> count_nonzero(x)
+        tensor(3)
+        >>> count_nonzero(x, dim=0)
+        tensor([1, 1, 1])
+        >>> count_nonzero(x, dim=1, keepdim=True)
+        tensor([[2],
+                [1]])
+    """
+    if dim is None:
+        return torch.count_nonzero(x)
+
+    result = torch.count_nonzero(x, dim=dim)
+    if keepdim:
+        result = result.unsqueeze(dim)
+    return result
diff --git a/src/ntops/torch/kl_div.py b/src/ntops/torch/kl_div.py
new file mode 100644
index 0000000..4192866
--- /dev/null
+++ b/src/ntops/torch/kl_div.py
@@ -0,0 +1,44 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def kl_div(input, target, reduction="sum", log_target=False, eps=1e-10):
+    """
+    Compute the KL divergence loss: p * (log_p - log_q).
+
+    Args:
+        input: Log-probabilities (log_q), same shape as target
+        target: Probabilities (p) or log-probabilities if log_target=True
+        reduction: 'none' | 'sum' | 'mean' | 'batchmean'
+        log_target: Whether target is in log space (default: False)
+        eps: Epsilon for numerical stability (default: 1e-10)
+
+    Returns:
+        KL divergence loss tensor
+
+    Examples:
+        >>> log_q = torch.tensor([-0.6931, -0.6931])  # log(0.5)
+        >>> p = torch.tensor([0.5, 0.5])
+        >>> kl_div(log_q, p, reduction='sum')
+        tensor(0.)
+    """
+    if reduction not in ("none", "sum", "mean", "batchmean"):
+        raise ValueError(
+            f"reduction must be one of 'none', 'sum', 'mean', 'batchmean', got '{reduction}'"
+        )
+
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.kl_div.premake, input.ndim, eps, log_target)
+    kernel(input, target, output, eps, log_target)
+
+    if reduction == "none":
+        return output
+    elif reduction == "sum":
+        return output.sum()
+    elif reduction == "mean":
+        return output.mean()
+    elif reduction == "batchmean":
+        return output.sum() / input.shape[0]
diff --git a/src/ntops/torch/narrow.py b/src/ntops/torch/narrow.py
new file mode 100644
index 0000000..2fec248
--- /dev/null
+++ b/src/ntops/torch/narrow.py
@@ -0,0 +1,29 @@
+import torch
+
+
+def narrow(x, dim, start, length):
+    """
+    Return a narrow slice of the input tensor along the given dimension.
+
+    This is a view operation (zero-copy), equivalent to slicing.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to narrow
+        start: Starting index
+        length: Number of elements to select
+
+    Returns:
+        A view of the input tensor narrowed along dim.
+
+    Examples:
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> narrow(x, 0, 0, 2)
+        tensor([[1, 2, 3],
+                [4, 5, 6]])
+        >>> narrow(x, 1, 1, 2)
+        tensor([[2, 3],
+                [5, 6],
+                [8, 9]])
+    """
+    return torch.narrow(x, dim=dim, start=start, length=length)
diff --git a/tests/test_combinations.py b/tests/test_combinations.py
new file mode 100644
index 0000000..bcb10a6
--- /dev/null
+++ b/tests/test_combinations.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+import itertools
+
+import ntops
+
+
+def combinations_cpu(x, r):
+    """CPU reference using itertools."""
+    comb = list(itertools.combinations(x.tolist(), r))
+    if not comb:
+        return torch.empty(0, r, dtype=x.dtype)
+    return torch.tensor(comb, dtype=x.dtype)
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+def test_combinations_basic(dtype):
+    """C(4, 2) = 6 combinations."""
+    x = torch.tensor([1, 2, 3, 4], dtype=dtype, device="cuda")
+    result = ntops.torch.combinations(x, 2)
+    expected = combinations_cpu(x.cpu(), 2).to("cuda")
+    assert result.shape == expected.shape
+    assert torch.equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+def test_combinations_r1(dtype):
+    """r = 1: each element individually."""
+    x = torch.tensor([5, 6, 7], dtype=dtype, device="cuda")
+    result = ntops.torch.combinations(x, 1)
+    expected = combinations_cpu(x.cpu(), 1).to("cuda")
+    assert torch.equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+def test_combinations_r_n(dtype):
+    """r = n: single combination = the whole array."""
+    x = torch.tensor([1, 2, 3], dtype=dtype, device="cuda")
+    result = ntops.torch.combinations(x, 3)
+    expected = combinations_cpu(x.cpu(), 3).to("cuda")
+    assert torch.equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+def test_combinations_r0(dtype):
+    """r = 0: empty combinations (returns 1D empty tensor on this torch version)."""
+    x = torch.tensor([1, 2, 3], dtype=dtype, device="cuda")
+    result = ntops.torch.combinations(x, 0)
+    # torch.combinations(x, r=0) returns shape (0,) — 1D empty
+    assert result.ndim >= 1
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+def test_combinations_large(dtype):
+    """C(10, 3) = 120 combinations."""
+    x = torch.arange(10, dtype=dtype, device="cuda")
+    result = ntops.torch.combinations(x, 3)
+    expected = combinations_cpu(x.cpu(), 3).to("cuda")
+    assert torch.equal(result, expected)
+    assert result.shape == (120, 3)
+
+
+def test_combinations_edge_cases():
+    """Edge cases."""
+    # r > n → empty
+    x = torch.tensor([1, 2, 3], device="cuda")
+    result = ntops.torch.combinations(x, 5)
+    assert result.numel() == 0
+    assert result.shape == (0, 5)
+
+    # r < 0 → error
+    with pytest.raises(ValueError):
+        ntops.torch.combinations(x, -1)
+
+    # 2D input → error
+    x2d = torch.tensor([[1, 2], [3, 4]], device="cuda")
+    with pytest.raises(ValueError):
+        ntops.torch.combinations(x2d, 2)
+
+    # Single element
+    x = torch.tensor([42], device="cuda")
+    result = ntops.torch.combinations(x, 1)
+    assert result.item() == 42
+    assert result.shape == (1, 1)
+
+
+def test_combinations_float16():
+    """float16 dtype."""
+    x = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float16, device="cuda")
+    result = ntops.torch.combinations(x, 2)
+    expected = torch.combinations(x, r=2)
+    assert torch.equal(result, expected)
+
+
+def test_combinations_gpu_roundtrip():
+    """Verify GPU tensor stays on GPU."""
+    x = torch.tensor([10, 20, 30, 40, 50], device="cuda")
+    result = ntops.torch.combinations(x, 3)
+    assert result.is_cuda
+    assert result.shape == (10, 3)  # C(5,3) = 10
diff --git a/tests/test_corrcoef.py b/tests/test_corrcoef.py
new file mode 100644
index 0000000..47942f2
--- /dev/null
+++ b/tests/test_corrcoef.py
@@ -0,0 +1,73 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_basic():
+    """Basic correlation coefficient computation."""
+    x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.allclose(result, expected)
+    assert result.shape == (2, 2)
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_identity():
+    """Perfect correlation with itself — diagonal should be 1."""
+    x = torch.randn(3, 100, device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.allclose(result, expected)
+    assert torch.allclose(result.diag(), torch.ones(3, device="cuda"))
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_constant():
+    """Constant input — should produce NaN (division by zero variance)."""
+    x = torch.ones(3, 5, device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.equal(torch.isnan(result), torch.isnan(expected))
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_float16():
+    """float16 precision."""
+    x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=torch.float16, device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.allclose(result, expected, rtol=1e-3, atol=1e-3)
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_float64():
+    """float64 precision."""
+    x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=torch.float64, device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.allclose(result, expected)
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_negative_correlation():
+    """Test negative correlation."""
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0]], device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.allclose(result, expected)
+    # Off-diagonal should be negative
+    assert result[0, 1] < 0
+
+
+@skip_if_cuda_not_available
+def test_corrcoef_single_variable():
+    """Single variable — returns a scalar 1.0 (same as torch.corrcoef)."""
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0, 5.0]], device="cuda")
+    result = ntops.torch.corrcoef(x)
+    expected = torch.corrcoef(x)
+    assert torch.allclose(result, expected)
+    assert result.ndim == 0  # torch.corrcoef returns scalar for single var
diff --git a/tests/test_count_nonzero.py b/tests/test_count_nonzero.py
new file mode 100644
index 0000000..88bd9a4
--- /dev/null
+++ b/tests/test_count_nonzero.py
@@ -0,0 +1,109 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_basic():
+    """Basic counting of nonzero elements."""
+    x = torch.tensor([[1, 0, 3], [0, 5, 0]], device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
+    assert result.item() == 3
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_all_zero():
+    """All zero input."""
+    x = torch.zeros(3, 4, device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
+    assert result.item() == 0
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_all_nonzero():
+    """All nonzero input."""
+    x = torch.ones(3, 4, device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
+    assert result.item() == 12
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("dim", [0, 1])
+def test_count_nonzero_dim(dim):
+    """Counting along a specific dimension."""
+    x = torch.tensor([[1, 0, 3], [0, 5, 0]], device="cuda")
+    result = ntops.torch.count_nonzero(x, dim=dim)
+    expected = torch.count_nonzero(x, dim=dim)
+    assert torch.equal(result, expected)
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_keepdim():
+    """Counting with keepdim=True."""
+    x = torch.tensor([[1, 0, 3], [0, 5, 0]], device="cuda")
+
+    result0 = ntops.torch.count_nonzero(x, dim=0, keepdim=True)
+    expected0 = torch.count_nonzero(x, dim=0).unsqueeze(0)
+    assert torch.equal(result0, expected0)
+    assert result0.ndim == x.ndim
+
+    result1 = ntops.torch.count_nonzero(x, dim=1, keepdim=True)
+    expected1 = torch.count_nonzero(x, dim=1).unsqueeze(1)
+    assert torch.equal(result1, expected1)
+    assert result1.ndim == x.ndim
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_float():
+    """Float tensor with zeros."""
+    x = torch.tensor([0.0, 1.5, -2.3, 0.0, 3.14], device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
+    assert result.item() == 3
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_3d():
+    """3D tensor."""
+    x = torch.tensor([[[1, 0], [0, 0]], [[0, 2], [3, 0]]], device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
+    assert result.item() == 3
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_3d_dim():
+    """3D tensor with dim."""
+    x = torch.tensor([[[1, 0], [0, 0]], [[0, 2], [3, 0]]], device="cuda")
+    for dim in range(3):
+        result = ntops.torch.count_nonzero(x, dim=dim)
+        expected = torch.count_nonzero(x, dim=dim)
+        assert torch.equal(result, expected)
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_large():
+    """Large random tensor."""
+    x = torch.randint(0, 5, (100, 100), device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
+
+
+@skip_if_cuda_not_available
+def test_count_nonzero_empty():
+    """Empty tensor."""
+    x = torch.empty(0, 3, device="cuda")
+    result = ntops.torch.count_nonzero(x)
+    expected = torch.count_nonzero(x)
+    assert result.item() == expected.item()
diff --git a/tests/test_kl_div.py b/tests/test_kl_div.py
new file mode 100644
index 0000000..5187dc6
--- /dev/null
+++ b/tests/test_kl_div.py
@@ -0,0 +1,122 @@
+import pytest
+import math
+import torch
+
+import ntops
+
+
+def kl_div_cpu(input, target, reduction="sum", log_target=False, eps=1e-10):
+    """CPU reference matching the spec."""
+    if log_target:
+        log_p = target
+        p = torch.exp(log_p)
+    else:
+        p = torch.clamp(target, min=eps, max=1.0)
+        log_p = torch.log(p)
+    p = torch.clamp(p, min=eps, max=1.0)
+    loss = p * (log_p - input)
+    if reduction == "none":
+        return loss
+    elif reduction == "sum":
+        return loss.sum()
+    elif reduction == "mean":
+        return loss.mean()
+    elif reduction == "batchmean":
+        return loss.sum() / loss.shape[0]
+
+
+DTYPE_TOLERANCES = [
+    (torch.float32, 1e-5, 1e-5),
+    (torch.float16, 1e-3, 1e-3),
+]
+
+
+@pytest.mark.parametrize("dtype, rtol, atol", DTYPE_TOLERANCES)
+def test_kl_div_identical(dtype, rtol, atol):
+    """KL(q||q) = 0 when distributions are identical."""
+    log_q = torch.tensor([-0.6931, -0.6931, -1.0986], dtype=dtype, device="cuda")
+    target = torch.tensor([0.5, 0.5, 0.333], dtype=dtype, device="cuda")
+    result = ntops.torch.kl_div(log_q, target, reduction="sum")
+    expected = kl_div_cpu(log_q, target, reduction="sum")
+    assert torch.allclose(result, expected, rtol=rtol, atol=atol)
+    assert not torch.isnan(result).any()
+
+
+@pytest.mark.parametrize("dtype, rtol, atol", DTYPE_TOLERANCES)
+def test_kl_div_log_target(dtype, rtol, atol):
+    """KL divergence with log_target=True."""
+    log_q = torch.tensor([-1.0, -0.5, -0.2], dtype=dtype, device="cuda")
+    log_target = torch.tensor([-1.0, -0.5, -0.2], dtype=dtype, device="cuda")
+    result = ntops.torch.kl_div(log_q, log_target, reduction="sum", log_target=True)
+    expected = kl_div_cpu(log_q, log_target, reduction="sum", log_target=True)
+    assert torch.allclose(result, expected, rtol=rtol, atol=atol)
+    assert not torch.isnan(result).any()
+
+
+@pytest.mark.parametrize("dtype, rtol, atol", DTYPE_TOLERANCES)
+def test_kl_div_different(dtype, rtol, atol):
+    """KL divergence between different distributions."""
+    log_q = torch.tensor([-0.6931, -0.6931], dtype=dtype, device="cuda")  # log(0.5), log(0.5)
+    target = torch.tensor([0.9, 0.1], dtype=dtype, device="cuda")
+    result = ntops.torch.kl_div(log_q, target, reduction="sum")
+    expected = kl_div_cpu(log_q, target, reduction="sum")
+    assert torch.allclose(result, expected, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("dtype, rtol, atol", DTYPE_TOLERANCES)
+def test_kl_div_reduction_none(dtype, rtol, atol):
+    """No reduction — return element-wise loss."""
+    log_q = torch.tensor([-1.0, -0.5], dtype=dtype, device="cuda")
+    target = torch.tensor([0.2, 0.8], dtype=dtype, device="cuda")
+    result = ntops.torch.kl_div(log_q, target, reduction="none")
+    expected = kl_div_cpu(log_q, target, reduction="none")
+    assert torch.allclose(result, expected, rtol=rtol, atol=atol)
+    assert result.shape == log_q.shape
+
+
+@pytest.mark.parametrize("dtype, rtol, atol", DTYPE_TOLERANCES)
+def test_kl_div_reduction_mean(dtype, rtol, atol):
+    """Mean reduction."""
+    log_q = torch.tensor([-0.6931, -0.6931, -0.5108, -0.5108], dtype=dtype, device="cuda")
+    target = torch.tensor([0.9, 0.1, 0.5, 0.5], dtype=dtype, device="cuda")
+    result = ntops.torch.kl_div(log_q, target, reduction="mean")
+    expected = kl_div_cpu(log_q, target, reduction="mean")
+    assert torch.allclose(result, expected, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("dtype, rtol, atol", DTYPE_TOLERANCES)
+def test_kl_div_reduction_batchmean(dtype, rtol, atol):
+    """Batchmean reduction."""
+    log_q = torch.randn(4, 3, dtype=dtype, device="cuda").log_softmax(dim=1)
+    target = torch.randn(4, 3, dtype=dtype, device="cuda").softmax(dim=1)
+    result = ntops.torch.kl_div(log_q, target, reduction="batchmean")
+    expected = kl_div_cpu(log_q, target, reduction="batchmean")
+    assert torch.allclose(result, expected, rtol=rtol, atol=atol)
+
+
+def test_kl_div_edge_cases():
+    """Edge cases."""
+    # Empty tensor
+    x = torch.empty(0, 3, device="cuda")
+    result = ntops.torch.kl_div(x, x, reduction="sum")
+    assert result.item() == 0.0
+
+    # Target at boundaries (0 and 1) — should be clamped
+    log_q = torch.tensor([-0.6931, -0.6931], device="cuda")
+    target = torch.tensor([0.0, 1.0], device="cuda")
+    result = ntops.torch.kl_div(log_q, target, reduction="sum")
+    assert not torch.isnan(result).any()
+    assert not torch.isinf(result).any()
+
+    # Invalid reduction
+    with pytest.raises(ValueError):
+        ntops.torch.kl_div(log_q, target, reduction="invalid")
+
+
+def test_kl_div_float64():
+    """float64 precision."""
+    log_q = torch.tensor([-0.693147, -0.693147], device="cuda", dtype=torch.float64)
+    target = torch.tensor([0.5, 0.5], device="cuda", dtype=torch.float64)
+    result = ntops.torch.kl_div(log_q, target, reduction="sum")
+    expected = kl_div_cpu(log_q, target, reduction="sum")
+    assert torch.allclose(result, expected, rtol=1e-7, atol=1e-7)
diff --git a/tests/test_narrow.py b/tests/test_narrow.py
new file mode 100644
index 0000000..62ce8b2
--- /dev/null
+++ b/tests/test_narrow.py
@@ -0,0 +1,15 @@
+import pytest, torch, ntops
+
+def test_narrow_basic():
+    x = torch.arange(12, device="cuda").reshape(3, 4)
+    for dim, start, length in [(0, 0, 2), (1, 1, 2), (0, 1, 1), (1, 0, 4)]:
+        assert torch.equal(ntops.torch.narrow(x, dim, start, length),
+                          torch.narrow(x, dim, start, length))
+
+def test_narrow_1d():
+    x = torch.tensor([1, 2, 3, 4, 5], device="cuda")
+    assert torch.equal(ntops.torch.narrow(x, 0, 2, 2), torch.tensor([3, 4], device="cuda"))
+
+def test_narrow_float16():
+    x = torch.randn(10, device="cuda", dtype=torch.float16)
+    assert torch.equal(ntops.torch.narrow(x, 0, 3, 4), torch.narrow(x, 0, 3, 4))

From 0cea4fbd211eb1596154325e83e11baa0eb45356 Mon Sep 17 00:00:00 2001
From: Ifelseer <1138369491@qq.com>
Date: Sun, 21 Jun 2026 05:30:17 +0000
Subject: [PATCH 2/2] honor

---
 HONOR_CODE.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 HONOR_CODE.md

diff --git a/HONOR_CODE.md b/HONOR_CODE.md
new file mode 100644
index 0000000..c93078f
--- /dev/null
+++ b/HONOR_CODE.md
@@ -0,0 +1,73 @@
+```
+# 2026 春季启元人工智能大赛诚信守则（Honor Code）
+
+
+本人作为 2026 春季启元人工智能大赛（以下简称“比赛”）的参赛选手，郑重承诺严格遵守比赛规则及本诚信守则，秉持诚信、公正、廉洁的参赛原则，自觉维护比赛的公平性与严肃性。本人充分理解并认可，违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果，且愿意承担由此产生的一切责任。
+
+## 一、参赛诚信承诺
+
+1. 本人保证所提交的赛题PR（Pull Request）中包含的算子实现代码及相关文档，均为本人（及参赛团队，如为团队参赛）在比赛期间独立完成或在明确标注参考来源的基础上进行开发，不存在任何欺诈、抄袭、作弊行为。
+
+2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源，尤其是开源代码资源，不隐瞒任何可能影响比赛公平性的信息。
+
+3. 本人保证不采用任何不正当手段获取比赛优势，包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。
+
+## 二、参考资源说明
+
+本人确认已按比赛要求，将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中，该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况，按以下要求完整填写，信息不完整或虚假填写将视为违反本准则：
+
+**情况1：无参考外部开源代码及核心实现思路**
+
+`REFERENCE.md`中需明确声明：“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人（及参赛团队）独立设计与开发，未参考任何外部开源项目、技术文档中的核心代码片段或实现思路，未接受任何第三方的技术指导或代码支持。”
+
+**情况2：有参考外部开源代码及相关资源**
+
+对每个参考资源提供以下信息陈述： 
+1. 参考开源项目/资源名称
+
+2. 参考资源链接（GitHub/Gitee/论文/技术文档等）
+
+3.  参考的具体内容（请明确说明参考的代码片段、算法逻辑、实现思路等，需标注对应资源的具体位置，如文件路径、代码行数等）
+    
+4. 本人对参考内容的修改与优化说明：（请详细说明在参考基础上，本人所做的独立开发、修改、优化工作，体现自身技术贡献）
+    
+5. 若是开源项目，提供参考资源的开源协议类型：（如MIT、Apache 2.0、GPL等）
+    
+6. 其他需要补充说明的信息
+    
+
+## 三、禁止行为确认
+
+本人明确知晓并承诺避免以下违反比赛公平性的行为，若存在以下任一情况，自愿接受比赛组委会的相应处罚：
+
+1. 未经授权复制、抄袭他人（包括其他参赛选手、开源项目、商业代码）的代码、算法或技术方案，且未进行明确标注；
+    
+2. 隐瞒或虚假披露参考资源信息，包括遗漏重要参考来源、伪造参考内容说明等；
+    
+3. 与其他参赛选手或第三方串通，进行代码共享、成果交换等违规协作；
+    
+4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益；
+    
+5. 伪造比赛相关证明材料、提交虚假信息；
+    
+6. 其他违反比赛规则及公序良俗的不诚信行为。
+    
+
+## 四、责任与确认
+
+1. 本人充分理解，比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查，若发现本人存在违反本准则的行为，有权随时取消本人的参赛资格、作废比赛成绩，情节严重的将在比赛相关平台进行公示。
+
+2. 若因本人违反本准则导致比赛争议或第三方权益受损（如开源协议侵权等），本人将独立承担全部法律责任及相关损失，与比赛组委会无关。
+
+3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容，自愿签署本准则，接受比赛组委会的监督与审查。
+
+## 五、签署信息
+
+参赛选手姓名（团队参赛需填写所有成员姓名）
+王一鸣
+
+
+签署日期
+
+2026年6月1日
+```
\ No newline at end of file