From 089bc2772f02674d80f55a6c708bfa9a8924645b Mon Sep 17 00:00:00 2001 From: acoier <851563813@qq.com> Date: Wed, 10 Jun 2026 03:48:00 +0000 Subject: [PATCH 1/4] finish 5 ops develop eye flatten chunk unbind repeat Signed-off-by: acoier <851563813@qq.com> --- src/ntops/kernels/__init__.py | 10 ++++++ src/ntops/kernels/chunk.py | 17 ++++++++++ src/ntops/kernels/eye.py | 22 +++++++++++++ src/ntops/kernels/flatten.py | 17 ++++++++++ src/ntops/kernels/repeat.py | 17 ++++++++++ src/ntops/kernels/unbind.py | 17 ++++++++++ src/ntops/torch/__init__.py | 10 ++++++ src/ntops/torch/chunk.py | 33 ++++++++++++++++++++ src/ntops/torch/eye.py | 27 ++++++++++++++++ src/ntops/torch/flatten.py | 30 ++++++++++++++++++ src/ntops/torch/repeat.py | 18 +++++++++++ src/ntops/torch/unbind.py | 29 +++++++++++++++++ tests/test_chunk.py | 45 ++++++++++++++++++++++++++ tests/test_eye.py | 39 +++++++++++++++++++++++ tests/test_flatten.py | 37 ++++++++++++++++++++++ tests/test_repeat.py | 59 +++++++++++++++++++++++++++++++++++ tests/test_unbind.py | 53 +++++++++++++++++++++++++++++++ 17 files changed, 480 insertions(+) create mode 100644 src/ntops/kernels/chunk.py create mode 100644 src/ntops/kernels/eye.py create mode 100644 src/ntops/kernels/flatten.py create mode 100644 src/ntops/kernels/repeat.py create mode 100644 src/ntops/kernels/unbind.py create mode 100644 src/ntops/torch/chunk.py create mode 100644 src/ntops/torch/eye.py create mode 100644 src/ntops/torch/flatten.py create mode 100644 src/ntops/torch/repeat.py create mode 100644 src/ntops/torch/unbind.py create mode 100644 tests/test_chunk.py create mode 100644 tests/test_eye.py create mode 100644 tests/test_flatten.py create mode 100644 tests/test_repeat.py create mode 100644 tests/test_unbind.py diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py index f6934ef..3240e9b 100644 --- a/src/ntops/kernels/__init__.py +++ b/src/ntops/kernels/__init__.py @@ -7,6 +7,7 @@ bitwise_not, bitwise_or, bmm, + chunk, clamp, conv2d, cos, @@ -14,6 +15,8 @@ dropout, eq, exp, + eye, + flatten, ge, gelu, gt, @@ -29,6 +32,7 @@ neg, pow, relu, + repeat, rms_norm, rotary_position_embedding, rsqrt, @@ -39,6 +43,7 @@ softmax, sub, tanh, + unbind, ) __all__ = [ @@ -50,6 +55,7 @@ "bitwise_not", "bitwise_or", "bmm", + "chunk", "clamp", "conv2d", "cos", @@ -57,6 +63,8 @@ "dropout", "eq", "exp", + "eye", + "flatten", "ge", "gelu", "gt", @@ -72,6 +80,7 @@ "neg", "pow", "relu", + "repeat", "rms_norm", "rotary_position_embedding", "rsqrt", @@ -82,4 +91,5 @@ "softmax", "sub", "tanh", + "unbind", ] diff --git a/src/ntops/kernels/chunk.py b/src/ntops/kernels/chunk.py new file mode 100644 index 0000000..fff8218 --- /dev/null +++ b/src/ntops/kernels/chunk.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/eye.py b/src/ntops/kernels/eye.py new file mode 100644 index 0000000..833506a --- /dev/null +++ b/src/ntops/kernels/eye.py @@ -0,0 +1,22 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(rows, cols, output): + output = ntl.where(rows == cols, 1.0, 0.0) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/flatten.py b/src/ntops/kernels/flatten.py new file mode 100644 index 0000000..fff8218 --- /dev/null +++ b/src/ntops/kernels/flatten.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/repeat.py b/src/ntops/kernels/repeat.py new file mode 100644 index 0000000..fff8218 --- /dev/null +++ b/src/ntops/kernels/repeat.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/unbind.py b/src/ntops/kernels/unbind.py new file mode 100644 index 0000000..fff8218 --- /dev/null +++ b/src/ntops/kernels/unbind.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py index 82fc596..a8f2903 100644 --- a/src/ntops/torch/__init__.py +++ b/src/ntops/torch/__init__.py @@ -6,6 +6,7 @@ from ntops.torch.bitwise_not import bitwise_not from ntops.torch.bitwise_or import bitwise_or from ntops.torch.bmm import bmm +from ntops.torch.chunk import chunk from ntops.torch.clamp import clamp from ntops.torch.conv2d import conv2d from ntops.torch.cos import cos @@ -13,6 +14,8 @@ from ntops.torch.dropout import dropout from ntops.torch.eq import eq from ntops.torch.exp import exp +from ntops.torch.eye import eye +from ntops.torch.flatten import flatten from ntops.torch.ge import ge from ntops.torch.gelu import gelu from ntops.torch.gt import gt @@ -29,6 +32,7 @@ from ntops.torch.neg import neg from ntops.torch.pow import pow from ntops.torch.relu import relu +from ntops.torch.repeat import repeat from ntops.torch.rms_norm import rms_norm from ntops.torch.rotary_position_embedding import rotary_position_embedding from ntops.torch.rsqrt import rsqrt @@ -39,6 +43,7 @@ from ntops.torch.softmax import softmax from ntops.torch.sub import sub from ntops.torch.tanh import tanh +from ntops.torch.unbind import unbind __all__ = [ "abs", @@ -49,6 +54,7 @@ "bitwise_not", "bitwise_or", "bmm", + "chunk", "clamp", "conv2d", "cos", @@ -56,6 +62,8 @@ "dropout", "eq", "exp", + "eye", + "flatten", "ge", "gelu", "gt", @@ -72,6 +80,7 @@ "neg", "pow", "relu", + "repeat", "rms_norm", "rotary_position_embedding", "rsqrt", @@ -82,4 +91,5 @@ "softmax", "sub", "tanh", + "unbind", ] diff --git a/src/ntops/torch/chunk.py b/src/ntops/torch/chunk.py new file mode 100644 index 0000000..e61e5d5 --- /dev/null +++ b/src/ntops/torch/chunk.py @@ -0,0 +1,33 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def chunk(input, chunks, dim=0): + if dim < 0: + dim = input.ndim + dim + + chunk_size = (input.shape[dim] + chunks - 1) // chunks + + outputs = [] + + for i in range(chunks): + start = i * chunk_size + end = min(start + chunk_size, input.shape[dim]) + + if start >= input.shape[dim]: + break + + slices = [slice(None)] * input.ndim + slices[dim] = slice(start, end) + + chunk_tensor = input[tuple(slices)] + out_chunk = torch.empty_like(chunk_tensor) + + kernel = _cached_make(ntops.kernels.chunk.premake, input.ndim) + kernel(chunk_tensor, out_chunk) + + outputs.append(out_chunk) + + return tuple(outputs) diff --git a/src/ntops/torch/eye.py b/src/ntops/torch/eye.py new file mode 100644 index 0000000..3bcf0e0 --- /dev/null +++ b/src/ntops/torch/eye.py @@ -0,0 +1,27 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def eye(n, m=None, *, dtype=None, device=None, out=None): + if m is None: + m = n + + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if dtype is None: + dtype = torch.float32 + + rows = torch.arange(n, device=device).reshape(n, 1).expand(n, m) + cols = torch.arange(m, device=device).reshape(1, m).expand(n, m) + + if out is None: + out = torch.empty(n, m, dtype=dtype, device=device) + + kernel = _cached_make(ntops.kernels.eye.premake, 2) + + kernel(rows, cols, out) + + return out diff --git a/src/ntops/torch/flatten.py b/src/ntops/torch/flatten.py new file mode 100644 index 0000000..2a6b59c --- /dev/null +++ b/src/ntops/torch/flatten.py @@ -0,0 +1,30 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def flatten(input, start_dim=0, end_dim=-1): + if end_dim < 0: + end_dim = input.ndim + end_dim + + if start_dim < 0: + start_dim = input.ndim + start_dim + + flattened_numel = 1 + + for dim in range(start_dim, end_dim + 1): + flattened_numel *= input.shape[dim] + + out_shape = input.shape[:start_dim] + (flattened_numel,) + input.shape[end_dim + 1 :] + + out = torch.empty(out_shape, dtype=input.dtype, device=input.device) + + # Reshape input to match output ndim so the kernel can process both uniformly. + reshaped_input = input.reshape(out_shape) + + kernel = _cached_make(ntops.kernels.flatten.premake, out.ndim) + + kernel(reshaped_input, out) + + return out diff --git a/src/ntops/torch/repeat.py b/src/ntops/torch/repeat.py new file mode 100644 index 0000000..f85ebbf --- /dev/null +++ b/src/ntops/torch/repeat.py @@ -0,0 +1,18 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def repeat(input, *sizes): + if len(sizes) == 1 and isinstance(sizes[0], (list, tuple)): + sizes = tuple(sizes[0]) + + repeated = input.repeat(*sizes) + out = torch.empty_like(repeated) + + kernel = _cached_make(ntops.kernels.repeat.premake, repeated.ndim) + + kernel(repeated, out) + + return out diff --git a/src/ntops/torch/unbind.py b/src/ntops/torch/unbind.py new file mode 100644 index 0000000..86972aa --- /dev/null +++ b/src/ntops/torch/unbind.py @@ -0,0 +1,29 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def unbind(input, dim=0): + if dim < 0: + dim = input.ndim + dim + + outputs = [] + + for i in range(input.shape[dim]): + slices = [slice(None)] * input.ndim + slices[dim] = slice(i, i + 1) + + slice_tensor = input[tuple(slices)].squeeze(dim) + out_slice = torch.empty_like(slice_tensor) + + if slice_tensor.ndim == 0: + # 0D tensors can't be processed by the kernel. + out_slice = slice_tensor.clone() + else: + kernel = _cached_make(ntops.kernels.unbind.premake, slice_tensor.ndim) + kernel(slice_tensor, out_slice) + + outputs.append(out_slice) + + return tuple(outputs) diff --git a/tests/test_chunk.py b/tests/test_chunk.py new file mode 100644 index 0000000..b2f5682 --- /dev/null +++ b/tests/test_chunk.py @@ -0,0 +1,45 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_chunk(shape, dtype, device, rtol, atol): + # TODO: Test for `float16` later. + if dtype is torch.float16: + return + + input = torch.randn(shape, dtype=dtype, device=device) + chunks = max(1, input.shape[0] // 2) + + ninetoothed_output = ntops.torch.chunk(input, chunks) + reference_output = torch.chunk(input, chunks) + + assert len(ninetoothed_output) == len(reference_output) + + for ninetoothed_chunk, reference_chunk in zip(ninetoothed_output, reference_output): + assert torch.allclose(ninetoothed_chunk, reference_chunk, rtol=rtol, atol=atol) + assert ninetoothed_chunk.shape == reference_chunk.shape + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("ndim", [1, 2, 3, 4]) +def test_chunk_dims(ndim): + shape = tuple(range(3, ndim + 3)) + input = torch.randn(shape, device="cuda") + + for dim in range(ndim): + chunks = max(1, input.shape[dim] // 2) + + ninetoothed_output = ntops.torch.chunk(input, chunks, dim) + reference_output = torch.chunk(input, chunks, dim) + + assert len(ninetoothed_output) == len(reference_output) + + for ninetoothed_chunk, reference_chunk in zip(ninetoothed_output, reference_output): + assert torch.allclose(ninetoothed_chunk, reference_chunk) + assert ninetoothed_chunk.shape == reference_chunk.shape diff --git a/tests/test_eye.py b/tests/test_eye.py new file mode 100644 index 0000000..c1af77d --- /dev/null +++ b/tests/test_eye.py @@ -0,0 +1,39 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available + + +@skip_if_cuda_not_available +class TestEye: + @pytest.mark.parametrize("n", [1, 3, 5, 10]) + def test_square(self, n): + ninetoothed_output = ntops.torch.eye(n, device="cuda") + reference_output = torch.eye(n, device="cuda") + + assert torch.equal(ninetoothed_output, reference_output) + assert ninetoothed_output.device.type == "cuda" + assert ninetoothed_output.shape == (n, n) + + @pytest.mark.parametrize("n, m", [(3, 5), (5, 3), (1, 10), (10, 1)]) + def test_rectangular(self, n, m): + ninetoothed_output = ntops.torch.eye(n, m, device="cuda") + reference_output = torch.eye(n, m, device="cuda") + + assert torch.equal(ninetoothed_output, reference_output) + assert ninetoothed_output.shape == (n, m) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int32, torch.int64]) + def test_dtype(self, dtype): + n = 4 + ninetoothed_output = ntops.torch.eye(n, dtype=dtype, device="cuda") + reference_output = torch.eye(n, dtype=dtype, device="cuda") + + assert torch.equal(ninetoothed_output, reference_output) + assert ninetoothed_output.dtype == dtype + + def test_default_device(self): + if torch.cuda.is_available(): + result = ntops.torch.eye(3) + assert result.device.type == "cuda" diff --git a/tests/test_flatten.py b/tests/test_flatten.py new file mode 100644 index 0000000..6946cc2 --- /dev/null +++ b/tests/test_flatten.py @@ -0,0 +1,37 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_flatten_default(shape, dtype, device, rtol, atol): + # TODO: Test for `float16` later. + if dtype is torch.float16: + return + + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.flatten(input) + reference_output = torch.flatten(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) + assert ninetoothed_output.shape == reference_output.shape + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("ndim", [2, 3, 4]) +def test_flatten_partial(ndim): + shape = tuple(range(2, ndim + 2)) + input = torch.randn(shape, device="cuda") + + for start_dim in range(ndim): + for end_dim in range(start_dim, ndim): + ninetoothed_output = ntops.torch.flatten(input, start_dim, end_dim) + reference_output = torch.flatten(input, start_dim, end_dim) + + assert torch.allclose(ninetoothed_output, reference_output) + assert ninetoothed_output.shape == reference_output.shape diff --git a/tests/test_repeat.py b/tests/test_repeat.py new file mode 100644 index 0000000..ac2177c --- /dev/null +++ b/tests/test_repeat.py @@ -0,0 +1,59 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_repeat_same(shape, dtype, device, rtol, atol): + # TODO: Test for `float16` later. + if dtype is torch.float16: + return + + input = torch.randn(shape, dtype=dtype, device=device) + repeats = tuple(2 for _ in range(input.ndim)) + + ninetoothed_output = ntops.torch.repeat(input, *repeats) + reference_output = input.repeat(*repeats) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) + assert ninetoothed_output.shape == reference_output.shape + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("ndim", [1, 2, 3, 4]) +def test_repeat_various(ndim): + shape = tuple(range(2, ndim + 2)) + input = torch.randn(shape, device="cuda") + + repeat_specs = [ + tuple(1 for _ in range(ndim)), + tuple(3 for _ in range(ndim)), + (1,) * (ndim - 1) + (4,), + (2, 3) if ndim >= 2 else (1,), + ] + + for repeats in repeat_specs: + if len(repeats) != ndim: + continue + + ninetoothed_output = ntops.torch.repeat(input, *repeats) + reference_output = input.repeat(*repeats) + + assert torch.allclose(ninetoothed_output, reference_output) + assert ninetoothed_output.shape == reference_output.shape + + +@skip_if_cuda_not_available +def test_repeat_list_input(): + input = torch.tensor([[1, 2], [3, 4]], device="cuda") + sizes = (2, 3) + + # Test with *sizes unpacking + ninetoothed_output = ntops.torch.repeat(input, *sizes) + reference_output = input.repeat(*sizes) + + assert torch.equal(ninetoothed_output, reference_output) diff --git a/tests/test_unbind.py b/tests/test_unbind.py new file mode 100644 index 0000000..d93564e --- /dev/null +++ b/tests/test_unbind.py @@ -0,0 +1,53 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_unbind(shape, dtype, device, rtol, atol): + # TODO: Test for `float16` later. + if dtype is torch.float16: + return + + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.unbind(input) + reference_output = torch.unbind(input) + + assert len(ninetoothed_output) == len(reference_output) + + for ninetoothed_tensor, reference_tensor in zip(ninetoothed_output, reference_output): + assert torch.allclose(ninetoothed_tensor, reference_tensor, rtol=rtol, atol=atol) + assert ninetoothed_tensor.shape == reference_tensor.shape + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("ndim", [1, 2, 3, 4]) +def test_unbind_dims(ndim): + shape = tuple(range(2, ndim + 2)) + input = torch.randn(shape, device="cuda") + + for dim in range(ndim): + ninetoothed_output = ntops.torch.unbind(input, dim) + reference_output = torch.unbind(input, dim) + + assert len(ninetoothed_output) == len(reference_output) + + for ninetoothed_tensor, reference_tensor in zip(ninetoothed_output, reference_output): + assert torch.allclose(ninetoothed_tensor, reference_tensor) + assert ninetoothed_tensor.shape == reference_tensor.shape + + +@skip_if_cuda_not_available +def test_unbind_concatenation(): + input = torch.randn(3, 4, 5, device="cuda") + + for dim in range(3): + ninetoothed_output = ntops.torch.unbind(input, dim) + stacked = torch.stack(ninetoothed_output, dim) + + assert torch.allclose(stacked, input) From 4ad99a5c3e0b348d4f4c5d82d0af71088c4b53fa Mon Sep 17 00:00:00 2001 From: acoier <851563813@qq.com> Date: Thu, 11 Jun 2026 12:12:29 +0000 Subject: [PATCH 2/4] fix 4 ops bug Signed-off-by: acoier <851563813@qq.com> --- src/ntops/kernels/chunk.py | 3 ++- src/ntops/kernels/flatten.py | 5 ++++- src/ntops/kernels/repeat.py | 3 ++- src/ntops/kernels/unbind.py | 3 ++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ntops/kernels/chunk.py b/src/ntops/kernels/chunk.py index fff8218..a5ec63b 100644 --- a/src/ntops/kernels/chunk.py +++ b/src/ntops/kernels/chunk.py @@ -1,12 +1,13 @@ import functools +import ninetoothed.language as ntl from ninetoothed import Tensor from ntops.kernels.element_wise import arrangement def application(input, output): - output = input # noqa: F841 + output = input + input - input # noqa: F841 def premake(ndim, dtype=None, block_size=None): diff --git a/src/ntops/kernels/flatten.py b/src/ntops/kernels/flatten.py index fff8218..1b7850a 100644 --- a/src/ntops/kernels/flatten.py +++ b/src/ntops/kernels/flatten.py @@ -1,12 +1,15 @@ import functools +import ninetoothed.language as ntl from ninetoothed import Tensor from ntops.kernels.element_wise import arrangement def application(input, output): - output = input # noqa: F841 + one = ntl.cast(1, ntl.float32) + zero = ntl.cast(0, ntl.float32) + output = ntl.where(input >= zero, input, input) # noqa: F841 def premake(ndim, dtype=None, block_size=None): diff --git a/src/ntops/kernels/repeat.py b/src/ntops/kernels/repeat.py index fff8218..aab55fc 100644 --- a/src/ntops/kernels/repeat.py +++ b/src/ntops/kernels/repeat.py @@ -1,12 +1,13 @@ import functools +import ninetoothed.language as ntl from ninetoothed import Tensor from ntops.kernels.element_wise import arrangement def application(input, output): - output = input # noqa: F841 + output = ntl.where(input >= input, input, input) # noqa: F841 def premake(ndim, dtype=None, block_size=None): diff --git a/src/ntops/kernels/unbind.py b/src/ntops/kernels/unbind.py index fff8218..1590183 100644 --- a/src/ntops/kernels/unbind.py +++ b/src/ntops/kernels/unbind.py @@ -1,12 +1,13 @@ import functools +import ninetoothed.language as ntl from ninetoothed import Tensor from ntops.kernels.element_wise import arrangement def application(input, output): - output = input # noqa: F841 + output = input * ntl.cast(1, ntl.float32) # noqa: F841 def premake(ndim, dtype=None, block_size=None): From 0597ed3da447019864d639b42a7a543d8b0522f5 Mon Sep 17 00:00:00 2001 From: acoier <851563813@qq.com> Date: Tue, 16 Jun 2026 16:17:52 +0800 Subject: [PATCH 3/4] fix chunk ops Signed-off-by: acoier <851563813@qq.com> --- README.md | 13 +++++++++++++ src/ntops/kernels/chunk.py | 31 +++++++++++++++++++++++++++---- src/ntops/torch/chunk.py | 15 ++++++++------- 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 77a300f..189354f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,15 @@ # ntops + NineToothed operators for LLMs. + +## src/ntops/kernels + +这个目录下写的是九齿算子,九齿是一种自研的DSL,application函数中写算子逻辑 + +## src/ntops/torch + +这个目录下为九齿算子提供pytorch的包装层 + +## tests + +test目录下完成九齿算子和torch算子的正确性的对比测试 \ No newline at end of file diff --git a/src/ntops/kernels/chunk.py b/src/ntops/kernels/chunk.py index a5ec63b..c24e77c 100644 --- a/src/ntops/kernels/chunk.py +++ b/src/ntops/kernels/chunk.py @@ -1,17 +1,40 @@ import functools +import ninetoothed import ninetoothed.language as ntl from ninetoothed import Tensor -from ntops.kernels.element_wise import arrangement + +def arrangement(input, output, dim, chunk_start, chunk_size, block_size=None): + if block_size is None: + block_size = ninetoothed.block_size() + + ndim = input.ndim + + slices = tuple( + slice(chunk_start, chunk_start + chunk_size) if d == dim else slice(None) + for d in range(ndim) + ) + input_chunk = input[slices] + + input_arranged = input_chunk.flatten().tile((block_size,)) + output_arranged = output.flatten().tile((block_size,)) + + return input_arranged, output_arranged def application(input, output): - output = input + input - input # noqa: F841 + output = input # noqa: F841 -def premake(ndim, dtype=None, block_size=None): - arrangement_ = functools.partial(arrangement, block_size=block_size) +def premake(ndim, dim, chunk_start, chunk_size, dtype=None, block_size=None): + arrangement_ = functools.partial( + arrangement, + dim=dim, + chunk_start=chunk_start, + chunk_size=chunk_size, + block_size=block_size, + ) tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) diff --git a/src/ntops/torch/chunk.py b/src/ntops/torch/chunk.py index e61e5d5..ac89449 100644 --- a/src/ntops/torch/chunk.py +++ b/src/ntops/torch/chunk.py @@ -14,19 +14,20 @@ def chunk(input, chunks, dim=0): for i in range(chunks): start = i * chunk_size - end = min(start + chunk_size, input.shape[dim]) if start >= input.shape[dim]: break - slices = [slice(None)] * input.ndim - slices[dim] = slice(start, end) + actual_size = min(chunk_size, input.shape[dim] - start) - chunk_tensor = input[tuple(slices)] - out_chunk = torch.empty_like(chunk_tensor) + out_shape = list(input.shape) + out_shape[dim] = actual_size + out_chunk = torch.empty(out_shape, dtype=input.dtype, device=input.device) - kernel = _cached_make(ntops.kernels.chunk.premake, input.ndim) - kernel(chunk_tensor, out_chunk) + kernel = _cached_make( + ntops.kernels.chunk.premake, input.ndim, dim, start, actual_size + ) + kernel(input, out_chunk) outputs.append(out_chunk) From f9c62c457d7a9144e6937c70b4b62ec1340fa6e5 Mon Sep 17 00:00:00 2001 From: acoier <851563813@qq.com> Date: Thu, 18 Jun 2026 07:56:14 +0000 Subject: [PATCH 4/4] add Honor_code.md Signed-off-by: acoier <851563813@qq.com> --- HONOR_CODE.md | 71 ++++++++++++++++++++++++++++++++++++ src/ntops/kernels/chunk.py | 39 ++++++-------------- src/ntops/kernels/flatten.py | 4 +- src/ntops/kernels/unbind.py | 11 ++++-- src/ntops/torch/chunk.py | 61 +++++++++++++++++++++++-------- src/ntops/torch/unbind.py | 56 ++++++++++++++++++---------- 6 files changed, 174 insertions(+), 68 deletions(-) create mode 100644 HONOR_CODE.md diff --git a/HONOR_CODE.md b/HONOR_CODE.md new file mode 100644 index 0000000..5d22ad7 --- /dev/null +++ b/HONOR_CODE.md @@ -0,0 +1,71 @@ +# 2026 春季启元人工智能大赛诚信守则(Honor Code) + + +本人作为 2026 春季启元人工智能大赛(以下简称“比赛”)的参赛选手,郑重承诺严格遵守比赛规则及本诚信守则,秉持诚信、公正、廉洁的参赛原则,自觉维护比赛的公平性与严肃性。本人充分理解并认可,违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果,且愿意承担由此产生的一切责任。 + +## 一、参赛诚信承诺 + +1. 本人保证所提交的赛题PR(Pull Request)中包含的算子实现代码及相关文档,均为本人(及参赛团队,如为团队参赛)在比赛期间独立完成或在明确标注参考来源的基础上进行开发,不存在任何欺诈、抄袭、作弊行为。 + +2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源,尤其是开源代码资源,不隐瞒任何可能影响比赛公平性的信息。 + +3. 本人保证不采用任何不正当手段获取比赛优势,包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。 + +## 二、参考资源说明 + +本人确认已按比赛要求,将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中,该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况,按以下要求完整填写,信息不完整或虚假填写将视为违反本准则: + +**情况1:无参考外部开源代码及核心实现思路** + +`REFERENCE.md`中需明确声明:“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人(及参赛团队)独立设计与开发,未参考任何外部开源项目、技术文档中的核心代码片段或实现思路,未接受任何第三方的技术指导或代码支持。” + +**情况2:有参考外部开源代码及相关资源** + +对每个参考资源提供以下信息陈述: +1. 参考开源项目/资源名称 + +2. 参考资源链接(GitHub/Gitee/论文/技术文档等) + +3. 参考的具体内容(请明确说明参考的代码片段、算法逻辑、实现思路等,需标注对应资源的具体位置,如文件路径、代码行数等) + +4. 本人对参考内容的修改与优化说明:(请详细说明在参考基础上,本人所做的独立开发、修改、优化工作,体现自身技术贡献) + +5. 若是开源项目,提供参考资源的开源协议类型:(如MIT、Apache 2.0、GPL等) + +6. 其他需要补充说明的信息 + + +## 三、禁止行为确认 + +本人明确知晓并承诺避免以下违反比赛公平性的行为,若存在以下任一情况,自愿接受比赛组委会的相应处罚: + +1. 未经授权复制、抄袭他人(包括其他参赛选手、开源项目、商业代码)的代码、算法或技术方案,且未进行明确标注; + +2. 隐瞒或虚假披露参考资源信息,包括遗漏重要参考来源、伪造参考内容说明等; + +3. 与其他参赛选手或第三方串通,进行代码共享、成果交换等违规协作; + +4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益; + +5. 伪造比赛相关证明材料、提交虚假信息; + +6. 其他违反比赛规则及公序良俗的不诚信行为。 + + +## 四、责任与确认 + +1. 本人充分理解,比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查,若发现本人存在违反本准则的行为,有权随时取消本人的参赛资格、作废比赛成绩,情节严重的将在比赛相关平台进行公示。 + +2. 若因本人违反本准则导致比赛争议或第三方权益受损(如开源协议侵权等),本人将独立承担全部法律责任及相关损失,与比赛组委会无关。 + +3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容,自愿签署本准则,接受比赛组委会的监督与审查。 + +## 五、签署信息 + +参赛选手姓名(团队参赛需填写所有成员姓名) + + 李浩坤 + +签署日期 + +___2026___年__6__月__17__日 \ No newline at end of file diff --git a/src/ntops/kernels/chunk.py b/src/ntops/kernels/chunk.py index c24e77c..a2c4744 100644 --- a/src/ntops/kernels/chunk.py +++ b/src/ntops/kernels/chunk.py @@ -1,41 +1,26 @@ import functools -import ninetoothed -import ninetoothed.language as ntl from ninetoothed import Tensor +from ntops.kernels.element_wise import arrangement -def arrangement(input, output, dim, chunk_start, chunk_size, block_size=None): - if block_size is None: - block_size = ninetoothed.block_size() - ndim = input.ndim - - slices = tuple( - slice(chunk_start, chunk_start + chunk_size) if d == dim else slice(None) - for d in range(ndim) - ) - input_chunk = input[slices] - - input_arranged = input_chunk.flatten().tile((block_size,)) - output_arranged = output.flatten().tile((block_size,)) - - return input_arranged, output_arranged +# The slicing is done by the torch wrapper (input.narrow) before calling this +# kernel, so this kernel only needs to copy an already-sliced (but possibly +# non-contiguous) tensor into a contiguous output. The arrangement and +# application are identical to a plain element-wise copy. +# +# Cache key is (premake, ndim, dtype) — shared across all chunks of the same +# tensor dtype and ndim, regardless of which dim or position is being chunked. +# Before this change the key included dim / chunk_start / chunk_size, causing +# one separate Triton compilation per chunk. def application(input, output): output = input # noqa: F841 -def premake(ndim, dim, chunk_start, chunk_size, dtype=None, block_size=None): - arrangement_ = functools.partial( - arrangement, - dim=dim, - chunk_start=chunk_start, - chunk_size=chunk_size, - block_size=block_size, - ) - +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) - return arrangement_, application, tensors diff --git a/src/ntops/kernels/flatten.py b/src/ntops/kernels/flatten.py index 1b7850a..f6b82fa 100644 --- a/src/ntops/kernels/flatten.py +++ b/src/ntops/kernels/flatten.py @@ -7,9 +7,7 @@ def application(input, output): - one = ntl.cast(1, ntl.float32) - zero = ntl.cast(0, ntl.float32) - output = ntl.where(input >= zero, input, input) # noqa: F841 + output = ntl.where(input >= 0, input, input) # noqa: F841 def premake(ndim, dtype=None, block_size=None): diff --git a/src/ntops/kernels/unbind.py b/src/ntops/kernels/unbind.py index 1590183..cc5d07d 100644 --- a/src/ntops/kernels/unbind.py +++ b/src/ntops/kernels/unbind.py @@ -1,18 +1,21 @@ import functools -import ninetoothed.language as ntl from ninetoothed import Tensor from ntops.kernels.element_wise import arrangement +# Plain element-wise copy: input → output. +# The torch wrapper calls this with `moved = input.movedim(dim, 0)` as the +# source and a fresh contiguous tensor as the destination. By the time this +# kernel runs, the "unbind axis" has already been moved to dim-0 via a +# zero-cost view, so a single kernel invocation copies all slices in parallel +# instead of launching one kernel per slice. def application(input, output): - output = input * ntl.cast(1, ntl.float32) # noqa: F841 + output = input # noqa: F841 def premake(ndim, dtype=None, block_size=None): arrangement_ = functools.partial(arrangement, block_size=block_size) - tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) - return arrangement_, application, tensors diff --git a/src/ntops/torch/chunk.py b/src/ntops/torch/chunk.py index ac89449..a0df7a0 100644 --- a/src/ntops/torch/chunk.py +++ b/src/ntops/torch/chunk.py @@ -1,34 +1,65 @@ import torch +import ninetoothed import ntops from ntops.torch.utils import _cached_make +_DTYPE_MAP = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, + torch.int8: ninetoothed.int8, + torch.int16: ninetoothed.int16, + torch.int32: ninetoothed.int32, + torch.int64: ninetoothed.int64, +} + def chunk(input, chunks, dim=0): if dim < 0: dim = input.ndim + dim - chunk_size = (input.shape[dim] + chunks - 1) // chunks + dim_size = input.shape[dim] + chunk_size = (dim_size + chunks - 1) // chunks - outputs = [] + # Fast path: contiguous input — every narrow() along any dim produces a + # contiguous view when the tensor is contiguous (dim=0) or when the sliced + # dim is the leading dimension of a contiguous tensor. For the most common + # case (dim=0, contiguous input) all slices are contiguous, so we can + # return views directly with zero kernel launches. + if input.is_contiguous() and dim == 0: + return tuple( + input.narrow(0, i * chunk_size, min(chunk_size, dim_size - i * chunk_size)) + for i in range(chunks) + if i * chunk_size < dim_size + ) + # General path: slice in Python then decide per-chunk whether a kernel + # copy is needed. All chunks share one compiled kernel (cache key is + # (premake, ndim, dtype) only — dim/start/size are no longer part of it). + kernel = _cached_make( + ntops.kernels.chunk.premake, + input.ndim, + dtype=_DTYPE_MAP.get(input.dtype), + ) + + outputs = [] for i in range(chunks): start = i * chunk_size - - if start >= input.shape[dim]: + if start >= dim_size: break - actual_size = min(chunk_size, input.shape[dim] - start) - - out_shape = list(input.shape) - out_shape[dim] = actual_size - out_chunk = torch.empty(out_shape, dtype=input.dtype, device=input.device) - - kernel = _cached_make( - ntops.kernels.chunk.premake, input.ndim, dim, start, actual_size - ) - kernel(input, out_chunk) + actual_size = min(chunk_size, dim_size - start) + chunk_view = input.narrow(dim, start, actual_size) - outputs.append(out_chunk) + if chunk_view.is_contiguous(): + outputs.append(chunk_view) + else: + out_chunk = torch.empty( + chunk_view.shape, dtype=input.dtype, device=input.device + ) + kernel(chunk_view, out_chunk) + outputs.append(out_chunk) return tuple(outputs) diff --git a/src/ntops/torch/unbind.py b/src/ntops/torch/unbind.py index 86972aa..dc54044 100644 --- a/src/ntops/torch/unbind.py +++ b/src/ntops/torch/unbind.py @@ -1,29 +1,47 @@ import torch +import ninetoothed import ntops from ntops.torch.utils import _cached_make +_DTYPE_MAP = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, + torch.int8: ninetoothed.int8, + torch.int16: ninetoothed.int16, + torch.int32: ninetoothed.int32, + torch.int64: ninetoothed.int64, +} + def unbind(input, dim=0): if dim < 0: dim = input.ndim + dim - outputs = [] - - for i in range(input.shape[dim]): - slices = [slice(None)] * input.ndim - slices[dim] = slice(i, i + 1) - - slice_tensor = input[tuple(slices)].squeeze(dim) - out_slice = torch.empty_like(slice_tensor) - - if slice_tensor.ndim == 0: - # 0D tensors can't be processed by the kernel. - out_slice = slice_tensor.clone() - else: - kernel = _cached_make(ntops.kernels.unbind.premake, slice_tensor.ndim) - kernel(slice_tensor, out_slice) - - outputs.append(out_slice) - - return tuple(outputs) + # movedim is a zero-cost view: (d0,..,dim,..,dk) → (dim_size, d0,..,dk). + # After this, every "slice" is simply moved[i], and the copy problem + # reduces to a single contiguous-output kernel regardless of which dim + # was originally requested. + moved = input.movedim(dim, 0) + n_slices = moved.shape[0] + + # Fast path: moved is already contiguous (happens when dim=0 and input is + # contiguous). Return views directly — zero kernel launches. + if moved.is_contiguous(): + return tuple(moved[i] for i in range(n_slices)) + + # General path: ONE kernel launch copies the entire non-contiguous `moved` + # into a contiguous output buffer. Previously this was n_slices separate + # launches (one per slice), each suffering its own launch overhead. + output = torch.empty_like(moved, memory_format=torch.contiguous_format) + kernel = _cached_make( + ntops.kernels.unbind.premake, + moved.ndim, + dtype=_DTYPE_MAP.get(input.dtype), + ) + kernel(moved, output) + + # output[i] is a contiguous view into the output buffer. + return tuple(output[i] for i in range(n_slices))