diff --git a/HONOR_CODE.md b/HONOR_CODE.md new file mode 100644 index 0000000..5d22ad7 --- /dev/null +++ b/HONOR_CODE.md @@ -0,0 +1,71 @@ +# 2026 春季启元人工智能大赛诚信守则(Honor Code) + + +本人作为 2026 春季启元人工智能大赛(以下简称“比赛”)的参赛选手,郑重承诺严格遵守比赛规则及本诚信守则,秉持诚信、公正、廉洁的参赛原则,自觉维护比赛的公平性与严肃性。本人充分理解并认可,违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果,且愿意承担由此产生的一切责任。 + +## 一、参赛诚信承诺 + +1. 本人保证所提交的赛题PR(Pull Request)中包含的算子实现代码及相关文档,均为本人(及参赛团队,如为团队参赛)在比赛期间独立完成或在明确标注参考来源的基础上进行开发,不存在任何欺诈、抄袭、作弊行为。 + +2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源,尤其是开源代码资源,不隐瞒任何可能影响比赛公平性的信息。 + +3. 本人保证不采用任何不正当手段获取比赛优势,包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。 + +## 二、参考资源说明 + +本人确认已按比赛要求,将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中,该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况,按以下要求完整填写,信息不完整或虚假填写将视为违反本准则: + +**情况1:无参考外部开源代码及核心实现思路** + +`REFERENCE.md`中需明确声明:“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人(及参赛团队)独立设计与开发,未参考任何外部开源项目、技术文档中的核心代码片段或实现思路,未接受任何第三方的技术指导或代码支持。” + +**情况2:有参考外部开源代码及相关资源** + +对每个参考资源提供以下信息陈述: +1. 参考开源项目/资源名称 + +2. 参考资源链接(GitHub/Gitee/论文/技术文档等) + +3. 参考的具体内容(请明确说明参考的代码片段、算法逻辑、实现思路等,需标注对应资源的具体位置,如文件路径、代码行数等) + +4. 本人对参考内容的修改与优化说明:(请详细说明在参考基础上,本人所做的独立开发、修改、优化工作,体现自身技术贡献) + +5. 若是开源项目,提供参考资源的开源协议类型:(如MIT、Apache 2.0、GPL等) + +6. 其他需要补充说明的信息 + + +## 三、禁止行为确认 + +本人明确知晓并承诺避免以下违反比赛公平性的行为,若存在以下任一情况,自愿接受比赛组委会的相应处罚: + +1. 未经授权复制、抄袭他人(包括其他参赛选手、开源项目、商业代码)的代码、算法或技术方案,且未进行明确标注; + +2. 隐瞒或虚假披露参考资源信息,包括遗漏重要参考来源、伪造参考内容说明等; + +3. 与其他参赛选手或第三方串通,进行代码共享、成果交换等违规协作; + +4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益; + +5. 伪造比赛相关证明材料、提交虚假信息; + +6. 其他违反比赛规则及公序良俗的不诚信行为。 + + +## 四、责任与确认 + +1. 本人充分理解,比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查,若发现本人存在违反本准则的行为,有权随时取消本人的参赛资格、作废比赛成绩,情节严重的将在比赛相关平台进行公示。 + +2. 若因本人违反本准则导致比赛争议或第三方权益受损(如开源协议侵权等),本人将独立承担全部法律责任及相关损失,与比赛组委会无关。 + +3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容,自愿签署本准则,接受比赛组委会的监督与审查。 + +## 五、签署信息 + +参赛选手姓名(团队参赛需填写所有成员姓名) + + 李浩坤 + +签署日期 + +___2026___年__6__月__17__日 \ No newline at end of file diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py index f6934ef..8d0c187 100644 --- a/src/ntops/kernels/__init__.py +++ b/src/ntops/kernels/__init__.py @@ -9,6 +9,7 @@ bmm, clamp, conv2d, + copysign, cos, div, dropout, @@ -20,14 +21,18 @@ isinf, isnan, layer_norm, + lcm, le, + lgamma, lt, max_pool2d, mm, mul, ne, neg, + nextafter, pow, + rad2deg, relu, rms_norm, rotary_position_embedding, @@ -52,6 +57,7 @@ "bmm", "clamp", "conv2d", + "copysign", "cos", "div", "dropout", @@ -63,14 +69,18 @@ "isinf", "isnan", "layer_norm", + "lcm", "le", + "lgamma", "lt", "max_pool2d", "mm", "mul", "ne", "neg", + "nextafter", "pow", + "rad2deg", "relu", "rms_norm", "rotary_position_embedding", diff --git a/src/ntops/kernels/copysign.py b/src/ntops/kernels/copysign.py new file mode 100644 index 0000000..15e7cf4 --- /dev/null +++ b/src/ntops/kernels/copysign.py @@ -0,0 +1,60 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +# copysign(input, other) = magnitude of input + sign of other. +# +# Magnitude: ntl.abs(input) handles all float types directly. +# Sign detection: cast other to a same-width signed integer and check < 0. +# - Signed int comparison checks the MSB, which is exactly the IEEE 754 +# sign bit for all standard float widths (16 / 32 / 64 bit). +# - This correctly identifies -0.0 as negative (0x8000... as signed int +# is INT_MIN, which is < 0), unlike the float comparison `other < 0` +# which treats -0.0 as 0. +# +# Three functions because the bitcast target type depends on float width: +# float16 / bfloat16 (16-bit) -> int16 +# float32 (32-bit) -> int32 +# float64 (64-bit) -> int64 + + +def application_f16(input, other, output): + other_sign_negative = ntl.cast(other, ntl.int16, bitcast=True) < 0 + abs_val = ntl.abs(input) + output = ntl.where(other_sign_negative, -abs_val, abs_val) # noqa: F841 + + +def application_f32(input, other, output): + other_sign_negative = ntl.cast(other, ntl.int32, bitcast=True) < 0 + abs_val = ntl.abs(input) + output = ntl.where(other_sign_negative, -abs_val, abs_val) # noqa: F841 + + +def application_f64(input, other, output): + other_sign_negative = ntl.cast(other, ntl.int64, bitcast=True) < 0 + abs_val = ntl.abs(input) + output = ntl.where(other_sign_negative, -abs_val, abs_val) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + if dtype in (ninetoothed.float16, ninetoothed.bfloat16): + application = application_f16 + elif dtype == ninetoothed.float32: + application = application_f32 + else: + application = application_f64 + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/lcm.py b/src/ntops/kernels/lcm.py new file mode 100644 index 0000000..6700ef9 --- /dev/null +++ b/src/ntops/kernels/lcm.py @@ -0,0 +1,120 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +# Euclidean GCD: gcd(a, b) = gcd(b, a % b) until b == 0. +# +# All dtypes compute in int32 (for int8/int16/int32) or int64 (for int64). +# This avoids two problems in the original single-function implementation: +# 1. Computing in native dtype causes overflow then wrong abs +# e.g. int8: 11*17=187 wraps to -69, ntl.abs gives 69 != torch.lcm's -69 +# 2. 32 iterations is insufficient for int64 whose Fibonacci worst case +# needs ~91 steps; int8/int16/int32 each need fewer. +# +# Iteration count per dtype (Fibonacci adversarial bound, a sorted >= b): +# int8: abs ≤ 128, max ~10 steps -> 12 +# int16: abs ≤ 32768, max ~23 steps -> 24 +# int32: abs ≤ 2^31, max ~45 steps -> 48 +# int64: abs ≤ 2^63, max ~92 steps -> 96 +# +# LCM = (|a| / gcd) * |b|, computed in wide type, then cast to output dtype. +# The cast wraps on overflow, matching torch.lcm's behaviour for all dtypes. + + +def application_i8(input, other, output): + w = ntl.int32 + abs_a = ntl.abs(ntl.cast(input, w)) + abs_b = ntl.abs(ntl.cast(other, w)) + or_ab = abs_a | abs_b + a = ntl.where(abs_a >= abs_b, abs_a, abs_b) + b = ntl.where(abs_a >= abs_b, abs_b, abs_a) + zero = ntl.cast(0, w) + for _ in range(12): + b_safe = ntl.where(b != 0, b, zero + 1) + r = a % b_safe + a = ntl.where(b != 0, b, a) + b = r + gcd = ntl.where(a == 0, zero + 1, a) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype + ) + + +def application_i16(input, other, output): + w = ntl.int32 + abs_a = ntl.abs(ntl.cast(input, w)) + abs_b = ntl.abs(ntl.cast(other, w)) + or_ab = abs_a | abs_b + a = ntl.where(abs_a >= abs_b, abs_a, abs_b) + b = ntl.where(abs_a >= abs_b, abs_b, abs_a) + zero = ntl.cast(0, w) + for _ in range(24): + b_safe = ntl.where(b != 0, b, zero + 1) + r = a % b_safe + a = ntl.where(b != 0, b, a) + b = r + gcd = ntl.where(a == 0, zero + 1, a) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype + ) + + +def application_i32(input, other, output): + w = ntl.int32 + abs_a = ntl.abs(ntl.cast(input, w)) + abs_b = ntl.abs(ntl.cast(other, w)) + or_ab = abs_a | abs_b + a = ntl.where(abs_a >= abs_b, abs_a, abs_b) + b = ntl.where(abs_a >= abs_b, abs_b, abs_a) + zero = ntl.cast(0, w) + for _ in range(48): + b_safe = ntl.where(b != 0, b, zero + 1) + r = a % b_safe + a = ntl.where(b != 0, b, a) + b = r + gcd = ntl.where(a == 0, zero + 1, a) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype + ) + + +def application_i64(input, other, output): + w = ntl.int64 + abs_a = ntl.abs(ntl.cast(input, w)) + abs_b = ntl.abs(ntl.cast(other, w)) + or_ab = abs_a | abs_b + a = ntl.where(abs_a >= abs_b, abs_a, abs_b) + b = ntl.where(abs_a >= abs_b, abs_b, abs_a) + zero = ntl.cast(0, w) + for _ in range(96): + b_safe = ntl.where(b != 0, b, zero + 1) + r = a % b_safe + a = ntl.where(b != 0, b, a) + b = r + gcd = ntl.where(a == 0, zero + 1, a) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype + ) + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + if dtype == ninetoothed.int64: + application = application_i64 + elif dtype == ninetoothed.int32: + application = application_i32 + elif dtype == ninetoothed.int16: + application = application_i16 + else: + application = application_i8 + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + return arrangement_, application, tensors diff --git a/src/ntops/kernels/lgamma.py b/src/ntops/kernels/lgamma.py new file mode 100644 index 0000000..dd6bfa8 --- /dev/null +++ b/src/ntops/kernels/lgamma.py @@ -0,0 +1,42 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor +from ninetoothed.language import libdevice + +from ntops.kernels.element_wise import arrangement + + +# libdevice.lgamma supports float32 and float64 only. +# +# Integer types: handled entirely in the torch wrapper by pre-converting +# the input tensor to float32 (torch.Tensor.to), then running the +# float32 kernel. This reuses one kernel instead of compiling four +# identical int→float32→lgamma kernels (one per int dtype). +# +# float16 / bfloat16: promote to float32 for the lgamma call, downcast result. +# +# float32 / float64: call libdevice.lgamma directly. + + +def application_narrow(input, output): + promoted = ntl.cast(input, ntl.float32) + output = ntl.cast(libdevice.lgamma(promoted), output.dtype) # noqa: F841 + + +def application_wide(input, output): + output = libdevice.lgamma(input) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + if dtype in (ninetoothed.float16, ninetoothed.bfloat16): + application = application_narrow + else: + application = application_wide + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/nextafter.py b/src/ntops/kernels/nextafter.py new file mode 100644 index 0000000..2b0fceb --- /dev/null +++ b/src/ntops/kernels/nextafter.py @@ -0,0 +1,102 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor +from ninetoothed.language import libdevice + +from ntops.kernels.element_wise import arrangement + + +# nextafter for integers: the nearest integer toward other is simply ±1 away. +def application_int(input, other, output): + output = ntl.where( # noqa: F841 + input == other, + other, + ntl.where(input < other, input + 1, input - 1), + ) + + +# nextafter for float16 / bfloat16. +# libdevice.nextafter does not accept narrow float types; we use bitcast to int16. +# +# Observation from bit patterns (int16 = bitcast of float16): +# positive floats: int16 order == float order (+3.0 → 16896 > +1.0 → 15360) +# negative floats: int16 order is REVERSED (-3.0 → −15872, -2.998 → −15873) +# +# So to advance one ULP toward other: +# a > 0, a < b → a_i + 1 (moving up in both orderings) +# a > 0, a > b → a_i - 1 +# a < 0, a < b → a_i - 1 (moving up in float but down in int16) +# a < 0, a > b → a_i + 1 +# +# Special cases (checked before the general step): +# NaN : propagate whichever input is NaN (a takes priority) +# equal: return b's bit pattern (handles +0 == -0 correctly) +# zero : return ±min_subnormal with b's sign (bitcast b < 0 detects -0.0) +def application_f16(input, other, output): + i16 = ntl.int16 + one = ntl.cast(1, i16) + zero_i = ntl.cast(0, i16) + + a_i = ntl.cast(input, i16, bitcast=True) + b_i = ntl.cast(other, i16, bitcast=True) + + # NaN propagation: keep NaN bits of the offending operand + a_nan = input != input + b_nan = other != other + nan_bits = ntl.where(a_nan, a_i, b_i) + + # Zero → smallest subnormal; sign of b determined by its sign bit + from_zero = ntl.where(b_i < zero_i, ntl.cast(-32767, i16), one) + + # General case: positive int16 means positive float (same order), + # negative int16 means negative float (reversed order) → flip step sign. + going_up = input < other + a_i_positive = a_i >= zero_i + step = ntl.where( + a_i_positive, + ntl.where(going_up, one, -one), + ntl.where(going_up, -one, one), + ) + + result_i = ntl.where( + a_nan | b_nan, + nan_bits, + ntl.where( + input == other, + b_i, + ntl.where(input == ntl.cast(0, ntl.float16), from_zero, a_i + step), + ), + ) + output = ntl.cast(result_i, output.dtype, bitcast=True) # noqa: F841 + + +def application_f32(input, other, output): + output = libdevice.nextafter(input, other) # noqa: F841 + + +def application_f64(input, other, output): + output = libdevice.nextafter(input, other) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + int_types = (ninetoothed.int8, ninetoothed.int16, ninetoothed.int32, ninetoothed.int64) + if dtype in int_types: + application = application_int + elif dtype in (ninetoothed.float16, ninetoothed.bfloat16): + application = application_f16 + elif dtype == ninetoothed.float32: + application = application_f32 + else: + application = application_f64 + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/rad2deg.py b/src/ntops/kernels/rad2deg.py new file mode 100644 index 0000000..9371f22 --- /dev/null +++ b/src/ntops/kernels/rad2deg.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input * 57.29577951308232 # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py index 82fc596..5732b72 100644 --- a/src/ntops/torch/__init__.py +++ b/src/ntops/torch/__init__.py @@ -8,6 +8,7 @@ from ntops.torch.bmm import bmm from ntops.torch.clamp import clamp from ntops.torch.conv2d import conv2d +from ntops.torch.copysign import copysign from ntops.torch.cos import cos from ntops.torch.div import div from ntops.torch.dropout import dropout @@ -19,7 +20,9 @@ from ntops.torch.isinf import isinf from ntops.torch.isnan import isnan from ntops.torch.layer_norm import layer_norm +from ntops.torch.lcm import lcm from ntops.torch.le import le +from ntops.torch.lgamma import lgamma from ntops.torch.lt import lt from ntops.torch.matmul import matmul from ntops.torch.max_pool2d import max_pool2d @@ -27,7 +30,9 @@ from ntops.torch.mul import mul from ntops.torch.ne import ne from ntops.torch.neg import neg +from ntops.torch.nextafter import nextafter from ntops.torch.pow import pow +from ntops.torch.rad2deg import rad2deg from ntops.torch.relu import relu from ntops.torch.rms_norm import rms_norm from ntops.torch.rotary_position_embedding import rotary_position_embedding @@ -51,6 +56,7 @@ "bmm", "clamp", "conv2d", + "copysign", "cos", "div", "dropout", @@ -62,7 +68,9 @@ "isinf", "isnan", "layer_norm", + "lcm", "le", + "lgamma", "lt", "matmul", "max_pool2d", @@ -70,7 +78,9 @@ "mul", "ne", "neg", + "nextafter", "pow", + "rad2deg", "relu", "rms_norm", "rotary_position_embedding", diff --git a/src/ntops/torch/copysign.py b/src/ntops/torch/copysign.py new file mode 100644 index 0000000..2c0b427 --- /dev/null +++ b/src/ntops/torch/copysign.py @@ -0,0 +1,27 @@ +import torch + +import ninetoothed +import ntops +from ntops.torch.utils import _cached_make + +_DTYPE_MAP = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, +} + + +def copysign(input, other, *, out=None): + if out is None: + out = torch.empty_like(input) + + kernel = _cached_make( + ntops.kernels.copysign.premake, + input.ndim, + dtype=_DTYPE_MAP.get(input.dtype), + ) + + kernel(input, other, out) + + return out diff --git a/src/ntops/torch/lcm.py b/src/ntops/torch/lcm.py new file mode 100644 index 0000000..2cb187d --- /dev/null +++ b/src/ntops/torch/lcm.py @@ -0,0 +1,57 @@ +import torch + +import ninetoothed +import ntops +from ntops.torch.utils import _cached_make + +_NUM_STAGES = 1 + + +def _block_size_for(torch_dtype): + if torch_dtype == torch.int64: + return 32 + return 512 + + +def _num_warps_for(torch_dtype): + if torch_dtype == torch.int64: + return 1 + return 4 + + +def _to_nt(torch_dtype): + mapping = { + torch.int8: ninetoothed.int8, + torch.int16: ninetoothed.int16, + torch.int32: ninetoothed.int32, + torch.int64: ninetoothed.int64, + } + return mapping.get(torch_dtype) + + +def lcm(input, other, *, out=None): + if out is None: + out = torch.empty_like(input) + + if input.ndim != 1 and input.is_contiguous() and other.is_contiguous() and out.is_contiguous(): + n = input.numel() + in_view = input.view([n]) + other_view = other.view([n]) + out_view = out.view([n]) + else: + in_view = input + other_view = other + out_view = out + + kernel = _cached_make( + ntops.kernels.lcm.premake, + in_view.ndim, + dtype=_to_nt(input.dtype), + block_size=_block_size_for(input.dtype), + num_warps=_num_warps_for(input.dtype), + num_stages=_NUM_STAGES, + ) + + kernel(in_view, other_view, out_view) + + return out diff --git a/src/ntops/torch/lgamma.py b/src/ntops/torch/lgamma.py new file mode 100644 index 0000000..5c646e6 --- /dev/null +++ b/src/ntops/torch/lgamma.py @@ -0,0 +1,44 @@ +import torch + +import ninetoothed +import ntops +from ntops.torch.utils import _cached_make + +_DTYPE_MAP = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, +} + +_INT_TYPES = {torch.int8, torch.int16, torch.int32, torch.int64} + + +def lgamma(input, *, out=None): + if input.dtype in _INT_TYPES: + # Pre-convert to float32 in Python before calling the kernel. + # This reuses the existing float32 kernel for all four integer types, + # avoiding four separate Triton JIT compilations (one per int dtype) + # whose bodies would be identical (cast → float32 → lgamma). + # torch.lgamma always returns float32 for integer inputs. + if out is None: + out = torch.empty_like(input, dtype=torch.float32) + kernel_input = input.to(torch.float32) + kernel = _cached_make( + ntops.kernels.lgamma.premake, + input.ndim, + dtype=ninetoothed.float32, + ) + kernel(kernel_input, out) + return out + + if out is None: + out = torch.empty_like(input) + + kernel = _cached_make( + ntops.kernels.lgamma.premake, + input.ndim, + dtype=_DTYPE_MAP.get(input.dtype), + ) + kernel(input, out) + return out diff --git a/src/ntops/torch/nextafter.py b/src/ntops/torch/nextafter.py new file mode 100644 index 0000000..918f438 --- /dev/null +++ b/src/ntops/torch/nextafter.py @@ -0,0 +1,31 @@ +import torch + +import ninetoothed +import ntops +from ntops.torch.utils import _cached_make + +_DTYPE_MAP = { + torch.int8: ninetoothed.int8, + torch.int16: ninetoothed.int16, + torch.int32: ninetoothed.int32, + torch.int64: ninetoothed.int64, + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, +} + + +def nextafter(input, other, *, out=None): + if out is None: + out = torch.empty_like(input) + + kernel = _cached_make( + ntops.kernels.nextafter.premake, + input.ndim, + dtype=_DTYPE_MAP.get(input.dtype), + ) + + kernel(input, other, out) + + return out diff --git a/src/ntops/torch/rad2deg.py b/src/ntops/torch/rad2deg.py new file mode 100644 index 0000000..f6896b1 --- /dev/null +++ b/src/ntops/torch/rad2deg.py @@ -0,0 +1,15 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def rad2deg(input, *, out=None): + if out is None: + out = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.rad2deg.premake, input.ndim) + + kernel(input, out) + + return out diff --git a/tests/test_copysign.py b/tests/test_copysign.py new file mode 100644 index 0000000..c9fabe0 --- /dev/null +++ b/tests/test_copysign.py @@ -0,0 +1,21 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_copysign(shape, dtype, device, rtol, atol): + # TODO: Test for `float16` later. + # if dtype is torch.float16: + # return + input = torch.randn(shape, dtype=dtype, device=device) + other = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.copysign(input, other) + reference_output = torch.copysign(input, other) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_lcm.py b/tests/test_lcm.py new file mode 100644 index 0000000..f2e29fe --- /dev/null +++ b/tests/test_lcm.py @@ -0,0 +1,23 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments(False)) +def test_lcm(shape, dtype, device, rtol, atol): + upper_bound = 100 + input = torch.randint( + -upper_bound, upper_bound + 1, size=shape, dtype=dtype, device=device + ) + other = torch.randint( + -upper_bound, upper_bound + 1, size=shape, dtype=dtype, device=device + ) + + ninetoothed_output = ntops.torch.lcm(input, other) + reference_output = torch.lcm(input, other) + + assert torch.equal(ninetoothed_output, reference_output) diff --git a/tests/test_lgamma.py b/tests/test_lgamma.py new file mode 100644 index 0000000..f422259 --- /dev/null +++ b/tests/test_lgamma.py @@ -0,0 +1,29 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_lgamma_float(shape, dtype, device, rtol, atol): + input = torch.rand(shape, dtype=dtype, device=device) * 5 + 0.1 + + ninetoothed_output = ntops.torch.lgamma(input) + reference_output = torch.lgamma(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments(False)) +def test_lgamma_int(shape, dtype, device, rtol, atol): + # torch.lgamma on integers returns float32 + input = torch.randint(1, 20, size=shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.lgamma(input) + reference_output = torch.lgamma(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=1e-4, atol=1e-4) diff --git a/tests/test_nextafter.py b/tests/test_nextafter.py new file mode 100644 index 0000000..29f5509 --- /dev/null +++ b/tests/test_nextafter.py @@ -0,0 +1,38 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +def _int_nextafter_ref(input, other): + # torch.nextafter doesn't support integers; step ±1 toward other. + result = input.clone() + result[input < other] += 1 + result[input > other] -= 1 + return result + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_nextafter_float(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + other = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.nextafter(input, other) + reference_output = torch.nextafter(input, other) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments(False)) +def test_nextafter_int(shape, dtype, device, rtol, atol): + input = torch.randint(-100, 100, size=shape, dtype=dtype, device=device) + other = torch.randint(-100, 100, size=shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.nextafter(input, other) + reference_output = _int_nextafter_ref(input, other) + + assert torch.equal(ninetoothed_output, reference_output) diff --git a/tests/test_rad2deg.py b/tests/test_rad2deg.py new file mode 100644 index 0000000..1e7ba1b --- /dev/null +++ b/tests/test_rad2deg.py @@ -0,0 +1,20 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_rad2deg(shape, dtype, device, rtol, atol): + # TODO: Test for `float16` later. + if dtype is torch.float16: + return + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.rad2deg(input) + reference_output = torch.rad2deg(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/utils.py b/tests/utils.py index ac1949f..9f592ea 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -9,7 +9,7 @@ def generate_arguments(use_float=True): if use_float: dtype_arr = (torch.float32, torch.float16) else: - dtype_arr = (torch.bool, torch.int8, torch.int16, torch.int32) + dtype_arr = (torch.int8, torch.int16, torch.int32, torch.int64) for ndim in range(1, 5): for dtype in dtype_arr: