diff --git a/HONOR_CODE.md b/HONOR_CODE.md
new file mode 100644
index 0000000..5d22ad7
--- /dev/null
+++ b/HONOR_CODE.md
@@ -0,0 +1,71 @@
+# 2026 春季启元人工智能大赛诚信守则（Honor Code）
+
+
+本人作为 2026 春季启元人工智能大赛（以下简称“比赛”）的参赛选手，郑重承诺严格遵守比赛规则及本诚信守则，秉持诚信、公正、廉洁的参赛原则，自觉维护比赛的公平性与严肃性。本人充分理解并认可，违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果，且愿意承担由此产生的一切责任。
+
+## 一、参赛诚信承诺
+
+1. 本人保证所提交的赛题PR（Pull Request）中包含的算子实现代码及相关文档，均为本人（及参赛团队，如为团队参赛）在比赛期间独立完成或在明确标注参考来源的基础上进行开发，不存在任何欺诈、抄袭、作弊行为。
+
+2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源，尤其是开源代码资源，不隐瞒任何可能影响比赛公平性的信息。
+
+3. 本人保证不采用任何不正当手段获取比赛优势，包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。
+
+## 二、参考资源说明
+
+本人确认已按比赛要求，将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中，该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况，按以下要求完整填写，信息不完整或虚假填写将视为违反本准则：
+
+**情况1：无参考外部开源代码及核心实现思路**
+
+`REFERENCE.md`中需明确声明：“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人（及参赛团队）独立设计与开发，未参考任何外部开源项目、技术文档中的核心代码片段或实现思路，未接受任何第三方的技术指导或代码支持。”
+
+**情况2：有参考外部开源代码及相关资源**
+
+对每个参考资源提供以下信息陈述： 
+1. 参考开源项目/资源名称
+
+2. 参考资源链接（GitHub/Gitee/论文/技术文档等）
+
+3.  参考的具体内容（请明确说明参考的代码片段、算法逻辑、实现思路等，需标注对应资源的具体位置，如文件路径、代码行数等）
+    
+4. 本人对参考内容的修改与优化说明：（请详细说明在参考基础上，本人所做的独立开发、修改、优化工作，体现自身技术贡献）
+    
+5. 若是开源项目，提供参考资源的开源协议类型：（如MIT、Apache 2.0、GPL等）
+    
+6. 其他需要补充说明的信息
+    
+
+## 三、禁止行为确认
+
+本人明确知晓并承诺避免以下违反比赛公平性的行为，若存在以下任一情况，自愿接受比赛组委会的相应处罚：
+
+1. 未经授权复制、抄袭他人（包括其他参赛选手、开源项目、商业代码）的代码、算法或技术方案，且未进行明确标注；
+    
+2. 隐瞒或虚假披露参考资源信息，包括遗漏重要参考来源、伪造参考内容说明等；
+    
+3. 与其他参赛选手或第三方串通，进行代码共享、成果交换等违规协作；
+    
+4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益；
+    
+5. 伪造比赛相关证明材料、提交虚假信息；
+    
+6. 其他违反比赛规则及公序良俗的不诚信行为。
+    
+
+## 四、责任与确认
+
+1. 本人充分理解，比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查，若发现本人存在违反本准则的行为，有权随时取消本人的参赛资格、作废比赛成绩，情节严重的将在比赛相关平台进行公示。
+
+2. 若因本人违反本准则导致比赛争议或第三方权益受损（如开源协议侵权等），本人将独立承担全部法律责任及相关损失，与比赛组委会无关。
+
+3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容，自愿签署本准则，接受比赛组委会的监督与审查。
+
+## 五、签署信息
+
+参赛选手姓名（团队参赛需填写所有成员姓名）
+
+    李浩坤
+
+签署日期
+
+___2026___年__6__月__17__日
\ No newline at end of file
diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
index f6934ef..8d0c187 100644
--- a/src/ntops/kernels/__init__.py
+++ b/src/ntops/kernels/__init__.py
@@ -9,6 +9,7 @@
     bmm,
     clamp,
     conv2d,
+    copysign,
     cos,
     div,
     dropout,
@@ -20,14 +21,18 @@
     isinf,
     isnan,
     layer_norm,
+    lcm,
     le,
+    lgamma,
     lt,
     max_pool2d,
     mm,
     mul,
     ne,
     neg,
+    nextafter,
     pow,
+    rad2deg,
     relu,
     rms_norm,
     rotary_position_embedding,
@@ -52,6 +57,7 @@
     "bmm",
     "clamp",
     "conv2d",
+    "copysign",
     "cos",
     "div",
     "dropout",
@@ -63,14 +69,18 @@
     "isinf",
     "isnan",
     "layer_norm",
+    "lcm",
     "le",
+    "lgamma",
     "lt",
     "max_pool2d",
     "mm",
     "mul",
     "ne",
     "neg",
+    "nextafter",
     "pow",
+    "rad2deg",
     "relu",
     "rms_norm",
     "rotary_position_embedding",
diff --git a/src/ntops/kernels/copysign.py b/src/ntops/kernels/copysign.py
new file mode 100644
index 0000000..15e7cf4
--- /dev/null
+++ b/src/ntops/kernels/copysign.py
@@ -0,0 +1,60 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+# copysign(input, other) = magnitude of input + sign of other.
+#
+# Magnitude: ntl.abs(input) handles all float types directly.
+# Sign detection: cast other to a same-width signed integer and check < 0.
+#   - Signed int comparison checks the MSB, which is exactly the IEEE 754
+#     sign bit for all standard float widths (16 / 32 / 64 bit).
+#   - This correctly identifies -0.0 as negative (0x8000... as signed int
+#     is INT_MIN, which is < 0), unlike the float comparison `other < 0`
+#     which treats -0.0 as 0.
+#
+# Three functions because the bitcast target type depends on float width:
+#   float16 / bfloat16  (16-bit)  ->  int16
+#   float32             (32-bit)  ->  int32
+#   float64             (64-bit)  ->  int64
+
+
+def application_f16(input, other, output):
+    other_sign_negative = ntl.cast(other, ntl.int16, bitcast=True) < 0
+    abs_val = ntl.abs(input)
+    output = ntl.where(other_sign_negative, -abs_val, abs_val)  # noqa: F841
+
+
+def application_f32(input, other, output):
+    other_sign_negative = ntl.cast(other, ntl.int32, bitcast=True) < 0
+    abs_val = ntl.abs(input)
+    output = ntl.where(other_sign_negative, -abs_val, abs_val)  # noqa: F841
+
+
+def application_f64(input, other, output):
+    other_sign_negative = ntl.cast(other, ntl.int64, bitcast=True) < 0
+    abs_val = ntl.abs(input)
+    output = ntl.where(other_sign_negative, -abs_val, abs_val)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    if dtype in (ninetoothed.float16, ninetoothed.bfloat16):
+        application = application_f16
+    elif dtype == ninetoothed.float32:
+        application = application_f32
+    else:
+        application = application_f64
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/lcm.py b/src/ntops/kernels/lcm.py
new file mode 100644
index 0000000..6700ef9
--- /dev/null
+++ b/src/ntops/kernels/lcm.py
@@ -0,0 +1,120 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+# Euclidean GCD: gcd(a, b) = gcd(b, a % b) until b == 0.
+#
+# All dtypes compute in int32 (for int8/int16/int32) or int64 (for int64).
+# This avoids two problems in the original single-function implementation:
+#   1. Computing in native dtype causes overflow then wrong abs
+#      e.g. int8: 11*17=187 wraps to -69, ntl.abs gives 69 != torch.lcm's -69
+#   2. 32 iterations is insufficient for int64 whose Fibonacci worst case
+#      needs ~91 steps; int8/int16/int32 each need fewer.
+#
+# Iteration count per dtype (Fibonacci adversarial bound, a sorted >= b):
+#   int8:  abs ≤ 128,     max ~10 steps  -> 12
+#   int16: abs ≤ 32768,   max ~23 steps  -> 24
+#   int32: abs ≤ 2^31,    max ~45 steps  -> 48
+#   int64: abs ≤ 2^63,    max ~92 steps  -> 96
+#
+# LCM = (|a| / gcd) * |b|, computed in wide type, then cast to output dtype.
+# The cast wraps on overflow, matching torch.lcm's behaviour for all dtypes.
+
+
+def application_i8(input, other, output):
+    w = ntl.int32
+    abs_a = ntl.abs(ntl.cast(input, w))
+    abs_b = ntl.abs(ntl.cast(other, w))
+    or_ab = abs_a | abs_b
+    a = ntl.where(abs_a >= abs_b, abs_a, abs_b)
+    b = ntl.where(abs_a >= abs_b, abs_b, abs_a)
+    zero = ntl.cast(0, w)
+    for _ in range(12):
+        b_safe = ntl.where(b != 0, b, zero + 1)
+        r = a % b_safe
+        a = ntl.where(b != 0, b, a)
+        b = r
+    gcd = ntl.where(a == 0, zero + 1, a)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype
+    )
+
+
+def application_i16(input, other, output):
+    w = ntl.int32
+    abs_a = ntl.abs(ntl.cast(input, w))
+    abs_b = ntl.abs(ntl.cast(other, w))
+    or_ab = abs_a | abs_b
+    a = ntl.where(abs_a >= abs_b, abs_a, abs_b)
+    b = ntl.where(abs_a >= abs_b, abs_b, abs_a)
+    zero = ntl.cast(0, w)
+    for _ in range(24):
+        b_safe = ntl.where(b != 0, b, zero + 1)
+        r = a % b_safe
+        a = ntl.where(b != 0, b, a)
+        b = r
+    gcd = ntl.where(a == 0, zero + 1, a)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype
+    )
+
+
+def application_i32(input, other, output):
+    w = ntl.int32
+    abs_a = ntl.abs(ntl.cast(input, w))
+    abs_b = ntl.abs(ntl.cast(other, w))
+    or_ab = abs_a | abs_b
+    a = ntl.where(abs_a >= abs_b, abs_a, abs_b)
+    b = ntl.where(abs_a >= abs_b, abs_b, abs_a)
+    zero = ntl.cast(0, w)
+    for _ in range(48):
+        b_safe = ntl.where(b != 0, b, zero + 1)
+        r = a % b_safe
+        a = ntl.where(b != 0, b, a)
+        b = r
+    gcd = ntl.where(a == 0, zero + 1, a)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype
+    )
+
+
+def application_i64(input, other, output):
+    w = ntl.int64
+    abs_a = ntl.abs(ntl.cast(input, w))
+    abs_b = ntl.abs(ntl.cast(other, w))
+    or_ab = abs_a | abs_b
+    a = ntl.where(abs_a >= abs_b, abs_a, abs_b)
+    b = ntl.where(abs_a >= abs_b, abs_b, abs_a)
+    zero = ntl.cast(0, w)
+    for _ in range(96):
+        b_safe = ntl.where(b != 0, b, zero + 1)
+        r = a % b_safe
+        a = ntl.where(b != 0, b, a)
+        b = r
+    gcd = ntl.where(a == 0, zero + 1, a)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, zero, (abs_a // gcd) * abs_b), output.dtype
+    )
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+    if dtype == ninetoothed.int64:
+        application = application_i64
+    elif dtype == ninetoothed.int32:
+        application = application_i32
+    elif dtype == ninetoothed.int16:
+        application = application_i16
+    else:
+        application = application_i8
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/lgamma.py b/src/ntops/kernels/lgamma.py
new file mode 100644
index 0000000..dd6bfa8
--- /dev/null
+++ b/src/ntops/kernels/lgamma.py
@@ -0,0 +1,42 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+from ninetoothed.language import libdevice
+
+from ntops.kernels.element_wise import arrangement
+
+
+# libdevice.lgamma supports float32 and float64 only.
+#
+# Integer types: handled entirely in the torch wrapper by pre-converting
+#   the input tensor to float32 (torch.Tensor.to), then running the
+#   float32 kernel. This reuses one kernel instead of compiling four
+#   identical int→float32→lgamma kernels (one per int dtype).
+#
+# float16 / bfloat16: promote to float32 for the lgamma call, downcast result.
+#
+# float32 / float64: call libdevice.lgamma directly.
+
+
+def application_narrow(input, output):
+    promoted = ntl.cast(input, ntl.float32)
+    output = ntl.cast(libdevice.lgamma(promoted), output.dtype)  # noqa: F841
+
+
+def application_wide(input, output):
+    output = libdevice.lgamma(input)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    if dtype in (ninetoothed.float16, ninetoothed.bfloat16):
+        application = application_narrow
+    else:
+        application = application_wide
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/nextafter.py b/src/ntops/kernels/nextafter.py
new file mode 100644
index 0000000..2b0fceb
--- /dev/null
+++ b/src/ntops/kernels/nextafter.py
@@ -0,0 +1,102 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+from ninetoothed.language import libdevice
+
+from ntops.kernels.element_wise import arrangement
+
+
+# nextafter for integers: the nearest integer toward other is simply ±1 away.
+def application_int(input, other, output):
+    output = ntl.where(  # noqa: F841
+        input == other,
+        other,
+        ntl.where(input < other, input + 1, input - 1),
+    )
+
+
+# nextafter for float16 / bfloat16.
+# libdevice.nextafter does not accept narrow float types; we use bitcast to int16.
+#
+# Observation from bit patterns (int16 = bitcast of float16):
+#   positive floats: int16 order == float order  (+3.0 → 16896 > +1.0 → 15360)
+#   negative floats: int16 order is REVERSED     (-3.0 → −15872, -2.998 → −15873)
+#
+# So to advance one ULP toward other:
+#   a > 0, a < b  →  a_i + 1   (moving up in both orderings)
+#   a > 0, a > b  →  a_i - 1
+#   a < 0, a < b  →  a_i - 1   (moving up in float but down in int16)
+#   a < 0, a > b  →  a_i + 1
+#
+# Special cases (checked before the general step):
+#   NaN  : propagate whichever input is NaN (a takes priority)
+#   equal: return b's bit pattern (handles +0 == -0 correctly)
+#   zero : return ±min_subnormal with b's sign (bitcast b < 0 detects -0.0)
+def application_f16(input, other, output):
+    i16 = ntl.int16
+    one = ntl.cast(1, i16)
+    zero_i = ntl.cast(0, i16)
+
+    a_i = ntl.cast(input, i16, bitcast=True)
+    b_i = ntl.cast(other, i16, bitcast=True)
+
+    # NaN propagation: keep NaN bits of the offending operand
+    a_nan = input != input
+    b_nan = other != other
+    nan_bits = ntl.where(a_nan, a_i, b_i)
+
+    # Zero → smallest subnormal; sign of b determined by its sign bit
+    from_zero = ntl.where(b_i < zero_i, ntl.cast(-32767, i16), one)
+
+    # General case: positive int16 means positive float (same order),
+    # negative int16 means negative float (reversed order) → flip step sign.
+    going_up = input < other
+    a_i_positive = a_i >= zero_i
+    step = ntl.where(
+        a_i_positive,
+        ntl.where(going_up, one, -one),
+        ntl.where(going_up, -one, one),
+    )
+
+    result_i = ntl.where(
+        a_nan | b_nan,
+        nan_bits,
+        ntl.where(
+            input == other,
+            b_i,
+            ntl.where(input == ntl.cast(0, ntl.float16), from_zero, a_i + step),
+        ),
+    )
+    output = ntl.cast(result_i, output.dtype, bitcast=True)  # noqa: F841
+
+
+def application_f32(input, other, output):
+    output = libdevice.nextafter(input, other)  # noqa: F841
+
+
+def application_f64(input, other, output):
+    output = libdevice.nextafter(input, other)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    int_types = (ninetoothed.int8, ninetoothed.int16, ninetoothed.int32, ninetoothed.int64)
+    if dtype in int_types:
+        application = application_int
+    elif dtype in (ninetoothed.float16, ninetoothed.bfloat16):
+        application = application_f16
+    elif dtype == ninetoothed.float32:
+        application = application_f32
+    else:
+        application = application_f64
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/rad2deg.py b/src/ntops/kernels/rad2deg.py
new file mode 100644
index 0000000..9371f22
--- /dev/null
+++ b/src/ntops/kernels/rad2deg.py
@@ -0,0 +1,17 @@
+import functools
+
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = input * 57.29577951308232  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
index 82fc596..5732b72 100644
--- a/src/ntops/torch/__init__.py
+++ b/src/ntops/torch/__init__.py
@@ -8,6 +8,7 @@
 from ntops.torch.bmm import bmm
 from ntops.torch.clamp import clamp
 from ntops.torch.conv2d import conv2d
+from ntops.torch.copysign import copysign
 from ntops.torch.cos import cos
 from ntops.torch.div import div
 from ntops.torch.dropout import dropout
@@ -19,7 +20,9 @@
 from ntops.torch.isinf import isinf
 from ntops.torch.isnan import isnan
 from ntops.torch.layer_norm import layer_norm
+from ntops.torch.lcm import lcm
 from ntops.torch.le import le
+from ntops.torch.lgamma import lgamma
 from ntops.torch.lt import lt
 from ntops.torch.matmul import matmul
 from ntops.torch.max_pool2d import max_pool2d
@@ -27,7 +30,9 @@
 from ntops.torch.mul import mul
 from ntops.torch.ne import ne
 from ntops.torch.neg import neg
+from ntops.torch.nextafter import nextafter
 from ntops.torch.pow import pow
+from ntops.torch.rad2deg import rad2deg
 from ntops.torch.relu import relu
 from ntops.torch.rms_norm import rms_norm
 from ntops.torch.rotary_position_embedding import rotary_position_embedding
@@ -51,6 +56,7 @@
     "bmm",
     "clamp",
     "conv2d",
+    "copysign",
     "cos",
     "div",
     "dropout",
@@ -62,7 +68,9 @@
     "isinf",
     "isnan",
     "layer_norm",
+    "lcm",
     "le",
+    "lgamma",
     "lt",
     "matmul",
     "max_pool2d",
@@ -70,7 +78,9 @@
     "mul",
     "ne",
     "neg",
+    "nextafter",
     "pow",
+    "rad2deg",
     "relu",
     "rms_norm",
     "rotary_position_embedding",
diff --git a/src/ntops/torch/copysign.py b/src/ntops/torch/copysign.py
new file mode 100644
index 0000000..2c0b427
--- /dev/null
+++ b/src/ntops/torch/copysign.py
@@ -0,0 +1,27 @@
+import torch
+
+import ninetoothed
+import ntops
+from ntops.torch.utils import _cached_make
+
+_DTYPE_MAP = {
+    torch.float16: ninetoothed.float16,
+    torch.bfloat16: ninetoothed.bfloat16,
+    torch.float32: ninetoothed.float32,
+    torch.float64: ninetoothed.float64,
+}
+
+
+def copysign(input, other, *, out=None):
+    if out is None:
+        out = torch.empty_like(input)
+
+    kernel = _cached_make(
+        ntops.kernels.copysign.premake,
+        input.ndim,
+        dtype=_DTYPE_MAP.get(input.dtype),
+    )
+
+    kernel(input, other, out)
+
+    return out
diff --git a/src/ntops/torch/lcm.py b/src/ntops/torch/lcm.py
new file mode 100644
index 0000000..2cb187d
--- /dev/null
+++ b/src/ntops/torch/lcm.py
@@ -0,0 +1,57 @@
+import torch
+
+import ninetoothed
+import ntops
+from ntops.torch.utils import _cached_make
+
+_NUM_STAGES = 1
+
+
+def _block_size_for(torch_dtype):
+    if torch_dtype == torch.int64:
+        return 32
+    return 512
+
+
+def _num_warps_for(torch_dtype):
+    if torch_dtype == torch.int64:
+        return 1
+    return 4
+
+
+def _to_nt(torch_dtype):
+    mapping = {
+        torch.int8: ninetoothed.int8,
+        torch.int16: ninetoothed.int16,
+        torch.int32: ninetoothed.int32,
+        torch.int64: ninetoothed.int64,
+    }
+    return mapping.get(torch_dtype)
+
+
+def lcm(input, other, *, out=None):
+    if out is None:
+        out = torch.empty_like(input)
+
+    if input.ndim != 1 and input.is_contiguous() and other.is_contiguous() and out.is_contiguous():
+        n = input.numel()
+        in_view = input.view([n])
+        other_view = other.view([n])
+        out_view = out.view([n])
+    else:
+        in_view = input
+        other_view = other
+        out_view = out
+
+    kernel = _cached_make(
+        ntops.kernels.lcm.premake,
+        in_view.ndim,
+        dtype=_to_nt(input.dtype),
+        block_size=_block_size_for(input.dtype),
+        num_warps=_num_warps_for(input.dtype),
+        num_stages=_NUM_STAGES,
+    )
+
+    kernel(in_view, other_view, out_view)
+
+    return out
diff --git a/src/ntops/torch/lgamma.py b/src/ntops/torch/lgamma.py
new file mode 100644
index 0000000..5c646e6
--- /dev/null
+++ b/src/ntops/torch/lgamma.py
@@ -0,0 +1,44 @@
+import torch
+
+import ninetoothed
+import ntops
+from ntops.torch.utils import _cached_make
+
+_DTYPE_MAP = {
+    torch.float16: ninetoothed.float16,
+    torch.bfloat16: ninetoothed.bfloat16,
+    torch.float32: ninetoothed.float32,
+    torch.float64: ninetoothed.float64,
+}
+
+_INT_TYPES = {torch.int8, torch.int16, torch.int32, torch.int64}
+
+
+def lgamma(input, *, out=None):
+    if input.dtype in _INT_TYPES:
+        # Pre-convert to float32 in Python before calling the kernel.
+        # This reuses the existing float32 kernel for all four integer types,
+        # avoiding four separate Triton JIT compilations (one per int dtype)
+        # whose bodies would be identical (cast → float32 → lgamma).
+        # torch.lgamma always returns float32 for integer inputs.
+        if out is None:
+            out = torch.empty_like(input, dtype=torch.float32)
+        kernel_input = input.to(torch.float32)
+        kernel = _cached_make(
+            ntops.kernels.lgamma.premake,
+            input.ndim,
+            dtype=ninetoothed.float32,
+        )
+        kernel(kernel_input, out)
+        return out
+
+    if out is None:
+        out = torch.empty_like(input)
+
+    kernel = _cached_make(
+        ntops.kernels.lgamma.premake,
+        input.ndim,
+        dtype=_DTYPE_MAP.get(input.dtype),
+    )
+    kernel(input, out)
+    return out
diff --git a/src/ntops/torch/nextafter.py b/src/ntops/torch/nextafter.py
new file mode 100644
index 0000000..918f438
--- /dev/null
+++ b/src/ntops/torch/nextafter.py
@@ -0,0 +1,31 @@
+import torch
+
+import ninetoothed
+import ntops
+from ntops.torch.utils import _cached_make
+
+_DTYPE_MAP = {
+    torch.int8: ninetoothed.int8,
+    torch.int16: ninetoothed.int16,
+    torch.int32: ninetoothed.int32,
+    torch.int64: ninetoothed.int64,
+    torch.float16: ninetoothed.float16,
+    torch.bfloat16: ninetoothed.bfloat16,
+    torch.float32: ninetoothed.float32,
+    torch.float64: ninetoothed.float64,
+}
+
+
+def nextafter(input, other, *, out=None):
+    if out is None:
+        out = torch.empty_like(input)
+
+    kernel = _cached_make(
+        ntops.kernels.nextafter.premake,
+        input.ndim,
+        dtype=_DTYPE_MAP.get(input.dtype),
+    )
+
+    kernel(input, other, out)
+
+    return out
diff --git a/src/ntops/torch/rad2deg.py b/src/ntops/torch/rad2deg.py
new file mode 100644
index 0000000..f6896b1
--- /dev/null
+++ b/src/ntops/torch/rad2deg.py
@@ -0,0 +1,15 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def rad2deg(input, *, out=None):
+    if out is None:
+        out = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.rad2deg.premake, input.ndim)
+
+    kernel(input, out)
+
+    return out
diff --git a/tests/test_copysign.py b/tests/test_copysign.py
new file mode 100644
index 0000000..c9fabe0
--- /dev/null
+++ b/tests/test_copysign.py
@@ -0,0 +1,21 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_copysign(shape, dtype, device, rtol, atol):
+    # TODO: Test for `float16` later.
+    # if dtype is torch.float16:
+    #     return
+    input = torch.randn(shape, dtype=dtype, device=device)
+    other = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.copysign(input, other)
+    reference_output = torch.copysign(input, other)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_lcm.py b/tests/test_lcm.py
new file mode 100644
index 0000000..f2e29fe
--- /dev/null
+++ b/tests/test_lcm.py
@@ -0,0 +1,23 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments(False))
+def test_lcm(shape, dtype, device, rtol, atol):
+    upper_bound = 100
+    input = torch.randint(
+        -upper_bound, upper_bound + 1, size=shape, dtype=dtype, device=device
+    )
+    other = torch.randint(
+        -upper_bound, upper_bound + 1, size=shape, dtype=dtype, device=device
+    )
+
+    ninetoothed_output = ntops.torch.lcm(input, other)
+    reference_output = torch.lcm(input, other)
+
+    assert torch.equal(ninetoothed_output, reference_output)
diff --git a/tests/test_lgamma.py b/tests/test_lgamma.py
new file mode 100644
index 0000000..f422259
--- /dev/null
+++ b/tests/test_lgamma.py
@@ -0,0 +1,29 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_lgamma_float(shape, dtype, device, rtol, atol):
+    input = torch.rand(shape, dtype=dtype, device=device) * 5 + 0.1
+
+    ninetoothed_output = ntops.torch.lgamma(input)
+    reference_output = torch.lgamma(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments(False))
+def test_lgamma_int(shape, dtype, device, rtol, atol):
+    # torch.lgamma on integers returns float32
+    input = torch.randint(1, 20, size=shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.lgamma(input)
+    reference_output = torch.lgamma(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=1e-4, atol=1e-4)
diff --git a/tests/test_nextafter.py b/tests/test_nextafter.py
new file mode 100644
index 0000000..29f5509
--- /dev/null
+++ b/tests/test_nextafter.py
@@ -0,0 +1,38 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+def _int_nextafter_ref(input, other):
+    # torch.nextafter doesn't support integers; step ±1 toward other.
+    result = input.clone()
+    result[input < other] += 1
+    result[input > other] -= 1
+    return result
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_nextafter_float(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+    other = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.nextafter(input, other)
+    reference_output = torch.nextafter(input, other)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments(False))
+def test_nextafter_int(shape, dtype, device, rtol, atol):
+    input = torch.randint(-100, 100, size=shape, dtype=dtype, device=device)
+    other = torch.randint(-100, 100, size=shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.nextafter(input, other)
+    reference_output = _int_nextafter_ref(input, other)
+
+    assert torch.equal(ninetoothed_output, reference_output)
diff --git a/tests/test_rad2deg.py b/tests/test_rad2deg.py
new file mode 100644
index 0000000..1e7ba1b
--- /dev/null
+++ b/tests/test_rad2deg.py
@@ -0,0 +1,20 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_rad2deg(shape, dtype, device, rtol, atol):
+    # TODO: Test for `float16` later.
+    if dtype is torch.float16:
+        return
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.rad2deg(input)
+    reference_output = torch.rad2deg(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/utils.py b/tests/utils.py
index ac1949f..9f592ea 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -9,7 +9,7 @@ def generate_arguments(use_float=True):
     if use_float:
         dtype_arr = (torch.float32, torch.float16)
     else:
-        dtype_arr = (torch.bool, torch.int8, torch.int16, torch.int32)
+        dtype_arr = (torch.int8, torch.int16, torch.int32, torch.int64)
 
     for ndim in range(1, 5):
         for dtype in dtype_arr: