diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
index f6934ef..b4c52c0 100644
--- a/src/ntops/kernels/__init__.py
+++ b/src/ntops/kernels/__init__.py
@@ -7,38 +7,61 @@
     bitwise_not,
     bitwise_or,
     bmm,
+    cast,
+    ceil,
     clamp,
     conv2d,
     cos,
     div,
     dropout,
+    elu,
     eq,
     exp,
+    floor,
     ge,
     gelu,
+    gelu_backward,
     gt,
+    hardswish,
+    hardtanh,
     isinf,
     isnan,
     layer_norm,
     le,
+    leaky_relu,
+    log,
+    log_softmax,
     lt,
     max_pool2d,
+    mish,
     mm,
     mul,
     ne,
     neg,
     pow,
+    prelu,
+    reciprocal,
     relu,
+    relu6,
+    relu_backward,
     rms_norm,
     rotary_position_embedding,
     rsqrt,
     scaled_dot_product_attention,
+    selu,
     sigmoid,
+    sigmoid_backward,
     silu,
     sin,
     softmax,
+    softplus,
+    softsign,
+    sqrt,
     sub,
     tanh,
+    tanh_backward,
+    tanhshrink,
+    where,
 )
 
 __all__ = [
@@ -50,36 +73,59 @@
     "bitwise_not",
     "bitwise_or",
     "bmm",
+    "cast",
+    "ceil",
     "clamp",
     "conv2d",
     "cos",
     "div",
     "dropout",
+    "elu",
     "eq",
     "exp",
+    "floor",
     "ge",
     "gelu",
+    "gelu_backward",
     "gt",
+    "hardswish",
+    "hardtanh",
     "isinf",
     "isnan",
     "layer_norm",
     "le",
+    "leaky_relu",
+    "log",
+    "log_softmax",
     "lt",
     "max_pool2d",
+    "mish",
     "mm",
     "mul",
     "ne",
     "neg",
     "pow",
+    "prelu",
+    "reciprocal",
     "relu",
+    "relu6",
+    "relu_backward",
     "rms_norm",
     "rotary_position_embedding",
     "rsqrt",
     "scaled_dot_product_attention",
+    "selu",
     "sigmoid",
+    "sigmoid_backward",
     "silu",
     "sin",
     "softmax",
+    "softplus",
+    "softsign",
+    "sqrt",
     "sub",
     "tanh",
+    "tanh_backward",
+    "tanhshrink",
+    "where",
 ]
diff --git a/src/ntops/kernels/cast.py b/src/ntops/kernels/cast.py
new file mode 100644
index 0000000..dc6e64e
--- /dev/null
+++ b/src/ntops/kernels/cast.py
@@ -0,0 +1,21 @@
+import functools
+
+import ninetoothed
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = input  # noqa: F841
+
+
+def premake(ndim, input_dtype=None, output_dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=input_dtype),
+        Tensor(ndim, dtype=output_dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/ceil.py b/src/ntops/kernels/ceil.py
new file mode 100644
index 0000000..ac4ce92
--- /dev/null
+++ b/src/ntops/kernels/ceil.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.cast(ntl.ceil(ntl.cast(input, ntl.float32)), input.dtype)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/elu.py b/src/ntops/kernels/elu.py
new file mode 100644
index 0000000..f68c44a
--- /dev/null
+++ b/src/ntops/kernels/elu.py
@@ -0,0 +1,25 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, alpha, output):
+    input_f32 = ntl.cast(input, ntl.float32)
+    result = ntl.where(input >= 0, input, ntl.cast(alpha * (ntl.exp(input_f32) - 1), input.dtype))
+    output = result  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/floor.py b/src/ntops/kernels/floor.py
new file mode 100644
index 0000000..c36c65e
--- /dev/null
+++ b/src/ntops/kernels/floor.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.cast(ntl.floor(ntl.cast(input, ntl.float32)), input.dtype)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/gelu_backward.py b/src/ntops/kernels/gelu_backward.py
new file mode 100644
index 0000000..85fd36e
--- /dev/null
+++ b/src/ntops/kernels/gelu_backward.py
@@ -0,0 +1,25 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(grad_output, input, grad_input):
+    input_f32 = ntl.cast(input, ntl.float32)
+    cdf = 0.5 * (1.0 + ntl.erf(input_f32 * 0.7071067811865476))
+    pdf = ntl.exp(-0.5 * input_f32 * input_f32) * 0.3989422804014327
+    grad_input = grad_output * ntl.cast(cdf + input_f32 * pdf, grad_output.dtype)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/hardswish.py b/src/ntops/kernels/hardswish.py
new file mode 100644
index 0000000..f75c4bd
--- /dev/null
+++ b/src/ntops/kernels/hardswish.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = input * ntl.clamp(input + 3.0, 0.0, 6.0) / 6.0  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/hardtanh.py b/src/ntops/kernels/hardtanh.py
new file mode 100644
index 0000000..0494579
--- /dev/null
+++ b/src/ntops/kernels/hardtanh.py
@@ -0,0 +1,24 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, min_val, max_val, output):
+    output = ntl.clamp(input, min_val, max_val)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/leaky_relu.py b/src/ntops/kernels/leaky_relu.py
new file mode 100644
index 0000000..f8038ed
--- /dev/null
+++ b/src/ntops/kernels/leaky_relu.py
@@ -0,0 +1,23 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, negative_slope, output):
+    output = ntl.where(input >= 0, input, negative_slope * input)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/log.py b/src/ntops/kernels/log.py
new file mode 100644
index 0000000..178a27e
--- /dev/null
+++ b/src/ntops/kernels/log.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.log(ntl.cast(input, ntl.float32))  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/log_softmax.py b/src/ntops/kernels/log_softmax.py
new file mode 100644
index 0000000..0eb92a6
--- /dev/null
+++ b/src/ntops/kernels/log_softmax.py
@@ -0,0 +1,44 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.reduction import arrangement
+
+
+def _exp(x, dtype):
+    exp_dtype = dtype if dtype != ntl.float16 else ntl.float32
+    return ntl.cast(ntl.exp(ntl.cast(x, exp_dtype)), dtype)
+
+
+def application(input, output):
+    dtype = output.dtype.dtype
+    prev_max = ntl.cast(float("-inf"), dtype)
+    denominator = ntl.cast(0, dtype)
+
+    for i in range(input.shape[0]):
+        input_i = ntl.cast(input[i], dtype)
+        curr_max = ntl.cast(ntl.maximum(prev_max, ntl.max(input_i)), dtype)
+        input_max_diff_exp = _exp(input_i - curr_max, dtype)
+        prev_curr_max_diff_exp = _exp(prev_max - curr_max, dtype)
+        denominator = denominator * prev_curr_max_diff_exp + ntl.sum(input_max_diff_exp)
+        prev_max = curr_max
+
+    log_dtype = dtype if dtype != ntl.float16 else ntl.float32
+
+    for i in range(input.shape[0]):
+        log_denominator = ntl.log(ntl.cast(denominator, log_dtype))
+        output[i] = ntl.cast(ntl.cast(input[i], log_dtype) - ntl.cast(prev_max, log_dtype) - log_denominator, dtype)
+
+
+def premake(ndim, dim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, dim=dim, block_size=block_size)
+
+    tensors = (
+        Tensor(
+            ndim, dtype=dtype, other=float("-inf"), shape_options={"constexpr": True}
+        ),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/mish.py b/src/ntops/kernels/mish.py
new file mode 100644
index 0000000..794be67
--- /dev/null
+++ b/src/ntops/kernels/mish.py
@@ -0,0 +1,24 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    input_f32 = ntl.cast(input, ntl.float32)
+    sp = ntl.log(1 + ntl.exp(input_f32))
+    exp_sp = ntl.exp(sp)
+    exp_neg_sp = ntl.exp(-sp)
+    tanh_sp = (exp_sp - exp_neg_sp) / (exp_sp + exp_neg_sp)
+    result = ntl.cast(input_f32 * tanh_sp, input.dtype)
+    output = result  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/prelu.py b/src/ntops/kernels/prelu.py
new file mode 100644
index 0000000..41cdaf5
--- /dev/null
+++ b/src/ntops/kernels/prelu.py
@@ -0,0 +1,22 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, weight, output):
+    output = ntl.where(input >= 0, input, weight * input)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/reciprocal.py b/src/ntops/kernels/reciprocal.py
new file mode 100644
index 0000000..64cbef7
--- /dev/null
+++ b/src/ntops/kernels/reciprocal.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.cast(1.0 / ntl.cast(input, ntl.float32), input.dtype)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/relu6.py b/src/ntops/kernels/relu6.py
new file mode 100644
index 0000000..c73364e
--- /dev/null
+++ b/src/ntops/kernels/relu6.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.clamp(input, 0.0, 6.0)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/relu_backward.py b/src/ntops/kernels/relu_backward.py
new file mode 100644
index 0000000..ea52ce2
--- /dev/null
+++ b/src/ntops/kernels/relu_backward.py
@@ -0,0 +1,22 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(grad_output, input, grad_input):
+    grad_input = ntl.where(input >= 0, grad_output, 0.0)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/selu.py b/src/ntops/kernels/selu.py
new file mode 100644
index 0000000..91ce384
--- /dev/null
+++ b/src/ntops/kernels/selu.py
@@ -0,0 +1,26 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, alpha, scale, output):
+    input_f32 = ntl.cast(input, ntl.float32)
+    result = scale * ntl.where(input > 0, input, ntl.cast(alpha * (ntl.exp(input_f32) - 1), input.dtype))
+    output = result  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/sigmoid_backward.py b/src/ntops/kernels/sigmoid_backward.py
new file mode 100644
index 0000000..aab0944
--- /dev/null
+++ b/src/ntops/kernels/sigmoid_backward.py
@@ -0,0 +1,22 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(grad_output, output, grad_input):
+    grad_input = grad_output * output * (1 - output)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/softplus.py b/src/ntops/kernels/softplus.py
new file mode 100644
index 0000000..67222db
--- /dev/null
+++ b/src/ntops/kernels/softplus.py
@@ -0,0 +1,27 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, beta, threshold, output):
+    input_f32 = ntl.cast(input, ntl.float32)
+    softplus_val = ntl.log(1 + ntl.exp(input_f32 * beta)) / beta
+    result = ntl.where(input * beta > threshold, input, ntl.cast(softplus_val, input.dtype))
+    output = result  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(0, dtype=ninetoothed.float64),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/softsign.py b/src/ntops/kernels/softsign.py
new file mode 100644
index 0000000..d2862d0
--- /dev/null
+++ b/src/ntops/kernels/softsign.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = input / (1 + ntl.abs(input))  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/sqrt.py b/src/ntops/kernels/sqrt.py
new file mode 100644
index 0000000..b929292
--- /dev/null
+++ b/src/ntops/kernels/sqrt.py
@@ -0,0 +1,18 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    output = ntl.sqrt(ntl.cast(input, ntl.float32))  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/tanh_backward.py b/src/ntops/kernels/tanh_backward.py
new file mode 100644
index 0000000..079ac82
--- /dev/null
+++ b/src/ntops/kernels/tanh_backward.py
@@ -0,0 +1,22 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(grad_output, output, grad_input):
+    grad_input = grad_output * (1 - output * output)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/tanhshrink.py b/src/ntops/kernels/tanhshrink.py
new file mode 100644
index 0000000..37fcc3e
--- /dev/null
+++ b/src/ntops/kernels/tanhshrink.py
@@ -0,0 +1,22 @@
+import functools
+
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(input, output):
+    input_f32 = ntl.cast(input, ntl.float32)
+    exp_input = ntl.exp(input_f32)
+    exp_neg_input = ntl.exp(-input_f32)
+    tanh_val = (exp_input - exp_neg_input) / (exp_input + exp_neg_input)
+    output = ntl.cast(input_f32 - tanh_val, input.dtype)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype))
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/where.py b/src/ntops/kernels/where.py
new file mode 100644
index 0000000..e13e682
--- /dev/null
+++ b/src/ntops/kernels/where.py
@@ -0,0 +1,24 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application(condition, input, other, output):
+    output = ntl.where(condition != 0, input, other)  # noqa: F841
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    tensors = (
+        Tensor(ndim, dtype=ninetoothed.int8),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py
index 82fc596..828f5dc 100644
--- a/src/ntops/torch/__init__.py
+++ b/src/ntops/torch/__init__.py
@@ -6,39 +6,62 @@
 from ntops.torch.bitwise_not import bitwise_not
 from ntops.torch.bitwise_or import bitwise_or
 from ntops.torch.bmm import bmm
+from ntops.torch.cast import cast
+from ntops.torch.ceil import ceil
 from ntops.torch.clamp import clamp
 from ntops.torch.conv2d import conv2d
 from ntops.torch.cos import cos
 from ntops.torch.div import div
 from ntops.torch.dropout import dropout
+from ntops.torch.elu import elu
 from ntops.torch.eq import eq
 from ntops.torch.exp import exp
+from ntops.torch.floor import floor
 from ntops.torch.ge import ge
 from ntops.torch.gelu import gelu
+from ntops.torch.gelu_backward import gelu_backward
 from ntops.torch.gt import gt
+from ntops.torch.hardswish import hardswish
+from ntops.torch.hardtanh import hardtanh
 from ntops.torch.isinf import isinf
 from ntops.torch.isnan import isnan
 from ntops.torch.layer_norm import layer_norm
 from ntops.torch.le import le
+from ntops.torch.leaky_relu import leaky_relu
+from ntops.torch.log import log
+from ntops.torch.log_softmax import log_softmax
 from ntops.torch.lt import lt
 from ntops.torch.matmul import matmul
 from ntops.torch.max_pool2d import max_pool2d
+from ntops.torch.mish import mish
 from ntops.torch.mm import mm
 from ntops.torch.mul import mul
 from ntops.torch.ne import ne
 from ntops.torch.neg import neg
 from ntops.torch.pow import pow
+from ntops.torch.prelu import prelu
+from ntops.torch.reciprocal import reciprocal
 from ntops.torch.relu import relu
+from ntops.torch.relu6 import relu6
+from ntops.torch.relu_backward import relu_backward
 from ntops.torch.rms_norm import rms_norm
 from ntops.torch.rotary_position_embedding import rotary_position_embedding
 from ntops.torch.rsqrt import rsqrt
 from ntops.torch.scaled_dot_product_attention import scaled_dot_product_attention
+from ntops.torch.selu import selu
 from ntops.torch.sigmoid import sigmoid
+from ntops.torch.sigmoid_backward import sigmoid_backward
 from ntops.torch.silu import silu
 from ntops.torch.sin import sin
 from ntops.torch.softmax import softmax
+from ntops.torch.softplus import softplus
+from ntops.torch.softsign import softsign
+from ntops.torch.sqrt import sqrt
 from ntops.torch.sub import sub
 from ntops.torch.tanh import tanh
+from ntops.torch.tanh_backward import tanh_backward
+from ntops.torch.tanhshrink import tanhshrink
+from ntops.torch.where import where
 
 __all__ = [
     "abs",
@@ -49,37 +72,60 @@
     "bitwise_not",
     "bitwise_or",
     "bmm",
+    "cast",
+    "ceil",
     "clamp",
     "conv2d",
     "cos",
     "div",
     "dropout",
+    "elu",
     "eq",
     "exp",
+    "floor",
     "ge",
     "gelu",
+    "gelu_backward",
     "gt",
+    "hardswish",
+    "hardtanh",
     "isinf",
     "isnan",
     "layer_norm",
     "le",
+    "leaky_relu",
+    "log",
+    "log_softmax",
     "lt",
     "matmul",
     "max_pool2d",
+    "mish",
     "mm",
     "mul",
     "ne",
     "neg",
     "pow",
+    "prelu",
+    "reciprocal",
     "relu",
+    "relu6",
+    "relu_backward",
     "rms_norm",
     "rotary_position_embedding",
     "rsqrt",
     "scaled_dot_product_attention",
+    "selu",
     "sigmoid",
+    "sigmoid_backward",
     "silu",
     "sin",
     "softmax",
+    "softplus",
+    "softsign",
+    "sqrt",
     "sub",
     "tanh",
+    "tanh_backward",
+    "tanhshrink",
+    "where",
 ]
diff --git a/src/ntops/torch/cast.py b/src/ntops/torch/cast.py
new file mode 100644
index 0000000..69473d6
--- /dev/null
+++ b/src/ntops/torch/cast.py
@@ -0,0 +1,19 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def cast(input, dtype):
+    output = torch.empty_like(input, dtype=dtype)
+
+    kernel = _cached_make(
+        ntops.kernels.cast.premake,
+        input.ndim,
+        input_dtype=input.dtype,
+        output_dtype=dtype,
+    )
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/ceil.py b/src/ntops/torch/ceil.py
new file mode 100644
index 0000000..ebb08d3
--- /dev/null
+++ b/src/ntops/torch/ceil.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def ceil(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.ceil.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/elu.py b/src/ntops/torch/elu.py
new file mode 100644
index 0000000..be7f497
--- /dev/null
+++ b/src/ntops/torch/elu.py
@@ -0,0 +1,17 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def elu(input, alpha=1.0, inplace=False):
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.elu.premake, input.ndim)
+
+    kernel(input, alpha, output)
+
+    return output
diff --git a/src/ntops/torch/floor.py b/src/ntops/torch/floor.py
new file mode 100644
index 0000000..410688f
--- /dev/null
+++ b/src/ntops/torch/floor.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def floor(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.floor.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/gelu_backward.py b/src/ntops/torch/gelu_backward.py
new file mode 100644
index 0000000..8c8f31e
--- /dev/null
+++ b/src/ntops/torch/gelu_backward.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def gelu_backward(grad_output, input):
+    grad_input = torch.empty_like(grad_output)
+
+    kernel = _cached_make(ntops.kernels.gelu_backward.premake, grad_output.ndim)
+
+    kernel(grad_output, input, grad_input)
+
+    return grad_input
diff --git a/src/ntops/torch/hardswish.py b/src/ntops/torch/hardswish.py
new file mode 100644
index 0000000..fa3b5a7
--- /dev/null
+++ b/src/ntops/torch/hardswish.py
@@ -0,0 +1,17 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def hardswish(input, inplace=False):
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.hardswish.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/hardtanh.py b/src/ntops/torch/hardtanh.py
new file mode 100644
index 0000000..419324c
--- /dev/null
+++ b/src/ntops/torch/hardtanh.py
@@ -0,0 +1,17 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def hardtanh(input, min_val=-1.0, max_val=1.0, inplace=False):
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.hardtanh.premake, input.ndim)
+
+    kernel(input, min_val, max_val, output)
+
+    return output
diff --git a/src/ntops/torch/leaky_relu.py b/src/ntops/torch/leaky_relu.py
new file mode 100644
index 0000000..c647dd7
--- /dev/null
+++ b/src/ntops/torch/leaky_relu.py
@@ -0,0 +1,17 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def leaky_relu(input, negative_slope=0.01, inplace=False):
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.leaky_relu.premake, input.ndim)
+
+    kernel(input, negative_slope, output)
+
+    return output
diff --git a/src/ntops/torch/log.py b/src/ntops/torch/log.py
new file mode 100644
index 0000000..a1bfcbd
--- /dev/null
+++ b/src/ntops/torch/log.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def log(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.log.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/log_softmax.py b/src/ntops/torch/log_softmax.py
new file mode 100644
index 0000000..7552bb9
--- /dev/null
+++ b/src/ntops/torch/log_softmax.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def log_softmax(input, dim=-1):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.log_softmax.premake, input.ndim, dim=dim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/mish.py b/src/ntops/torch/mish.py
new file mode 100644
index 0000000..0112644
--- /dev/null
+++ b/src/ntops/torch/mish.py
@@ -0,0 +1,17 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def mish(input, inplace=False):
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.mish.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/prelu.py b/src/ntops/torch/prelu.py
new file mode 100644
index 0000000..d81b7b4
--- /dev/null
+++ b/src/ntops/torch/prelu.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def prelu(input, weight):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.prelu.premake, input.ndim)
+
+    kernel(input, weight, output)
+
+    return output
diff --git a/src/ntops/torch/reciprocal.py b/src/ntops/torch/reciprocal.py
new file mode 100644
index 0000000..bbcc16c
--- /dev/null
+++ b/src/ntops/torch/reciprocal.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def reciprocal(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.reciprocal.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/relu6.py b/src/ntops/torch/relu6.py
new file mode 100644
index 0000000..218294c
--- /dev/null
+++ b/src/ntops/torch/relu6.py
@@ -0,0 +1,17 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def relu6(input, inplace=False):
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.relu6.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/relu_backward.py b/src/ntops/torch/relu_backward.py
new file mode 100644
index 0000000..ce9bd83
--- /dev/null
+++ b/src/ntops/torch/relu_backward.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def relu_backward(grad_output, input):
+    grad_input = torch.empty_like(grad_output)
+
+    kernel = _cached_make(ntops.kernels.relu_backward.premake, grad_output.ndim)
+
+    kernel(grad_output, input, grad_input)
+
+    return grad_input
diff --git a/src/ntops/torch/selu.py b/src/ntops/torch/selu.py
new file mode 100644
index 0000000..ed7b877
--- /dev/null
+++ b/src/ntops/torch/selu.py
@@ -0,0 +1,20 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def selu(input, inplace=False):
+    alpha = 1.6732632423543772
+    scale = 1.0507009873554805
+
+    if inplace:
+        output = input
+    else:
+        output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.selu.premake, input.ndim)
+
+    kernel(input, alpha, scale, output)
+
+    return output
diff --git a/src/ntops/torch/sigmoid_backward.py b/src/ntops/torch/sigmoid_backward.py
new file mode 100644
index 0000000..00e7102
--- /dev/null
+++ b/src/ntops/torch/sigmoid_backward.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def sigmoid_backward(grad_output, output):
+    grad_input = torch.empty_like(grad_output)
+
+    kernel = _cached_make(ntops.kernels.sigmoid_backward.premake, grad_output.ndim)
+
+    kernel(grad_output, output, grad_input)
+
+    return grad_input
diff --git a/src/ntops/torch/softplus.py b/src/ntops/torch/softplus.py
new file mode 100644
index 0000000..73cf60c
--- /dev/null
+++ b/src/ntops/torch/softplus.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def softplus(input, beta=1.0, threshold=20.0):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.softplus.premake, input.ndim)
+
+    kernel(input, beta, threshold, output)
+
+    return output
diff --git a/src/ntops/torch/softsign.py b/src/ntops/torch/softsign.py
new file mode 100644
index 0000000..98e7b52
--- /dev/null
+++ b/src/ntops/torch/softsign.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def softsign(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.softsign.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/sqrt.py b/src/ntops/torch/sqrt.py
new file mode 100644
index 0000000..f6654bd
--- /dev/null
+++ b/src/ntops/torch/sqrt.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def sqrt(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.sqrt.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/tanh_backward.py b/src/ntops/torch/tanh_backward.py
new file mode 100644
index 0000000..05e76e5
--- /dev/null
+++ b/src/ntops/torch/tanh_backward.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def tanh_backward(grad_output, output):
+    grad_input = torch.empty_like(grad_output)
+
+    kernel = _cached_make(ntops.kernels.tanh_backward.premake, grad_output.ndim)
+
+    kernel(grad_output, output, grad_input)
+
+    return grad_input
diff --git a/src/ntops/torch/tanhshrink.py b/src/ntops/torch/tanhshrink.py
new file mode 100644
index 0000000..2701cee
--- /dev/null
+++ b/src/ntops/torch/tanhshrink.py
@@ -0,0 +1,14 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def tanhshrink(input):
+    output = torch.empty_like(input)
+
+    kernel = _cached_make(ntops.kernels.tanhshrink.premake, input.ndim)
+
+    kernel(input, output)
+
+    return output
diff --git a/src/ntops/torch/where.py b/src/ntops/torch/where.py
new file mode 100644
index 0000000..e889bf5
--- /dev/null
+++ b/src/ntops/torch/where.py
@@ -0,0 +1,15 @@
+import torch
+
+import ntops
+from ntops.torch.utils import _cached_make
+
+
+def where(condition, input, other):
+    output = torch.empty_like(input)
+    condition_int8 = condition.to(torch.int8)
+
+    kernel = _cached_make(ntops.kernels.where.premake, input.ndim)
+
+    kernel(condition_int8, input, other, output)
+
+    return output
diff --git a/tests/test_cast.py b/tests/test_cast.py
new file mode 100644
index 0000000..a7863eb
--- /dev/null
+++ b/tests/test_cast.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_cast(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.cast(input, torch.float32)
+    reference_output = input.to(torch.float32)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_ceil.py b/tests/test_ceil.py
new file mode 100644
index 0000000..4c020b1
--- /dev/null
+++ b/tests/test_ceil.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_ceil(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.ceil(input)
+    reference_output = torch.ceil(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_elu.py b/tests/test_elu.py
new file mode 100644
index 0000000..256b597
--- /dev/null
+++ b/tests/test_elu.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_elu(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.elu(input, inplace=inplace)
+    reference_output = F.elu(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_floor.py b/tests/test_floor.py
new file mode 100644
index 0000000..569c1f1
--- /dev/null
+++ b/tests/test_floor.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_floor(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.floor(input)
+    reference_output = torch.floor(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_gelu_backward.py b/tests/test_gelu_backward.py
new file mode 100644
index 0000000..f0370ac
--- /dev/null
+++ b/tests/test_gelu_backward.py
@@ -0,0 +1,22 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_gelu_backward(shape, dtype, device, rtol, atol):
+    grad_output = torch.randn(shape, dtype=dtype, device=device)
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.gelu_backward(grad_output, input)
+
+    # Use PyTorch autograd for reference
+    input_ref = input.clone().requires_grad_(True)
+    torch.nn.functional.gelu(input_ref).backward(grad_output)
+    reference_output = input_ref.grad
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_hardswish.py b/tests/test_hardswish.py
new file mode 100644
index 0000000..6f797ba
--- /dev/null
+++ b/tests/test_hardswish.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_hardswish(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.hardswish(input, inplace=inplace)
+    reference_output = F.hardswish(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_hardtanh.py b/tests/test_hardtanh.py
new file mode 100644
index 0000000..c1c9fb4
--- /dev/null
+++ b/tests/test_hardtanh.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_hardtanh(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.hardtanh(input, inplace=inplace)
+    reference_output = F.hardtanh(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_leaky_relu.py b/tests/test_leaky_relu.py
new file mode 100644
index 0000000..d01c8e0
--- /dev/null
+++ b/tests/test_leaky_relu.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_leaky_relu(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.leaky_relu(input, inplace=inplace)
+    reference_output = F.leaky_relu(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_log.py b/tests/test_log.py
new file mode 100644
index 0000000..79355e3
--- /dev/null
+++ b/tests/test_log.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_log(shape, dtype, device, rtol, atol):
+    input = torch.rand(shape, dtype=dtype, device=device) + 0.01
+
+    ninetoothed_output = ntops.torch.log(input)
+    reference_output = torch.log(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_log_softmax.py b/tests/test_log_softmax.py
new file mode 100644
index 0000000..27afc97
--- /dev/null
+++ b/tests/test_log_softmax.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_log_softmax(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.log_softmax(input, dim=-1)
+    reference_output = F.log_softmax(input, dim=-1)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_mish.py b/tests/test_mish.py
new file mode 100644
index 0000000..0486f97
--- /dev/null
+++ b/tests/test_mish.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_mish(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.mish(input, inplace=inplace)
+    reference_output = F.mish(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_reciprocal.py b/tests/test_reciprocal.py
new file mode 100644
index 0000000..03c7ef6
--- /dev/null
+++ b/tests/test_reciprocal.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_reciprocal(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+    input = input + 0.1 * torch.sign(input)
+
+    ninetoothed_output = ntops.torch.reciprocal(input)
+    reference_output = torch.reciprocal(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_relu6.py b/tests/test_relu6.py
new file mode 100644
index 0000000..a77cb93
--- /dev/null
+++ b/tests/test_relu6.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_relu6(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.relu6(input, inplace=inplace)
+    reference_output = F.relu6(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_relu_backward.py b/tests/test_relu_backward.py
new file mode 100644
index 0000000..0eeb3cb
--- /dev/null
+++ b/tests/test_relu_backward.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_relu_backward(shape, dtype, device, rtol, atol):
+    grad_output = torch.randn(shape, dtype=dtype, device=device)
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.relu_backward(grad_output, input)
+    reference_output = torch.where(input >= 0, grad_output, torch.zeros_like(grad_output))
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_selu.py b/tests/test_selu.py
new file mode 100644
index 0000000..a900a2c
--- /dev/null
+++ b/tests/test_selu.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize("inplace", (False, True))
+@pytest.mark.parametrize(*generate_arguments())
+def test_selu(shape, inplace, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.selu(input, inplace=inplace)
+    reference_output = F.selu(input, inplace=inplace)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_sigmoid_backward.py b/tests/test_sigmoid_backward.py
new file mode 100644
index 0000000..080affc
--- /dev/null
+++ b/tests/test_sigmoid_backward.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_sigmoid_backward(shape, dtype, device, rtol, atol):
+    grad_output = torch.randn(shape, dtype=dtype, device=device)
+    output = torch.sigmoid(torch.randn(shape, dtype=dtype, device=device))
+
+    ninetoothed_output = ntops.torch.sigmoid_backward(grad_output, output)
+    reference_output = grad_output * output * (1 - output)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_softplus.py b/tests/test_softplus.py
new file mode 100644
index 0000000..0522b82
--- /dev/null
+++ b/tests/test_softplus.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_softplus(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.softplus(input)
+    reference_output = F.softplus(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_softsign.py b/tests/test_softsign.py
new file mode 100644
index 0000000..615f5f6
--- /dev/null
+++ b/tests/test_softsign.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_softsign(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.softsign(input)
+    reference_output = F.softsign(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_sqrt.py b/tests/test_sqrt.py
new file mode 100644
index 0000000..58094b7
--- /dev/null
+++ b/tests/test_sqrt.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_sqrt(shape, dtype, device, rtol, atol):
+    input = torch.rand(shape, dtype=dtype, device=device) + 0.01
+
+    ninetoothed_output = ntops.torch.sqrt(input)
+    reference_output = torch.sqrt(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_tanh_backward.py b/tests/test_tanh_backward.py
new file mode 100644
index 0000000..a304c43
--- /dev/null
+++ b/tests/test_tanh_backward.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_tanh_backward(shape, dtype, device, rtol, atol):
+    grad_output = torch.randn(shape, dtype=dtype, device=device)
+    output = torch.tanh(torch.randn(shape, dtype=dtype, device=device))
+
+    ninetoothed_output = ntops.torch.tanh_backward(grad_output, output)
+    reference_output = grad_output * (1 - output * output)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_tanhshrink.py b/tests/test_tanhshrink.py
new file mode 100644
index 0000000..6f94bbd
--- /dev/null
+++ b/tests/test_tanhshrink.py
@@ -0,0 +1,18 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_tanhshrink(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+
+    ninetoothed_output = ntops.torch.tanhshrink(input)
+    reference_output = F.tanhshrink(input)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)
diff --git a/tests/test_where.py b/tests/test_where.py
new file mode 100644
index 0000000..1c59506
--- /dev/null
+++ b/tests/test_where.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+
+import ntops
+from tests.skippers import skip_if_cuda_not_available
+from tests.utils import generate_arguments
+
+
+@skip_if_cuda_not_available
+@pytest.mark.parametrize(*generate_arguments())
+def test_where(shape, dtype, device, rtol, atol):
+    input = torch.randn(shape, dtype=dtype, device=device)
+    other = torch.randn(shape, dtype=dtype, device=device)
+    condition = input > 0
+
+    ninetoothed_output = ntops.torch.where(condition, input, other)
+    reference_output = torch.where(condition, input, other)
+
+    assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)