diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py index f6934ef..b4c52c0 100644 --- a/src/ntops/kernels/__init__.py +++ b/src/ntops/kernels/__init__.py @@ -7,38 +7,61 @@ bitwise_not, bitwise_or, bmm, + cast, + ceil, clamp, conv2d, cos, div, dropout, + elu, eq, exp, + floor, ge, gelu, + gelu_backward, gt, + hardswish, + hardtanh, isinf, isnan, layer_norm, le, + leaky_relu, + log, + log_softmax, lt, max_pool2d, + mish, mm, mul, ne, neg, pow, + prelu, + reciprocal, relu, + relu6, + relu_backward, rms_norm, rotary_position_embedding, rsqrt, scaled_dot_product_attention, + selu, sigmoid, + sigmoid_backward, silu, sin, softmax, + softplus, + softsign, + sqrt, sub, tanh, + tanh_backward, + tanhshrink, + where, ) __all__ = [ @@ -50,36 +73,59 @@ "bitwise_not", "bitwise_or", "bmm", + "cast", + "ceil", "clamp", "conv2d", "cos", "div", "dropout", + "elu", "eq", "exp", + "floor", "ge", "gelu", + "gelu_backward", "gt", + "hardswish", + "hardtanh", "isinf", "isnan", "layer_norm", "le", + "leaky_relu", + "log", + "log_softmax", "lt", "max_pool2d", + "mish", "mm", "mul", "ne", "neg", "pow", + "prelu", + "reciprocal", "relu", + "relu6", + "relu_backward", "rms_norm", "rotary_position_embedding", "rsqrt", "scaled_dot_product_attention", + "selu", "sigmoid", + "sigmoid_backward", "silu", "sin", "softmax", + "softplus", + "softsign", + "sqrt", "sub", "tanh", + "tanh_backward", + "tanhshrink", + "where", ] diff --git a/src/ntops/kernels/cast.py b/src/ntops/kernels/cast.py new file mode 100644 index 0000000..dc6e64e --- /dev/null +++ b/src/ntops/kernels/cast.py @@ -0,0 +1,21 @@ +import functools + +import ninetoothed +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input # noqa: F841 + + +def premake(ndim, input_dtype=None, output_dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=input_dtype), + Tensor(ndim, dtype=output_dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/ceil.py b/src/ntops/kernels/ceil.py new file mode 100644 index 0000000..ac4ce92 --- /dev/null +++ b/src/ntops/kernels/ceil.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = ntl.cast(ntl.ceil(ntl.cast(input, ntl.float32)), input.dtype) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/elu.py b/src/ntops/kernels/elu.py new file mode 100644 index 0000000..f68c44a --- /dev/null +++ b/src/ntops/kernels/elu.py @@ -0,0 +1,25 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, alpha, output): + input_f32 = ntl.cast(input, ntl.float32) + result = ntl.where(input >= 0, input, ntl.cast(alpha * (ntl.exp(input_f32) - 1), input.dtype)) + output = result # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(0, dtype=ninetoothed.float64), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/floor.py b/src/ntops/kernels/floor.py new file mode 100644 index 0000000..c36c65e --- /dev/null +++ b/src/ntops/kernels/floor.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = ntl.cast(ntl.floor(ntl.cast(input, ntl.float32)), input.dtype) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/gelu_backward.py b/src/ntops/kernels/gelu_backward.py new file mode 100644 index 0000000..85fd36e --- /dev/null +++ b/src/ntops/kernels/gelu_backward.py @@ -0,0 +1,25 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(grad_output, input, grad_input): + input_f32 = ntl.cast(input, ntl.float32) + cdf = 0.5 * (1.0 + ntl.erf(input_f32 * 0.7071067811865476)) + pdf = ntl.exp(-0.5 * input_f32 * input_f32) * 0.3989422804014327 + grad_input = grad_output * ntl.cast(cdf + input_f32 * pdf, grad_output.dtype) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/hardswish.py b/src/ntops/kernels/hardswish.py new file mode 100644 index 0000000..f75c4bd --- /dev/null +++ b/src/ntops/kernels/hardswish.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input * ntl.clamp(input + 3.0, 0.0, 6.0) / 6.0 # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/hardtanh.py b/src/ntops/kernels/hardtanh.py new file mode 100644 index 0000000..0494579 --- /dev/null +++ b/src/ntops/kernels/hardtanh.py @@ -0,0 +1,24 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, min_val, max_val, output): + output = ntl.clamp(input, min_val, max_val) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(0, dtype=ninetoothed.float64), + Tensor(0, dtype=ninetoothed.float64), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/leaky_relu.py b/src/ntops/kernels/leaky_relu.py new file mode 100644 index 0000000..f8038ed --- /dev/null +++ b/src/ntops/kernels/leaky_relu.py @@ -0,0 +1,23 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, negative_slope, output): + output = ntl.where(input >= 0, input, negative_slope * input) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(0, dtype=ninetoothed.float64), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/log.py b/src/ntops/kernels/log.py new file mode 100644 index 0000000..178a27e --- /dev/null +++ b/src/ntops/kernels/log.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = ntl.log(ntl.cast(input, ntl.float32)) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/log_softmax.py b/src/ntops/kernels/log_softmax.py new file mode 100644 index 0000000..0eb92a6 --- /dev/null +++ b/src/ntops/kernels/log_softmax.py @@ -0,0 +1,44 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.reduction import arrangement + + +def _exp(x, dtype): + exp_dtype = dtype if dtype != ntl.float16 else ntl.float32 + return ntl.cast(ntl.exp(ntl.cast(x, exp_dtype)), dtype) + + +def application(input, output): + dtype = output.dtype.dtype + prev_max = ntl.cast(float("-inf"), dtype) + denominator = ntl.cast(0, dtype) + + for i in range(input.shape[0]): + input_i = ntl.cast(input[i], dtype) + curr_max = ntl.cast(ntl.maximum(prev_max, ntl.max(input_i)), dtype) + input_max_diff_exp = _exp(input_i - curr_max, dtype) + prev_curr_max_diff_exp = _exp(prev_max - curr_max, dtype) + denominator = denominator * prev_curr_max_diff_exp + ntl.sum(input_max_diff_exp) + prev_max = curr_max + + log_dtype = dtype if dtype != ntl.float16 else ntl.float32 + + for i in range(input.shape[0]): + log_denominator = ntl.log(ntl.cast(denominator, log_dtype)) + output[i] = ntl.cast(ntl.cast(input[i], log_dtype) - ntl.cast(prev_max, log_dtype) - log_denominator, dtype) + + +def premake(ndim, dim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, dim=dim, block_size=block_size) + + tensors = ( + Tensor( + ndim, dtype=dtype, other=float("-inf"), shape_options={"constexpr": True} + ), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/mish.py b/src/ntops/kernels/mish.py new file mode 100644 index 0000000..794be67 --- /dev/null +++ b/src/ntops/kernels/mish.py @@ -0,0 +1,24 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + input_f32 = ntl.cast(input, ntl.float32) + sp = ntl.log(1 + ntl.exp(input_f32)) + exp_sp = ntl.exp(sp) + exp_neg_sp = ntl.exp(-sp) + tanh_sp = (exp_sp - exp_neg_sp) / (exp_sp + exp_neg_sp) + result = ntl.cast(input_f32 * tanh_sp, input.dtype) + output = result # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/prelu.py b/src/ntops/kernels/prelu.py new file mode 100644 index 0000000..41cdaf5 --- /dev/null +++ b/src/ntops/kernels/prelu.py @@ -0,0 +1,22 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, weight, output): + output = ntl.where(input >= 0, input, weight * input) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/reciprocal.py b/src/ntops/kernels/reciprocal.py new file mode 100644 index 0000000..64cbef7 --- /dev/null +++ b/src/ntops/kernels/reciprocal.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = ntl.cast(1.0 / ntl.cast(input, ntl.float32), input.dtype) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/relu6.py b/src/ntops/kernels/relu6.py new file mode 100644 index 0000000..c73364e --- /dev/null +++ b/src/ntops/kernels/relu6.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = ntl.clamp(input, 0.0, 6.0) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/relu_backward.py b/src/ntops/kernels/relu_backward.py new file mode 100644 index 0000000..ea52ce2 --- /dev/null +++ b/src/ntops/kernels/relu_backward.py @@ -0,0 +1,22 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(grad_output, input, grad_input): + grad_input = ntl.where(input >= 0, grad_output, 0.0) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/selu.py b/src/ntops/kernels/selu.py new file mode 100644 index 0000000..91ce384 --- /dev/null +++ b/src/ntops/kernels/selu.py @@ -0,0 +1,26 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, alpha, scale, output): + input_f32 = ntl.cast(input, ntl.float32) + result = scale * ntl.where(input > 0, input, ntl.cast(alpha * (ntl.exp(input_f32) - 1), input.dtype)) + output = result # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(0, dtype=ninetoothed.float64), + Tensor(0, dtype=ninetoothed.float64), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/sigmoid_backward.py b/src/ntops/kernels/sigmoid_backward.py new file mode 100644 index 0000000..aab0944 --- /dev/null +++ b/src/ntops/kernels/sigmoid_backward.py @@ -0,0 +1,22 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(grad_output, output, grad_input): + grad_input = grad_output * output * (1 - output) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/softplus.py b/src/ntops/kernels/softplus.py new file mode 100644 index 0000000..67222db --- /dev/null +++ b/src/ntops/kernels/softplus.py @@ -0,0 +1,27 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, beta, threshold, output): + input_f32 = ntl.cast(input, ntl.float32) + softplus_val = ntl.log(1 + ntl.exp(input_f32 * beta)) / beta + result = ntl.where(input * beta > threshold, input, ntl.cast(softplus_val, input.dtype)) + output = result # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(0, dtype=ninetoothed.float64), + Tensor(0, dtype=ninetoothed.float64), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/softsign.py b/src/ntops/kernels/softsign.py new file mode 100644 index 0000000..d2862d0 --- /dev/null +++ b/src/ntops/kernels/softsign.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input / (1 + ntl.abs(input)) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/sqrt.py b/src/ntops/kernels/sqrt.py new file mode 100644 index 0000000..b929292 --- /dev/null +++ b/src/ntops/kernels/sqrt.py @@ -0,0 +1,18 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = ntl.sqrt(ntl.cast(input, ntl.float32)) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/tanh_backward.py b/src/ntops/kernels/tanh_backward.py new file mode 100644 index 0000000..079ac82 --- /dev/null +++ b/src/ntops/kernels/tanh_backward.py @@ -0,0 +1,22 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(grad_output, output, grad_input): + grad_input = grad_output * (1 - output * output) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/tanhshrink.py b/src/ntops/kernels/tanhshrink.py new file mode 100644 index 0000000..37fcc3e --- /dev/null +++ b/src/ntops/kernels/tanhshrink.py @@ -0,0 +1,22 @@ +import functools + +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + input_f32 = ntl.cast(input, ntl.float32) + exp_input = ntl.exp(input_f32) + exp_neg_input = ntl.exp(-input_f32) + tanh_val = (exp_input - exp_neg_input) / (exp_input + exp_neg_input) + output = ntl.cast(input_f32 - tanh_val, input.dtype) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/where.py b/src/ntops/kernels/where.py new file mode 100644 index 0000000..e13e682 --- /dev/null +++ b/src/ntops/kernels/where.py @@ -0,0 +1,24 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(condition, input, other, output): + output = ntl.where(condition != 0, input, other) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = ( + Tensor(ndim, dtype=ninetoothed.int8), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py index 82fc596..828f5dc 100644 --- a/src/ntops/torch/__init__.py +++ b/src/ntops/torch/__init__.py @@ -6,39 +6,62 @@ from ntops.torch.bitwise_not import bitwise_not from ntops.torch.bitwise_or import bitwise_or from ntops.torch.bmm import bmm +from ntops.torch.cast import cast +from ntops.torch.ceil import ceil from ntops.torch.clamp import clamp from ntops.torch.conv2d import conv2d from ntops.torch.cos import cos from ntops.torch.div import div from ntops.torch.dropout import dropout +from ntops.torch.elu import elu from ntops.torch.eq import eq from ntops.torch.exp import exp +from ntops.torch.floor import floor from ntops.torch.ge import ge from ntops.torch.gelu import gelu +from ntops.torch.gelu_backward import gelu_backward from ntops.torch.gt import gt +from ntops.torch.hardswish import hardswish +from ntops.torch.hardtanh import hardtanh from ntops.torch.isinf import isinf from ntops.torch.isnan import isnan from ntops.torch.layer_norm import layer_norm from ntops.torch.le import le +from ntops.torch.leaky_relu import leaky_relu +from ntops.torch.log import log +from ntops.torch.log_softmax import log_softmax from ntops.torch.lt import lt from ntops.torch.matmul import matmul from ntops.torch.max_pool2d import max_pool2d +from ntops.torch.mish import mish from ntops.torch.mm import mm from ntops.torch.mul import mul from ntops.torch.ne import ne from ntops.torch.neg import neg from ntops.torch.pow import pow +from ntops.torch.prelu import prelu +from ntops.torch.reciprocal import reciprocal from ntops.torch.relu import relu +from ntops.torch.relu6 import relu6 +from ntops.torch.relu_backward import relu_backward from ntops.torch.rms_norm import rms_norm from ntops.torch.rotary_position_embedding import rotary_position_embedding from ntops.torch.rsqrt import rsqrt from ntops.torch.scaled_dot_product_attention import scaled_dot_product_attention +from ntops.torch.selu import selu from ntops.torch.sigmoid import sigmoid +from ntops.torch.sigmoid_backward import sigmoid_backward from ntops.torch.silu import silu from ntops.torch.sin import sin from ntops.torch.softmax import softmax +from ntops.torch.softplus import softplus +from ntops.torch.softsign import softsign +from ntops.torch.sqrt import sqrt from ntops.torch.sub import sub from ntops.torch.tanh import tanh +from ntops.torch.tanh_backward import tanh_backward +from ntops.torch.tanhshrink import tanhshrink +from ntops.torch.where import where __all__ = [ "abs", @@ -49,37 +72,60 @@ "bitwise_not", "bitwise_or", "bmm", + "cast", + "ceil", "clamp", "conv2d", "cos", "div", "dropout", + "elu", "eq", "exp", + "floor", "ge", "gelu", + "gelu_backward", "gt", + "hardswish", + "hardtanh", "isinf", "isnan", "layer_norm", "le", + "leaky_relu", + "log", + "log_softmax", "lt", "matmul", "max_pool2d", + "mish", "mm", "mul", "ne", "neg", "pow", + "prelu", + "reciprocal", "relu", + "relu6", + "relu_backward", "rms_norm", "rotary_position_embedding", "rsqrt", "scaled_dot_product_attention", + "selu", "sigmoid", + "sigmoid_backward", "silu", "sin", "softmax", + "softplus", + "softsign", + "sqrt", "sub", "tanh", + "tanh_backward", + "tanhshrink", + "where", ] diff --git a/src/ntops/torch/cast.py b/src/ntops/torch/cast.py new file mode 100644 index 0000000..69473d6 --- /dev/null +++ b/src/ntops/torch/cast.py @@ -0,0 +1,19 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def cast(input, dtype): + output = torch.empty_like(input, dtype=dtype) + + kernel = _cached_make( + ntops.kernels.cast.premake, + input.ndim, + input_dtype=input.dtype, + output_dtype=dtype, + ) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/ceil.py b/src/ntops/torch/ceil.py new file mode 100644 index 0000000..ebb08d3 --- /dev/null +++ b/src/ntops/torch/ceil.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def ceil(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.ceil.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/elu.py b/src/ntops/torch/elu.py new file mode 100644 index 0000000..be7f497 --- /dev/null +++ b/src/ntops/torch/elu.py @@ -0,0 +1,17 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def elu(input, alpha=1.0, inplace=False): + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.elu.premake, input.ndim) + + kernel(input, alpha, output) + + return output diff --git a/src/ntops/torch/floor.py b/src/ntops/torch/floor.py new file mode 100644 index 0000000..410688f --- /dev/null +++ b/src/ntops/torch/floor.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def floor(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.floor.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/gelu_backward.py b/src/ntops/torch/gelu_backward.py new file mode 100644 index 0000000..8c8f31e --- /dev/null +++ b/src/ntops/torch/gelu_backward.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def gelu_backward(grad_output, input): + grad_input = torch.empty_like(grad_output) + + kernel = _cached_make(ntops.kernels.gelu_backward.premake, grad_output.ndim) + + kernel(grad_output, input, grad_input) + + return grad_input diff --git a/src/ntops/torch/hardswish.py b/src/ntops/torch/hardswish.py new file mode 100644 index 0000000..fa3b5a7 --- /dev/null +++ b/src/ntops/torch/hardswish.py @@ -0,0 +1,17 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def hardswish(input, inplace=False): + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.hardswish.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/hardtanh.py b/src/ntops/torch/hardtanh.py new file mode 100644 index 0000000..419324c --- /dev/null +++ b/src/ntops/torch/hardtanh.py @@ -0,0 +1,17 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def hardtanh(input, min_val=-1.0, max_val=1.0, inplace=False): + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.hardtanh.premake, input.ndim) + + kernel(input, min_val, max_val, output) + + return output diff --git a/src/ntops/torch/leaky_relu.py b/src/ntops/torch/leaky_relu.py new file mode 100644 index 0000000..c647dd7 --- /dev/null +++ b/src/ntops/torch/leaky_relu.py @@ -0,0 +1,17 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def leaky_relu(input, negative_slope=0.01, inplace=False): + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.leaky_relu.premake, input.ndim) + + kernel(input, negative_slope, output) + + return output diff --git a/src/ntops/torch/log.py b/src/ntops/torch/log.py new file mode 100644 index 0000000..a1bfcbd --- /dev/null +++ b/src/ntops/torch/log.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def log(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.log.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/log_softmax.py b/src/ntops/torch/log_softmax.py new file mode 100644 index 0000000..7552bb9 --- /dev/null +++ b/src/ntops/torch/log_softmax.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def log_softmax(input, dim=-1): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.log_softmax.premake, input.ndim, dim=dim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/mish.py b/src/ntops/torch/mish.py new file mode 100644 index 0000000..0112644 --- /dev/null +++ b/src/ntops/torch/mish.py @@ -0,0 +1,17 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def mish(input, inplace=False): + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.mish.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/prelu.py b/src/ntops/torch/prelu.py new file mode 100644 index 0000000..d81b7b4 --- /dev/null +++ b/src/ntops/torch/prelu.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def prelu(input, weight): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.prelu.premake, input.ndim) + + kernel(input, weight, output) + + return output diff --git a/src/ntops/torch/reciprocal.py b/src/ntops/torch/reciprocal.py new file mode 100644 index 0000000..bbcc16c --- /dev/null +++ b/src/ntops/torch/reciprocal.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def reciprocal(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.reciprocal.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/relu6.py b/src/ntops/torch/relu6.py new file mode 100644 index 0000000..218294c --- /dev/null +++ b/src/ntops/torch/relu6.py @@ -0,0 +1,17 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def relu6(input, inplace=False): + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.relu6.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/relu_backward.py b/src/ntops/torch/relu_backward.py new file mode 100644 index 0000000..ce9bd83 --- /dev/null +++ b/src/ntops/torch/relu_backward.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def relu_backward(grad_output, input): + grad_input = torch.empty_like(grad_output) + + kernel = _cached_make(ntops.kernels.relu_backward.premake, grad_output.ndim) + + kernel(grad_output, input, grad_input) + + return grad_input diff --git a/src/ntops/torch/selu.py b/src/ntops/torch/selu.py new file mode 100644 index 0000000..ed7b877 --- /dev/null +++ b/src/ntops/torch/selu.py @@ -0,0 +1,20 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def selu(input, inplace=False): + alpha = 1.6732632423543772 + scale = 1.0507009873554805 + + if inplace: + output = input + else: + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.selu.premake, input.ndim) + + kernel(input, alpha, scale, output) + + return output diff --git a/src/ntops/torch/sigmoid_backward.py b/src/ntops/torch/sigmoid_backward.py new file mode 100644 index 0000000..00e7102 --- /dev/null +++ b/src/ntops/torch/sigmoid_backward.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def sigmoid_backward(grad_output, output): + grad_input = torch.empty_like(grad_output) + + kernel = _cached_make(ntops.kernels.sigmoid_backward.premake, grad_output.ndim) + + kernel(grad_output, output, grad_input) + + return grad_input diff --git a/src/ntops/torch/softplus.py b/src/ntops/torch/softplus.py new file mode 100644 index 0000000..73cf60c --- /dev/null +++ b/src/ntops/torch/softplus.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def softplus(input, beta=1.0, threshold=20.0): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.softplus.premake, input.ndim) + + kernel(input, beta, threshold, output) + + return output diff --git a/src/ntops/torch/softsign.py b/src/ntops/torch/softsign.py new file mode 100644 index 0000000..98e7b52 --- /dev/null +++ b/src/ntops/torch/softsign.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def softsign(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.softsign.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/sqrt.py b/src/ntops/torch/sqrt.py new file mode 100644 index 0000000..f6654bd --- /dev/null +++ b/src/ntops/torch/sqrt.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def sqrt(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.sqrt.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/tanh_backward.py b/src/ntops/torch/tanh_backward.py new file mode 100644 index 0000000..05e76e5 --- /dev/null +++ b/src/ntops/torch/tanh_backward.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def tanh_backward(grad_output, output): + grad_input = torch.empty_like(grad_output) + + kernel = _cached_make(ntops.kernels.tanh_backward.premake, grad_output.ndim) + + kernel(grad_output, output, grad_input) + + return grad_input diff --git a/src/ntops/torch/tanhshrink.py b/src/ntops/torch/tanhshrink.py new file mode 100644 index 0000000..2701cee --- /dev/null +++ b/src/ntops/torch/tanhshrink.py @@ -0,0 +1,14 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def tanhshrink(input): + output = torch.empty_like(input) + + kernel = _cached_make(ntops.kernels.tanhshrink.premake, input.ndim) + + kernel(input, output) + + return output diff --git a/src/ntops/torch/where.py b/src/ntops/torch/where.py new file mode 100644 index 0000000..e889bf5 --- /dev/null +++ b/src/ntops/torch/where.py @@ -0,0 +1,15 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +def where(condition, input, other): + output = torch.empty_like(input) + condition_int8 = condition.to(torch.int8) + + kernel = _cached_make(ntops.kernels.where.premake, input.ndim) + + kernel(condition_int8, input, other, output) + + return output diff --git a/tests/test_cast.py b/tests/test_cast.py new file mode 100644 index 0000000..a7863eb --- /dev/null +++ b/tests/test_cast.py @@ -0,0 +1,17 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_cast(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.cast(input, torch.float32) + reference_output = input.to(torch.float32) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_ceil.py b/tests/test_ceil.py new file mode 100644 index 0000000..4c020b1 --- /dev/null +++ b/tests/test_ceil.py @@ -0,0 +1,17 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_ceil(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.ceil(input) + reference_output = torch.ceil(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_elu.py b/tests/test_elu.py new file mode 100644 index 0000000..256b597 --- /dev/null +++ b/tests/test_elu.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_elu(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.elu(input, inplace=inplace) + reference_output = F.elu(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_floor.py b/tests/test_floor.py new file mode 100644 index 0000000..569c1f1 --- /dev/null +++ b/tests/test_floor.py @@ -0,0 +1,17 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_floor(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.floor(input) + reference_output = torch.floor(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_gelu_backward.py b/tests/test_gelu_backward.py new file mode 100644 index 0000000..f0370ac --- /dev/null +++ b/tests/test_gelu_backward.py @@ -0,0 +1,22 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_gelu_backward(shape, dtype, device, rtol, atol): + grad_output = torch.randn(shape, dtype=dtype, device=device) + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.gelu_backward(grad_output, input) + + # Use PyTorch autograd for reference + input_ref = input.clone().requires_grad_(True) + torch.nn.functional.gelu(input_ref).backward(grad_output) + reference_output = input_ref.grad + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_hardswish.py b/tests/test_hardswish.py new file mode 100644 index 0000000..6f797ba --- /dev/null +++ b/tests/test_hardswish.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_hardswish(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.hardswish(input, inplace=inplace) + reference_output = F.hardswish(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_hardtanh.py b/tests/test_hardtanh.py new file mode 100644 index 0000000..c1c9fb4 --- /dev/null +++ b/tests/test_hardtanh.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_hardtanh(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.hardtanh(input, inplace=inplace) + reference_output = F.hardtanh(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_leaky_relu.py b/tests/test_leaky_relu.py new file mode 100644 index 0000000..d01c8e0 --- /dev/null +++ b/tests/test_leaky_relu.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_leaky_relu(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.leaky_relu(input, inplace=inplace) + reference_output = F.leaky_relu(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_log.py b/tests/test_log.py new file mode 100644 index 0000000..79355e3 --- /dev/null +++ b/tests/test_log.py @@ -0,0 +1,17 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_log(shape, dtype, device, rtol, atol): + input = torch.rand(shape, dtype=dtype, device=device) + 0.01 + + ninetoothed_output = ntops.torch.log(input) + reference_output = torch.log(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_log_softmax.py b/tests/test_log_softmax.py new file mode 100644 index 0000000..27afc97 --- /dev/null +++ b/tests/test_log_softmax.py @@ -0,0 +1,18 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_log_softmax(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.log_softmax(input, dim=-1) + reference_output = F.log_softmax(input, dim=-1) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_mish.py b/tests/test_mish.py new file mode 100644 index 0000000..0486f97 --- /dev/null +++ b/tests/test_mish.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_mish(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.mish(input, inplace=inplace) + reference_output = F.mish(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_reciprocal.py b/tests/test_reciprocal.py new file mode 100644 index 0000000..03c7ef6 --- /dev/null +++ b/tests/test_reciprocal.py @@ -0,0 +1,18 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_reciprocal(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + input = input + 0.1 * torch.sign(input) + + ninetoothed_output = ntops.torch.reciprocal(input) + reference_output = torch.reciprocal(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_relu6.py b/tests/test_relu6.py new file mode 100644 index 0000000..a77cb93 --- /dev/null +++ b/tests/test_relu6.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_relu6(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.relu6(input, inplace=inplace) + reference_output = F.relu6(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_relu_backward.py b/tests/test_relu_backward.py new file mode 100644 index 0000000..0eeb3cb --- /dev/null +++ b/tests/test_relu_backward.py @@ -0,0 +1,18 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_relu_backward(shape, dtype, device, rtol, atol): + grad_output = torch.randn(shape, dtype=dtype, device=device) + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.relu_backward(grad_output, input) + reference_output = torch.where(input >= 0, grad_output, torch.zeros_like(grad_output)) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_selu.py b/tests/test_selu.py new file mode 100644 index 0000000..a900a2c --- /dev/null +++ b/tests/test_selu.py @@ -0,0 +1,19 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("inplace", (False, True)) +@pytest.mark.parametrize(*generate_arguments()) +def test_selu(shape, inplace, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.selu(input, inplace=inplace) + reference_output = F.selu(input, inplace=inplace) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_sigmoid_backward.py b/tests/test_sigmoid_backward.py new file mode 100644 index 0000000..080affc --- /dev/null +++ b/tests/test_sigmoid_backward.py @@ -0,0 +1,18 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_sigmoid_backward(shape, dtype, device, rtol, atol): + grad_output = torch.randn(shape, dtype=dtype, device=device) + output = torch.sigmoid(torch.randn(shape, dtype=dtype, device=device)) + + ninetoothed_output = ntops.torch.sigmoid_backward(grad_output, output) + reference_output = grad_output * output * (1 - output) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_softplus.py b/tests/test_softplus.py new file mode 100644 index 0000000..0522b82 --- /dev/null +++ b/tests/test_softplus.py @@ -0,0 +1,18 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_softplus(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.softplus(input) + reference_output = F.softplus(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_softsign.py b/tests/test_softsign.py new file mode 100644 index 0000000..615f5f6 --- /dev/null +++ b/tests/test_softsign.py @@ -0,0 +1,18 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_softsign(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.softsign(input) + reference_output = F.softsign(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_sqrt.py b/tests/test_sqrt.py new file mode 100644 index 0000000..58094b7 --- /dev/null +++ b/tests/test_sqrt.py @@ -0,0 +1,17 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_sqrt(shape, dtype, device, rtol, atol): + input = torch.rand(shape, dtype=dtype, device=device) + 0.01 + + ninetoothed_output = ntops.torch.sqrt(input) + reference_output = torch.sqrt(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_tanh_backward.py b/tests/test_tanh_backward.py new file mode 100644 index 0000000..a304c43 --- /dev/null +++ b/tests/test_tanh_backward.py @@ -0,0 +1,18 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_tanh_backward(shape, dtype, device, rtol, atol): + grad_output = torch.randn(shape, dtype=dtype, device=device) + output = torch.tanh(torch.randn(shape, dtype=dtype, device=device)) + + ninetoothed_output = ntops.torch.tanh_backward(grad_output, output) + reference_output = grad_output * (1 - output * output) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_tanhshrink.py b/tests/test_tanhshrink.py new file mode 100644 index 0000000..6f94bbd --- /dev/null +++ b/tests/test_tanhshrink.py @@ -0,0 +1,18 @@ +import pytest +import torch +import torch.nn.functional as F + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_tanhshrink(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.tanhshrink(input) + reference_output = F.tanhshrink(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_where.py b/tests/test_where.py new file mode 100644 index 0000000..1c59506 --- /dev/null +++ b/tests/test_where.py @@ -0,0 +1,19 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_where(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + other = torch.randn(shape, dtype=dtype, device=device) + condition = input > 0 + + ninetoothed_output = ntops.torch.where(condition, input, other) + reference_output = torch.where(condition, input, other) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol)