Kernel Details

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(x: torch.Tensor, alpha: float) -> torch.Tensor:
    """
    Applies ELU activation to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of any shape.
        alpha (float): The alpha parameter for the ELU function.

    Returns:
        torch.Tensor: Output tensor with ELU applied, same shape as input.
    """
    return F.elu(x, alpha=alpha)


class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """

    def __init__(self, alpha):
        """
        Initializes the ELU model.

        Args:
            alpha (float): The alpha parameter for the ELU function.
        """
        super(Model, self).__init__()
        self.alpha = alpha

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return fn(x, self.alpha)


batch_size = 16
dim = 16384
alpha = 1.0


def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]


def get_init_inputs():
    return [alpha]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """
    def __init__(self, alpha: float = 1.0):
        """
        Initializes the ELU model.

        Args:
            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
        """
        super(Model, self).__init__()
        self.alpha = alpha
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return F.elu(x, alpha=self.alpha)

batch_size = 16
dim = 16384

def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]

def get_init_inputs():
    return [1.0]  # Provide alpha value for initialization

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	31_ELU
Level ID	1
Task ID	31
Kernel Name	31_ELU
CUDA Speedup (Native)	0.974x
CUDA Speedup (Compile)	4.116x
CUDA Runtime	0.007 ms
PyTorch Runtime (Native)	0.007 ms
PyTorch Runtime (Compile)	0.029 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	manually written
Temperature	0.00

View Experiment Progress Details

Related Kernels (Level 1, Task 31 • 31_ELU)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	31_elu_shared_base	0.01	1.14	4.80
🥇	hybrid_elu_optimized_base	0.01	1.14	4.80
🥇	31_elu_vectorized_base	0.01	1.14	4.80
🥇	vec_shared_elu_base	0.01	1.14	4.80
🥇	31_elu_grid_stride_base_base	0.01	1.14	4.80
🥇	31_elu_vectorized_edit_1	0.01	1.14	4.80
🥇	elu_unroll_kernel_base	0.01	1.14	4.80
🥇	ldg_elu_128_base	0.01	1.14	4.80
9	31_ELU	0.01	0.97	4.12
9	31_elu_aligned_coalesced_base	0.01	0.97	4.12
9	hybrid_elu_base	0.01	0.97	4.12
9	31_elu_optimized_indexing_base	0.01	0.97	4.12
9	31_elu_reduced_divergence_base	0.01	0.97	4.12
9	elu_hybrid_base	0.01	0.97	4.12
9	31_elu_coalesced_base	0.01	0.97	4.12
9	31_elu_shared_mem_base	0.01	0.97	4.12
9	modular_elu_base	0.01	0.97	4.12
9	elu_vec4_shared_base	0.01	0.97	4.12
9	elu_tuned_blocksize_base	0.01	0.97	4.12
9	branchless_elu_vectorized_base	0.01	0.97	4.12

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

__global__ void elu_kernel(const float* x, float* out, float alpha, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        out[idx] = (x[idx] > 0) ? x[idx] : alpha * (expf(x[idx]) - 1);
    }
}

torch::Tensor elu_cuda(torch::Tensor x, float alpha) {
    CHECK_INPUT(x);

    auto out = torch::empty_like(x);
    int n = x.numel();

    const int threads = 256;
    const int blocks = (n + threads - 1) / threads;

    elu_kernel<<<blocks, threads>>>(x.data_ptr<float>(), out.data_ptr<float>(), alpha, n);

    return out;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &elu_cuda, "ELU activation (CUDA)");
}

The AI CUDA Engineer 👷

`31_ELU` • `31_ELU`

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

The AI CUDA Engineer 👷

31_ELU • 31_ELU

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

`31_ELU` • `31_ELU`