← Back to Leaderboard

The AI CUDA Engineer 👷

31_ELUvec_shared_elu_base

Level 1 • Task 31
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(x: torch.Tensor, alpha: float) -> torch.Tensor:
    """
    Applies ELU activation to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of any shape.
        alpha (float): The alpha parameter for the ELU function.

    Returns:
        torch.Tensor: Output tensor with ELU applied, same shape as input.
    """
    return F.elu(x, alpha=alpha)


class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """

    def __init__(self, alpha):
        """
        Initializes the ELU model.

        Args:
            alpha (float): The alpha parameter for the ELU function.
        """
        super(Model, self).__init__()
        self.alpha = alpha

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return fn(x, self.alpha)


batch_size = 16
dim = 16384
alpha = 1.0


def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]


def get_init_inputs():
    return [alpha]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """
    def __init__(self, alpha: float = 1.0):
        """
        Initializes the ELU model.

        Args:
            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
        """
        super(Model, self).__init__()
        self.alpha = alpha
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return F.elu(x, alpha=self.alpha)

batch_size = 16
dim = 16384

def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]

def get_init_inputs():
    return [1.0]  # Provide alpha value for initialization

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 31_elu_shared_base 0.01 1.14 4.80
🥇 hybrid_elu_optimized_base 0.01 1.14 4.80
🥇 31_elu_vectorized_base 0.01 1.14 4.80
🥇 vec_shared_elu_base 0.01 1.14 4.80
🥇 31_elu_grid_stride_base_base 0.01 1.14 4.80
🥇 31_elu_vectorized_edit_1 0.01 1.14 4.80
🥇 elu_unroll_kernel_base 0.01 1.14 4.80
🥇 ldg_elu_128_base 0.01 1.14 4.80
9 31_ELU 0.01 0.97 4.12
9 31_elu_aligned_coalesced_base 0.01 0.97 4.12
9 hybrid_elu_base 0.01 0.97 4.12
9 31_elu_optimized_indexing_base 0.01 0.97 4.12
9 31_elu_reduced_divergence_base 0.01 0.97 4.12
9 elu_hybrid_base 0.01 0.97 4.12
9 31_elu_coalesced_base 0.01 0.97 4.12
9 31_elu_shared_mem_base 0.01 0.97 4.12
9 modular_elu_base 0.01 0.97 4.12
9 elu_vec4_shared_base 0.01 0.97 4.12
9 elu_tuned_blocksize_base 0.01 0.97 4.12
9 branchless_elu_vectorized_base 0.01 0.97 4.12
/*
Combined CUDA kernel for ELU activation using both vectorized load/store (float4) and shared memory tiling.
This kernel processes the bulk of the data in groups of 4 floats for improved memory throughput, while
handling any leftover elements (if the total number of elements is not a multiple of 4) separately.
*/

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

// Kernel: Process vectorized data using shared memory tiling
// Each thread loads a float4 element into shared memory, computes ELU, and writes back.
__global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
    extern __shared__ float4 tile[]; // Shared memory allocated dynamically
    int tid = threadIdx.x;
    int globalIdx = blockIdx.x * blockDim.x + tid;

    // Load vectorized data from global memory into shared memory
    if (globalIdx < n4) {
        tile[tid] = x[globalIdx];
    }
    __syncthreads();

    // Compute the ELU activation on the tile
    if (globalIdx < n4) {
        float4 val = tile[tid];
        float4 result;
        result.x = (val.x > 0.f) ? val.x : alpha * (expf(val.x) - 1.f);
        result.y = (val.y > 0.f) ? val.y : alpha * (expf(val.y) - 1.f);
        result.z = (val.z > 0.f) ? val.z : alpha * (expf(val.z) - 1.f);
        result.w = (val.w > 0.f) ? val.w : alpha * (expf(val.w) - 1.f);
        tile[tid] = result;  // Write result back into shared memory
    }
    __syncthreads();

    // Write results from shared memory back to global memory
    if (globalIdx < n4) {
        out[globalIdx] = tile[tid];
    }
}

// Kernel: Process the tail elements that are not a multiple of 4
__global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
    int globalIdx = blockIdx.x * blockDim.x + threadIdx.x + offset;
    if (globalIdx < n) {
        float val = x[globalIdx];
        out[globalIdx] = (val > 0.f) ? val : alpha * (expf(val) - 1.f);
    }
}

// Host interface function
// It dispatches two kernel calls: one for the vectorized portion and one for any remaining tail elements.
torch::Tensor elu_cuda_combined(torch::Tensor x, float alpha) {
    CHECK_INPUT(x);

    auto out = torch::empty_like(x);
    int n = x.numel();

    // Determine the number of float4 groups
    int n4 = n / 4;            // number of vectorizable groups
    int remainder = n % 4;       // remaining elements

    const int threads = 256;
    int blocks_vec = (n4 + threads - 1) / threads;
    size_t sharedMemSize = threads * sizeof(float4);

    // If there is at least one vectorized element, process it using the shared memory kernel
    if (n4 > 0) {
        elu_kernel_vec_shared<<<blocks_vec, threads, sharedMemSize>>>(
            reinterpret_cast<const float4*>(x.data_ptr<float>()),
            reinterpret_cast<float4*>(out.data_ptr<float>()),
            alpha,
            n4
        );
    }

    // Process any remaining tail elements with a scalar kernel
    if (remainder > 0) {
        int tail_offset = n4 * 4;
        int blocks_tail = (remainder + threads - 1) / threads;
        elu_kernel_tail<<<blocks_tail, threads>>>(
            x.data_ptr<float>(),
            out.data_ptr<float>(),
            alpha,
            tail_offset,
            n
        );
    }

    return out;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &elu_cuda_combined, "Combined ELU activation with shared memory and vectorized load (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::to
CPU Time 403027.87 μs
Device Time 39.97 μs
Self CPU Time 32.54 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 402995.32 μs
Device Time 39.97 μs
Self CPU Time 79.42 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 418625.62 μs
Device Time 0.00 μs
Self CPU Time 16017.96 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 402416.37 μs
Device Time 0.00 μs
Self CPU Time 402416.37 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 422654.72 μs
Device Time 19340.26 μs
Self CPU Time 422654.72 μs
Self Device Time 19340.26 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
elu_kernel_vec_shared(float4 const*, float4*, float, int)
CPU Time 0.00 μs
Device Time 21039.58 μs
Self CPU Time 0.00 μs
Self Device Time 21039.58 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 19595.32 μs
Device Time 37184.07 μs
Self CPU Time 19595.32 μs
Self Device Time 37184.07 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 58291.27 μs
Device Time 551878.97 μs
Self CPU Time 10499.63 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 47793.11 μs
Device Time 551878.97 μs
Self CPU Time 13364.32 μs
Self Device Time 551878.97 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 551878.97 μs
Self CPU Time 0.00 μs
Self Device Time 551878.97 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45284 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:12:35 bugprone-macro-parentheses
12 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:13:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
13 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:69: warning: 2 adjacent parameters of 'elu_kernel_vec_shared' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:75: note: the first parameter in the range is 'alpha'
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:86: note: the last parameter in the range is 'n4'
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:82: note: 'float' and 'int' may be implicitly converted
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:20:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
20 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:21:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int globalIdx = blockIdx.x * blockDim.x + tid;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:61: warning: 3 adjacent parameters of 'elu_kernel_tail' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:67: note: the first parameter in the range is 'alpha'
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:90: note: the last parameter in the range is 'n'
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:74: note: 'float' and 'int' may be implicitly converted
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:49:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
49 | int globalIdx = blockIdx.x * blockDim.x + threadIdx.x + offset;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:58:47: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
58 | torch::Tensor elu_cuda_combined(torch::Tensor x, float alpha) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:62:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
62 | int n = x.numel();
| ^