Kernel Details - vec_shared_elu

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(x: torch.Tensor, alpha: float) -> torch.Tensor:
    """
    Applies ELU activation to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of any shape.
        alpha (float): The alpha parameter for the ELU function.

    Returns:
        torch.Tensor: Output tensor with ELU applied, same shape as input.
    """
    return F.elu(x, alpha=alpha)


class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """

    def __init__(self, alpha):
        """
        Initializes the ELU model.

        Args:
            alpha (float): The alpha parameter for the ELU function.
        """
        super(Model, self).__init__()
        self.alpha = alpha

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return fn(x, self.alpha)


batch_size = 16
dim = 16384
alpha = 1.0


def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]


def get_init_inputs():
    return [alpha]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """
    def __init__(self, alpha: float = 1.0):
        """
        Initializes the ELU model.

        Args:
            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
        """
        super(Model, self).__init__()
        self.alpha = alpha
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return F.elu(x, alpha=self.alpha)

batch_size = 16
dim = 16384

def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]

def get_init_inputs():
    return [1.0]  # Provide alpha value for initialization

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	31_ELU
Level ID	1
Task ID	31
Kernel Name	vec_shared_elu_base
CUDA Speedup (Native)	1.136x
CUDA Speedup (Compile)	4.802x
CUDA Runtime	0.006 ms
PyTorch Runtime (Native)	0.007 ms
PyTorch Runtime (Compile)	0.029 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 1, Task 31 • 31_ELU)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	31_elu_shared_base	0.01	1.14	4.80
🥇	hybrid_elu_optimized_base	0.01	1.14	4.80
🥇	31_elu_vectorized_base	0.01	1.14	4.80
🥇	vec_shared_elu_base	0.01	1.14	4.80
🥇	31_elu_grid_stride_base_base	0.01	1.14	4.80
🥇	31_elu_vectorized_edit_1	0.01	1.14	4.80
🥇	elu_unroll_kernel_base	0.01	1.14	4.80
🥇	ldg_elu_128_base	0.01	1.14	4.80
9	31_ELU	0.01	0.97	4.12
9	31_elu_aligned_coalesced_base	0.01	0.97	4.12
9	hybrid_elu_base	0.01	0.97	4.12
9	31_elu_optimized_indexing_base	0.01	0.97	4.12
9	31_elu_reduced_divergence_base	0.01	0.97	4.12
9	elu_hybrid_base	0.01	0.97	4.12
9	31_elu_coalesced_base	0.01	0.97	4.12
9	31_elu_shared_mem_base	0.01	0.97	4.12
9	modular_elu_base	0.01	0.97	4.12
9	elu_vec4_shared_base	0.01	0.97	4.12
9	elu_tuned_blocksize_base	0.01	0.97	4.12
9	branchless_elu_vectorized_base	0.01	0.97	4.12

/*
Combined CUDA kernel for ELU activation using both vectorized load/store (float4) and shared memory tiling.
This kernel processes the bulk of the data in groups of 4 floats for improved memory throughput, while
handling any leftover elements (if the total number of elements is not a multiple of 4) separately.
*/

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

// Kernel: Process vectorized data using shared memory tiling
// Each thread loads a float4 element into shared memory, computes ELU, and writes back.
__global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
    extern __shared__ float4 tile[]; // Shared memory allocated dynamically
    int tid = threadIdx.x;
    int globalIdx = blockIdx.x * blockDim.x + tid;

    // Load vectorized data from global memory into shared memory
    if (globalIdx < n4) {
        tile[tid] = x[globalIdx];
    }
    __syncthreads();

    // Compute the ELU activation on the tile
    if (globalIdx < n4) {
        float4 val = tile[tid];
        float4 result;
        result.x = (val.x > 0.f) ? val.x : alpha * (expf(val.x) - 1.f);
        result.y = (val.y > 0.f) ? val.y : alpha * (expf(val.y) - 1.f);
        result.z = (val.z > 0.f) ? val.z : alpha * (expf(val.z) - 1.f);
        result.w = (val.w > 0.f) ? val.w : alpha * (expf(val.w) - 1.f);
        tile[tid] = result;  // Write result back into shared memory
    }
    __syncthreads();

    // Write results from shared memory back to global memory
    if (globalIdx < n4) {
        out[globalIdx] = tile[tid];
    }
}

// Kernel: Process the tail elements that are not a multiple of 4
__global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
    int globalIdx = blockIdx.x * blockDim.x + threadIdx.x + offset;
    if (globalIdx < n) {
        float val = x[globalIdx];
        out[globalIdx] = (val > 0.f) ? val : alpha * (expf(val) - 1.f);
    }
}

// Host interface function
// It dispatches two kernel calls: one for the vectorized portion and one for any remaining tail elements.
torch::Tensor elu_cuda_combined(torch::Tensor x, float alpha) {
    CHECK_INPUT(x);

    auto out = torch::empty_like(x);
    int n = x.numel();

    // Determine the number of float4 groups
    int n4 = n / 4;            // number of vectorizable groups
    int remainder = n % 4;       // remaining elements

    const int threads = 256;
    int blocks_vec = (n4 + threads - 1) / threads;
    size_t sharedMemSize = threads * sizeof(float4);

    // If there is at least one vectorized element, process it using the shared memory kernel
    if (n4 > 0) {
        elu_kernel_vec_shared<<<blocks_vec, threads, sharedMemSize>>>(
            reinterpret_cast<const float4*>(x.data_ptr<float>()),
            reinterpret_cast<float4*>(out.data_ptr<float>()),
            alpha,
            n4
        );
    }

    // Process any remaining tail elements with a scalar kernel
    if (remainder > 0) {
        int tail_offset = n4 * 4;
        int blocks_tail = (remainder + threads - 1) / threads;
        elu_kernel_tail<<<blocks_tail, threads>>>(
            x.data_ptr<float>(),
            out.data_ptr<float>(),
            alpha,
            tail_offset,
            n
        );
    }

    return out;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &elu_cuda_combined, "Combined ELU activation with shared memory and vectorized load (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::to
CPU Time	403027.87	μs
Device Time	39.97	μs
Self CPU Time	32.54	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	402995.32	μs
Device Time	39.97	μs
Self CPU Time	79.42	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	418625.62	μs
Device Time	0.00	μs
Self CPU Time	16017.96	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	402416.37	μs
Device Time	0.00	μs
Self CPU Time	402416.37	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	422654.72	μs
Device Time	19340.26	μs
Self CPU Time	422654.72	μs
Self Device Time	19340.26	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
elu_kernel_vec_shared(float4 const, float4, float, int)
CPU Time	0.00	μs
Device Time	21039.58	μs
Self CPU Time	0.00	μs
Self Device Time	21039.58	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	19595.32	μs
Device Time	37184.07	μs
Self CPU Time	19595.32	μs
Self Device Time	37184.07	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	58291.27	μs
Device Time	551878.97	μs
Self CPU Time	10499.63	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	47793.11	μs
Device Time	551878.97	μs
Self CPU Time	13364.32	μs
Self Device Time	551878.97	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	551878.97	μs
Self CPU Time	0.00	μs
Self Device Time	551878.97	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45284 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:12:35 bugprone-macro-parentheses

12 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:13:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]

13 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:69: warning: 2 adjacent parameters of 'elu_kernel_vec_shared' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]

18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {

| ^~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:75: note: the first parameter in the range is 'alpha'

18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:86: note: the last parameter in the range is 'n4'

18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {

| ^~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:82: note: 'float' and 'int' may be implicitly converted

18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:20:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

20 | int tid = threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:21:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

21 | int globalIdx = blockIdx.x * blockDim.x + tid;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:61: warning: 3 adjacent parameters of 'elu_kernel_tail' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]

48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:67: note: the first parameter in the range is 'alpha'

48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:90: note: the last parameter in the range is 'n'

48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:74: note: 'float' and 'int' may be implicitly converted

48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:49:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

49 | int globalIdx = blockIdx.x * blockDim.x + threadIdx.x + offset;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:58:47: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

58 | torch::Tensor elu_cuda_combined(torch::Tensor x, float alpha) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:62:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

62 | int n = x.numel();

| ^

The AI CUDA Engineer 👷

`31_ELU` • `vec_shared_elu_base`

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

The AI CUDA Engineer 👷

31_ELU • vec_shared_elu_base

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

`31_ELU` • `vec_shared_elu_base`