← Back to Leaderboard

The AI CUDA Engineer 👷

31_ELUhybrid_elu_optimized_base

Level 1 • Task 31
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(x: torch.Tensor, alpha: float) -> torch.Tensor:
    """
    Applies ELU activation to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of any shape.
        alpha (float): The alpha parameter for the ELU function.

    Returns:
        torch.Tensor: Output tensor with ELU applied, same shape as input.
    """
    return F.elu(x, alpha=alpha)


class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """

    def __init__(self, alpha):
        """
        Initializes the ELU model.

        Args:
            alpha (float): The alpha parameter for the ELU function.
        """
        super(Model, self).__init__()
        self.alpha = alpha

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return fn(x, self.alpha)


batch_size = 16
dim = 16384
alpha = 1.0


def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]


def get_init_inputs():
    return [alpha]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """
    def __init__(self, alpha: float = 1.0):
        """
        Initializes the ELU model.

        Args:
            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
        """
        super(Model, self).__init__()
        self.alpha = alpha
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return F.elu(x, alpha=self.alpha)

batch_size = 16
dim = 16384

def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]

def get_init_inputs():
    return [1.0]  # Provide alpha value for initialization

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 31_elu_shared_base 0.01 1.14 4.80
🥇 hybrid_elu_optimized_base 0.01 1.14 4.80
🥇 31_elu_vectorized_base 0.01 1.14 4.80
🥇 vec_shared_elu_base 0.01 1.14 4.80
🥇 31_elu_grid_stride_base_base 0.01 1.14 4.80
🥇 31_elu_vectorized_edit_1 0.01 1.14 4.80
🥇 elu_unroll_kernel_base 0.01 1.14 4.80
🥇 ldg_elu_128_base 0.01 1.14 4.80
9 31_ELU 0.01 0.97 4.12
9 31_elu_aligned_coalesced_base 0.01 0.97 4.12
9 hybrid_elu_base 0.01 0.97 4.12
9 31_elu_optimized_indexing_base 0.01 0.97 4.12
9 31_elu_reduced_divergence_base 0.01 0.97 4.12
9 elu_hybrid_base 0.01 0.97 4.12
9 31_elu_coalesced_base 0.01 0.97 4.12
9 31_elu_shared_mem_base 0.01 0.97 4.12
9 modular_elu_base 0.01 0.97 4.12
9 elu_vec4_shared_base 0.01 0.97 4.12
9 elu_tuned_blocksize_base 0.01 0.97 4.12
9 branchless_elu_vectorized_base 0.01 0.97 4.12
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

// Vectorized kernel with shared memory for large chunks
__global__ void elu_kernel_vec4_shared(const float4* x, float4* out, float alpha, int n4) {
    extern __shared__ float4 tile[];
    int tid = threadIdx.x;
    int globalIdx = blockIdx.x * blockDim.x + tid;

    // Load input data using vectorized reads
    if (globalIdx < n4) {
        tile[tid] = x[globalIdx];
    }
    __syncthreads();

    if (globalIdx < n4) {
        float4 val = tile[tid];
        float4 result;
        
        result.x = (val.x > 0) ? val.x : alpha * (expf(val.x) - 1);
        result.y = (val.y > 0) ? val.y : alpha * (expf(val.y) - 1);
        result.z = (val.z > 0) ? val.z : alpha * (expf(val.z) - 1);
        result.w = (val.w > 0) ? val.w : alpha * (expf(val.w) - 1);
        
        out[globalIdx] = result;
    }
}

// Regular kernel for remaining elements
__global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx + start < n) {
        float val = x[idx + start];
        out[idx + start] = (val > 0) ? val : alpha * (expf(val) - 1);
    }
}

torch::Tensor elu_cuda_hybrid(torch::Tensor x, float alpha) {
    CHECK_INPUT(x);
    auto out = torch::empty_like(x);
    
    int n = x.numel();
    int n4 = n / 4;  // Number of float4 elements
    int remainder = n % 4;  // Remaining elements
    
    const int threads = 256;
    const int blocks = (n4 + threads - 1) / threads;
    
    // Process main part using vectorized loads and shared memory
    if (n4 > 0) {
        size_t sharedMemSize = threads * sizeof(float4);
        elu_kernel_vec4_shared<<<blocks, threads, sharedMemSize>>>(
            reinterpret_cast<const float4*>(x.data_ptr<float>()),
            reinterpret_cast<float4*>(out.data_ptr<float>()),
            alpha,
            n4
        );
    }
    
    // Process remaining elements
    if (remainder > 0) {
        const int remainder_blocks = (remainder + threads - 1) / threads;
        elu_kernel_remainder<<<remainder_blocks, threads>>>(
            x.data_ptr<float>(),
            out.data_ptr<float>(),
            alpha,
            n4 * 4,
            n
        );
    }
    
    return out;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &elu_cuda_hybrid, "Hybrid ELU activation (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::to
CPU Time 465508.37 μs
Device Time 39.97 μs
Self CPU Time 33.03 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 465475.34 μs
Device Time 39.97 μs
Self CPU Time 87.53 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 477143.18 μs
Device Time 0.00 μs
Self CPU Time 12121.18 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 464828.38 μs
Device Time 0.00 μs
Self CPU Time 464828.38 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 333666.85 μs
Device Time 15032.45 μs
Self CPU Time 333666.85 μs
Self Device Time 15032.45 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
elu_kernel_vec4_shared(float4 const*, float4*, float, int)
CPU Time 0.00 μs
Device Time 20934.33 μs
Self CPU Time 0.00 μs
Self Device Time 20934.33 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 15081.17 μs
Device Time 28951.38 μs
Self CPU Time 15081.17 μs
Self Device Time 28951.38 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 49456.48 μs
Device Time 431566.27 μs
Self CPU Time 8344.39 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 41114.12 μs
Device Time 431566.27 μs
Self CPU Time 12079.31 μs
Self Device Time 431566.27 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 431566.27 μs
Self CPU Time 0.00 μs
Self Device Time 431566.27 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45283 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:6:35 bugprone-macro-parentheses
6 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:7:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
7 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:11:70: warning: 2 adjacent parameters of 'elu_kernel_vec4_shared' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
11 | __global__ void elu_kernel_vec4_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:11:76: note: the first parameter in the range is 'alpha'
11 | __global__ void elu_kernel_vec4_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:11:87: note: the last parameter in the range is 'n4'
11 | __global__ void elu_kernel_vec4_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:11:83: note: 'float' and 'int' may be implicitly converted
11 | __global__ void elu_kernel_vec4_shared(const float4* x, float4* out, float alpha, int n4) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:13:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
13 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:14:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
14 | int globalIdx = blockIdx.x * blockDim.x + tid;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:37:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
37 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:44:45: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
44 | torch::Tensor elu_cuda_hybrid(torch::Tensor x, float alpha) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b8_s2_hybrid_elu_optimized/base/base.cu:48:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
48 | int n = x.numel();
| ^