← Back to Leaderboard

The AI CUDA Engineer 👷

31_ELU31_elu_vectorized_edit_1

Level 1 • Task 31
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(x: torch.Tensor, alpha: float) -> torch.Tensor:
    """
    Applies ELU activation to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of any shape.
        alpha (float): The alpha parameter for the ELU function.

    Returns:
        torch.Tensor: Output tensor with ELU applied, same shape as input.
    """
    return F.elu(x, alpha=alpha)


class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """

    def __init__(self, alpha):
        """
        Initializes the ELU model.

        Args:
            alpha (float): The alpha parameter for the ELU function.
        """
        super(Model, self).__init__()
        self.alpha = alpha

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return fn(x, self.alpha)


batch_size = 16
dim = 16384
alpha = 1.0


def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]


def get_init_inputs():
    return [alpha]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """
    def __init__(self, alpha: float = 1.0):
        """
        Initializes the ELU model.

        Args:
            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
        """
        super(Model, self).__init__()
        self.alpha = alpha
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return F.elu(x, alpha=self.alpha)

batch_size = 16
dim = 16384

def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]

def get_init_inputs():
    return [1.0]  # Provide alpha value for initialization

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 31_elu_shared_base 0.01 1.14 4.80
🥇 hybrid_elu_optimized_base 0.01 1.14 4.80
🥇 31_elu_vectorized_base 0.01 1.14 4.80
🥇 vec_shared_elu_base 0.01 1.14 4.80
🥇 31_elu_grid_stride_base_base 0.01 1.14 4.80
🥇 31_elu_vectorized_edit_1 0.01 1.14 4.80
🥇 elu_unroll_kernel_base 0.01 1.14 4.80
🥇 ldg_elu_128_base 0.01 1.14 4.80
9 31_ELU 0.01 0.97 4.12
9 31_elu_aligned_coalesced_base 0.01 0.97 4.12
9 hybrid_elu_base 0.01 0.97 4.12
9 31_elu_optimized_indexing_base 0.01 0.97 4.12
9 31_elu_reduced_divergence_base 0.01 0.97 4.12
9 elu_hybrid_base 0.01 0.97 4.12
9 31_elu_coalesced_base 0.01 0.97 4.12
9 31_elu_shared_mem_base 0.01 0.97 4.12
9 modular_elu_base 0.01 0.97 4.12
9 elu_vec4_shared_base 0.01 0.97 4.12
9 elu_tuned_blocksize_base 0.01 0.97 4.12
9 branchless_elu_vectorized_base 0.01 0.97 4.12
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

__global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (tid < n4) {
        float4 in = x[tid];
        float4 result;
        
        // Process four elements at once
        result.x = (in.x > 0) ? in.x : alpha * (expf(in.x) - 1);
        result.y = (in.y > 0) ? in.y : alpha * (expf(in.y) - 1);
        result.z = (in.z > 0) ? in.z : alpha * (expf(in.z) - 1);
        result.w = (in.w > 0) ? in.w : alpha * (expf(in.w) - 1);
        
        out[tid] = result;
    }
}

// Handle remaining elements
__global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = start + tid;
    
    if (idx < n) {
        float val = x[idx];
        out[idx] = (val > 0) ? val : alpha * (expf(val) - 1);
    }
}

torch::Tensor elu_cuda_vectorized(torch::Tensor x, float alpha) {
    CHECK_INPUT(x);
    
    auto out = torch::empty_like(x);
    int n = x.numel();
    int n4 = n / 4;
    
    const int threads = 512;
    const int blocks = (n4 + threads - 1) / threads;
    
    // Process blocks of 4 elements
    if (n4 > 0) {
        elu_kernel_vectorized<<<blocks, threads>>>(
            reinterpret_cast<const float4*>(x.data_ptr<float>()),
            reinterpret_cast<float4*>(out.data_ptr<float>()),
            alpha,
            n4
        );
    }
    
    // Handle remaining elements
    int remaining = n - (n4 * 4);
    if (remaining > 0) {
        int remainder_blocks = (remaining + threads - 1) / threads;
        elu_kernel_remainder<<<remainder_blocks, threads>>>(
            x.data_ptr<float>(),
            out.data_ptr<float>(),
            alpha,
            n4 * 4,
            n
        );
    }
    
    return out;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &elu_cuda_vectorized, "ELU activation vectorized (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::to
CPU Time 519617.42 μs
Device Time 40.09 μs
Self CPU Time 29.45 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 519587.97 μs
Device Time 40.09 μs
Self CPU Time 81.27 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 539676.93 μs
Device Time 0.00 μs
Self CPU Time 20503.90 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 518975.61 μs
Device Time 0.00 μs
Self CPU Time 518975.61 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 507463.29 μs
Device Time 22919.32 μs
Self CPU Time 507463.29 μs
Self Device Time 22919.32 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
elu_kernel_vectorized(float4 const*, float4*, float, int)
CPU Time 0.00 μs
Device Time 32499.61 μs
Self CPU Time 0.00 μs
Self Device Time 32499.61 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 24351.05 μs
Device Time 44064.52 μs
Self CPU Time 24351.05 μs
Self Device Time 44064.52 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 68291.90 μs
Device Time 653403.09 μs
Self CPU Time 14983.42 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 53309.91 μs
Device Time 653403.09 μs
Self CPU Time 16847.10 μs
Self Device Time 653403.09 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 653403.09 μs
Self CPU Time 0.00 μs
Self Device Time 653403.09 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45283 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:6:35 bugprone-macro-parentheses
6 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:7:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
7 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:69: warning: 2 adjacent parameters of 'elu_kernel_vectorized' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:75: note: the first parameter in the range is 'alpha'
10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:86: note: the last parameter in the range is 'n4'
10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:82: note: 'float' and 'int' may be implicitly converted
10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:11:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
11 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:66: warning: 3 adjacent parameters of 'elu_kernel_remainder' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:72: note: the first parameter in the range is 'alpha'
28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:94: note: the last parameter in the range is 'n'
28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:79: note: 'float' and 'int' may be implicitly converted
28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:29:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
29 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:38:49: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
38 | torch::Tensor elu_cuda_vectorized(torch::Tensor x, float alpha) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:42:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
42 | int n = x.numel();
| ^