Kernel Details - 31_elu_vectorized_edit

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(x: torch.Tensor, alpha: float) -> torch.Tensor:
    """
    Applies ELU activation to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of any shape.
        alpha (float): The alpha parameter for the ELU function.

    Returns:
        torch.Tensor: Output tensor with ELU applied, same shape as input.
    """
    return F.elu(x, alpha=alpha)


class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """

    def __init__(self, alpha):
        """
        Initializes the ELU model.

        Args:
            alpha (float): The alpha parameter for the ELU function.
        """
        super(Model, self).__init__()
        self.alpha = alpha

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return fn(x, self.alpha)


batch_size = 16
dim = 16384
alpha = 1.0


def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]


def get_init_inputs():
    return [alpha]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    """
    Simple model that performs an ELU activation.
    """
    def __init__(self, alpha: float = 1.0):
        """
        Initializes the ELU model.

        Args:
            alpha (float, optional): The alpha parameter for the ELU function. Defaults to 1.0.
        """
        super(Model, self).__init__()
        self.alpha = alpha
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies ELU activation to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of any shape.

        Returns:
            torch.Tensor: Output tensor with ELU applied, same shape as input.
        """
        return F.elu(x, alpha=self.alpha)

batch_size = 16
dim = 16384

def get_inputs():
    x = torch.randn(batch_size, dim)
    return [x]

def get_init_inputs():
    return [1.0]  # Provide alpha value for initialization

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	31_ELU
Level ID	1
Task ID	31
Kernel Name	31_elu_vectorized_edit_1
CUDA Speedup (Native)	1.136x
CUDA Speedup (Compile)	4.802x
CUDA Runtime	0.006 ms
PyTorch Runtime (Native)	0.007 ms
PyTorch Runtime (Compile)	0.029 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	gpt-4o-mini-2024-07-18
Temperature	0.00

View Experiment Progress Details

Related Kernels (Level 1, Task 31 • 31_ELU)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	31_elu_shared_base	0.01	1.14	4.80
🥇	hybrid_elu_optimized_base	0.01	1.14	4.80
🥇	31_elu_vectorized_base	0.01	1.14	4.80
🥇	vec_shared_elu_base	0.01	1.14	4.80
🥇	31_elu_grid_stride_base_base	0.01	1.14	4.80
🥇	31_elu_vectorized_edit_1	0.01	1.14	4.80
🥇	elu_unroll_kernel_base	0.01	1.14	4.80
🥇	ldg_elu_128_base	0.01	1.14	4.80
9	31_ELU	0.01	0.97	4.12
9	31_elu_aligned_coalesced_base	0.01	0.97	4.12
9	hybrid_elu_base	0.01	0.97	4.12
9	31_elu_optimized_indexing_base	0.01	0.97	4.12
9	31_elu_reduced_divergence_base	0.01	0.97	4.12
9	elu_hybrid_base	0.01	0.97	4.12
9	31_elu_coalesced_base	0.01	0.97	4.12
9	31_elu_shared_mem_base	0.01	0.97	4.12
9	modular_elu_base	0.01	0.97	4.12
9	elu_vec4_shared_base	0.01	0.97	4.12
9	elu_tuned_blocksize_base	0.01	0.97	4.12
9	branchless_elu_vectorized_base	0.01	0.97	4.12

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

__global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (tid < n4) {
        float4 in = x[tid];
        float4 result;
        
        // Process four elements at once
        result.x = (in.x > 0) ? in.x : alpha * (expf(in.x) - 1);
        result.y = (in.y > 0) ? in.y : alpha * (expf(in.y) - 1);
        result.z = (in.z > 0) ? in.z : alpha * (expf(in.z) - 1);
        result.w = (in.w > 0) ? in.w : alpha * (expf(in.w) - 1);
        
        out[tid] = result;
    }
}

// Handle remaining elements
__global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = start + tid;
    
    if (idx < n) {
        float val = x[idx];
        out[idx] = (val > 0) ? val : alpha * (expf(val) - 1);
    }
}

torch::Tensor elu_cuda_vectorized(torch::Tensor x, float alpha) {
    CHECK_INPUT(x);
    
    auto out = torch::empty_like(x);
    int n = x.numel();
    int n4 = n / 4;
    
    const int threads = 512;
    const int blocks = (n4 + threads - 1) / threads;
    
    // Process blocks of 4 elements
    if (n4 > 0) {
        elu_kernel_vectorized<<<blocks, threads>>>(
            reinterpret_cast<const float4*>(x.data_ptr<float>()),
            reinterpret_cast<float4*>(out.data_ptr<float>()),
            alpha,
            n4
        );
    }
    
    // Handle remaining elements
    int remaining = n - (n4 * 4);
    if (remaining > 0) {
        int remainder_blocks = (remaining + threads - 1) / threads;
        elu_kernel_remainder<<<remainder_blocks, threads>>>(
            x.data_ptr<float>(),
            out.data_ptr<float>(),
            alpha,
            n4 * 4,
            n
        );
    }
    
    return out;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &elu_cuda_vectorized, "ELU activation vectorized (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::to
CPU Time	519617.42	μs
Device Time	40.09	μs
Self CPU Time	29.45	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	519587.97	μs
Device Time	40.09	μs
Self CPU Time	81.27	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	539676.93	μs
Device Time	0.00	μs
Self CPU Time	20503.90	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	518975.61	μs
Device Time	0.00	μs
Self CPU Time	518975.61	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	507463.29	μs
Device Time	22919.32	μs
Self CPU Time	507463.29	μs
Self Device Time	22919.32	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
elu_kernel_vectorized(float4 const, float4, float, int)
CPU Time	0.00	μs
Device Time	32499.61	μs
Self CPU Time	0.00	μs
Self Device Time	32499.61	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	24351.05	μs
Device Time	44064.52	μs
Self CPU Time	24351.05	μs
Self Device Time	44064.52	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	68291.90	μs
Device Time	653403.09	μs
Self CPU Time	14983.42	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	53309.91	μs
Device Time	653403.09	μs
Self CPU Time	16847.10	μs
Self Device Time	653403.09	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	653403.09	μs
Self CPU Time	0.00	μs
Self Device Time	653403.09	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45283 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:6:35 bugprone-macro-parentheses

6 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:7:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]

7 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:69: warning: 2 adjacent parameters of 'elu_kernel_vectorized' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]

10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {

| ^~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:75: note: the first parameter in the range is 'alpha'

10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:86: note: the last parameter in the range is 'n4'

10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {

| ^~

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:10:82: note: 'float' and 'int' may be implicitly converted

10 | __global__ void elu_kernel_vectorized(const float4* x, float4* out, float alpha, int n4) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:11:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

11 | int tid = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:66: warning: 3 adjacent parameters of 'elu_kernel_remainder' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]

28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:72: note: the first parameter in the range is 'alpha'

28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:94: note: the last parameter in the range is 'n'

28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:28:79: note: 'float' and 'int' may be implicitly converted

28 | __global__ void elu_kernel_remainder(const float* x, float* out, float alpha, int start, int n) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:29:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

29 | int tid = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:38:49: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

38 | torch::Tensor elu_cuda_vectorized(torch::Tensor x, float alpha) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_31/b3_s3_31_elu_vectorized/edit_1/edit_1.cu:42:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

42 | int n = x.numel();

| ^

The AI CUDA Engineer 👷

`31_ELU` • `31_elu_vectorized_edit_1`

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

The AI CUDA Engineer 👷

31_ELU • 31_elu_vectorized_edit_1

Kernel Information

Related Kernels (Level 1, Task 31 • 31_ELU)

`31_ELU` • `31_elu_vectorized_edit_1`