← Back to Leaderboard

The AI CUDA Engineer 👷

88_MinGPTNewGeluoptimized_gelu_combined_edit_1

Level 1 • Task 88
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 88_MinGPTNewGelu_shared_base_base 0.02 5.72 2.99
🥇 gelu_uniform_flow_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_indexing_base 0.02 5.72 2.99
🥇 gelu_tile_inline_base 0.02 5.72 2.99
🥇 optimized_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 88_mingptnewgelu_shared_tile_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
🥇 gelu_modular_base_base 0.02 5.72 2.99
🥇 optimized_gelu_manual_unroll_base 0.02 5.72 2.99
🥇 modular_gelu_device_base 0.02 5.72 2.99
🥇 optimized_gelu_combined_edit_1 0.02 5.72 2.99
🥇 gelu_optimized_block_size_base 0.02 5.72 2.99
🥇 combined_gelu_modular_vectorized_edit_1 0.02 5.72 2.99
🥇 optimized_gelu_combined_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_edit_1 0.02 5.72 2.99
🥇 modular_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Device inline function to compute GELU activation.
__device__ __forceinline__ float gelu_activation(float x) {
    const float sqrt_2_over_pi = 0.7978845608f; // sqrt(2/pi)
    const float coeff = 0.044715f;
    float x_cubed = x * x * x;
    float inner = (x + coeff * x_cubed) * sqrt_2_over_pi;
    // Use faster approximation for tanh using exponential intrinsics
    float exp2x = __expf(2.0f * inner);
    float tanh_approx = (exp2x - 1.0f) / (exp2x + 1.0f);
    return 0.5f * x * (1.0f + tanh_approx);
}

// Kernel to process input in float4 vectorized chunks
__global__ void gelu_kernel_vector(const float4* __restrict__ x, float4* __restrict__ y, int vec_size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < vec_size) {
        float4 v = __ldg(&x[i]);
        v.x = gelu_activation(v.x);
        v.y = gelu_activation(v.y);
        v.z = gelu_activation(v.z);
        v.w = gelu_activation(v.w);
        y[i] = v;
    }
}

// Fallback scalar kernel for remaining elements
__global__ void gelu_kernel_scalar(const float* __restrict__ x, float* __restrict__ y, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        y[i] = gelu_activation(x[i]);
    }
}

// Forward function accessible from Python
torch::Tensor gelu_forward(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");

    auto y = torch::empty_like(x);
    int n = x.numel();

    // Process most of the tensor with vectorized float4 loads/stores
    int vec_size = n / 4;  // number of float4 vectors
    int remainder = n % 4;

    const int threads = 256;
    if(vec_size > 0) {
        int blocks = (vec_size + threads - 1) / threads;
        const float4* x_vec = reinterpret_cast<const float4*>(x.data_ptr<float>());
        float4* y_vec = reinterpret_cast<float4*>(y.data_ptr<float>());
        gelu_kernel_vector<<<blocks, threads>>>(x_vec, y_vec, vec_size);
    }

    // Process any remaining elements with the scalar kernel
    if(remainder > 0) {
        int offset = vec_size * 4;
        int blocks_rem = (remainder + threads - 1) / threads;
        gelu_kernel_scalar<<<blocks_rem, threads>>>(x.data_ptr<float>() + offset, y.data_ptr<float>() + offset, remainder);
    }

    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward, "Optimized GELU combined CUDA implementation");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::to
CPU Time 257717.09 μs
Device Time 1608.35 μs
Self CPU Time 34.58 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 257682.51 μs
Device Time 1608.35 μs
Self CPU Time 87.76 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 272427.59 μs
Device Time 0.00 μs
Self CPU Time 16344.06 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 248708.83 μs
Device Time 0.00 μs
Self CPU Time 248708.83 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 501555.22 μs
Device Time 19766.29 μs
Self CPU Time 501555.22 μs
Self Device Time 19766.29 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
gelu_kernel_vector(float4 const*, float4*, int)
CPU Time 0.00 μs
Device Time 98070.52 μs
Self CPU Time 0.00 μs
Self Device Time 98070.52 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 17841.15 μs
Device Time 38138.60 μs
Self CPU Time 17841.15 μs
Self Device Time 38138.60 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 78504.69 μs
Device Time 566087.74 μs
Self CPU Time 13376.76 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 65131.15 μs
Device Time 566087.74 μs
Self CPU Time 14992.54 μs
Self Device Time 566087.74 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 566087.74 μs
Self CPU Time 0.00 μs
Self Device Time 566087.74 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45278 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:20:13 bugprone-narrowing-conversions
20 | int i = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:33:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
33 | int i = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:40:42: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
40 | torch::Tensor gelu_forward(torch::Tensor x) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:45:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | int n = x.numel();
| ^