← Back to Leaderboard

The AI CUDA Engineer 👷

88_MinGPTNewGelumodular_gelu_device_base

Level 1 • Task 88
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 88_MinGPTNewGelu_shared_base_base 0.02 5.72 2.99
🥇 gelu_uniform_flow_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_indexing_base 0.02 5.72 2.99
🥇 gelu_tile_inline_base 0.02 5.72 2.99
🥇 optimized_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 88_mingptnewgelu_shared_tile_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
🥇 gelu_modular_base_base 0.02 5.72 2.99
🥇 optimized_gelu_manual_unroll_base 0.02 5.72 2.99
🥇 modular_gelu_device_base 0.02 5.72 2.99
🥇 optimized_gelu_combined_edit_1 0.02 5.72 2.99
🥇 gelu_optimized_block_size_base 0.02 5.72 2.99
🥇 combined_gelu_modular_vectorized_edit_1 0.02 5.72 2.99
🥇 optimized_gelu_combined_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_edit_1 0.02 5.72 2.99
🥇 modular_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Device function to compute GELU activation for a scalar value
__device__ __forceinline__ float compute_gelu(float x) {
    const float sqrt_2_over_pi = 0.7978845608f; // sqrt(2/pi)
    const float coeff = 0.044715f;
    float x_cubed = x * x * x;
    float inner = (x + coeff * x_cubed) * sqrt_2_over_pi;
    return 0.5f * x * (1.0f + tanhf(inner));
}

// Device function to compute GELU activation for a float4 vector
__device__ __forceinline__ float4 compute_gelu_vector(const float4 v) {
    float4 out;
    out.x = compute_gelu(v.x);
    out.y = compute_gelu(v.y);
    out.z = compute_gelu(v.z);
    out.w = compute_gelu(v.w);
    return out;
}

// Kernel to process input in vectorized float4 chunks
__global__ void gelu_kernel_vector(const float4* __restrict__ x, float4* __restrict__ y, int vec_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < vec_size) {
        // Load data using read-only cache
        float4 input = __ldg(&x[idx]);
        // Apply the modular GELU operation
        float4 output = compute_gelu_vector(input);
        y[idx] = output;
    }
}

// Fallback scalar kernel for remaining elements
__global__ void gelu_kernel_scalar(const float* __restrict__ x, float* __restrict__ y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        y[idx] = compute_gelu(x[idx]);
    }
}

// Forward function exposed to Python
torch::Tensor gelu_forward(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");

    auto y = torch::empty_like(x);
    int n = x.numel();

    // Process the bulk of data using vectorized operations
    int vec_size = n / 4;  // number of float4 vectors
    int remainder = n % 4;

    const int threads = 256;
    if (vec_size > 0) {
        int blocks = (vec_size + threads - 1) / threads;
        const float4* x_vec = reinterpret_cast<const float4*>(x.data_ptr<float>());
        float4* y_vec = reinterpret_cast<float4*>(y.data_ptr<float>());
        gelu_kernel_vector<<<blocks, threads>>>(x_vec, y_vec, vec_size);
    }

    // Process any remaining elements with the scalar kernel
    if (remainder > 0) {
        int offset = vec_size * 4;
        int blocks = (remainder + threads - 1) / threads;
        gelu_kernel_scalar<<<blocks, threads>>>(x.data_ptr<float>() + offset, y.data_ptr<float>() + offset, remainder);
    }

    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward, "Modular GELU CUDA implementation");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::to
CPU Time 757421.37 μs
Device Time 1702.42 μs
Self CPU Time 35.24 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 757386.13 μs
Device Time 1702.42 μs
Self CPU Time 84.83 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 771236.03 μs
Device Time 0.00 μs
Self CPU Time 15410.31 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 743875.29 μs
Device Time 0.00 μs
Self CPU Time 743875.29 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 471341.29 μs
Device Time 19074.59 μs
Self CPU Time 471341.29 μs
Self Device Time 19074.59 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
gelu_kernel_vector(float4 const*, float4*, int)
CPU Time 0.00 μs
Device Time 95280.43 μs
Self CPU Time 0.00 μs
Self Device Time 95280.43 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 15521.00 μs
Device Time 36806.85 μs
Self CPU Time 15521.00 μs
Self Device Time 36806.85 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 75629.24 μs
Device Time 545925.29 μs
Self CPU Time 11954.77 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 63675.96 μs
Device Time 545925.29 μs
Self CPU Time 15381.36 μs
Self Device Time 545925.29 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 545925.29 μs
Self CPU Time 0.00 μs
Self Device Time 545925.29 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45278 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:27:15 bugprone-narrowing-conversions
27 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:39:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
39 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:46:42: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
46 | torch::Tensor gelu_forward(torch::Tensor x) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:51:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
51 | int n = x.numel();
| ^