Kernel Details - modular_gelu_device

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	88_MinGPTNewGelu
Level ID	1
Task ID	88
Kernel Name	modular_gelu_device_base
CUDA Speedup (Native)	5.716x
CUDA Speedup (Compile)	2.987x
CUDA Runtime	0.017 ms
PyTorch Runtime (Native)	0.097 ms
PyTorch Runtime (Compile)	0.051 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	88_MinGPTNewGelu_shared_base_base	0.02	5.72	2.99
🥇	gelu_uniform_flow_base	0.02	5.72	2.99
🥇	gelu_kernel_optimized_indexing_base	0.02	5.72	2.99
🥇	gelu_tile_inline_base	0.02	5.72	2.99
🥇	optimized_gelu_kernel_base	0.02	5.72	2.99
🥇	gelu_kernel_optimized_base	0.02	5.72	2.99
🥇	gelu_kernel_optimized_base	0.02	5.72	2.99
🥇	88_mingptnewgelu_shared_tile_base	0.02	5.72	2.99
🥇	gelu_vectorized_base	0.02	5.72	2.99
🥇	gelu_modular_base_base	0.02	5.72	2.99
🥇	optimized_gelu_manual_unroll_base	0.02	5.72	2.99
🥇	modular_gelu_device_base	0.02	5.72	2.99
🥇	optimized_gelu_combined_edit_1	0.02	5.72	2.99
🥇	gelu_optimized_block_size_base	0.02	5.72	2.99
🥇	combined_gelu_modular_vectorized_edit_1	0.02	5.72	2.99
🥇	optimized_gelu_combined_base	0.02	5.72	2.99
🥇	gelu_vectorized_tuned_edit_1	0.02	5.72	2.99
🥇	modular_gelu_kernel_base	0.02	5.72	2.99
🥇	gelu_vectorized_tuned_base	0.02	5.72	2.99
🥇	gelu_vectorized_base	0.02	5.72	2.99

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Device function to compute GELU activation for a scalar value
__device__ __forceinline__ float compute_gelu(float x) {
    const float sqrt_2_over_pi = 0.7978845608f; // sqrt(2/pi)
    const float coeff = 0.044715f;
    float x_cubed = x * x * x;
    float inner = (x + coeff * x_cubed) * sqrt_2_over_pi;
    return 0.5f * x * (1.0f + tanhf(inner));
}

// Device function to compute GELU activation for a float4 vector
__device__ __forceinline__ float4 compute_gelu_vector(const float4 v) {
    float4 out;
    out.x = compute_gelu(v.x);
    out.y = compute_gelu(v.y);
    out.z = compute_gelu(v.z);
    out.w = compute_gelu(v.w);
    return out;
}

// Kernel to process input in vectorized float4 chunks
__global__ void gelu_kernel_vector(const float4* __restrict__ x, float4* __restrict__ y, int vec_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < vec_size) {
        // Load data using read-only cache
        float4 input = __ldg(&x[idx]);
        // Apply the modular GELU operation
        float4 output = compute_gelu_vector(input);
        y[idx] = output;
    }
}

// Fallback scalar kernel for remaining elements
__global__ void gelu_kernel_scalar(const float* __restrict__ x, float* __restrict__ y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        y[idx] = compute_gelu(x[idx]);
    }
}

// Forward function exposed to Python
torch::Tensor gelu_forward(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");

    auto y = torch::empty_like(x);
    int n = x.numel();

    // Process the bulk of data using vectorized operations
    int vec_size = n / 4;  // number of float4 vectors
    int remainder = n % 4;

    const int threads = 256;
    if (vec_size > 0) {
        int blocks = (vec_size + threads - 1) / threads;
        const float4* x_vec = reinterpret_cast<const float4*>(x.data_ptr<float>());
        float4* y_vec = reinterpret_cast<float4*>(y.data_ptr<float>());
        gelu_kernel_vector<<<blocks, threads>>>(x_vec, y_vec, vec_size);
    }

    // Process any remaining elements with the scalar kernel
    if (remainder > 0) {
        int offset = vec_size * 4;
        int blocks = (remainder + threads - 1) / threads;
        gelu_kernel_scalar<<<blocks, threads>>>(x.data_ptr<float>() + offset, y.data_ptr<float>() + offset, remainder);
    }

    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward, "Modular GELU CUDA implementation");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::to
CPU Time	757421.37	μs
Device Time	1702.42	μs
Self CPU Time	35.24	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	757386.13	μs
Device Time	1702.42	μs
Self CPU Time	84.83	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	771236.03	μs
Device Time	0.00	μs
Self CPU Time	15410.31	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	743875.29	μs
Device Time	0.00	μs
Self CPU Time	743875.29	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	471341.29	μs
Device Time	19074.59	μs
Self CPU Time	471341.29	μs
Self Device Time	19074.59	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
gelu_kernel_vector(float4 const, float4, int)
CPU Time	0.00	μs
Device Time	95280.43	μs
Self CPU Time	0.00	μs
Self Device Time	95280.43	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	15521.00	μs
Device Time	36806.85	μs
Self CPU Time	15521.00	μs
Self Device Time	36806.85	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	75629.24	μs
Device Time	545925.29	μs
Self CPU Time	11954.77	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	63675.96	μs
Device Time	545925.29	μs
Self CPU Time	15381.36	μs
Self Device Time	545925.29	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	545925.29	μs
Self CPU Time	0.00	μs
Self Device Time	545925.29	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45278 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:27:15 bugprone-narrowing-conversions

27 | int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:39:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

39 | int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:46:42: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

46 | torch::Tensor gelu_forward(torch::Tensor x) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b5_s2_modular_gelu_device/base/base.cu:51:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

51 | int n = x.numel();

| ^

The AI CUDA Engineer 👷

`88_MinGPTNewGelu` • `modular_gelu_device_base`

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

The AI CUDA Engineer 👷

88_MinGPTNewGelu • modular_gelu_device_base

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

`88_MinGPTNewGelu` • `modular_gelu_device_base`