Kernel Details - optimized_gelu_combined_edit

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	88_MinGPTNewGelu
Level ID	1
Task ID	88
Kernel Name	optimized_gelu_combined_edit_1
CUDA Speedup (Native)	5.716x
CUDA Speedup (Compile)	2.987x
CUDA Runtime	0.017 ms
PyTorch Runtime (Native)	0.097 ms
PyTorch Runtime (Compile)	0.051 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
Temperature	0.00

View Experiment Progress Details

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	88_MinGPTNewGelu_shared_base_base	0.02	5.72	2.99
🥇	gelu_uniform_flow_base	0.02	5.72	2.99
🥇	gelu_kernel_optimized_indexing_base	0.02	5.72	2.99
🥇	gelu_tile_inline_base	0.02	5.72	2.99
🥇	optimized_gelu_kernel_base	0.02	5.72	2.99
🥇	gelu_kernel_optimized_base	0.02	5.72	2.99
🥇	gelu_kernel_optimized_base	0.02	5.72	2.99
🥇	88_mingptnewgelu_shared_tile_base	0.02	5.72	2.99
🥇	gelu_vectorized_base	0.02	5.72	2.99
🥇	gelu_modular_base_base	0.02	5.72	2.99
🥇	optimized_gelu_manual_unroll_base	0.02	5.72	2.99
🥇	modular_gelu_device_base	0.02	5.72	2.99
🥇	optimized_gelu_combined_edit_1	0.02	5.72	2.99
🥇	gelu_optimized_block_size_base	0.02	5.72	2.99
🥇	combined_gelu_modular_vectorized_edit_1	0.02	5.72	2.99
🥇	optimized_gelu_combined_base	0.02	5.72	2.99
🥇	gelu_vectorized_tuned_edit_1	0.02	5.72	2.99
🥇	modular_gelu_kernel_base	0.02	5.72	2.99
🥇	gelu_vectorized_tuned_base	0.02	5.72	2.99
🥇	gelu_vectorized_base	0.02	5.72	2.99

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Device inline function to compute GELU activation.
__device__ __forceinline__ float gelu_activation(float x) {
    const float sqrt_2_over_pi = 0.7978845608f; // sqrt(2/pi)
    const float coeff = 0.044715f;
    float x_cubed = x * x * x;
    float inner = (x + coeff * x_cubed) * sqrt_2_over_pi;
    // Use faster approximation for tanh using exponential intrinsics
    float exp2x = __expf(2.0f * inner);
    float tanh_approx = (exp2x - 1.0f) / (exp2x + 1.0f);
    return 0.5f * x * (1.0f + tanh_approx);
}

// Kernel to process input in float4 vectorized chunks
__global__ void gelu_kernel_vector(const float4* __restrict__ x, float4* __restrict__ y, int vec_size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < vec_size) {
        float4 v = __ldg(&x[i]);
        v.x = gelu_activation(v.x);
        v.y = gelu_activation(v.y);
        v.z = gelu_activation(v.z);
        v.w = gelu_activation(v.w);
        y[i] = v;
    }
}

// Fallback scalar kernel for remaining elements
__global__ void gelu_kernel_scalar(const float* __restrict__ x, float* __restrict__ y, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        y[i] = gelu_activation(x[i]);
    }
}

// Forward function accessible from Python
torch::Tensor gelu_forward(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");

    auto y = torch::empty_like(x);
    int n = x.numel();

    // Process most of the tensor with vectorized float4 loads/stores
    int vec_size = n / 4;  // number of float4 vectors
    int remainder = n % 4;

    const int threads = 256;
    if(vec_size > 0) {
        int blocks = (vec_size + threads - 1) / threads;
        const float4* x_vec = reinterpret_cast<const float4*>(x.data_ptr<float>());
        float4* y_vec = reinterpret_cast<float4*>(y.data_ptr<float>());
        gelu_kernel_vector<<<blocks, threads>>>(x_vec, y_vec, vec_size);
    }

    // Process any remaining elements with the scalar kernel
    if(remainder > 0) {
        int offset = vec_size * 4;
        int blocks_rem = (remainder + threads - 1) / threads;
        gelu_kernel_scalar<<<blocks_rem, threads>>>(x.data_ptr<float>() + offset, y.data_ptr<float>() + offset, remainder);
    }

    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward, "Optimized GELU combined CUDA implementation");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::to
CPU Time	257717.09	μs
Device Time	1608.35	μs
Self CPU Time	34.58	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	257682.51	μs
Device Time	1608.35	μs
Self CPU Time	87.76	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	272427.59	μs
Device Time	0.00	μs
Self CPU Time	16344.06	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	248708.83	μs
Device Time	0.00	μs
Self CPU Time	248708.83	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	501555.22	μs
Device Time	19766.29	μs
Self CPU Time	501555.22	μs
Self Device Time	19766.29	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
gelu_kernel_vector(float4 const, float4, int)
CPU Time	0.00	μs
Device Time	98070.52	μs
Self CPU Time	0.00	μs
Self Device Time	98070.52	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	17841.15	μs
Device Time	38138.60	μs
Self CPU Time	17841.15	μs
Self Device Time	38138.60	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	78504.69	μs
Device Time	566087.74	μs
Self CPU Time	13376.76	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	65131.15	μs
Device Time	566087.74	μs
Self CPU Time	14992.54	μs
Self Device Time	566087.74	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	566087.74	μs
Self CPU Time	0.00	μs
Self Device Time	566087.74	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45278 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:20:13 bugprone-narrowing-conversions

20 | int i = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:33:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

33 | int i = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:40:42: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

40 | torch::Tensor gelu_forward(torch::Tensor x) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_88/b4_s1_optimized_gelu_combined/edit_1/edit_1.cu:45:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

45 | int n = x.numel();

| ^

The AI CUDA Engineer 👷

`88_MinGPTNewGelu` • `optimized_gelu_combined_edit_1`

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

The AI CUDA Engineer 👷

88_MinGPTNewGelu • optimized_gelu_combined_edit_1

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

`88_MinGPTNewGelu` • `optimized_gelu_combined_edit_1`