← Back to Leaderboard

The AI CUDA Engineer 👷

88_MinGPTNewGelumodular_gelu_kernel_base

Level 1 • Task 88
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 88_MinGPTNewGelu_shared_base_base 0.02 5.72 2.99
🥇 gelu_uniform_flow_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_indexing_base 0.02 5.72 2.99
🥇 gelu_tile_inline_base 0.02 5.72 2.99
🥇 optimized_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 88_mingptnewgelu_shared_tile_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
🥇 gelu_modular_base_base 0.02 5.72 2.99
🥇 optimized_gelu_manual_unroll_base 0.02 5.72 2.99
🥇 modular_gelu_device_base 0.02 5.72 2.99
🥇 optimized_gelu_combined_edit_1 0.02 5.72 2.99
🥇 gelu_optimized_block_size_base 0.02 5.72 2.99
🥇 combined_gelu_modular_vectorized_edit_1 0.02 5.72 2.99
🥇 optimized_gelu_combined_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_edit_1 0.02 5.72 2.99
🥇 modular_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Device function for computing the GELU activation
__device__ float gelu_activation(float x) {
    const float sqrt_2_over_pi = 0.7978845608f;
    const float coeff = 0.044715f;
    float x_cubed = x * x * x;
    float inner = x + coeff * x_cubed;
    inner *= sqrt_2_over_pi;
    float tanh_val = tanhf(inner);
    return 0.5f * x * (1.0f + tanh_val);
}

// Kernel that applies the GELU activation using modular device functions
__global__ void gelu_kernel_modular(const float* __restrict__ x, float* __restrict__ y, int n) {
    extern __shared__ float shared_x[];
    const int tid = threadIdx.x;
    const int blockSize = blockDim.x;
    const int base = blockIdx.x * blockSize * 4; // unroll factor of 4

    // Load data into shared memory with manual unroll
    #pragma unroll
    for (int i = 0; i < 4; i++) {
        int idx = base + tid + i * blockSize;
        if (idx < n) {
            shared_x[tid + i * blockSize] = x[idx];
        }
    }
    __syncthreads();

    // Process elements using shared memory
    #pragma unroll
    for (int i = 0; i < 4; i++) {
        int idx = base + tid + i * blockSize;
        if (idx < n) {
            y[idx] = gelu_activation(shared_x[tid + i * blockSize]);
        }
    }
}

// Host function to launch the modular GELU kernel
torch::Tensor gelu_forward_modular(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");
    
    auto y = torch::empty_like(x);
    int n = x.numel();
    
    const int threads = 256;
    const int unroll_factor = 4;
    int blocks = (n + threads * unroll_factor - 1) / (threads * unroll_factor);
    size_t shared_mem_size = threads * unroll_factor * sizeof(float);
    
    gelu_kernel_modular<<<blocks, threads, shared_mem_size>>>(
        x.data_ptr<float>(),
        y.data_ptr<float>(),
        n
    );
    
    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward_modular, "Modular GELU forward CUDA implementation using device functions");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.834 inst/cycle 0.001 5
Executed Ipc Elapsed 2.272 inst/cycle 0.000 5
Issue Slots Busy 71.026 % 0.281 5
Issued Ipc Active 2.842 inst/cycle 0.000 5
SM Busy 71.026 % 0.281 5
Memory Throughput 1366419646163.334 byte/second 56180136070418857984.000 5
Mem Busy 33.810 % 0.029 5
Max Bandwidth 40.900 % 0.032 5
L1/TEX Hit Rate 0.000 % 0.000 5
L2 Hit Rate 50.320 % 0.023 5
Mem Pipes Busy 32.504 % 0.034 5
Warp Cycles Per Issued Instruction 19.098 cycle 0.001 5
Warp Cycles Per Executed Instruction 19.136 cycle 0.001 5
Avg. Active Threads Per Warp 25.350 0.000 5
Avg. Not Predicated Off Threads Per Warp 24.340 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 10.000 block 0.000 5
Block Limit Shared Mem 20.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 85.666 % 0.029 5
Achieved Active Warps Per SM 54.826 warp 0.012 5
Analysis Rules
Rule Description
INF HighPipeUtilization FMA is the highest-utilized pipeline (31.3%) based on active cycles, taking into account the rates of its different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD) operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (85.8%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 396667.22 μs
Device Time 1540.99 μs
Self CPU Time 32.85 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 396634.37 μs
Device Time 1540.99 μs
Self CPU Time 97.31 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 413144.93 μs
Device Time 0.00 μs
Self CPU Time 18120.76 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 394482.14 μs
Device Time 0.00 μs
Self CPU Time 394482.14 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 580984.12 μs
Device Time 22369.01 μs
Self CPU Time 580984.12 μs
Self Device Time 22369.01 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
gelu_kernel_modular(float const*, float*, int)
CPU Time 0.00 μs
Device Time 112270.07 μs
Self CPU Time 0.00 μs
Self Device Time 112270.07 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 17773.85 μs
Device Time 43174.30 μs
Self CPU Time 17773.85 μs
Self Device Time 43174.30 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 88537.97 μs
Device Time 639275.84 μs
Self CPU Time 13764.59 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 74775.35 μs
Device Time 639275.84 μs
Self CPU Time 15683.24 μs
Self Device Time 639275.84 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 639354.31 μs
Self CPU Time 0.00 μs
Self Device Time 639354.31 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45280 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:20:21 bugprone-narrowing-conversions
20 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:21:27: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | const int blockSize = blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:22:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | const int base = blockIdx.x * blockSize * 4; // unroll factor of 4
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:45:50: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
45 | torch::Tensor gelu_forward_modular(torch::Tensor x) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:50:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
50 | int n = x.numel();
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:55:30: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
55 | size_t shared_mem_size = threads * unroll_factor * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:55:30: note: make conversion explicit to silence this warning
5 | size_t shared_mem_size = threads * unroll_factor * sizeof(float);
| ^~~~~~~~~~~~~~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b10_s3_modular_gelu_kernel/base/base.cu:55:30: note: perform multiplication in a wider type
55 | size_t shared_mem_size = threads * unroll_factor * sizeof(float);
| ^~~~~~~
| static_cast<long>( )