← Back to Leaderboard

The AI CUDA Engineer 👷

88_MinGPTNewGelugelu_kernel_optimized_base

Level 1 • Task 88
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 88_MinGPTNewGelu_shared_base_base 0.02 5.72 2.99
🥇 gelu_uniform_flow_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_indexing_base 0.02 5.72 2.99
🥇 gelu_tile_inline_base 0.02 5.72 2.99
🥇 optimized_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 88_mingptnewgelu_shared_tile_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
🥇 gelu_modular_base_base 0.02 5.72 2.99
🥇 optimized_gelu_manual_unroll_base 0.02 5.72 2.99
🥇 modular_gelu_device_base 0.02 5.72 2.99
🥇 optimized_gelu_combined_edit_1 0.02 5.72 2.99
🥇 gelu_optimized_block_size_base 0.02 5.72 2.99
🥇 combined_gelu_modular_vectorized_edit_1 0.02 5.72 2.99
🥇 optimized_gelu_combined_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_edit_1 0.02 5.72 2.99
🥇 modular_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Device function for GELU activation computation
__device__ float compute_gelu(float x) {
    const float sqrt_2_over_pi = 0.7978845608f;
    const float coeff = 0.044715f;
    float x_cubed = x * x * x;
    float inner = x + coeff * x_cubed;
    inner *= sqrt_2_over_pi;
    float tanh_val = tanhf(inner);
    return 0.5f * x * (1.0f + tanh_val);
}

// Optimized kernel that applies the GELU activation using shared memory
__global__ void gelu_kernel_optimized(const float* __restrict__ x, float* __restrict__ y, int n) {
    extern __shared__ float shared_x[];

    const int tid = threadIdx.x;
    const int gid = blockIdx.x * blockDim.x * 4 + tid;

    // Load data into shared memory
    #pragma unroll
    for (int i = 0; i < 4; i++) {
        const int idx = gid + i * blockDim.x;
        if (idx < n) {
            shared_x[tid + i * blockDim.x] = x[idx];
        }
    }
    __syncthreads();

    // Process elements using shared memory
    #pragma unroll
    for (int i = 0; i < 4; i++) {
        const int idx = gid + i * blockDim.x;
        if (idx < n) {
            y[idx] = compute_gelu(shared_x[tid + i * blockDim.x]);
        }
    }
}

// Torch binding to launch optimized GELU kernel
torch::Tensor gelu_forward(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");
    
    auto y = torch::empty_like(x);
    int n = x.numel();
    
    const int threads = 256;
    int blocks = (n + threads * 4 - 1) / (threads * 4);
    
    // Allocate shared memory for the block
    size_t shared_mem_size = threads * 4 * sizeof(float);
    
    gelu_kernel_optimized<<<blocks, threads, shared_mem_size>>>(
        x.data_ptr<float>(),
        y.data_ptr<float>(),
        n
    );
    
    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward, "Optimized GELU forward CUDA implementation");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.856 inst/cycle 0.000 5
Executed Ipc Elapsed 2.256 inst/cycle 0.000 5
Issue Slots Busy 71.524 % 0.027 5
Issued Ipc Active 2.862 inst/cycle 0.000 5
SM Busy 71.524 % 0.027 5
Memory Throughput 1361378074709.264 byte/second 131282521743893102592.000 5
Mem Busy 33.626 % 0.083 5
Max Bandwidth 40.700 % 0.134 5
L1/TEX Hit Rate 0.000 % 0.000 5
L2 Hit Rate 50.540 % 0.043 5
Mem Pipes Busy 32.252 % 0.037 5
Warp Cycles Per Issued Instruction 19.050 cycle 0.006 5
Warp Cycles Per Executed Instruction 19.090 cycle 0.006 5
Avg. Active Threads Per Warp 25.350 0.000 5
Avg. Not Predicated Off Threads Per Warp 24.340 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 10.000 block 0.000 5
Block Limit Shared Mem 20.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 85.600 % 0.005 5
Achieved Active Warps Per SM 54.784 warp 0.002 5
Analysis Rules
Rule Description
INF HighPipeUtilization FMA is the highest-utilized pipeline (31.6%) based on active cycles, taking into account the rates of its different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD) operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (85.6%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 306800.69 μs
Device Time 1644.47 μs
Self CPU Time 39.81 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 306760.88 μs
Device Time 1644.47 μs
Self CPU Time 89.30 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 322656.93 μs
Device Time 0.00 μs
Self CPU Time 17350.13 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 303589.51 μs
Device Time 0.00 μs
Self CPU Time 303589.51 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 522395.96 μs
Device Time 21118.46 μs
Self CPU Time 522395.96 μs
Self Device Time 21118.46 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
gelu_kernel_optimized(float const*, float*, int)
CPU Time 0.00 μs
Device Time 105522.71 μs
Self CPU Time 0.00 μs
Self Device Time 105522.71 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 18684.16 μs
Device Time 40594.80 μs
Self CPU Time 18684.16 μs
Self Device Time 40594.80 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 80223.35 μs
Device Time 601821.36 μs
Self CPU Time 11994.76 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 68233.18 μs
Device Time 601821.36 μs
Self CPU Time 14891.09 μs
Self Device Time 601821.36 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 601821.36 μs
Self CPU Time 0.00 μs
Self Device Time 601821.36 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45281 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:21:21 bugprone-narrowing-conversions
21 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:22:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | const int gid = blockIdx.x * blockDim.x * 4 + tid;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:27:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
27 | const int idx = gid + i * blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:37:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
37 | const int idx = gid + i * blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:45:42: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
45 | torch::Tensor gelu_forward(torch::Tensor x) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:50:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
50 | int n = x.numel();
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:56:30: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
56 | size_t shared_mem_size = threads * 4 * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:56:30: note: make conversion explicit to silence this warning
5 | size_t shared_mem_size = threads * 4 * sizeof(float);
| ^~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b4_s2_gelu_kernel_optimized/base/base.cu:56:30: note: perform multiplication in a wider type
56 | size_t shared_mem_size = threads * 4 * sizeof(float);
| ^~~~~~~
| static_cast<long>( )