← Back to Leaderboard

The AI CUDA Engineer 👷

88_MinGPTNewGelu88_MinGPTNewGelu_shared_base_base

Level 1 • Task 88
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


def module_fn(x: torch.Tensor) -> torch.Tensor:
    """
    Implementation of the Gaussian Error Linear Units (GELU) activation function currently in Google BERT repo (identical to OpenAI GPT).

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor.
    """
    return (
        0.5
        * x
        * (
            1.0
            + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))
        )
    )


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x, fn=module_fn):
        return fn(x)


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# From https://github.com/karpathy/minGPT/blob/master/mingpt/model.py


class Model(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1.0
                + torch.tanh(
                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
                )
            )
        )


batch_size = 2000
dim = 2000


def get_inputs():
    return [torch.randn(batch_size, dim)]


def get_init_inputs():
    return []

Kernel Information

Related Kernels (Level 1, Task 88 • 88_MinGPTNewGelu)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 88_MinGPTNewGelu_shared_base_base 0.02 5.72 2.99
🥇 gelu_uniform_flow_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_indexing_base 0.02 5.72 2.99
🥇 gelu_tile_inline_base 0.02 5.72 2.99
🥇 optimized_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 gelu_kernel_optimized_base 0.02 5.72 2.99
🥇 88_mingptnewgelu_shared_tile_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
🥇 gelu_modular_base_base 0.02 5.72 2.99
🥇 optimized_gelu_manual_unroll_base 0.02 5.72 2.99
🥇 modular_gelu_device_base 0.02 5.72 2.99
🥇 optimized_gelu_combined_edit_1 0.02 5.72 2.99
🥇 gelu_optimized_block_size_base 0.02 5.72 2.99
🥇 combined_gelu_modular_vectorized_edit_1 0.02 5.72 2.99
🥇 optimized_gelu_combined_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_edit_1 0.02 5.72 2.99
🥇 modular_gelu_kernel_base 0.02 5.72 2.99
🥇 gelu_vectorized_tuned_base 0.02 5.72 2.99
🥇 gelu_vectorized_base 0.02 5.72 2.99
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

__global__ void gelu_kernel_shared(const float* __restrict__ x, float* __restrict__ y, int n) {
    const float sqrt_2_over_pi = 0.7978845608f;
    const float coeff = 0.044715f;
    
    extern __shared__ float shared_x[];
    
    const int tid = threadIdx.x;
    const int gid = blockIdx.x * blockDim.x * 4 + tid;
    
    // Load data into shared memory
    #pragma unroll
    for (int i = 0; i < 4; i++) {
        const int idx = gid + i * blockDim.x;
        if (idx < n) {
            shared_x[tid + i * blockDim.x] = x[idx];
        }
    }
    __syncthreads();
    
    // Process elements using shared memory
    #pragma unroll
    for (int i = 0; i < 4; i++) {
        const int idx = gid + i * blockDim.x;
        if (idx < n) {
            float xi = shared_x[tid + i * blockDim.x];
            float x_cubed = xi * xi * xi;
            float inner = xi + coeff * x_cubed;
            inner *= sqrt_2_over_pi;
            float tanh_val = tanhf(inner);
            y[idx] = 0.5f * xi * (1.0f + tanh_val);
        }
    }
}

torch::Tensor gelu_forward(torch::Tensor x) {
    TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");
    
    auto y = torch::empty_like(x);
    int n = x.numel();
    
    const int threads = 256;
    int blocks = (n + threads * 4 - 1) / (threads * 4);
    
    // Allocate shared memory for the block
    size_t shared_mem_size = threads * 4 * sizeof(float);
    
    gelu_kernel_shared<<<blocks, threads, shared_mem_size>>>(
        x.data_ptr<float>(),
        y.data_ptr<float>(),
        n
    );
    
    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &gelu_forward, "GELU forward CUDA implementation with shared memory");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.852 inst/cycle 0.000 5
Executed Ipc Elapsed 2.264 inst/cycle 0.001 5
Issue Slots Busy 71.460 % 0.025 5
Issued Ipc Active 2.860 inst/cycle 0.000 5
SM Busy 71.460 % 0.025 5
Memory Throughput 1360166362187.542 byte/second 124489954523502608384.000 5
Mem Busy 33.646 % 0.084 5
Max Bandwidth 40.706 % 0.106 5
L1/TEX Hit Rate 0.000 % 0.000 5
L2 Hit Rate 50.362 % 0.015 5
Mem Pipes Busy 32.346 % 0.114 5
Warp Cycles Per Issued Instruction 19.090 cycle 0.008 5
Warp Cycles Per Executed Instruction 19.132 cycle 0.008 5
Avg. Active Threads Per Warp 25.350 0.000 5
Avg. Not Predicated Off Threads Per Warp 24.340 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 10.000 block 0.000 5
Block Limit Shared Mem 20.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 85.626 % 0.006 5
Achieved Active Warps Per SM 54.800 warp 0.003 5
Analysis Rules
Rule Description
INF HighPipeUtilization FMA is the highest-utilized pipeline (31.6%) based on active cycles, taking into account the rates of its different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD) operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (85.6%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 596227.59 μs
Device Time 1562.02 μs
Self CPU Time 41.82 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 596185.77 μs
Device Time 1562.02 μs
Self CPU Time 105.73 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 612907.76 μs
Device Time 0.00 μs
Self CPU Time 16876.48 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 590394.31 μs
Device Time 0.00 μs
Self CPU Time 590394.31 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 538657.61 μs
Device Time 20807.95 μs
Self CPU Time 538657.61 μs
Self Device Time 20807.95 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
gelu_kernel_shared(float const*, float*, int)
CPU Time 0.00 μs
Device Time 104542.62 μs
Self CPU Time 0.00 μs
Self Device Time 104542.62 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 18399.08 μs
Device Time 40191.97 μs
Self CPU Time 18399.08 μs
Self Device Time 40191.97 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 83305.24 μs
Device Time 596310.49 μs
Self CPU Time 13034.94 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 70271.55 μs
Device Time 596310.49 μs
Self CPU Time 15149.01 μs
Self Device Time 596310.49 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 596310.49 μs
Self CPU Time 0.00 μs
Self Device Time 596310.49 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45281 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:12:21 bugprone-narrowing-conversions
12 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:13:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
13 | const int gid = blockIdx.x * blockDim.x * 4 + tid;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:18:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
18 | const int idx = gid + i * blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:28:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | const int idx = gid + i * blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:40:42: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
40 | torch::Tensor gelu_forward(torch::Tensor x) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:45:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | int n = x.numel();
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:51:30: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
51 | size_t shared_mem_size = threads * 4 * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:51:30: note: make conversion explicit to silence this warning
5 | size_t shared_mem_size = threads * 4 * sizeof(float);
| ^~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_88/b2_s3_88_MinGPTNewGelu_shared_base/base/base.cu:51:30: note: perform multiplication in a wider type
51 | size_t shared_mem_size = threads * 4 * sizeof(float);
| ^~~~~~~
| static_cast<long>( )