← Back to Leaderboard

The AI CUDA Engineer 👷

12_Gemm_Multiply_LeakyReLU12_gemm_warp_primitives_base

Level 2 • Task 12
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    multiplier: float,
    negative_slope: float,
    weight: torch.Tensor,
    bias: torch.Tensor,
) -> torch.Tensor:
    """
    Applies linear transformation, multiplies by scalar, and applies LeakyReLU.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_features)
        multiplier (float): Scalar multiplier
        negative_slope (float): Negative slope for LeakyReLU
        weight (torch.Tensor): Weight matrix of shape (out_features, in_features)
        bias (torch.Tensor): Bias vector of shape (out_features)

    Returns:
        torch.Tensor: Output tensor of shape (batch_size, out_features)
    """
    x = F.linear(x, weight, bias)
    x = x * multiplier
    x = F.leaky_relu(x, negative_slope=negative_slope)
    return x


class Model(nn.Module):
    """
    Simple model that performs a Gemm, multiplies the result, and applies LeakyReLU.
    """

    def __init__(self, in_features, out_features, multiplier, negative_slope):
        super(Model, self).__init__()
        gemm = nn.Linear(in_features, out_features)
        self.weight = gemm.weight
        self.bias = gemm.bias

    def forward(self, x, fn=module_fn):
        return fn(x, multiplier, negative_slope, self.weight, self.bias)


batch_size = 128
in_features = 1024
out_features = 512
multiplier = 2.0
negative_slope = 0.1


def get_inputs():
    return [torch.randn(batch_size, in_features)]


def get_init_inputs():
    return [in_features, out_features, multiplier, negative_slope]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a Gemm, multiplies the result, and applies LeakyReLU.
    """
    def __init__(self, in_features, out_features, multiplier, negative_slope):
        super(Model, self).__init__()
        self.gemm = nn.Linear(in_features, out_features)
        self.multiplier = multiplier
        self.leaky_relu = nn.LeakyReLU(negative_slope)

    def forward(self, x):
        x = self.gemm(x)
        x = x * self.multiplier
        x = self.leaky_relu(x)
        return x

batch_size = 128
in_features = 1024
out_features = 512
multiplier = 2.0
negative_slope = 0.1

def get_inputs():
    return [torch.randn(batch_size, in_features)]

def get_init_inputs():
    return [in_features, out_features, multiplier, negative_slope]

Kernel Information

Related Kernels (Level 2, Task 12 • 12_Gemm_Multiply_LeakyReLU)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 12_gemm_warp_primitives_base 0.03 1.30 2.32
🥈 12_gemm_ldg_optimization_base 0.04 1.18 2.12
🥈 12_gemm_warp_vec4_edit_1_base_edit_1 0.04 1.18 2.12
🥈 12_gemm_ldg_optimization_edit_1 0.04 1.18 2.12
5 12_gemm_warp_vec4_edit_1_base_base 0.04 1.04 1.86
6 gemm_tiled_grid_edit_1 0.05 0.83 1.49
7 12_gemm_constant_memory_edit_1 0.05 0.80 1.43
8 gemm_unrolled_base 0.05 0.77 1.38
8 gemm_unrolled_edit_1 0.05 0.77 1.38
8 12_gemm_constant_memory_base 0.05 0.77 1.38
11 gemm_tiled_grid_block_32_base 0.07 0.60 1.08
12 gemm_tiled_grid_base 0.07 0.59 1.06
12 gemm_tiled_shared_base 0.07 0.59 1.06
12 gemm_tiled_grid_block_32_edit_1 0.07 0.59 1.06
15 12_gemm_tiled_coalesced_edit_1 0.07 0.58 1.05
16 gemm_tiled_streamed_base 0.07 0.56 1.00
17 optimized_gemm_leakyrelu_base 0.07 0.55 0.99
17 atomic_optimization_gemm_edit_1 0.07 0.55 0.99
17 optimized_gemm_leakyrelu_edit_1 0.07 0.55 0.99
20 optimized_thread_block_indexing_gemm_base 0.08 0.51 0.91
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

__device__ __inline__ float warp_reduce_sum(float val) {
    for (int offset = 16; offset > 0; offset /= 2) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

__global__ void module_fn_kernel(
    const float* __restrict__ x,
    const float* __restrict__ weight,
    const float* __restrict__ bias,
    float* __restrict__ output,
    const int batch_size,
    const int in_features,
    const int out_features,
    const float multiplier,
    const float negative_slope
) {
    const int row = blockIdx.x;
    const int col = blockIdx.y * blockDim.y + threadIdx.y;
    const int lane_id = threadIdx.x;
    
    if (row >= batch_size || col >= out_features) return;

    const float* x_row = x + row * in_features;
    const float* weight_col = weight + col * in_features;
    
    float thread_sum = 0.0f;
    for (int k = lane_id; k < in_features; k += 32) {
        thread_sum += x_row[k] * weight_col[k];
    }
    
    float sum = warp_reduce_sum(thread_sum);
    
    if (lane_id == 0) {
        sum += bias[col];
        sum *= multiplier;
        output[row * out_features + col] = sum > 0 ? sum : sum * negative_slope;
    }
}

torch::Tensor module_fn_forward(
    torch::Tensor x,
    float multiplier,
    float negative_slope,
    torch::Tensor weight,
    torch::Tensor bias
) {
    TORCH_CHECK(x.device().is_cuda(), "x must be a CUDA tensor");
    TORCH_CHECK(weight.device().is_cuda(), "weight must be a CUDA tensor");
    TORCH_CHECK(bias.device().is_cuda(), "bias must be a CUDA tensor");

    const int batch_size = x.size(0);
    const int in_features = x.size(1);
    const int out_features = weight.size(0);

    TORCH_CHECK(weight.size(1) == in_features, "Weight in_features must match x in_features");
    TORCH_CHECK(bias.size(0) == out_features, "Bias size must match weight out_features");

    auto output = torch::zeros({batch_size, out_features}, x.options());

    dim3 block(32, 16);
    dim3 grid(
        batch_size,
        (out_features + block.y - 1) / block.y
    );

    module_fn_kernel<<<grid, block>>>(
        x.data_ptr<float>(),
        weight.data_ptr<float>(),
        bias.data_ptr<float>(),
        output.data_ptr<float>(),
        batch_size,
        in_features,
        out_features,
        multiplier,
        negative_slope
    );

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &module_fn_forward, "Module function forward CUDA with warp primitives");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.274 inst/cycle 0.000 5
Executed Ipc Elapsed 2.078 inst/cycle 0.000 5
Issue Slots Busy 56.968 % 0.022 5
Issued Ipc Active 2.278 inst/cycle 0.000 5
SM Busy 56.968 % 0.022 5
Memory Throughput 85160445830.716 byte/second 246532052853514400.000 5
Mem Busy 77.578 % 0.088 5
Max Bandwidth 75.448 % 0.091 5
L1/TEX Hit Rate 53.290 % 0.034 5
L2 Hit Rate 96.776 % 1.255 5
Mem Pipes Busy 75.448 % 0.091 5
Warp Cycles Per Issued Instruction 25.522 cycle 0.019 5
Warp Cycles Per Executed Instruction 25.560 cycle 0.019 5
Avg. Active Threads Per Warp 29.690 0.000 5
Avg. Not Predicated Off Threads Per Warp 28.880 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 4.000 block 0.000 5
Block Limit Shared Mem 16.000 block 0.000 5
Block Limit Warps 4.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 90.940 % 0.019 5
Achieved Active Warps Per SM 58.198 warp 0.008 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (22.6%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
INF Occupancy This kernel's theoretical occupancy is not impacted by any block limit.
Operation / Metric Value Unit
aten::to
CPU Time 427551.51 μs
Device Time 201.92 μs
Self CPU Time 64.12 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zeros
CPU Time 5527234.80 μs
Device Time 111875.26 μs
Self CPU Time 128742.20 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 6689145.67 μs
Device Time 6241978.99 μs
Self CPU Time 252220.72 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 6436927.04 μs
Device Time 6241978.99 μs
Self CPU Time 348120.83 μs
Self Device Time 6241978.99 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 6378798.01 μs
Device Time 271994.24 μs
Self CPU Time 6378798.01 μs
Self Device Time 271994.24 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
module_fn_kernel(float const*, float const*, float const*, float*, int, int, int, float, float)
CPU Time 0.00 μs
Device Time 2106335.81 μs
Self CPU Time 0.00 μs
Self Device Time 2106335.81 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 6130103.74 μs
Self CPU Time 0.00 μs
Self Device Time 6130103.74 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45292 warnings generated when compiling for host.
Suppressed 45325 warnings (45278 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:13:5 bugprone-easily-swappable-parameters
13 | const float* __restrict__ x,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
14 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:13:31: note: the first parameter in the range is 'x'
13 | const float* __restrict__ x,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:15:31: note: the last parameter in the range is 'bias'
15 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:17:5: warning: 2 adjacent parameters of 'module_fn_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
17 | const int batch_size,
| ^~~~~~~~~~~~~~~~~~~~~
18 | const int in_features,
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:17:15: note: the first parameter in the range is 'batch_size'
17 | const int batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:18:15: note: the last parameter in the range is 'in_features'
18 | const int in_features,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:19:5: warning: 2 adjacent parameters of 'module_fn_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
19 | const int out_features,
| ^~~~~~~~~~~~~~~~~~~~~~~
20 | const float multiplier,
| ~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:19:15: note: the first parameter in the range is 'out_features'
19 | const int out_features,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:20:17: note: the last parameter in the range is 'multiplier'
20 | const float multiplier,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:20:5: note: 'const int' and 'const float' may be implicitly converted: 'const int' (as 'int') -> 'const float' (as 'float'), 'const float' (as 'float') -> 'const int' (as 'int')
20 | const float multiplier,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:23:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
23 | const int row = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:24:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
24 | const int col = blockIdx.y * blockDim.y + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:25:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
25 | const int lane_id = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:29:26: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
29 | const float* x_row = x + row * in_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:29:30: note: make conversion explicit to silence this warning
4 | const float* x_row = x + row * in_features;
| ^~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:29:30: note: perform multiplication in a wider type
29 | const float* x_row = x + row * in_features;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:30:31: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
30 | const float* weight_col = weight + col * in_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:30:40: note: make conversion explicit to silence this warning
30 | const float* weight_col = weight + col * in_features;
| ^~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:30:40: note: perform multiplication in a wider type
30 | const float* weight_col = weight + col * in_features;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:47:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
47 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:50:19: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
50 | torch::Tensor weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:51:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
51 | torch::Tensor bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:57:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
57 | const int batch_size = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:58:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
58 | const int in_features = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_12/b3_s1_12_gemm_warp_primitives/base/base.cu:59:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
59 | const int out_features = weight.size(0);
| ^