← Back to Leaderboard

The AI CUDA Engineer 👷

19_ConvTranspose2d_GELU_GroupNormopt_convtrans_gelu_gn_base

Level 2 • Task 19
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    stride: int,
    conv_transpose: torch.Tensor,
    conv_transpose_bias: torch.Tensor,
    group_norm_weight: torch.Tensor,
    group_norm_bias: torch.Tensor,
    num_groups: int,
) -> torch.Tensor:
    """
    Applies transposed convolution, GELU activation, and group normalization.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width)
        stride (int): Stride of the transposed convolution
        conv_transpose (torch.Tensor): Transposed convolution weight tensor
        conv_transpose_bias (torch.Tensor): Bias tensor for transposed convolution
        group_norm_weight (torch.Tensor): Weight tensor for group normalization
        group_norm_bias (torch.Tensor): Bias tensor for group normalization
        num_groups (int): Number of groups for group normalization

    Returns:
        torch.Tensor: Output tensor after applying transposed convolution, GELU and group norm
    """
    x = F.conv_transpose2d(x, conv_transpose, bias=conv_transpose_bias, stride=stride)
    x = F.gelu(x)
    x = F.group_norm(
        x, num_groups=num_groups, weight=group_norm_weight, bias=group_norm_bias
    )
    return x


class Model(nn.Module):
    """
    Model that performs a transposed convolution, applies GELU, and normalizes with GroupNorm.
    """

    def __init__(
        self, in_channels, out_channels, kernel_size, stride, groups, num_groups
    ):
        super(Model, self).__init__()
        conv_transpose = nn.ConvTranspose2d(
            in_channels, out_channels, kernel_size, stride=stride
        )
        group_norm = nn.GroupNorm(num_groups=num_groups, num_channels=out_channels)
        self.conv_transpose_parameter = conv_transpose.weight
        self.conv_transpose_bias = nn.Parameter(
            conv_transpose.bias + torch.ones_like(conv_transpose.bias) * 0.02
        )  # make sure its nonzero
        self.group_norm_weight = group_norm.weight
        self.group_norm_bias = nn.Parameter(
            group_norm.bias + torch.ones_like(group_norm.bias) * 0.02
        )  # make sure its nonzero

    def forward(self, x, stride, num_groups, fn=module_fn):
        return fn(
            x,
            stride,
            self.conv_transpose_parameter,
            self.conv_transpose_bias,
            self.group_norm_weight,
            self.group_norm_bias,
            num_groups,
        )


batch_size = 128
in_channels = 32
out_channels = 64
height, width = 32, 32
kernel_size = 4
stride = 2
groups = 8
num_groups = 8


def get_inputs():
    return [torch.randn(batch_size, in_channels, height, width), stride, num_groups]


def get_init_inputs():
    return [in_channels, out_channels, kernel_size, stride, groups, num_groups]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Model that performs a transposed convolution, applies GELU, and normalizes with GroupNorm.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride, groups, num_groups):
        super(Model, self).__init__()
        self.conv_transpose = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride=stride)
        self.group_norm = nn.GroupNorm(num_groups=num_groups, num_channels=out_channels)
        # Add the same noise as in the functional implementation
        self.conv_transpose.bias = nn.Parameter(self.conv_transpose.bias + torch.ones_like(self.conv_transpose.bias) * 0.02)
        self.group_norm.bias = nn.Parameter(self.group_norm.bias + torch.ones_like(self.group_norm.bias) * 0.02)

    def forward(self, x):
        x = self.conv_transpose(x)
        x = torch.nn.functional.gelu(x)
        x = self.group_norm(x)
        return x

batch_size = 128
in_channels = 32
out_channels = 64
height, width = 32, 32
kernel_size = 4
stride = 2
groups = 8
num_groups = 8

def get_inputs():
    return [torch.randn(batch_size, in_channels, height, width)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, stride, groups, num_groups]

Kernel Information

Related Kernels (Level 2, Task 19 • 19_ConvTranspose2d_GELU_GroupNorm)

#include <torch/extension.h>
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <pybind11/pybind11.h>
#include <cmath>

// Fused kernel: Applies GELU activation and Group Normalization in one pass
// after a transposed convolution. The kernel minimizes warp divergence by
// using warp shuffle reductions for computing mean and variance.

__global__ void fused_gelu_group_norm_kernel(
    const float* __restrict__ in,
    float* __restrict__ out,
    int N, int C, int H, int W,
    int num_groups,
    const float* __restrict__ gn_weight,
    const float* __restrict__ gn_bias,
    float eps) {

  // Each block processes one group from one sample
  int group_id = blockIdx.x; // group index across all samples
  int n = group_id / num_groups;
  int g = group_id % num_groups;
  int channels_per_group = C / num_groups;
  int group_elems = channels_per_group * H * W;

  // Compute base offset for this group in the tensor
  int base = n * C * H * W + g * channels_per_group * H * W;

  // First pass: each thread applies GELU activation and accumulates statistics
  float local_sum = 0.0f;
  float local_sum_sq = 0.0f;

  // Loop over the group's elements (each thread strides through the group)
  for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
      float val = in[base + idx];
      // GELU activation using a tanh approximation
      float gelu_val = 0.5f * val * (1.0f + tanhf(0.7978845608f * (val + 0.044715f * val * val * val)));
      out[base + idx] = gelu_val; // store activated value temporarily
      local_sum += gelu_val;
      local_sum_sq += gelu_val * gelu_val;
  }

  // Use warp shuffle reduction to sum local_sum and local_sum_sq within a warp
  int lane = threadIdx.x & 31;
  int warpId = threadIdx.x >> 5;
  for (int offset = 16; offset > 0; offset /= 2) {
      local_sum += __shfl_down_sync(0xffffffff, local_sum, offset);
      local_sum_sq += __shfl_down_sync(0xffffffff, local_sum_sq, offset);
  }

  // Shared memory to hold per-warp partial sums
  __shared__ float shared_sum[32];
  __shared__ float shared_sum_sq[32];
  if (lane == 0) {
      shared_sum[warpId] = local_sum;
      shared_sum_sq[warpId] = local_sum_sq;
  }
  __syncthreads();

  // Reduce the sums from all warps; assume blockDim.x <= 1024 so max warps is 32
  float group_sum = 0.0f;
  float group_sum_sq = 0.0f;
  int num_warps = (blockDim.x + 31) / 32;
  if (threadIdx.x < num_warps) {
      group_sum = shared_sum[threadIdx.x];
      group_sum_sq = shared_sum_sq[threadIdx.x];
  }
  if (threadIdx.x < 32) {
      for (int offset = 16; offset > 0; offset /= 2) {
          group_sum += __shfl_down_sync(0xffffffff, group_sum, offset);
          group_sum_sq += __shfl_down_sync(0xffffffff, group_sum_sq, offset);
      }
  }
  float mean, var;
  if (threadIdx.x == 0) {
      mean = group_sum / group_elems;
      var = group_sum_sq / group_elems - mean * mean;
      shared_sum[0] = mean;      // reuse shared memory to broadcast
      shared_sum_sq[0] = var;
  }
  __syncthreads();
  mean = shared_sum[0];
  var = shared_sum_sq[0];

  // Second pass: normalize the activated values and apply per-channel scaling (affine transform)
  for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
      float gelu_val = out[base + idx];
      // Determine the actual channel index from the relative index
      int ch_rel = idx / (H * W);
      int channel = g * channels_per_group + ch_rel;
      float w = gn_weight[channel];
      float b = gn_bias[channel];
      float norm_val = (gelu_val - mean) / sqrtf(var + eps);
      out[base + idx] = norm_val * w + b;
  }
}


// Host function: applies ConvTranspose2d, then fuses GELU and GroupNorm in one CUDA kernel

torch::Tensor forward(
    torch::Tensor x,
    int64_t stride,
    torch::Tensor conv_transpose_weight,
    torch::Tensor conv_transpose_bias,
    torch::Tensor group_norm_weight,
    torch::Tensor group_norm_bias,
    int64_t num_groups) {

    // Ensure input tensors are contiguous and on CUDA
    x = x.contiguous();
    conv_transpose_weight = conv_transpose_weight.contiguous();
    conv_transpose_bias = conv_transpose_bias.contiguous();
    group_norm_weight = group_norm_weight.contiguous();
    group_norm_bias = group_norm_bias.contiguous();

    if (!x.is_cuda()) x = x.cuda();
    if (!conv_transpose_weight.is_cuda()) conv_transpose_weight = conv_transpose_weight.cuda();
    if (!conv_transpose_bias.is_cuda()) conv_transpose_bias = conv_transpose_bias.cuda();
    if (!group_norm_weight.is_cuda()) group_norm_weight = group_norm_weight.cuda();
    if (!group_norm_bias.is_cuda()) group_norm_bias = group_norm_bias.cuda();

    // Perform transposed convolution using PyTorch's optimized operation
    auto conv_out = at::conv_transpose2d(x, conv_transpose_weight, conv_transpose_bias, {stride});

    // Allocate output tensor for the fused activation and normalization
    auto output = at::empty_like(conv_out);
    int N = conv_out.size(0);
    int C = conv_out.size(1);
    int H = conv_out.size(2);
    int W = conv_out.size(3);

    int groups = num_groups;
    int total_groups = N * groups;  // each block will handle one group
    int block = 256; // threads per block
    float eps = 1e-5;

    const float* conv_ptr = conv_out.data_ptr<float>();
    float* out_ptr = output.data_ptr<float>();
    const float* gn_weight_ptr = group_norm_weight.data_ptr<float>();
    const float* gn_bias_ptr = group_norm_bias.data_ptr<float>();

    // Launch fused kernel: one block per (sample, group) pair
    fused_gelu_group_norm_kernel<<<total_groups, block, 2 * sizeof(float) * ((block + 31) / 32)>>>(
        conv_ptr, out_ptr, N, C, H, W, groups, gn_weight_ptr, gn_bias_ptr, eps);
    cudaDeviceSynchronize();
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused ConvTranspose2d with GELU and GroupNorm (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 1.366 inst/cycle 0.000 5
Executed Ipc Elapsed 1.318 inst/cycle 0.000 5
Issue Slots Busy 34.150 % 0.004 5
Issued Ipc Active 1.366 inst/cycle 0.000 5
SM Busy 34.150 % 0.004 5
Memory Throughput 2217372032610.408 byte/second 146927895373659979776.000 5
Mem Busy 35.744 % 0.039 5
Max Bandwidth 66.156 % 0.131 5
L1/TEX Hit Rate 33.088 % 0.000 5
L2 Hit Rate 49.828 % 0.000 5
Mem Pipes Busy 13.190 % 0.006 5
Warp Cycles Per Issued Instruction 43.876 cycle 0.004 5
Warp Cycles Per Executed Instruction 43.882 cycle 0.005 5
Avg. Active Threads Per Warp 31.890 0.000 5
Avg. Not Predicated Off Threads Per Warp 29.760 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 23.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 93.684 % 0.002 5
Achieved Active Warps Per SM 59.956 warp 0.001 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (27.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
INF Occupancy This kernel's theoretical occupancy is not impacted by any block limit.
Operation / Metric Value Unit
aten::conv_transpose2d
CPU Time 506401.53 μs
Device Time 2307105.75 μs
Self CPU Time 13198.40 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::convolution
CPU Time 493203.14 μs
Device Time 2307105.75 μs
Self CPU Time 17150.05 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_convolution
CPU Time 476053.09 μs
Device Time 2307105.75 μs
Self CPU Time 34023.74 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::cudnn_convolution_transpose
CPU Time 365437.57 μs
Device Time 1442073.63 μs
Self CPU Time 194168.19 μs
Self Device Time 1442073.63 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceSynchronize
CPU Time 4133033.75 μs
Device Time 37031.43 μs
Self CPU Time 4133033.75 μs
Self Device Time 37031.43 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
fused_gelu_group_norm_kernel(float const*, float*, int, int, int, int, int, float const*, float const*, float)
CPU Time 0.00 μs
Device Time 1705858.87 μs
Self CPU Time 0.00 μs
Self Device Time 1705858.87 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45298 warnings generated when compiling for host.
Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:15:5 bugprone-easily-swappable-parameters
15 | int N, int C, int H, int W,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:15:9: note: the first parameter in the range is 'N'
15 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:15:16: note: the last parameter in the range is 'C'
15 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:15:26: warning: 2 adjacent parameters of 'fused_gelu_group_norm_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
15 | int N, int C, int H, int W,
| ^~~~~~
16 | int num_groups,
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:15:30: note: the first parameter in the range is 'W'
15 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:16:9: note: the last parameter in the range is 'num_groups'
16 | int num_groups,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:17:5: warning: 2 adjacent parameters of 'fused_gelu_group_norm_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
17 | const float* __restrict__ gn_weight,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | const float* __restrict__ gn_bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:17:31: note: the first parameter in the range is 'gn_weight'
17 | const float* __restrict__ gn_weight,
| ^~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:18:31: note: the last parameter in the range is 'gn_bias'
18 | const float* __restrict__ gn_bias,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:22:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | int group_id = blockIdx.x; // group index across all samples
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:36:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
36 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:36:57: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
36 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:46:14: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
46 | int lane = threadIdx.x & 31;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:47:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
47 | int warpId = threadIdx.x >> 5;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:65:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
65 | int num_warps = (blockDim.x + 31) / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:78:26: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
78 | mean = group_sum / group_elems;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:79:28: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
79 | var = group_sum_sq / group_elems - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:88:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
88 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:88:57: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
88 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:130:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
130 | int N = conv_out.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:131:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
131 | int C = conv_out.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:132:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
132 | int H = conv_out.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:133:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
133 | int W = conv_out.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b1_s3_opt_convtrans_gelu_gn/base/base.cu:135:18: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
135 | int groups = num_groups;
| ^