Kernel Details - fused_lockfree_groupnorm_base

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    conv_weight: torch.Tensor,
    conv_bias: torch.Tensor,
    bias: torch.Tensor,
    scale: torch.Tensor,
    group_norm_weight: torch.Tensor,
    group_norm_bias: torch.Tensor,
    num_groups: int,
) -> torch.Tensor:
    """
    Applies convolution, bias addition, scaling, sigmoid activation and group normalization.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width)
        conv_weight (torch.Tensor): Convolution weight tensor
        conv_bias (torch.Tensor): Convolution bias tensor
        bias (torch.Tensor): Bias tensor for addition
        scale (torch.Tensor): Scale tensor for multiplication
        group_norm_weight (torch.Tensor): Group norm weight tensor
        group_norm_bias (torch.Tensor): Group norm bias tensor
        num_groups (int): Number of groups for group normalization

    Returns:
        torch.Tensor: Output tensor after applying convolution, bias, scale, sigmoid and group norm
    """
    x = F.conv2d(x, conv_weight, bias=conv_bias)
    x = x + bias
    x = x * scale
    x = torch.sigmoid(x)
    x = F.group_norm(x, num_groups, group_norm_weight, group_norm_bias)
    return x


class Model(nn.Module):
    """
    Model that performs a convolution, adds a bias term, scales, applies sigmoid, and performs group normalization.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        num_groups,
        bias_shape,
        scale_shape,
    ):
        super(Model, self).__init__()
        conv = nn.Conv2d(in_channels, out_channels, kernel_size)
        self.conv_weight = conv.weight
        self.conv_bias = nn.Parameter(
            conv.bias + torch.ones_like(conv.bias) * 0.02
        )  # make sure its nonzero
        self.bias = nn.Parameter(torch.randn(bias_shape) * 0.02)
        self.scale = nn.Parameter(torch.randn(scale_shape) * 0.02)
        group_norm = nn.GroupNorm(num_groups, out_channels)
        self.group_norm_weight = group_norm.weight
        self.group_norm_bias = nn.Parameter(
            group_norm.bias + torch.ones_like(group_norm.bias) * 0.02
        )  # make sure its nonzero

    def forward(self, x, num_groups, fn=module_fn):
        return fn(
            x,
            self.conv_weight,
            self.conv_bias,
            self.bias,
            self.scale,
            self.group_norm_weight,
            self.group_norm_bias,
            num_groups,
        )


batch_size = 128
in_channels = 3
out_channels = 16
height, width = 32, 32
kernel_size = 3
num_groups = 8
bias_shape = (out_channels, 1, 1)
scale_shape = (out_channels, 1, 1)


def get_inputs():
    return [torch.randn(batch_size, in_channels, height, width), num_groups]


def get_init_inputs():
    return [in_channels, out_channels, kernel_size, num_groups, bias_shape, scale_shape]

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Model that performs a convolution, adds a bias term, scales, applies sigmoid, and performs group normalization.
    """
    def __init__(self, in_channels, out_channels, kernel_size, num_groups, bias_shape, scale_shape):
        super(Model, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
        self.conv.bias = nn.Parameter(self.conv.bias + torch.ones_like(self.conv.bias) * 0.02)
        self.bias = nn.Parameter(torch.randn(bias_shape)*0.02) 
        self.scale = nn.Parameter(torch.randn(scale_shape)*0.02)
        self.group_norm = nn.GroupNorm(num_groups, out_channels)
        self.group_norm.bias = nn.Parameter(self.group_norm.bias + torch.ones_like(self.group_norm.bias) * 0.02)

    def forward(self, x):
        x = self.conv(x)
        x = x + self.bias
        x = x * self.scale
        x = torch.sigmoid(x)
        x = self.group_norm(x)
        return x

batch_size = 128
in_channels = 3
out_channels = 16
height, width = 32, 32
kernel_size = 3
num_groups = 8
bias_shape = (out_channels, 1, 1)
scale_shape = (out_channels, 1, 1)

def get_inputs():
    return [torch.randn(batch_size, in_channels, height, width)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, num_groups, bias_shape, scale_shape]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	21_Conv2d_Add_Scale_Sigmoid_GroupNorm
Level ID	2
Task ID	21
Kernel Name	fused_lockfree_groupnorm_base_base
CUDA Speedup (Native)	1.704x
CUDA Speedup (Compile)	1.459x
CUDA Runtime	0.044 ms
PyTorch Runtime (Native)	0.075 ms
PyTorch Runtime (Compile)	0.064 ms
Correct	True
Max Diff (vs. Reference)	0.007000
Model	bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 2, Task 21 • 21_Conv2d_Add_Scale_Sigmoid_GroupNorm)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	shared_memory_coalesced_access_kernel_base	0.04	1.79	1.53
🥈	fused_elem_groupnorm_reduced_sync_base_base	0.04	1.74	1.49
🥈	fused_optimized_warp_base	0.04	1.74	1.49
🥈	optimized_memory_access_kernel_base	0.04	1.74	1.49
🥈	atomic_optimized_kernel_base_base	0.04	1.74	1.49
🥈	optimized_memory_access_kernel_base	0.04	1.74	1.49
🥈	optimized_fused_kernel_base	0.04	1.74	1.49
8	fused_warp_reduce_groupnorm_base	0.04	1.70	1.46
8	fused_warp_groupnorm_base	0.04	1.70	1.46
8	shared_memory_reuse_kernel_base	0.04	1.70	1.46
8	fused_lockfree_groupnorm_base_base	0.04	1.70	1.46
8	fused_stride_kernel_base	0.04	1.70	1.46
8	fused_elem_groupnorm_no_atomic_base	0.04	1.70	1.46
8	fused_elem_groupnorm_min_sync_base_base	0.04	1.70	1.46
8	unrolled_fused_kernel_base_base	0.04	1.70	1.46
16	optimized_modular_kernel_base	0.04	1.67	1.43
16	fused_strided_groupnorm_base_base	0.04	1.67	1.43
18	fused_sigmoid_groupnorm_base	0.05	1.63	1.40
19	fused_sigmoid_groupnorm_base	0.05	1.60	1.37
19	block_size_optimized_kernel_base_base	0.05	1.60	1.37

#include <torch/extension.h>
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor.")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous.")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define WARP_SIZE 32
#define BLOCK_SIZE 256

__device__ __forceinline__ float warp_reduce_sum(float val) {
    #pragma unroll
    for (int offset = WARP_SIZE/2; offset > 0; offset /= 2) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

__global__ void fused_lockfree_kernel(
    const float4* __restrict__ x4,   // input tensor as float4
    float4* __restrict__ y4,         // output tensor as float4
    const float* __restrict__ bias,
    const float* __restrict__ scale,
    const float* __restrict__ gn_weight,
    const float* __restrict__ gn_bias,
    const int N, const int C, const int H, const int W,
    const int num_groups,
    const bool bias_broadcast,
    const bool scale_broadcast,
    const float eps) {

    extern __shared__ float shmem[];
    float* warp_sums = shmem;
    float* warp_sq_sums = &shmem[BLOCK_SIZE/WARP_SIZE];

    const int tid = threadIdx.x;
    const int wid = tid / WARP_SIZE;
    const int lane = tid % WARP_SIZE;
    const int group_idx = blockIdx.x % num_groups;
    const int sample_idx = blockIdx.x / num_groups;
    const int channels_per_group = C / num_groups;
    
    const int group_size = channels_per_group * H * W;
    const int group_size4 = group_size / 4;
    const int sample_offset = sample_idx * C * H * W / 4;
    const int group_offset = group_idx * channels_per_group * H * W / 4;

    float local_sum = 0.0f;
    float local_sq_sum = 0.0f;

    #pragma unroll 4
    for (int i = tid; i < group_size4; i += BLOCK_SIZE) {
        const int idx4 = sample_offset + group_offset + i;
        float4 in4 = x4[idx4];
        
        float vals[4] = {in4.x, in4.y, in4.z, in4.w};
        float4 out4;
        float* out_vals = reinterpret_cast<float*>(&out4);

        #pragma unroll
        for (int j = 0; j < 4; j++) {
            const int c = ((i * 4 + j) / (H * W)) + group_idx * channels_per_group;
            const float b = bias_broadcast ? bias[0] : bias[c];
            const float s = scale_broadcast ? scale[0] : scale[c];
            
            float val = (vals[j] + b) * s;
            val = 1.0f / (1.0f + expf(-val));
            out_vals[j] = val;
            
            local_sum += val;
            local_sq_sum += val * val;
        }
        
        y4[idx4] = out4;
    }

    local_sum = warp_reduce_sum(local_sum);
    local_sq_sum = warp_reduce_sum(local_sq_sum);

    if (lane == 0) {
        warp_sums[wid] = local_sum;
        warp_sq_sums[wid] = local_sq_sum;
    }
    __syncthreads();

    if (wid == 0) {
        if (lane < BLOCK_SIZE/WARP_SIZE) {
            local_sum = warp_sums[lane];
            local_sq_sum = warp_sq_sums[lane];
        } else {
            local_sum = 0.0f;
            local_sq_sum = 0.0f;
        }
        
        local_sum = warp_reduce_sum(local_sum);
        local_sq_sum = warp_reduce_sum(local_sq_sum);

        if (lane == 0) {
            warp_sums[0] = local_sum;
            warp_sq_sums[0] = local_sq_sum;
        }
    }
    __syncthreads();

    const float mean = warp_sums[0] / group_size;
    const float variance = (warp_sq_sums[0] / group_size) - (mean * mean);
    const float inv_std = rsqrtf(variance + eps);

    #pragma unroll 4
    for (int i = tid; i < group_size4; i += BLOCK_SIZE) {
        const int idx4 = sample_offset + group_offset + i;
        float4 val4 = y4[idx4];
        float* vals = reinterpret_cast<float*>(&val4);

        #pragma unroll
        for (int j = 0; j < 4; j++) {
            const int c = ((i * 4 + j) / (H * W)) + group_idx * channels_per_group;
            const float gamma = gn_weight[c];
            const float beta = gn_bias[c];
            
            vals[j] = gamma * ((vals[j] - mean) * inv_std) + beta;
        }
        
        y4[idx4] = val4;
    }
}

void fused_lockfree_cuda(
    at::Tensor x,
    at::Tensor bias,
    at::Tensor scale,
    at::Tensor y,
    at::Tensor gn_weight,
    at::Tensor gn_bias,
    int64_t num_groups,
    bool bias_broadcast,
    bool scale_broadcast,
    float eps) {

    const int N = x.size(0);
    const int C = x.size(1);
    const int H = x.size(2);
    const int W = x.size(3);
    
    const int total_blocks = N * num_groups;
    const size_t shared_mem_size = (2 * BLOCK_SIZE/WARP_SIZE) * sizeof(float);

    fused_lockfree_kernel<<<total_blocks, BLOCK_SIZE, shared_mem_size>>>(
        reinterpret_cast<float4*>(x.data_ptr<float>()),
        reinterpret_cast<float4*>(y.data_ptr<float>()),
        bias.data_ptr<float>(),
        scale.data_ptr<float>(),
        gn_weight.data_ptr<float>(),
        gn_bias.data_ptr<float>(),
        N, C, H, W,
        num_groups,
        bias_broadcast,
        scale_broadcast,
        eps);
}

at::Tensor module_fn_forward(
    at::Tensor x,
    at::Tensor conv_weight,
    at::Tensor conv_bias,
    at::Tensor bias,
    at::Tensor scale,
    at::Tensor gn_weight,
    at::Tensor gn_bias,
    int64_t num_groups) {

    CHECK_INPUT(x);
    CHECK_INPUT(conv_weight);
    if (conv_bias.defined()) CHECK_INPUT(conv_bias);
    CHECK_INPUT(bias);
    CHECK_INPUT(scale);
    CHECK_INPUT(gn_weight);
    CHECK_INPUT(gn_bias);

    x = at::conv2d(x, conv_weight, conv_bias);
    at::Tensor y = at::empty_like(x);

    bool bias_broadcast = (bias.numel() == 1);
    bool scale_broadcast = (scale.numel() == 1);
    float eps = 1e-5;

    fused_lockfree_cuda(x, bias, scale, y, gn_weight, gn_bias,
                       num_groups, bias_broadcast, scale_broadcast, eps);

    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &module_fn_forward, "Fused lock-free group norm kernel (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	2.432	inst/cycle	0.000	5
Executed Ipc Elapsed	1.932	inst/cycle	0.000	5
Issue Slots Busy	61.080	%	0.167	5
Issued Ipc Active	2.444	inst/cycle	0.000	5
SM Busy	61.080	%	0.167	5
Memory Throughput	542417185755.870	byte/second	388453711015542080.000	5
Mem Busy	24.510	%	0.001	5
Max Bandwidth	29.560	%	0.002	5
L1/TEX Hit Rate	61.660	%	0.000	5
L2 Hit Rate	68.748	%	0.132	5
Mem Pipes Busy	16.052	%	0.001	5
Warp Cycles Per Issued Instruction	14.374	cycle	0.000	5
Warp Cycles Per Executed Instruction	14.446	cycle	0.000	5
Avg. Active Threads Per Warp	30.580		0.000	5
Avg. Not Predicated Off Threads Per Warp	26.970		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	6.000	block	0.000	5
Block Limit Shared Mem	14.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	48.000	warp	0.000	5
Theoretical Occupancy	75.000	%	0.000	5
Achieved Occupancy	55.276	%	0.006	5
Achieved Active Warps Per SM	35.376	warp	0.003	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (55.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy	This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. The difference between calculated theoretical (75.0%) and measured achieved occupancy (55.3%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Rule

Description

INF HighPipeUtilization

ALU is the highest-utilized pipeline (55.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.

WRN Occupancy

This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. The difference between calculated theoretical (75.0%) and measured achieved occupancy (55.3%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::fill_
CPU Time	143917.26	μs
Device Time	1111776.14	μs
Self CPU Time	32388.36	μs
Self Device Time	1111776.14	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	170009.17	μs
Device Time	1111776.14	μs
Self CPU Time	26117.85	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::conv2d
CPU Time	1221769.31	μs
Device Time	394456.71	μs
Self CPU Time	24810.51	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::convolution
CPU Time	1196958.80	μs
Device Time	394456.71	μs
Self CPU Time	30406.04	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_convolution
CPU Time	1166552.76	μs
Device Time	394456.71	μs
Self CPU Time	59242.60	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::cudnn_convolution
CPU Time	969790.39	μs
Device Time	282467.05	μs
Self CPU Time	221586.07	μs
Self Device Time	282467.05	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	945472.00	μs
Device Time	29379.77	μs
Self CPU Time	945472.00	μs
Self Device Time	29379.77	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	1111776.14	μs
Self CPU Time	0.00	μs
Self Device Time	1111776.14	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45308 warnings generated when compiling for host.
Suppressed 45328 warnings (45281 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b5_s1_fused_lockfree_groupnorm_base/base/base.cu:7:35 bugprone-macro-parentheses

7 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor.")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b5_s1_fused_lockfree_groupnorm_base/base/base.cu:8:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]

8 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous.")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b5_s1_fused_lockfree_groupnorm_base/base/base.cu:24:5: warning: 4 adjacent parameters of 'fused_lockfree_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]

24 | const float* __restrict__ bias,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

25 | const float* __restrict__ scale,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

26 | const float* __restrict__ gn_weight,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

27 | const float* __restrict__ gn_bias,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b5_s1_fused_lockfree_groupnorm_base/base/base.cu:24:31: note: the first parameter in the range is 'bias'

24 | const float* __restrict__ bias,

| ^~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b5_s1_fused_lockfree_groupnorm_base/base/base.cu:27:31: note: the last parameter in the range is 'gn_bias'

27 | const float* __restrict__ gn_bias,

| ^~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b5_s1_fused_lockfree_groupnorm_base/base/base.cu:28:5: warning: 2 adjacent parameters of 'fused_lockfree_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]