← Back to Leaderboard

The AI CUDA Engineer 👷

20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAddcoalesced_vectorized_access_opt_base

Level 2 • Task 20
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    stride: int,
    padding: int,
    output_padding: int,
    conv_transpose: torch.Tensor,
    conv_transpose_bias: torch.Tensor,
    bias: torch.Tensor,
) -> torch.Tensor:
    """
    Applies a 3D transposed convolution followed by bias addition and residual operations.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width)
        stride (int): Stride of the transposed convolution
        padding (int): Padding of the transposed convolution
        output_padding (int): Additional size added to output shape
        conv_transpose (torch.Tensor): Transposed convolution weight tensor
        conv_transpose_bias (torch.Tensor): Bias tensor for transposed convolution
        bias (torch.Tensor): Bias tensor for addition

    Returns:
        torch.Tensor: Output tensor after applying operations
    """
    x = F.conv_transpose3d(
        x,
        conv_transpose,
        bias=conv_transpose_bias,
        stride=stride,
        padding=padding,
        output_padding=output_padding,
    )
    original_x = x.clone().detach()
    x = x + bias
    x = x + original_x
    x = x * original_x
    x = x + original_x
    return x


class Model(nn.Module):
    """
    Model that performs a 3D transposed convolution, followed by a sum,
    a residual add, a multiplication, and another residual add.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        bias_shape,
    ):
        super(Model, self).__init__()
        conv_transpose = nn.ConvTranspose3d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            output_padding=output_padding,
        )
        self.conv_transpose_parameter = conv_transpose.weight
        self.conv_transpose_bias = nn.Parameter(
            conv_transpose.bias + torch.ones_like(conv_transpose.bias) * 0.02
        )  # make sure its nonzero
        self.bias_parameter = nn.Parameter(torch.randn(bias_shape) * 0.02)

    def forward(self, x, stride, padding, output_padding, fn=module_fn):
        return fn(
            x,
            stride,
            padding,
            output_padding,
            self.conv_transpose_parameter,
            self.conv_transpose_bias,
            self.bias_parameter,
        )


batch_size = 16
in_channels = 32
out_channels = 64
depth, height, width = 16, 32, 32
kernel_size = 3
stride = 2
padding = 1
output_padding = 1
bias_shape = (out_channels, 1, 1, 1)


def get_inputs():
    return [
        torch.randn(batch_size, in_channels, depth, height, width),
        stride,
        padding,
        output_padding,
    ]


def get_init_inputs():
    return [
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        bias_shape,
    ]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Model that performs a 3D transposed convolution, followed by a sum, 
    a residual add, a multiplication, and another residual add.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, output_padding, bias_shape):
        super(Model, self).__init__()
        self.conv_transpose = nn.ConvTranspose3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, output_padding=output_padding)
        self.conv_transpose.bias = nn.Parameter(self.conv_transpose.bias + torch.ones_like(self.conv_transpose.bias) * 0.02)
        self.bias = nn.Parameter(torch.randn(bias_shape)*0.02)

    def forward(self, x):
        x = self.conv_transpose(x)
        original_x = x.clone().detach()
        x = x + self.bias
        x = x + original_x
        x = x * original_x
        x = x + original_x
        return x

batch_size = 16
in_channels = 32
out_channels = 64
depth, height, width = 16, 32, 32
kernel_size = 3
stride = 2
padding = 1
output_padding = 1
bias_shape = (out_channels, 1, 1, 1)

def get_inputs():
    return [torch.randn(batch_size, in_channels, depth, height, width)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, stride, padding, output_padding, bias_shape]

Kernel Information

Related Kernels (Level 2, Task 20 • 20_ConvTranspose3d_Sum_ResidualAdd_Multiply_ResidualAdd)

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define VECTOR_SIZE 4

// Store bias in constant memory (up to 16384 channels)
__constant__ float c_bias[16384];

// Kernel: Each block processes one (batch, channel) pair.
// We vectorize accesses along the spatial dimension by loading/storing float4's.
__global__ void fused_operations_kernel(
    const float* __restrict__ conv_output,
    float* __restrict__ output,
    int num_vec,       // number of vectorized elements per channel (spatial_size / 4)
    int spatial_size,  // original spatial size for the channel
    int N,
    int C
) {
    // Determine (batch, channel) pair from blockIdx.x
    int nc = blockIdx.x;
    int n = nc / C;
    int c = nc % C;
    
    // Load channel bias from constant memory
    float bias_val = c_bias[c];
    
    // base index for this (n, c) slice
    int base_idx = n * C * spatial_size + c * spatial_size;

    // Each thread processes multiple vectorized chunks
    for (int i = threadIdx.x; i < num_vec; i += blockDim.x) {
        int vec_idx = base_idx + i * VECTOR_SIZE;
        // Load 4 contiguous floats (aligned load) which ensures coalescing
        float4 orig = *reinterpret_cast<const float4*>(&conv_output[vec_idx]);
        float4 res;
        // Compute element-wise operation: original * (2*original + bias + 1)
        res.x = orig.x * (2.0f * orig.x + bias_val + 1.0f);
        res.y = orig.y * (2.0f * orig.y + bias_val + 1.0f);
        res.z = orig.z * (2.0f * orig.z + bias_val + 1.0f);
        res.w = orig.w * (2.0f * orig.w + bias_val + 1.0f);
        
        // Write back result with aligned store
        *reinterpret_cast<float4*>(&output[vec_idx]) = res;
    }
}


torch::Tensor forward(
    torch::Tensor x,
    int stride,
    int padding,
    int output_padding,
    torch::Tensor conv_transpose,
    torch::Tensor conv_transpose_bias,
    torch::Tensor bias
) {
    // Perform the 3D transposed convolution using PyTorch's optimized kernel
    auto conv_result = torch::conv_transpose3d(
        x,
        conv_transpose,
        conv_transpose_bias,
        stride,
        padding,
        output_padding
    );

    auto sizes = conv_result.sizes();
    const int N = sizes[0];
    const int C = sizes[1];
    const int spatial_size = sizes[2] * sizes[3] * sizes[4];

    // Ensure that spatial_size is a multiple of 4 for vectorized (float4) operations
    TORCH_CHECK((spatial_size % VECTOR_SIZE) == 0, "Spatial size must be a multiple of 4 for vectorization");
    int num_vec = spatial_size / VECTOR_SIZE;

    // Copy bias to constant memory
    cudaMemcpyToSymbol(c_bias, bias.data_ptr<float>(), C * sizeof(float));
    
    auto output = torch::empty_like(conv_result);

    // Launch one block per (batch, channel) pair; threads within block process vectorized elements
    const int blocks = N * C;
    const int threads_per_block = 256;
    fused_operations_kernel<<<blocks, threads_per_block>>>(
        conv_result.data_ptr<float>(),
        output.data_ptr<float>(),
        num_vec,
        spatial_size,
        N,
        C
    );

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused ConvTranspose3D with Coalesced Vectorized Global Memory Access");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 0.300 inst/cycle 0.000 5
Executed Ipc Elapsed 0.276 inst/cycle 0.000 5
Issue Slots Busy 7.436 % 0.001 5
Issued Ipc Active 0.300 inst/cycle 0.000 5
SM Busy 7.436 % 0.001 5
Memory Throughput 2855876310969.562 byte/second 92224376896522911744.000 5
Mem Busy 45.422 % 0.025 5
Max Bandwidth 85.196 % 0.082 5
L1/TEX Hit Rate 0.000 % 0.000 5
L2 Hit Rate 50.854 % 0.005 5
Mem Pipes Busy 5.434 % 0.000 5
Warp Cycles Per Issued Instruction 203.420 cycle 0.367 5
Warp Cycles Per Executed Instruction 203.530 cycle 0.367 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 31.930 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 10.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 94.426 % 0.004 5
Achieved Active Warps Per SM 60.432 warp 0.002 5
Analysis Rules
Rule Description
WRN HighPipeUtilization All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
INF Occupancy This kernel's theoretical occupancy is not impacted by any block limit.
Operation / Metric Value Unit
aten::conv_transpose3d
CPU Time 4665975.70 μs
Device Time 4660114.84 μs
Self CPU Time 7209.68 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::convolution
CPU Time 4658766.02 μs
Device Time 4660114.84 μs
Self CPU Time 9924.70 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_convolution
CPU Time 4648841.32 μs
Device Time 4660114.84 μs
Self CPU Time 23988.95 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::cudnn_convolution_transpose
CPU Time 4572370.06 μs
Device Time 2834250.62 μs
Self CPU Time 145809.51 μs
Self Device Time 2834250.62 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 1880386.39 μs
Device Time 64474.45 μs
Self CPU Time 1880386.39 μs
Self Device Time 64474.45 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::add_
CPU Time 45231.56 μs
Device Time 1825864.22 μs
Self CPU Time 18069.04 μs
Self Device Time 1825864.22 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45291 warnings generated when compiling for host.
Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:15:5 bugprone-easily-swappable-parameters
15 | int num_vec, // number of vectorized elements per channel (spatial_size / 4)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 | int spatial_size, // original spatial size for the channel
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17 | int N,
| ~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:15:9: note: the first parameter in the range is 'num_vec'
15 | int num_vec, // number of vectorized elements per channel (spatial_size / 4)
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:17:9: note: the last parameter in the range is 'N'
17 | int N,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:21:14: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int nc = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:32:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | for (int i = threadIdx.x; i < num_vec; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:32:49: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | for (int i = threadIdx.x; i < num_vec; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:50:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
50 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:54:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
54 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:55:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
55 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
56 | torch::Tensor bias
| ~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:55:19: note: the first parameter in the range is 'conv_transpose_bias'
55 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:56:19: note: the last parameter in the range is 'bias'
56 | torch::Tensor bias
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:56:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
56 | torch::Tensor bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:69:19: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
69 | const int N = sizes[0];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:70:19: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
70 | const int C = sizes[1];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_20/b5_s1_coalesced_vectorized_access_opt/base/base.cu:71:30: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
71 | const int spatial_size = sizes[2] * sizes[3] * sizes[4];
| ^