← Back to Leaderboard

The AI CUDA Engineer 👷

57_conv_transposed_2D__square_input__square_kernelmapped_3d_bias_conv_transpose2d_base

Level 1 • Task 57
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    stride: int,
    padding: int,
    output_padding: int,
    groups: int,
) -> torch.Tensor:
    """
    Performs a transposed 2D convolution with square input and square kernel.

    Args:
        x (torch.Tensor): Input tensor.
        weight (torch.Tensor): Weight tensor.
        bias (torch.Tensor): Bias tensor.
        stride (int): Stride for the convolution.
        padding (int): Padding for the convolution.
        output_padding (int): Additional size added to one side of the output shape.
        groups (int): Number of groups for the convolution.

    Returns:
        torch.Tensor: Output tensor after convolution.
    """
    return F.conv_transpose2d(
        x,
        weight,
        bias,
        stride=stride,
        padding=padding,
        output_padding=output_padding,
        groups=groups,
    )


class Model(nn.Module):
    """
    Performs a transposed 2D convolution with square input and square kernel.

    Args:
        in_channels (int): Number of channels in the input tensor.
        out_channels (int): Number of channels produced by the convolution.
        kernel_size (int): Size of the square convolution kernel.
        stride (int): Stride of the convolution.
        padding (int): Padding applied to the input.
        output_padding (int): Additional size added to one side of the output shape.
        groups (int): Number of blocked connections from input channels to output channels.
        bias (bool): If `True`, adds a learnable bias to the output.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        padding: int,
        output_padding: int,
        groups: int,
        bias: bool,
    ):
        super(Model, self).__init__()
        conv = nn.ConvTranspose2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            output_padding=output_padding,
            groups=groups,
            bias=bias,
        )

        # Copy the initialized parameters
        self.weight = nn.Parameter(conv.weight.clone())
        self.bias = nn.Parameter(conv.bias.clone()) if bias else None

        self.stride = stride
        self.padding = padding
        self.groups = groups
        self.output_padding = output_padding

    def forward(
        self,
        x: torch.Tensor,
        fn=module_fn,
    ) -> torch.Tensor:
        """
        Performs the transposed 2D convolution.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
        """
        return fn(
            x,
            self.weight,
            self.bias,
            self.stride,
            self.padding,
            self.output_padding,
            self.groups,
        )


# Constants
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = 3
width = 128
height = 128
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False


def get_inputs():
    x = torch.randn(batch_size, in_channels, height, width)
    return [x]


def get_init_inputs():
    return [
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        groups,
        bias,
    ]
import torch
import torch.nn as nn


class Model(nn.Module):
    """
    Performs a transposed 2D convolution with square input and square kernel.

    Args:
        in_channels (int): Number of channels in the input tensor.
        out_channels (int): Number of channels produced by the convolution.
        kernel_size (int): Size of the square convolution kernel.
        stride (int, optional): Stride of the convolution. Defaults to 1.
        padding (int, optional): Padding applied to the input. Defaults to 0.
        output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
        groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
        bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        output_padding: int = 0,
        groups: int = 1,
        bias: bool = False,
    ):
        super(Model, self).__init__()
        self.conv_transpose2d = nn.ConvTranspose2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            output_padding=output_padding,
            groups=groups,
            bias=bias,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Performs the transposed 2D convolution.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
        """
        return self.conv_transpose2d(x)


# Test code
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = 3
width = 128
height = 128
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False


def get_inputs():
    x = torch.randn(batch_size, in_channels, height, width)
    return [x]


def get_init_inputs():
    return [
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        groups,
        bias,
    ]

Kernel Information

Related Kernels (Level 1, Task 57 • 57_conv_transposed_2D__square_input__square_kernel)

#include <torch/extension.h>

// Kernel that maps threads in a 3D grid corresponding to the output tensor's dimensions.
// The grid's z-dimension covers the batch and channel dimensions (N and C_out), while the x and y dimensions cover the spatial dimensions (W_out and H_out).
__global__ void add_bias_kernel_3d(
    float* output,          // pointer to the output tensor
    const float* bias,      // pointer to the bias tensor
    int N,                  // batch size
    int C_out,              // number of output channels
    int H_out,              // output height
    int W_out) {            // output width
  // Compute the batch and channel indices from the grid's z-dimension
  int idx = blockIdx.z;
  int n = idx / C_out;
  int c = idx % C_out;

  // Compute the spatial indices using 2D block indexing
  int w = blockIdx.x * blockDim.x + threadIdx.x;
  int h = blockIdx.y * blockDim.y + threadIdx.y;

  // Check bounds for spatial dimensions
  if (h < H_out && w < W_out) {
    // Compute the linear index in the output tensor assuming NCHW layout
    int offset = ((n * C_out + c) * H_out + h) * W_out + w;
    output[offset] += bias[c];
  }
}

// Forward function definition
torch::Tensor conv_transpose2d_forward(
    torch::Tensor x,
    torch::Tensor weight,
    torch::optional<torch::Tensor> bias,
    int64_t stride,
    int64_t padding,
    int64_t output_padding,
    int64_t groups) {

  // Ensure inputs are on CUDA and contiguous
  TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
  TORCH_CHECK(weight.is_cuda(), "Weight tensor must be on CUDA");
  TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");
  TORCH_CHECK(weight.is_contiguous(), "Weight tensor must be contiguous");

  if (bias.has_value()) {
    TORCH_CHECK(bias.value().is_cuda(), "Bias tensor must be on CUDA");
    TORCH_CHECK(bias.value().is_contiguous(), "Bias tensor must be contiguous");
  }

  // Use the built-in conv_transpose2d function for the main computation
  auto output = at::conv_transpose2d(
      x,
      weight,
      bias,
      {stride, stride},                 // stride
      {padding, padding},               // padding
      {output_padding, output_padding}, // output_padding
      groups
  );

  // If bias is provided, add it using our 3D-mapped kernel for efficient thread mapping
  if (bias.has_value()) {
    int N = x.size(0);
    int C_out = weight.size(1);
    int H_out = output.size(2);
    int W_out = output.size(3);

    // Define 2D block size for spatial dimensions
    dim3 block(16, 16);
    // Grid dimensions: x dimension for W, y dimension for H, and z dimension for (N * C_out)
    dim3 grid(
        (W_out + block.x - 1) / block.x,
        (H_out + block.y - 1) / block.y,
        N * C_out
    );

    add_bias_kernel_3d<<<grid, block>>>(
        output.data_ptr<float>(),
        bias.value().data_ptr<float>(),
        N,
        C_out,
        H_out,
        W_out
    );
    cudaDeviceSynchronize();
  }

  return output;
}

// Pybind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &conv_transpose2d_forward, "ConvTranspose2d forward (CUDA) - 3D mapped bias addition");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::conv_transpose2d
CPU Time 1632298.77 μs
Device Time 1139256.18 μs
Self CPU Time 11286.11 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::convolution
CPU Time 1621012.66 μs
Device Time 1139256.18 μs
Self CPU Time 14787.90 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_convolution
CPU Time 1606224.76 μs
Device Time 1139256.18 μs
Self CPU Time 18490.07 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::cudnn_convolution_transpose
CPU Time 1587734.69 μs
Device Time 1139256.18 μs
Self CPU Time 198937.10 μs
Self Device Time 1139256.18 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 946440.79 μs
Device Time 0.00 μs
Self CPU Time 946440.79 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 598709.61 μs
Self CPU Time 0.00 μs
Self Device Time 598709.61 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45290 warnings generated when compiling for host.
Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:8:5 bugprone-easily-swappable-parameters
8 | int N, // batch size
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 | int C_out, // number of output channels
| ~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:8:9: note: the first parameter in the range is 'N'
8 | int N, // batch size
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:9:9: note: the last parameter in the range is 'C_out'
9 | int C_out, // number of output channels
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:13:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
13 | int idx = blockIdx.z;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:18:11: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
18 | int w = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:19:11: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
19 | int h = blockIdx.y * blockDim.y + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:31:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
31 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:32:19: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
32 | torch::Tensor weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:63:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
63 | int N = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:64:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
64 | int C_out = weight.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:65:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
65 | int H_out = output.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_57/b10_s2_mapped_3d_bias_conv_transpose2d/base/base.cu:66:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
66 | int W_out = output.size(3);
| ^