57_conv_transposed_2D__square_input__square_kernel
• mapped_3d_bias_conv_transpose2d_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: int,
padding: int,
output_padding: int,
groups: int,
) -> torch.Tensor:
"""
Performs a transposed 2D convolution with square input and square kernel.
Args:
x (torch.Tensor): Input tensor.
weight (torch.Tensor): Weight tensor.
bias (torch.Tensor): Bias tensor.
stride (int): Stride for the convolution.
padding (int): Padding for the convolution.
output_padding (int): Additional size added to one side of the output shape.
groups (int): Number of groups for the convolution.
Returns:
torch.Tensor: Output tensor after convolution.
"""
return F.conv_transpose2d(
x,
weight,
bias,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
)
class Model(nn.Module):
"""
Performs a transposed 2D convolution with square input and square kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int): Stride of the convolution.
padding (int): Padding applied to the input.
output_padding (int): Additional size added to one side of the output shape.
groups (int): Number of blocked connections from input channels to output channels.
bias (bool): If `True`, adds a learnable bias to the output.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int,
padding: int,
output_padding: int,
groups: int,
bias: bool,
):
super(Model, self).__init__()
conv = nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
bias=bias,
)
# Copy the initialized parameters
self.weight = nn.Parameter(conv.weight.clone())
self.bias = nn.Parameter(conv.bias.clone()) if bias else None
self.stride = stride
self.padding = padding
self.groups = groups
self.output_padding = output_padding
def forward(
self,
x: torch.Tensor,
fn=module_fn,
) -> torch.Tensor:
"""
Performs the transposed 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return fn(
x,
self.weight,
self.bias,
self.stride,
self.padding,
self.output_padding,
self.groups,
)
# Constants
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = 3
width = 128
height = 128
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
groups,
bias,
]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Performs a transposed 2D convolution with square input and square kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int, optional): Stride of the convolution. Defaults to 1.
padding (int, optional): Padding applied to the input. Defaults to 0.
output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
output_padding: int = 0,
groups: int = 1,
bias: bool = False,
):
super(Model, self).__init__()
self.conv_transpose2d = nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Performs the transposed 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return self.conv_transpose2d(x)
# Test code
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = 3
width = 128
height = 128
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
groups,
bias,
]
#include <torch/extension.h>
// Kernel that maps threads in a 3D grid corresponding to the output tensor's dimensions.
// The grid's z-dimension covers the batch and channel dimensions (N and C_out), while the x and y dimensions cover the spatial dimensions (W_out and H_out).
__global__ void add_bias_kernel_3d(
float* output, // pointer to the output tensor
const float* bias, // pointer to the bias tensor
int N, // batch size
int C_out, // number of output channels
int H_out, // output height
int W_out) { // output width
// Compute the batch and channel indices from the grid's z-dimension
int idx = blockIdx.z;
int n = idx / C_out;
int c = idx % C_out;
// Compute the spatial indices using 2D block indexing
int w = blockIdx.x * blockDim.x + threadIdx.x;
int h = blockIdx.y * blockDim.y + threadIdx.y;
// Check bounds for spatial dimensions
if (h < H_out && w < W_out) {
// Compute the linear index in the output tensor assuming NCHW layout
int offset = ((n * C_out + c) * H_out + h) * W_out + w;
output[offset] += bias[c];
}
}
// Forward function definition
torch::Tensor conv_transpose2d_forward(
torch::Tensor x,
torch::Tensor weight,
torch::optional<torch::Tensor> bias,
int64_t stride,
int64_t padding,
int64_t output_padding,
int64_t groups) {
// Ensure inputs are on CUDA and contiguous
TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
TORCH_CHECK(weight.is_cuda(), "Weight tensor must be on CUDA");
TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");
TORCH_CHECK(weight.is_contiguous(), "Weight tensor must be contiguous");
if (bias.has_value()) {
TORCH_CHECK(bias.value().is_cuda(), "Bias tensor must be on CUDA");
TORCH_CHECK(bias.value().is_contiguous(), "Bias tensor must be contiguous");
}
// Use the built-in conv_transpose2d function for the main computation
auto output = at::conv_transpose2d(
x,
weight,
bias,
{stride, stride}, // stride
{padding, padding}, // padding
{output_padding, output_padding}, // output_padding
groups
);
// If bias is provided, add it using our 3D-mapped kernel for efficient thread mapping
if (bias.has_value()) {
int N = x.size(0);
int C_out = weight.size(1);
int H_out = output.size(2);
int W_out = output.size(3);
// Define 2D block size for spatial dimensions
dim3 block(16, 16);
// Grid dimensions: x dimension for W, y dimension for H, and z dimension for (N * C_out)
dim3 grid(
(W_out + block.x - 1) / block.x,
(H_out + block.y - 1) / block.y,
N * C_out
);
add_bias_kernel_3d<<<grid, block>>>(
output.data_ptr<float>(),
bias.value().data_ptr<float>(),
N,
C_out,
H_out,
W_out
);
cudaDeviceSynchronize();
}
return output;
}
// Pybind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &conv_transpose2d_forward, "ConvTranspose2d forward (CUDA) - 3D mapped bias addition");
}
Metric | Value | Unit | Variance | Samples |
---|
Rule | Description |
---|
Operation / Metric | Value | Unit |
---|---|---|
aten::conv_transpose2d | ||
CPU Time | 1632298.77 | μs |
Device Time | 1139256.18 | μs |
Self CPU Time | 11286.11 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 1621012.66 | μs |
Device Time | 1139256.18 | μs |
Self CPU Time | 14787.90 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 1606224.76 | μs |
Device Time | 1139256.18 | μs |
Self CPU Time | 18490.07 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution_transpose | ||
CPU Time | 1587734.69 | μs |
Device Time | 1139256.18 | μs |
Self CPU Time | 198937.10 | μs |
Self Device Time | 1139256.18 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 946440.79 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 946440.79 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 598709.61 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 598709.61 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45290 warnings generated when compiling for host. Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.