57_conv_transposed_2D__square_input__square_kernel
• block_tuned_conv_transpose2d_base_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: int,
padding: int,
output_padding: int,
groups: int,
) -> torch.Tensor:
"""
Performs a transposed 2D convolution with square input and square kernel.
Args:
x (torch.Tensor): Input tensor.
weight (torch.Tensor): Weight tensor.
bias (torch.Tensor): Bias tensor.
stride (int): Stride for the convolution.
padding (int): Padding for the convolution.
output_padding (int): Additional size added to one side of the output shape.
groups (int): Number of groups for the convolution.
Returns:
torch.Tensor: Output tensor after convolution.
"""
return F.conv_transpose2d(
x,
weight,
bias,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
)
class Model(nn.Module):
"""
Performs a transposed 2D convolution with square input and square kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int): Stride of the convolution.
padding (int): Padding applied to the input.
output_padding (int): Additional size added to one side of the output shape.
groups (int): Number of blocked connections from input channels to output channels.
bias (bool): If `True`, adds a learnable bias to the output.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int,
padding: int,
output_padding: int,
groups: int,
bias: bool,
):
super(Model, self).__init__()
conv = nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
bias=bias,
)
# Copy the initialized parameters
self.weight = nn.Parameter(conv.weight.clone())
self.bias = nn.Parameter(conv.bias.clone()) if bias else None
self.stride = stride
self.padding = padding
self.groups = groups
self.output_padding = output_padding
def forward(
self,
x: torch.Tensor,
fn=module_fn,
) -> torch.Tensor:
"""
Performs the transposed 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return fn(
x,
self.weight,
self.bias,
self.stride,
self.padding,
self.output_padding,
self.groups,
)
# Constants
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = 3
width = 128
height = 128
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
groups,
bias,
]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Performs a transposed 2D convolution with square input and square kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int, optional): Stride of the convolution. Defaults to 1.
padding (int, optional): Padding applied to the input. Defaults to 0.
output_padding (int, optional): Additional size added to one side of the output shape. Defaults to 0.
groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
output_padding: int = 0,
groups: int = 1,
bias: bool = False,
):
super(Model, self).__init__()
self.conv_transpose2d = nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Performs the transposed 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return self.conv_transpose2d(x)
# Test code
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = 3
width = 128
height = 128
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
groups,
bias,
]
#include <torch/extension.h>
// Optimized kernel with tuned block size for H100 GPU
__global__ void add_bias_kernel_block_tuned(
float* output,
const float* bias,
int total,
int C_out,
int H_out,
int W_out) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= total) return;
// Coalesced memory access pattern
int hw = H_out * W_out;
int oc = (index / hw) % C_out;
output[index] += bias[oc];
}
// Forward function definition
torch::Tensor conv_transpose2d_forward(
torch::Tensor x,
torch::Tensor weight,
torch::optional<torch::Tensor> bias,
int64_t stride,
int64_t padding,
int64_t output_padding,
int64_t groups) {
// Ensure inputs are on CUDA and contiguous
TORCH_CHECK(x.is_cuda(), "Input tensor must be on CUDA");
TORCH_CHECK(weight.is_cuda(), "Weight tensor must be on CUDA");
TORCH_CHECK(x.is_contiguous(), "Input tensor must be contiguous");
TORCH_CHECK(weight.is_contiguous(), "Weight tensor must be contiguous");
if (bias.has_value()) {
TORCH_CHECK(bias.value().is_cuda(), "Bias tensor must be on CUDA");
TORCH_CHECK(bias.value().is_contiguous(), "Bias tensor must be contiguous");
}
// Use the built-in conv_transpose2d function for the main computation
auto output = at::conv_transpose2d(
x,
weight,
bias,
{stride, stride}, // stride
{padding, padding}, // padding
{output_padding, output_padding}, // output_padding
groups
);
// If bias is provided, add it using the optimized kernel
if (bias.has_value()) {
int N = x.size(0);
int C_out = weight.size(1);
int H_out = output.size(2);
int W_out = output.size(3);
int total_output = N * C_out * H_out * W_out;
// Optimized block size of 128 threads
const int block_size = 128;
int grid_size = (total_output + block_size - 1) / block_size;
add_bias_kernel_block_tuned<<<grid_size, block_size>>>(
output.data_ptr<float>(),
bias.value().data_ptr<float>(),
total_output, C_out, H_out, W_out
);
cudaDeviceSynchronize();
}
return output;
}
// Pybind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &conv_transpose2d_forward, "ConvTranspose2d forward (CUDA) - block tuned");
}
Metric | Value | Unit | Variance | Samples |
---|
Rule | Description |
---|
Operation / Metric | Value | Unit |
---|---|---|
aten::conv_transpose2d | ||
CPU Time | 883121.41 | μs |
Device Time | 578750.54 | μs |
Self CPU Time | 5582.31 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 877539.10 | μs |
Device Time | 578750.54 | μs |
Self CPU Time | 7370.45 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 870168.66 | μs |
Device Time | 578750.54 | μs |
Self CPU Time | 8705.85 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution_transpose | ||
CPU Time | 861462.81 | μs |
Device Time | 578750.54 | μs |
Self CPU Time | 135788.75 | μs |
Self Device Time | 578750.54 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 482477.22 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 482477.22 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 47199.45 | μs |
Device Time | 305035.08 | μs |
Self CPU Time | 7429.15 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45288 warnings generated when compiling for host. Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.