65_conv_transposed_2D__square_input__asymmetric_kernel
• gather_fused_unroll_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: int,
padding: int,
output_padding: int,
groups: int,
) -> torch.Tensor:
"""
Performs a transposed 2D convolution with a square input and an asymmetric kernel.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
weight (torch.Tensor): Weight tensor of shape (out_channels, in_channels // groups, kernel_height, kernel_width).
bias (torch.Tensor): Bias tensor of shape (out_channels).
stride (int): Stride of the convolution.
padding (int): Padding applied to the input.
output_padding (int): Additional size added to one side of the output shape.
groups (int): Number of blocked connections from input channels to output channels.
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return F.conv_transpose2d(
x,
weight,
bias,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
)
class Model(nn.Module):
"""
Performs a transposed 2D convolution with a square input and an asymmetric kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (tuple): Size of the convolution kernel (height, width).
stride (int): Stride of the convolution.
padding (int or tuple): Padding applied to the input.
output_padding (int or tuple): Additional size added to one side of the output shape.
groups (int): Number of blocked connections from input channels to output channels.
bias (bool): If `True`, adds a learnable bias to the output.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: tuple,
stride: int,
padding: int,
output_padding: int,
groups: int,
bias: bool,
):
super(Model, self).__init__()
conv = nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
bias=bias,
)
# Copy the initialized parameters
self.weight = nn.Parameter(conv.weight.clone())
self.bias = nn.Parameter(conv.bias.clone()) if bias else None
self.stride = stride
self.padding = padding
self.groups = groups
self.output_padding = output_padding
def forward(
self,
x: torch.Tensor,
fn=module_fn,
) -> torch.Tensor:
"""
Performs the transposed 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return fn(
x,
self.weight,
self.bias,
self.stride,
self.padding,
self.output_padding,
self.groups,
)
# Constants for default arguments
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False
# Test code
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = (3, 5) # Asymmetric kernel
width = 128
height = 128
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
groups,
bias,
]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Performs a transposed 2D convolution with a square input and an asymmetric kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (tuple): Size of the convolution kernel (height, width).
stride (int, optional): Stride of the convolution. Defaults to 1.
padding (int or tuple, optional): Padding applied to the input. Defaults to 0.
output_padding (int or tuple, optional): Additional size added to one side of the output shape. Defaults to 0.
groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: tuple,
stride: int = 1,
padding: int = 0,
output_padding: int = 0,
groups: int = 1,
bias: bool = False,
):
super(Model, self).__init__()
self.conv_transpose2d = nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Performs the transposed 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return self.conv_transpose2d(x)
# Constants for default arguments
stride = 1
padding = 0
output_padding = 0
groups = 1
bias = False
# Test code
batch_size = 16
in_channels = 32
out_channels = 64
kernel_size = (3, 5) # Asymmetric kernel
width = 128
height = 128
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
groups,
bias,
]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
// This kernel computes each output element for transposed convolution
// and manually unrolls loops to reduce loop overhead. It assumes the kernel dimensions
// (kernel_h, kernel_w) are small so that unrolling them is beneficial, and it unrolls the inner
// loop over input channels in groups of 4 when possible.
template <typename scalar_t>
__global__ void conv_transpose2d_gather_fused_unroll_kernel(
const scalar_t* __restrict__ input,
const scalar_t* __restrict__ weight,
const scalar_t* __restrict__ bias, // may be nullptr
scalar_t* __restrict__ output,
const int batch_size,
const int in_channels,
const int in_height,
const int in_width,
const int out_channels,
const int kernel_h,
const int kernel_w,
const int stride,
const int padding,
const int output_padding, // used only for output size computation
const int groups,
const int dilation,
const int out_height,
const int out_width
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int total_elements = batch_size * out_channels * out_height * out_width;
if (idx >= total_elements) return;
// Unravel the flat index into 4D indices: (b, oc, oh, ow)
int tmp = idx;
int ow = tmp % out_width;
tmp /= out_width;
int oh = tmp % out_height;
tmp /= out_height;
int oc = tmp % out_channels;
int b = tmp / out_channels;
// Determine group related indices
int out_channels_per_group = out_channels / groups;
int g = oc / out_channels_per_group;
int oc_group = oc % out_channels_per_group;
int in_channels_per_group = in_channels / groups;
int in_channel_start = g * in_channels_per_group;
// Initialize the accumulation with bias if provided
scalar_t sum = (bias != nullptr) ? bias[oc] : static_cast<scalar_t>(0);
// Pre-calculate strides
int in_image_size = in_height * in_width;
// Loop over kernel height and width with unrolling
#pragma unroll
for (int kh = 0; kh < kernel_h; ++kh) {
int h_offset = oh + padding - kh * dilation;
if (h_offset < 0 || h_offset % stride != 0) continue;
int h_in = h_offset / stride;
if (h_in < 0 || h_in >= in_height) continue;
#pragma unroll
for (int kw = 0; kw < kernel_w; ++kw) {
int w_offset = ow + padding - kw * dilation;
if (w_offset < 0 || w_offset % stride != 0) continue;
int w_in = w_offset / stride;
if (w_in < 0 || w_in >= in_width) continue;
int input_base = b * (in_channels * in_image_size);
// Manual unrolling for the inner loop over input channels in groups of 4
int unroll_end = (in_channels_per_group / 4) * 4;
for (int ic = 0; ic < unroll_end; ic += 4) {
int cur_ic0 = in_channel_start + ic;
int cur_ic1 = cur_ic0 + 1;
int cur_ic2 = cur_ic0 + 2;
int cur_ic3 = cur_ic0 + 3;
int input_idx0 = input_base + cur_ic0 * in_image_size + h_in * in_width + w_in;
int input_idx1 = input_base + cur_ic1 * in_image_size + h_in * in_width + w_in;
int input_idx2 = input_base + cur_ic2 * in_image_size + h_in * in_width + w_in;
int input_idx3 = input_base + cur_ic3 * in_image_size + h_in * in_width + w_in;
// Weight indexing: [in_channels, out_channels_per_group, kernel_h, kernel_w]
int weight_idx0 = cur_ic0 * (out_channels_per_group * kernel_h * kernel_w) +
oc_group * (kernel_h * kernel_w) + kh * kernel_w + kw;
int weight_idx1 = cur_ic1 * (out_channels_per_group * kernel_h * kernel_w) +
oc_group * (kernel_h * kernel_w) + kh * kernel_w + kw;
int weight_idx2 = cur_ic2 * (out_channels_per_group * kernel_h * kernel_w) +
oc_group * (kernel_h * kernel_w) + kh * kernel_w + kw;
int weight_idx3 = cur_ic3 * (out_channels_per_group * kernel_h * kernel_w) +
oc_group * (kernel_h * kernel_w) + kh * kernel_w + kw;
sum += input[input_idx0] * weight[weight_idx0] +
input[input_idx1] * weight[weight_idx1] +
input[input_idx2] * weight[weight_idx2] +
input[input_idx3] * weight[weight_idx3];
}
// Process remaining channels if any
for (int ic = unroll_end; ic < in_channels_per_group; ++ic) {
int cur_ic = in_channel_start + ic;
int input_idx = input_base + cur_ic * in_image_size + h_in * in_width + w_in;
int weight_idx = cur_ic * (out_channels_per_group * kernel_h * kernel_w) +
oc_group * (kernel_h * kernel_w) + kh * kernel_w + kw;
sum += input[input_idx] * weight[weight_idx];
}
}
}
int output_idx = b * (out_channels * out_height * out_width) +
oc * (out_height * out_width) +
oh * out_width + ow;
output[output_idx] = sum;
}
// Forward function: sets up output tensor and launches the unrolled kernel
torch::Tensor forward(
torch::Tensor input,
torch::Tensor weight,
torch::optional<torch::Tensor> bias,
int stride,
int padding,
int output_padding,
int groups,
int dilation = 1
) {
TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
TORCH_CHECK(weight.is_cuda(), "weight must be a CUDA tensor");
TORCH_CHECK(input.dim() == 4, "Input must be 4D");
TORCH_CHECK(weight.dim() == 4, "Weight must be 4D");
const int batch_size = input.size(0);
const int in_channels = input.size(1);
const int in_height = input.size(2);
const int in_width = input.size(3);
// Weight shape assumed: [in_channels, out_channels/groups, kernel_h, kernel_w]
const int out_channels = weight.size(1) * groups; // weight.size(1) is out_channels per group
const int kernel_h = weight.size(2);
const int kernel_w = weight.size(3);
// Calculate output dimensions using transposed convolution formula
const int out_height = (in_height - 1) * stride - 2 * padding + dilation * (kernel_h - 1) + output_padding + 1;
const int out_width = (in_width - 1) * stride - 2 * padding + dilation * (kernel_w - 1) + output_padding + 1;
auto output = torch::zeros({batch_size, out_channels, out_height, out_width}, input.options());
const int total_elements = batch_size * out_channels * out_height * out_width;
const int threads = 256;
const int blocks = (total_elements + threads - 1) / threads;
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "conv_transpose2d_gather_fused_unroll_cuda", ([&] {
const scalar_t* bias_data = (bias.has_value() && bias->defined()) ? bias->data_ptr<scalar_t>() : nullptr;
conv_transpose2d_gather_fused_unroll_kernel<scalar_t><<<blocks, threads>>>(
input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
bias_data,
output.data_ptr<scalar_t>(),
batch_size,
in_channels,
in_height,
in_width,
out_channels,
kernel_h,
kernel_w,
stride,
padding,
output_padding,
groups,
dilation,
out_height,
out_width
);
}));
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Fused Transposed 2D Convolution with Loop Unrolling (CUDA)",
py::arg("input"),
py::arg("weight"),
py::arg("bias") = py::none(),
py::arg("stride"),
py::arg("padding"),
py::arg("output_padding"),
py::arg("groups"),
py::arg("dilation") = 1);
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.380 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.380 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 59.614 | % | 0.000 | 5 |
Issued Ipc Active | 2.380 | inst/cycle | 0.000 | 5 |
SM Busy | 63.264 | % | 0.000 | 5 |
Memory Throughput | 18230151333.582 | byte/second | 1847833836980172.000 | 5 |
Mem Busy | 54.918 | % | 0.000 | 5 |
Max Bandwidth | 54.684 | % | 0.000 | 5 |
L1/TEX Hit Rate | 85.310 | % | 0.002 | 5 |
L2 Hit Rate | 99.006 | % | 0.174 | 5 |
Mem Pipes Busy | 52.074 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 12.740 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 12.740 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 31.130 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 29.900 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 4.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 32.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 50.000 | % | 0.000 | 5 |
Achieved Occupancy | 47.486 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 30.390 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | FMA is the highest-utilized pipeline (38.4%) based on active cycles, taking into account the rates of its different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD) operations. It is well-utilized, but should not be a bottleneck. |
WRN Occupancy | This kernel's theoretical occupancy (50.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 337305.13 | μs |
Device Time | 3394.09 | μs |
Self CPU Time | 58.15 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zeros | ||
CPU Time | 156016.49 | μs |
Device Time | 57799.01 | μs |
Self CPU Time | 5264.93 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 9069209.75 | μs |
Device Time | 254083.28 | μs |
Self CPU Time | 10727.72 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 9058484.26 | μs |
Device Time | 254083.28 | μs |
Self CPU Time | 15288.30 | μs |
Self Device Time | 254083.28 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 9052712.87 | μs |
Device Time | 9230.08 | μs |
Self CPU Time | 9052712.87 | μs |
Self Device Time | 9230.08 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void conv_transpose2d_gather_fused_unroll_kernel<float>(float const*, float const*, float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 9745325.03 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 9745325.03 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 834785.79 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 834785.79 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 196284.27 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 196284.27 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45297 warnings generated when compiling for host. Suppressed 45328 warnings (45281 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.