63_conv_standard_2D__square_input__square_kernel
• conv2d_grid_stride_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: int,
padding: int,
dilation: int,
groups: int,
) -> torch.Tensor:
"""
Performs a standard 2D convolution operation with a square input and square kernel.
Args:
x (torch.Tensor): Input tensor.
weight (torch.Tensor): Weight tensor.
bias (torch.Tensor): Bias tensor.
stride (int): Stride of the convolution.
padding (int): Padding applied to the input.
dilation (int): Dilation of the convolution.
groups (int): Number of blocked connections from input channels to output channels.
Returns:
torch.Tensor: Output tensor.
"""
return F.conv2d(
x,
weight,
bias,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
)
class Model(nn.Module):
"""
Performs a standard 2D convolution operation with a square input and square kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int): Stride of the convolution.
padding (int): Padding applied to the input.
dilation (int): Spacing between kernel elements.
groups (int): Number of blocked connections from input channels to output channels.
bias (bool): If `True`, adds a learnable bias to the output.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int,
padding: int,
dilation: int,
groups: int,
bias: bool,
):
super(Model, self).__init__()
# Create a Conv2d layer to get the same initialization
conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
)
# Copy the initialized parameters
self.weight = nn.Parameter(conv.weight.clone())
self.bias = nn.Parameter(conv.bias.clone()) if bias else None
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
def forward(
self,
x: torch.Tensor,
fn=module_fn,
) -> torch.Tensor:
"""
Performs the 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return fn(
x,
self.weight,
self.bias,
self.stride,
self.padding,
self.dilation,
self.groups,
)
# Constants
batch_size = 16
in_channels = 3
out_channels = 64
kernel_size = 3
width = 256
height = 256
stride = 1
padding = 0
dilation = 1
groups = 1
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Performs a standard 2D convolution operation with a square input and square kernel.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int, optional): Stride of the convolution. Defaults to 1.
padding (int, optional): Padding applied to the input. Defaults to 0.
dilation (int, optional): Spacing between kernel elements. Defaults to 1.
groups (int, optional): Number of blocked connections from input channels to output channels. Defaults to 1.
bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
bias: bool = False,
):
super(Model, self).__init__()
self.conv2d = nn.Conv2d(
in_channels,
out_channels,
(kernel_size, kernel_size),
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Performs the 2D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, height_out, width_out).
"""
return self.conv2d(x)
# Test code
batch_size = 16
in_channels = 3
out_channels = 64
kernel_size = 3
width = 256
height = 256
stride = 1
padding = 0
dilation = 1
groups = 1
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, height, width)
return [x]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
] # Provide in_channels, out_channels, kernel_size for initialization
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
// This kernel uses grid-stride loops to handle workloads larger than the number of active threads.
// Each thread processes multiple output elements by iterating through a flattened output index space.
__global__ void conv2d_kernel(
const float* __restrict__ input,
const float* __restrict__ weight,
float* __restrict__ output,
const int batch_size,
const int in_channels,
const int out_channels,
const int input_height,
const int input_width,
const int kernel_size,
const int output_height,
const int output_width,
const int stride,
const int padding) {
// Total number of output elements in 4D tensor: (N, out_channels, output_height, output_width)
int total = batch_size * out_channels * output_height * output_width;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int step = blockDim.x * gridDim.x;
// Grid-stride loop: Each thread processes multiple elements if necessary
for (int index = idx; index < total; index += step) {
int temp = index;
int ox = temp % output_width;
temp /= output_width;
int oy = temp % output_height;
temp /= output_height;
int oc = temp % out_channels;
int n = temp / out_channels;
float sum = 0.0f;
// Convolution accumulation
for (int ic = 0; ic < in_channels; ++ic) {
for (int kh = 0; kh < kernel_size; ++kh) {
int in_y = oy * stride - padding + kh;
if (in_y < 0 || in_y >= input_height)
continue;
for (int kw = 0; kw < kernel_size; ++kw) {
int in_x = ox * stride - padding + kw;
if (in_x < 0 || in_x >= input_width)
continue;
int input_idx = ((n * in_channels + ic) * input_height + in_y) * input_width + in_x;
int weight_idx = ((oc * in_channels + ic) * kernel_size + kh) * kernel_size + kw;
sum += input[input_idx] * weight[weight_idx];
}
}
}
int output_idx = ((n * out_channels + oc) * output_height + oy) * output_width + ox;
output[output_idx] = sum;
}
}
// Host function to launch the kernel
// This implementation supports square kernels and does not support dilation or groups other than 1.
torch::Tensor forward(
torch::Tensor x,
torch::Tensor weight,
torch::optional<torch::Tensor> bias,
int stride,
int padding,
int dilation,
int groups) {
TORCH_CHECK(x.is_cuda(), "Input must be a CUDA tensor");
TORCH_CHECK(weight.is_cuda(), "Weight must be a CUDA tensor");
TORCH_CHECK(dilation == 1, "Dilation other than 1 is not supported in this kernel");
TORCH_CHECK(groups == 1, "Groups other than 1 are not supported in this kernel");
int batch_size = x.size(0);
int in_channels = x.size(1);
int input_height = x.size(2);
int input_width = x.size(3);
int out_channels = weight.size(0);
int kernel_size = weight.size(2); // assuming square kernel
int output_height = (input_height + 2 * padding - kernel_size) / stride + 1;
int output_width = (input_width + 2 * padding - kernel_size) / stride + 1;
auto output = torch::empty({batch_size, out_channels, output_height, output_width}, x.options());
int total_elements = batch_size * out_channels * output_height * output_width;
int threads = 256;
int blocks = (total_elements + threads - 1) / threads;
conv2d_kernel<<<blocks, threads>>>(
x.data_ptr<float>(),
weight.data_ptr<float>(),
output.data_ptr<float>(),
batch_size,
in_channels,
out_channels,
input_height,
input_width,
kernel_size,
output_height,
output_width,
stride,
padding);
cudaDeviceSynchronize();
if (bias.has_value()) {
output.add_(bias.value().view({1, -1, 1, 1}));
}
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "CUDA forward function for 2D convolution using grid-stride loops");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 3.440 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 3.430 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 85.994 | % | 0.000 | 5 |
Issued Ipc Active | 3.440 | inst/cycle | 0.000 | 5 |
SM Busy | 85.994 | % | 0.000 | 5 |
Memory Throughput | 147319453999.938 | byte/second | 28328722267525996.000 | 5 |
Mem Busy | 35.312 | % | 0.000 | 5 |
Max Bandwidth | 32.734 | % | 0.000 | 5 |
L1/TEX Hit Rate | 79.764 | % | 0.000 | 5 |
L2 Hit Rate | 98.802 | % | 0.007 | 5 |
Mem Pipes Busy | 31.828 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 17.580 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 17.580 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 29.350 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 94.656 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 60.582 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (57.5%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 256527.46 | μs |
Device Time | 1244.82 | μs |
Self CPU Time | 53.96 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 256473.50 | μs |
Device Time | 1244.82 | μs |
Self CPU Time | 99.73 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 254881.32 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 104.82 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 254409.86 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 254409.86 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 36866.71 | μs |
Device Time | 27635.71 | μs |
Self CPU Time | 36866.71 | μs |
Self Device Time | 27635.71 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
conv2d_kernel(float const*, float const*, float*, int, int, int, int, int, int, int, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 5276812.23 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 5276812.23 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 5526421.09 | μs |
Device Time | 392.16 | μs |
Self CPU Time | 5526421.09 | μs |
Self Device Time | 392.16 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 41044.36 | μs |
Device Time | 290806.80 | μs |
Self CPU Time | 8119.63 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 32926.67 | μs |
Device Time | 290806.80 | μs |
Self CPU Time | 10891.61 | μs |
Self Device Time | 290806.80 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 290806.80 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 290806.80 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45295 warnings generated when compiling for host. Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.