76_conv_standard_1D_dilated_strided__
• conv1d_grid_stride_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: int,
dilation: int,
) -> torch.Tensor:
"""
Performs a standard 1D convolution operation with asymmetric input and a square kernel, potentially dilated and strided.
Args:
x (torch.Tensor): Input tensor.
weight (torch.Tensor): Weight tensor.
bias (torch.Tensor): Bias tensor.
stride (int): Stride of the convolution.
dilation (int): Dilation of the convolution.
Returns:
torch.Tensor: Output tensor.
"""
return F.conv1d(x, weight, bias=bias, stride=stride, dilation=dilation)
class Model(nn.Module):
"""
Performs a standard 1D convolution operation with asymmetric input and a square kernel, potentially dilated and strided.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int): Stride of the convolution.
dilation (int): Spacing between kernel elements.
bias (bool): If `True`, adds a learnable bias to the output.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int,
dilation: int,
bias: bool,
):
super(Model, self).__init__()
conv = nn.Conv1d(
in_channels,
out_channels,
kernel_size,
stride=stride,
dilation=dilation,
bias=bias,
)
# Copy the initialized parameters
self.weight = nn.Parameter(conv.weight.clone())
self.bias = nn.Parameter(conv.bias.clone()) if bias else None
self.stride = stride
self.dilation = dilation
def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
"""
Performs the 1D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
"""
return fn(x, self.weight, self.bias, self.stride, self.dilation)
# Constants
batch_size = 16
in_channels = 3
out_channels = 64
kernel_size = 3
length = 256
stride = 3
dilation = 4
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, length)
return [x]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, stride, dilation, bias]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Performs a standard 1D convolution operation with asymmetric input and a square kernel, potentially dilated and strided.
Args:
in_channels (int): Number of channels in the input tensor.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int): Size of the square convolution kernel.
stride (int, optional): Stride of the convolution. Defaults to 1.
dilation (int, optional): Spacing between kernel elements. Defaults to 1.
bias (bool, optional): If `True`, adds a learnable bias to the output. Defaults to `False`.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
dilation: int = 1,
bias: bool = False,
):
super(Model, self).__init__()
self.conv1d = nn.Conv1d(
in_channels,
out_channels,
kernel_size,
stride=stride,
dilation=dilation,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Performs the 1D convolution.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, length).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_channels, length_out).
"""
return self.conv1d(x)
# Constants
batch_size = 16
in_channels = 3
out_channels = 64
kernel_size = 3
length = 256
stride = 3
dilation = 4
bias = False
def get_inputs():
x = torch.randn(batch_size, in_channels, length)
return [x]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, stride, dilation, bias]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
// CUDA kernel using grid-stride loops to handle workloads larger than the number of available threads
__global__ void conv1d_kernel(
const float* __restrict__ x,
const float* __restrict__ weight,
const float* __restrict__ bias,
float* __restrict__ output,
int B,
int in_channels,
int in_size,
int out_channels,
int kernel_size,
int out_size,
int stride,
int dilation
) {
int total_elements = B * out_channels * out_size;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int grid_stride = blockDim.x * gridDim.x;
// Grid-stride loop: each thread processes multiple output elements if needed
for (; idx < total_elements; idx += grid_stride) {
int o = idx % out_size;
int tmp = idx / out_size;
int oc = tmp % out_channels;
int b = tmp / out_channels;
float sum = 0.0f;
// Loop over all input channels and the convolution kernel
for (int ic = 0; ic < in_channels; ++ic) {
for (int k = 0; k < kernel_size; ++k) {
int input_pos = o * stride + k * dilation;
if (input_pos < in_size) {
int x_idx = b * in_channels * in_size + ic * in_size + input_pos;
int w_idx = oc * in_channels * kernel_size + ic * kernel_size + k;
sum += x[x_idx] * weight[w_idx];
}
}
}
if (bias != nullptr) {
sum += bias[oc];
}
int out_idx = b * out_channels * out_size + oc * out_size + o;
output[out_idx] = sum;
}
}
// Forward function exposed via pybind11
torch::Tensor forward(
torch::Tensor x,
torch::Tensor weight,
torch::optional<torch::Tensor> bias,
int stride,
int dilation
) {
TORCH_CHECK(x.device().is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(weight.device().is_cuda(), "weight must be a CUDA tensor");
TORCH_CHECK(x.is_contiguous(), "x must be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
TORCH_CHECK(x.dim() == 3, "x must be 3D");
TORCH_CHECK(weight.dim() == 3, "weight must be 3D");
TORCH_CHECK(weight.size(1) == x.size(1), "Input channels mismatch");
if (bias.has_value()) {
TORCH_CHECK(bias->device().is_cuda(), "bias must be a CUDA tensor");
TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
TORCH_CHECK(bias->dim() == 1, "bias must be 1D");
TORCH_CHECK(bias->size(0) == weight.size(0), "Bias size mismatch");
}
int B = x.size(0);
int in_channels = x.size(1);
int in_size = x.size(2);
int out_channels = weight.size(0);
int kernel_size = weight.size(2);
int out_size = (in_size - dilation * (kernel_size - 1) - 1) / stride + 1;
TORCH_CHECK(out_size > 0, "Invalid output size");
auto output = torch::empty({B, out_channels, out_size}, x.options());
if (output.numel() == 0) return output;
const float* x_data = x.data_ptr<float>();
const float* weight_data = weight.data_ptr<float>();
const float* bias_data = bias.has_value() ? bias.value().data_ptr<float>() : nullptr;
float* output_data = output.data_ptr<float>();
int total_elements = B * out_channels * out_size;
int threads = 256;
int blocks = (total_elements + threads - 1) / threads;
conv1d_kernel<<<blocks, threads>>>(
x_data,
weight_data,
bias_data,
output_data,
B,
in_channels,
in_size,
out_channels,
kernel_size,
out_size,
stride,
dilation
);
cudaError_t err = cudaGetLastError();
TORCH_CHECK(err == cudaSuccess, "Kernel launch error: ", cudaGetErrorString(err));
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "1D convolution forward (CUDA) with grid-stride loop");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.062 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.642 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 27.524 | % | 0.218 | 5 |
Issued Ipc Active | 1.102 | inst/cycle | 0.000 | 5 |
SM Busy | 27.524 | % | 0.218 | 5 |
Memory Throughput | 10850510415.646 | byte/second | 8923051982430478.000 | 5 |
Mem Busy | 7.406 | % | 0.006 | 5 |
Max Bandwidth | 5.406 | % | 0.003 | 5 |
L1/TEX Hit Rate | 87.020 | % | 0.000 | 5 |
L2 Hit Rate | 99.422 | % | 0.109 | 5 |
Mem Pipes Busy | 13.672 | % | 0.016 | 5 |
Warp Cycles Per Issued Instruction | 16.960 | cycle | 0.091 | 5 |
Warp Cycles Per Executed Instruction | 17.606 | cycle | 0.098 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 28.050 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 29.092 | % | 0.120 | 5 |
Achieved Active Warps Per SM | 18.616 | warp | 0.050 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (29.4%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 667166.28 | μs |
Device Time | 6.40 | μs |
Self CPU Time | 51.81 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 667114.46 | μs |
Device Time | 6.40 | μs |
Self CPU Time | 94.38 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 666898.97 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 109.84 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 666465.20 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 666465.20 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 522395.17 | μs |
Device Time | 708.73 | μs |
Self CPU Time | 522395.17 | μs |
Self Device Time | 708.73 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
conv1d_kernel(float const*, float const*, float const*, float*, int, int, int, int, int, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 44799.73 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 44799.73 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 19044.83 | μs |
Device Time | 126673.53 | μs |
Self CPU Time | 19044.83 | μs |
Self Device Time | 126673.53 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 67081.31 | μs |
Device Time | 652655.57 | μs |
Self CPU Time | 13563.24 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 53519.87 | μs |
Device Time | 652655.57 | μs |
Self CPU Time | 18540.10 | μs |
Self Device Time | 652655.57 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 652655.57 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 652655.57 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45291 warnings generated when compiling for host. Suppressed 45325 warnings (45278 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.