54_Conv2d_Multiply_LeakyReLU_GELU
• dynamic_block_size_54conv_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv_weight: torch.Tensor,
conv_bias: torch.Tensor,
multiplier: torch.Tensor,
) -> torch.Tensor:
"""
Applies convolution, scalar multiplication, LeakyReLU and GELU.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width)
conv_weight (torch.Tensor): Convolution weights of shape (out_channels, in_channels, kernel_size, kernel_size)
conv_bias (torch.Tensor): Convolution bias of shape (out_channels)
multiplier (torch.Tensor): Learnable scalar of shape (out_channels, 1, 1)
Returns:
torch.Tensor: Output tensor after applying convolution, multiplication, LeakyReLU and GELU
"""
x = F.conv2d(x, conv_weight, bias=conv_bias)
x = x * multiplier
x = F.leaky_relu(x)
x = F.gelu(x)
return x
class Model(nn.Module):
"""
Model that performs a convolution, multiplies by a learnable scalar, applies LeakyReLU, and then GELU.
"""
def __init__(self, in_channels, out_channels, kernel_size, multiplier_shape):
super(Model, self).__init__()
conv = nn.Conv2d(in_channels, out_channels, kernel_size)
self.conv_weight = nn.Parameter(conv.weight)
self.conv_bias = nn.Parameter(conv.bias)
self.multiplier = nn.Parameter(torch.randn(multiplier_shape) * 0.02)
def forward(self, x, fn=module_fn):
return fn(x, self.conv_weight, self.conv_bias, self.multiplier)
batch_size = 128
in_channels = 3
out_channels = 16
height, width = 32, 32
kernel_size = 3
multiplier_shape = (out_channels, 1, 1)
def get_inputs():
return [torch.randn(batch_size, in_channels, height, width)]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, multiplier_shape]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Model that performs a convolution, multiplies by a learnable scalar, applies LeakyReLU, and then GELU.
"""
def __init__(self, in_channels, out_channels, kernel_size, multiplier_shape):
super(Model, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
self.multiplier = nn.Parameter(torch.randn(multiplier_shape) * 0.02)
self.leaky_relu = nn.LeakyReLU()
def forward(self, x):
x = self.conv(x)
x = x * self.multiplier
x = self.leaky_relu(x)
x = torch.nn.functional.gelu(x)
return x
batch_size = 128
in_channels = 3
out_channels = 16
height, width = 32, 32
kernel_size = 3
multiplier_shape = (out_channels, 1, 1)
def get_inputs():
return [torch.randn(batch_size, in_channels, height, width)]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, multiplier_shape]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <cmath>
#include <stdio.h>
// Device function: GELU approximation
__device__ __forceinline__ float gelu(float x) {
const float k0 = 0.7978845608028654f; // sqrt(2/pi)
return 0.5f * x * (1.0f + tanhf(k0 * (x + 0.044715f * x * x * x)));
}
// CUDA kernel that performs convolution, scalar multiplication, LeakyReLU and GELU.
// - input: [batch_size, in_channels, input_h, input_w]
// - weight: [out_channels, in_channels, kernel_size, kernel_size]
// - bias: [out_channels]
// - multiplier: [out_channels] (broadcast over spatial dims)
// - output: [batch_size, out_channels, output_h, output_w]
__global__ void conv_forward_kernel(
const float* __restrict__ input,
const float* __restrict__ weight,
const float* __restrict__ bias,
const float* __restrict__ multiplier,
float* __restrict__ output,
int batch_size,
int in_channels,
int input_h,
int input_w,
int out_channels,
int kernel_size,
int output_h,
int output_w
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int total = batch_size * out_channels * output_h * output_w;
int stride = blockDim.x * gridDim.x;
// Grid-stride loop to cover all output elements.
for (int index = idx; index < total; index += stride) {
// Calculate indices for output: (n, oc, oh, ow)
int ow = index % output_w;
int tmp = index / output_w;
int oh = tmp % output_h;
tmp = tmp / output_h;
int oc = tmp % out_channels;
int n = tmp / out_channels;
// Start with the bias for output channel oc.
float sum = bias[oc];
// Convolution: iterate over input channels and kernel window.
for (int ic = 0; ic < in_channels; ic++) {
for (int i = 0; i < kernel_size; i++) {
for (int j = 0; j < kernel_size; j++) {
int in_h = oh + i; // stride = 1, no padding
int in_w = ow + j;
int input_index = ((n * in_channels + ic) * input_h + in_h) * input_w + in_w;
int weight_index = ((oc * in_channels + ic) * kernel_size + i) * kernel_size + j;
sum += input[input_index] * weight[weight_index];
}
}
}
// Multiply with the channel-specific multiplier
sum *= multiplier[oc];
// Apply LeakyReLU activation (negative slope = 0.01)
sum = (sum > 0.0f) ? sum : 0.01f * sum;
// Apply GELU activation
float out_val = gelu(sum);
output[index] = out_val;
}
}
// C++ interface (to be called from Python)
// Added parameter 'block_size' to allow experimentation with different block sizes (e.g., 32, 64, 128, 256, 512).
torch::Tensor forward_cuda(
torch::Tensor input,
torch::Tensor conv_weight,
torch::Tensor conv_bias,
torch::Tensor multiplier,
int block_size = 256 // Default block size
) {
// Get input dimensions
const int batch_size = input.size(0);
const int in_channels = input.size(1);
const int input_h = input.size(2);
const int input_w = input.size(3);
// Get convolution parameters
const int out_channels = conv_weight.size(0);
const int kernel_size = conv_weight.size(2);
const int output_h = input_h - kernel_size + 1;
const int output_w = input_w - kernel_size + 1;
// Allocate output tensor
auto output = torch::empty({batch_size, out_channels, output_h, output_w}, input.options());
// Calculate the total number of elements to process
const int total_elements = batch_size * out_channels * output_h * output_w;
int blocks = (total_elements + block_size - 1) / block_size;
// Launch CUDA kernel with the specified block size
conv_forward_kernel<<<blocks, block_size>>>(
input.data_ptr<float>(),
conv_weight.data_ptr<float>(),
conv_bias.data_ptr<float>(),
multiplier.data_ptr<float>(),
output.data_ptr<float>(),
batch_size,
in_channels,
input_h,
input_w,
out_channels,
kernel_size,
output_h,
output_w
);
// Check for kernel launch errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel failed: %s\n", cudaGetErrorString(err));
}
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward_cuda, "Convolution, scalar multiplication, LeakyReLU and GELU (CUDA) with dynamic block size",
py::arg("input"),
py::arg("conv_weight"),
py::arg("conv_bias"),
py::arg("multiplier"),
py::arg("block_size") = 256);
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 3.174 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.948 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 79.524 | % | 0.011 | 5 |
Issued Ipc Active | 3.182 | inst/cycle | 0.000 | 5 |
SM Busy | 79.524 | % | 0.011 | 5 |
Memory Throughput | 35465307694.476 | byte/second | 18267120055705660.000 | 5 |
Mem Busy | 53.260 | % | 0.051 | 5 |
Max Bandwidth | 36.386 | % | 0.024 | 5 |
L1/TEX Hit Rate | 87.932 | % | 0.000 | 5 |
L2 Hit Rate | 92.528 | % | 0.058 | 5 |
Mem Pipes Busy | 35.864 | % | 0.023 | 5 |
Warp Cycles Per Issued Instruction | 16.718 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 16.758 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 29.300 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 83.396 | % | 0.007 | 5 |
Achieved Active Warps Per SM | 53.372 | warp | 0.003 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (45.3%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (83.3%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 273281.13 | μs |
Device Time | 81.41 | μs |
Self CPU Time | 60.83 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 273220.31 | μs |
Device Time | 81.41 | μs |
Self CPU Time | 115.35 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 272733.99 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 135.75 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 272196.69 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 272196.69 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 634183.26 | μs |
Device Time | 14806.58 | μs |
Self CPU Time | 634183.26 | μs |
Self Device Time | 14806.58 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
conv_forward_kernel(float const*, float const*, float const*, float const*, float*, int, int, int, int, int, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 259177.32 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 259177.32 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 19900.72 | μs |
Device Time | 28907.56 | μs |
Self CPU Time | 19900.72 | μs |
Self Device Time | 28907.56 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 219910.49 | μs |
Device Time | 556300.08 | μs |
Self CPU Time | 12885.56 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 207026.92 | μs |
Device Time | 556300.08 | μs |
Self CPU Time | 15366.92 | μs |
Self Device Time | 556300.08 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 556300.08 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 556300.08 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45293 warnings generated when compiling for host. Suppressed 45325 warnings (45278 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.