43_Conv3d_Max_LogSumExp_ReLU
• warp_uniform_kernel_base_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
stride: int,
padding: int,
conv_weight: torch.Tensor,
conv_bias: torch.Tensor,
) -> torch.Tensor:
"""
Applies 3D convolution, max pooling, log sum exp, and ReLU activation.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width)
stride (int): Stride of the convolution
padding (int): Padding of the convolution
conv_weight (torch.Tensor): Convolution weight tensor
conv_bias (torch.Tensor): Convolution bias tensor
Returns:
torch.Tensor: Output tensor after applying convolution, max pooling, logsumexp and ReLU
"""
x = F.conv3d(x, conv_weight, bias=conv_bias, stride=stride, padding=padding)
x = F.max_pool3d(x, kernel_size=2, stride=2)
x = torch.logsumexp(x, dim=1, keepdim=True)
x = F.relu(x)
return x
class Model(nn.Module):
"""
Model that performs a 3D convolution, max pooling, log sum exp, and ReLU activation.
"""
def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
super(Model, self).__init__()
conv = nn.Conv3d(
in_channels, out_channels, kernel_size, stride=stride, padding=padding
)
self.conv_weight = nn.Parameter(conv.weight)
self.conv_bias = nn.Parameter(
conv.bias
+ torch.randn(
conv.bias.shape, device=conv.bias.device, dtype=conv.bias.dtype
)
* 0.02
)
def forward(self, x, stride, padding, fn=module_fn):
return fn(x, stride, padding, self.conv_weight, self.conv_bias)
batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
stride = 1
padding = 1
def get_inputs():
return [torch.randn(batch_size, in_channels, depth, height, width), stride, padding]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, stride, padding]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Model that performs a 3D convolution, max pooling, log sum exp, and ReLU activation.
"""
def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
super(Model, self).__init__()
self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
self.conv.bias = nn.Parameter(self.conv.bias + torch.randn(self.conv.bias.shape, device=self.conv.bias.device, dtype=self.conv.bias.dtype) * 0.02)
self.max_pool = nn.MaxPool3d(kernel_size=2, stride=2)
def forward(self, x):
"""
Args:
x: Input tensor of shape (batch_size, in_channels, depth, height, width)
Returns:
Output tensor of shape (batch_size, out_channels, depth', height', width')
"""
x = self.conv(x)
x = self.max_pool(x)
x = torch.logsumexp(x, dim=1, keepdim=True)
x = torch.relu(x)
return x
batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
stride = 1
padding = 1
def get_inputs():
return [torch.randn(batch_size, in_channels, depth, height, width)]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, stride, padding]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cfloat>
template<int WARP_SIZE = 32>
__global__ void warp_uniform_kernel(
const float* __restrict__ input,
float* __restrict__ output,
const int N, const int C, const int D, const int H, const int W) {
const int tid = threadIdx.x;
const int wid = tid / WARP_SIZE;
const int lane = tid % WARP_SIZE;
const int bid = blockIdx.x;
const int spatial_size = D * H * W;
const int total_elements = N * spatial_size;
const int warps_per_block = blockDim.x / WARP_SIZE;
const int elements_per_iter = gridDim.x * warps_per_block * WARP_SIZE;
// Process elements in warp-sized chunks
for (int base_idx = bid * warps_per_block * WARP_SIZE + wid * WARP_SIZE + lane;
base_idx < total_elements;
base_idx += elements_per_iter) {
// Calculate position (guaranteed uniform across warp)
const int n = base_idx / spatial_size;
const int pos = base_idx % spatial_size;
const int d = pos / (H * W);
const int hw = pos % (H * W);
const int h = hw / W;
const int w = hw % W;
// Calculate base offset for this position
const int base_offset = n * (C * spatial_size) + d * (H * W) + h * W + w;
// Find maximum value (uniform across warp)
float max_val = -FLT_MAX;
#pragma unroll 4
for (int c = 0; c < C; c++) {
const float val = input[base_offset + c * spatial_size];
max_val = fmaxf(max_val, val);
}
// Compute sum of exponentials (uniform across warp)
float sum_exp = 0.0f;
#pragma unroll 4
for (int c = 0; c < C; c++) {
const float val = input[base_offset + c * spatial_size];
sum_exp += __expf(val - max_val);
}
// Compute final result (uniform across warp)
const float result = fmaxf(0.0f, max_val + __logf(sum_exp));
// Write result (coalesced write)
if (base_idx < total_elements) {
output[base_idx] = result;
}
}
}
torch::Tensor forward(
torch::Tensor x,
int64_t stride,
int64_t padding,
torch::Tensor conv_weight,
torch::Tensor conv_bias) {
x = x.contiguous();
conv_weight = conv_weight.contiguous();
conv_bias = conv_bias.contiguous();
auto conv_result = torch::conv3d(x, conv_weight, conv_bias,
{stride, stride, stride},
{padding, padding, padding});
auto pool_result = torch::max_pool3d(conv_result, {2, 2, 2}, {2, 2, 2});
const int N = pool_result.size(0);
const int C = pool_result.size(1);
const int D = pool_result.size(2);
const int H = pool_result.size(3);
const int W = pool_result.size(4);
auto output = torch::empty({N, 1, D, H, W}, pool_result.options());
// Configure kernel launch parameters
const int threads_per_block = 256; // Multiple of warp size (32)
const int num_blocks = (N * D * H * W + threads_per_block - 1) / threads_per_block;
warp_uniform_kernel<32><<<num_blocks, threads_per_block>>>(
pool_result.data_ptr<float>(),
output.data_ptr<float>(),
N, C, D, H, W
);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Warp uniform kernel implementation");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.378 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 1.002 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 35.124 | % | 0.034 | 5 |
Issued Ipc Active | 1.404 | inst/cycle | 0.000 | 5 |
SM Busy | 35.124 | % | 0.034 | 5 |
Memory Throughput | 1592482846323.352 | byte/second | 74564256921934053376.000 | 5 |
Mem Busy | 30.446 | % | 0.036 | 5 |
Max Bandwidth | 47.640 | % | 0.087 | 5 |
L1/TEX Hit Rate | 42.330 | % | 0.001 | 5 |
L2 Hit Rate | 22.580 | % | 0.001 | 5 |
Mem Pipes Busy | 12.718 | % | 0.005 | 5 |
Warp Cycles Per Issued Instruction | 38.164 | cycle | 0.021 | 5 |
Warp Cycles Per Executed Instruction | 38.888 | cycle | 0.022 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 30.690 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 84.086 | % | 0.013 | 5 |
Achieved Active Warps Per SM | 53.816 | warp | 0.005 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (84.2%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::conv3d | ||
CPU Time | 4718663.58 | μs |
Device Time | 4793942.82 | μs |
Self CPU Time | 13358.33 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 4705305.25 | μs |
Device Time | 4793942.82 | μs |
Self CPU Time | 18193.48 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 4687111.77 | μs |
Device Time | 4793942.82 | μs |
Self CPU Time | 39604.55 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 3777914.63 | μs |
Device Time | 3895835.35 | μs |
Self CPU Time | 183721.29 | μs |
Self Device Time | 3895835.35 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernelExC | ||
CPU Time | 3559266.42 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 3559266.42 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
sm80_xmma_fprop_implicit_gemm_f32f32_f32f32_f32_nchwkcrs_nchw_tilesize128x32x8_stage3_warpsize2x2x1_g1_ffma_aligna4_alignc4_execute_kernel__5x_cudnn | ||
CPU Time | 0.00 | μs |
Device Time | 3895832.37 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 3895832.37 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45286 warnings generated when compiling for host. Suppressed 45325 warnings (45278 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.