8_Conv3d_Divide_Max_GlobalAvgPool_BiasAdd_Sum
• fused_pooling_unroll_edit_1
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
divisor: float,
pool_size: tuple,
sum_dim: int,
conv_weight: torch.Tensor,
conv_bias: torch.Tensor,
bias: torch.Tensor,
) -> torch.Tensor:
"""
Applies 3D convolution, division, max pooling, global average pooling, bias addition and sum.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width)
divisor (float): Constant to divide by
pool_size (tuple): Size for max pooling (depth, height, width)
sum_dim (int): Dimension to sum over
conv_weight (torch.Tensor): 3D convolution weights
conv_bias (torch.Tensor): 3D convolution bias
bias (torch.Tensor): Bias tensor for addition
Returns:
torch.Tensor: Output tensor after applying all operations
"""
x = F.conv3d(x, conv_weight, bias=conv_bias)
x = x / divisor
x = F.max_pool3d(x, pool_size)
x = F.adaptive_avg_pool3d(x, (1, 1, 1))
x = x + bias
x = torch.sum(x, dim=sum_dim)
return x
class Model(nn.Module):
"""
Model that performs a 3D convolution, divides by a constant, applies max pooling,
global average pooling, adds a bias term, and sums along a specific dimension.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size,
divisor,
pool_size,
bias_shape,
sum_dim,
):
super(Model, self).__init__()
conv_shape = (out_channels, in_channels, *kernel_size)
conv = nn.Conv3d(in_channels, out_channels, kernel_size)
self.bias = nn.Parameter(torch.randn(bias_shape) * 0.02)
self.conv_weight = conv.weight
self.conv_bias = conv.bias
self.bias = self.bias
def forward(self, x, fn=module_fn):
return fn(
x, divisor, pool_size, sum_dim, self.conv_weight, self.conv_bias, self.bias
)
batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = (3, 3, 3)
divisor = 2.0
pool_size = (2, 2, 2)
bias_shape = (out_channels, 1, 1, 1)
sum_dim = 1
def get_inputs():
return [torch.randn(batch_size, in_channels, depth, height, width)]
def get_init_inputs():
return [
in_channels,
out_channels,
kernel_size,
divisor,
pool_size,
bias_shape,
sum_dim,
]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Model that performs a 3D convolution, divides by a constant, applies max pooling,
global average pooling, adds a bias term, and sums along a specific dimension.
"""
def __init__(self, in_channels, out_channels, kernel_size, divisor, pool_size, bias_shape, sum_dim):
super(Model, self).__init__()
self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
self.divisor = divisor
self.max_pool = nn.MaxPool3d(pool_size)
self.global_avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.bias = nn.Parameter(torch.randn(bias_shape) * 0.02)
self.sum_dim = sum_dim
def forward(self, x):
x = self.conv(x)
x = x / self.divisor
x = self.max_pool(x)
x = self.global_avg_pool(x)
x = x + self.bias
x = torch.sum(x, dim=self.sum_dim)
return x
batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = (3, 3, 3)
divisor = 2.0
pool_size = (2, 2, 2)
bias_shape = (out_channels, 1, 1, 1)
sum_dim = 1
def get_inputs():
return [torch.randn(batch_size, in_channels, depth, height, width)]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, divisor, pool_size, bias_shape, sum_dim]
#include <torch/extension.h>
#include <vector>
#include <cfloat>
#include <cuda_runtime.h>
// Custom fused CUDA kernel that performs max pooling, global average pooling, bias addition, and an optional reduction (sum) along a given dimension.
// This kernel minimizes __syncthreads usage by using warp-level primitives for intra-warp reduction.
// Each block handles one (n, c) slice of the 5D tensor (N, C, D, H, W) output from conv3d.
// We assume that the spatial dimensions D, H, and W are exactly divisible by the pooling sizes.
__global__ void fusedPoolingAndReductionKernel(
const float* __restrict__ conv_out, // Input from conv3d, shape: (N, C, D, H, W)
int N,
int C,
int D,
int H,
int W,
int poolD,
int poolH,
int poolW,
const float* __restrict__ bias, // Bias tensor, assumed shape: (C) (broadcasted along N)
int sum_dim, // If sum_dim==1, we sum over the channel dimension
float* __restrict__ output, // Output tensor: if sum_dim==1, shape: (N); else shape: (N, C)
float divisor // Division factor applied per conv output element
) {
// Determine which (n, c) slice this block is responsible for
int nc = blockIdx.x; // block index in [0, N*C)
int n = nc / C;
int c = nc % C;
// Dimensions after max pooling (using non-overlapping windows with size equal to pool dimensions)
int outD = D / poolD;
int outH = H / poolH;
int outW = W / poolW;
int numPools = outD * outH * outW;
const float invDiv = 1.0f / divisor;
float thread_sum = 0.0f;
// Each thread processes multiple pooling windows in a strided loop
for (int idx = threadIdx.x; idx < numPools; idx += blockDim.x) {
// Calculate the pooling window indices in the output of max pooling
int d = idx / (outH * outW);
int rem = idx % (outH * outW);
int h = rem / outW;
int w = rem % outW;
int startD = d * poolD;
int startH = h * poolH;
int startW = w * poolW;
// Compute max pooling over the window
float max_val = -FLT_MAX;
#pragma unroll
for (int pd = 0; pd < poolD; pd++) {
#pragma unroll
for (int ph = 0; ph < poolH; ph++) {
#pragma unroll
for (int pw = 0; pw < poolW; pw++) {
int curD = startD + pd;
int curH = startH + ph;
int curW = startW + pw;
// Compute linear index for conv_out: shape is (N, C, D, H, W)
int index = (((n * C + c) * D + curD) * H + curH) * W + curW;
// Incorporate division on the fly for better fusion
float val = __fmaf_rn(__ldg(&conv_out[index]), invDiv, 0.0f);
if (val > max_val) {
max_val = val;
}
}
}
}
// Accumulate the max via this pooling window
thread_sum += max_val;
}
// Use warp-level reduction to minimize __syncthreads calls
unsigned int lane = threadIdx.x & 31;
// Intra-warp reduction using shuffle
float sum_val = thread_sum;
for (int offset = 16; offset > 0; offset /= 2) {
sum_val += __shfl_down_sync(0xFFFFFFFF, sum_val, offset);
}
// Allocate shared memory for inter-warp reduction; only one __syncthreads call is used
extern __shared__ float shared[];
if (lane == 0) {
shared[threadIdx.x >> 5] = sum_val;
}
__syncthreads();
// First warp reduces the partial sums from each warp
float block_sum = 0.0f;
int numWarps = (blockDim.x + 31) >> 5;
if (threadIdx.x < numWarps) {
block_sum = shared[threadIdx.x];
}
if (threadIdx.x < 32) { // reduction within first warp
for (int offset = 16; offset > 0; offset /= 2) {
block_sum += __shfl_down_sync(0xFFFFFFFF, block_sum, offset);
}
}
// Thread 0 of the block finalizes the result for this (n,c) slice
if (threadIdx.x == 0) {
// Compute global average from the max pooling results
float avg = block_sum / numPools;
// Add the bias (broadcasted along n)
avg = avg + bias[c];
// Write out the result. If sum_dim == 1, accumulate into output for each n; otherwise, store per (n,c).
if (sum_dim == 1) {
atomicAdd(&output[n], avg);
} else {
output[n * C + c] = avg;
}
}
}
// The unified forward_cuda function that calls conv3d and then the fused pooling/reduction kernel
torch::Tensor forward_cuda(
torch::Tensor x,
double divisor,
std::vector<int64_t> pool_size, // [poolD, poolH, poolW]
int64_t sum_dim,
torch::Tensor conv_weight,
torch::Tensor conv_bias,
torch::Tensor bias
) {
// Check that inputs are CUDA tensors
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor.");
TORCH_CHECK(conv_weight.is_cuda(), "conv_weight must be a CUDA tensor.");
TORCH_CHECK(conv_bias.is_cuda(), "conv_bias must be a CUDA tensor.");
TORCH_CHECK(bias.is_cuda(), "bias must be a CUDA tensor.");
// 1) 3D convolution (we do not fuse convolution here as cuDNN is highly optimized)
auto conv_out = at::conv3d(x, conv_weight, conv_bias);
// conv_out shape: (N, C, D, H, W)
int N = conv_out.size(0);
int C = conv_out.size(1);
int D = conv_out.size(2);
int H = conv_out.size(3);
int W = conv_out.size(4);
// 2) Launch the fused pooling and reduction kernel
// The kernel will perform: max pooling (with window size = pool_size), global average pooling, bias addition and sum reduction (if requested)
// Each block processes one (n, c) slice
int totalBlocks = N * C;
int threadsPerBlock = 128;
int sharedMemSize = ((threadsPerBlock + 31) / 32) * sizeof(float);
// Prepare output tensor
torch::Tensor output;
if (sum_dim == 1) {
// If summing over channels, output will have shape (N) (one value per sample)
output = torch::zeros({N}, conv_out.options());
} else {
// Otherwise, store the result per channel: shape (N, C)
output = torch::empty({N, C}, conv_out.options());
}
fusedPoolingAndReductionKernel<<<totalBlocks, threadsPerBlock, sharedMemSize>>>(
conv_out.data_ptr<float>(),
N, C, D, H, W,
pool_size[0], pool_size[1], pool_size[2],
bias.data_ptr<float>(),
sum_dim,
output.data_ptr<float>(),
static_cast<float>(divisor)
);
cudaDeviceSynchronize();
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward_cuda, "Fused 3D Conv, Pooling, GlobalAvgPool, BiasAdd, and Sum (CUDA) with optimized synchronization");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.400 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.146 | inst/cycle | 0.001 | 5 |
Issue Slots Busy | 59.982 | % | 0.031 | 5 |
Issued Ipc Active | 2.400 | inst/cycle | 0.000 | 5 |
SM Busy | 59.982 | % | 0.031 | 5 |
Memory Throughput | 1861442065770.508 | byte/second | 210081970636727812096.000 | 5 |
Mem Busy | 33.868 | % | 0.085 | 5 |
Max Bandwidth | 55.564 | % | 0.189 | 5 |
L1/TEX Hit Rate | 60.884 | % | 0.000 | 5 |
L2 Hit Rate | 13.022 | % | 0.001 | 5 |
Mem Pipes Busy | 21.048 | % | 0.043 | 5 |
Warp Cycles Per Issued Instruction | 23.254 | cycle | 0.006 | 5 |
Warp Cycles Per Executed Instruction | 23.262 | cycle | 0.006 | 5 |
Avg. Active Threads Per Warp | 31.480 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 24.620 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 16.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 28.000 | block | 0.000 | 5 |
Block Limit Warps | 16.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 86.802 | % | 0.019 | 5 |
Achieved Active Warps Per SM | 55.556 | warp | 0.008 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (48.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (86.9%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 449457.85 | μs |
Device Time | 3188.98 | μs |
Self CPU Time | 80.01 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv3d | ||
CPU Time | 562279.73 | μs |
Device Time | 7638199.84 | μs |
Self CPU Time | 21194.70 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 541085.03 | μs |
Device Time | 7638199.84 | μs |
Self CPU Time | 27231.33 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 513853.70 | μs |
Device Time | 7638199.84 | μs |
Self CPU Time | 54785.33 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 341759.34 | μs |
Device Time | 6629446.63 | μs |
Self CPU Time | 245992.42 | μs |
Self Device Time | 6629446.63 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
sm80_xmma_fprop_implicit_gemm_indexed_f32f32_f32f32_f32_nchwkcrs_nchw_tilesize32x32x8_stage3_warpsize1x2x1_g1_ffma_aligna4_alignc4_execute_kernel__5x_cudnn | ||
CPU Time | 0.00 | μs |
Device Time | 6629445.07 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 6629445.07 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 8479201.93 | μs |
Device Time | 113191.97 | μs |
Self CPU Time | 8479201.93 | μs |
Self Device Time | 113191.97 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45300 warnings generated when compiling for host. Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.