7_Conv3d_ReLU_LeakyReLU_GELU_Sigmoid_BiasAdd
• optimized_warp_distribution_kernel_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv_weight: torch.Tensor,
conv_bias: torch.Tensor,
bias: torch.Tensor,
) -> torch.Tensor:
"""
Applies 3D convolution followed by ReLU, LeakyReLU, GELU, Sigmoid activations and bias addition.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width)
conv_weight (torch.Tensor): 3D convolution weight tensor of shape
(out_channels, in_channels, kernel_size, kernel_size, kernel_size)
conv_bias (torch.Tensor): Bias tensor for 3D convolution of shape (out_channels)
bias (torch.Tensor): Bias tensor for addition of shape (out_channels, 1, 1, 1)
Returns:
torch.Tensor: Output tensor after applying convolution and activations
"""
x = F.conv3d(x, conv_weight, bias=conv_bias)
x = F.relu(x)
x = F.leaky_relu(x, negative_slope=0.01)
x = F.gelu(x)
x = torch.sigmoid(x)
x = x + bias
return x
class Model(nn.Module):
"""
Model that performs a 3D convolution, applies ReLU, LeakyReLU, GELU, Sigmoid activations, and bias in sequence.
"""
def __init__(self, in_channels, out_channels, kernel_size, bias_shape):
super(Model, self).__init__()
conv = nn.Conv3d(in_channels, out_channels, kernel_size)
self.bias = nn.Parameter(torch.randn(bias_shape) * 0.02)
self.conv_weight = nn.Parameter(conv.weight)
self.conv_bias = nn.Parameter(conv.bias)
self.bias = self.bias
def forward(self, x, fn=module_fn):
return fn(x, self.conv_weight, self.conv_bias, self.bias)
batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
bias_shape = (out_channels, 1, 1, 1)
def get_inputs():
return [torch.randn(batch_size, in_channels, depth, height, width)]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, bias_shape]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Model that performs a 3D convolution, applies ReLU, LeakyReLU, GELU, Sigmoid activations, and bias in sequence.
"""
def __init__(self, in_channels, out_channels, kernel_size, bias_shape):
super(Model, self).__init__()
self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
self.bias = nn.Parameter(torch.randn(bias_shape) * 0.02)
def forward(self, x):
x = self.conv(x)
x = torch.relu(x)
x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
x = torch.nn.functional.gelu(x)
x = torch.sigmoid(x)
x = x + self.bias
return x
batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
bias_shape = (out_channels, 1, 1, 1)
def get_inputs():
return [torch.randn(batch_size, in_channels, depth, height, width)]
def get_init_inputs():
return [in_channels, out_channels, kernel_size, bias_shape]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define WARP_SIZE 32
#define BLOCK_SIZE 256
#define ELEMENTS_PER_THREAD 8
__device__ __forceinline__ float process_element(float val, const float* bias, int bias_idx) {
val = fmaxf(0.0f, val);
val = fmaxf(0.01f * val, val);
const float sqrt_2_over_pi = sqrtf(2.0f / M_PI);
val = 0.5f * val * (1.0f + tanhf(sqrt_2_over_pi * (val + 0.044715f * powf(val, 3.0f))));
val = 1.0f / (1.0f + expf(-val));
return val + __ldg(&bias[bias_idx]);
}
__global__ void apply_activations_and_bias_kernel(
float* __restrict__ output,
const float* __restrict__ bias,
int batch_size,
int out_channels,
int depth,
int height,
int width
) {
const int tid = threadIdx.x;
const int warp_id = tid / WARP_SIZE;
const int lane_id = tid % WARP_SIZE;
const int global_warp_id = blockIdx.x * (BLOCK_SIZE / WARP_SIZE) + warp_id;
const int spatial_size = depth * height * width;
const int total_elements = batch_size * out_channels * spatial_size;
// Calculate base index for this thread
int base_idx = global_warp_id * (WARP_SIZE * ELEMENTS_PER_THREAD) + lane_id;
// Process elements in chunks of 4 when possible
#pragma unroll
for (int i = 0; i < ELEMENTS_PER_THREAD; i += 4) {
int idx = base_idx + i * WARP_SIZE;
if (idx + 3 * WARP_SIZE < total_elements) {
// Load 4 elements
float4 data;
data.x = output[idx];
data.y = output[idx + WARP_SIZE];
data.z = output[idx + 2 * WARP_SIZE];
data.w = output[idx + 3 * WARP_SIZE];
// Calculate bias indices
int bias_idx_x = (idx / spatial_size) % out_channels;
int bias_idx_y = ((idx + WARP_SIZE) / spatial_size) % out_channels;
int bias_idx_z = ((idx + 2 * WARP_SIZE) / spatial_size) % out_channels;
int bias_idx_w = ((idx + 3 * WARP_SIZE) / spatial_size) % out_channels;
// Process elements
data.x = process_element(data.x, bias, bias_idx_x);
data.y = process_element(data.y, bias, bias_idx_y);
data.z = process_element(data.z, bias, bias_idx_z);
data.w = process_element(data.w, bias, bias_idx_w);
// Store results
output[idx] = data.x;
output[idx + WARP_SIZE] = data.y;
output[idx + 2 * WARP_SIZE] = data.z;
output[idx + 3 * WARP_SIZE] = data.w;
} else {
// Handle remaining elements
for (int j = 0; j < 4; j++) {
int curr_idx = idx + j * WARP_SIZE;
if (curr_idx < total_elements) {
int bias_idx = (curr_idx / spatial_size) % out_channels;
float val = output[curr_idx];
output[curr_idx] = process_element(val, bias, bias_idx);
}
}
}
}
}
torch::Tensor module_fn_cuda(
torch::Tensor x,
torch::Tensor conv_weight,
torch::Tensor conv_bias,
torch::Tensor bias
) {
CHECK_INPUT(x);
CHECK_INPUT(conv_weight);
CHECK_INPUT(conv_bias);
CHECK_INPUT(bias);
auto output = torch::conv3d(x, conv_weight, conv_bias);
int batch_size = output.size(0);
int out_channels = output.size(1);
int depth = output.size(2);
int height = output.size(3);
int width = output.size(4);
const int total_elements = batch_size * out_channels * depth * height * width;
const int elements_per_block = BLOCK_SIZE * ELEMENTS_PER_THREAD;
const int num_blocks = (total_elements + elements_per_block - 1) / elements_per_block;
apply_activations_and_bias_kernel<<<num_blocks, BLOCK_SIZE>>>(
output.data_ptr<float>(),
bias.data_ptr<float>(),
batch_size,
out_channels,
depth,
height,
width
);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &module_fn_cuda, "CUDA implementation of module_fn");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 3.180 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 3.084 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 79.582 | % | 0.001 | 5 |
Issued Ipc Active | 3.180 | inst/cycle | 0.000 | 5 |
SM Busy | 79.582 | % | 0.001 | 5 |
Memory Throughput | 2002344245033.276 | byte/second | 197899624505076285440.000 | 5 |
Mem Busy | 33.892 | % | 0.060 | 5 |
Max Bandwidth | 59.756 | % | 0.177 | 5 |
L1/TEX Hit Rate | 50.592 | % | 0.002 | 5 |
L2 Hit Rate | 50.426 | % | 0.000 | 5 |
Mem Pipes Busy | 13.272 | % | 0.003 | 5 |
Warp Cycles Per Issued Instruction | 18.372 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 18.376 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 30.870 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 24.900 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 91.602 | % | 0.014 | 5 |
Achieved Active Warps Per SM | 58.622 | warp | 0.006 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | ALU is the highest-utilized pipeline (63.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. The pipeline is well-utilized, but might become a bottleneck if more work is added. Based on the number of executed instructions, the highest utilized pipeline (65.5%) is XU. Comparing the two, the overall pipeline utilization appears to be caused by frequent, low-latency instructions. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-decoder) or hover over the pipeline name to understand the workloads handled by each pipeline. The Instruction Statistics section shows the mix of executed instructions in this kernel. Check the Warp State Statistics section for which reasons cause warps to stall. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
INF Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. |
Operation / Metric | Value | Unit |
---|---|---|
aten::conv3d | ||
CPU Time | 604151.83 | μs |
Device Time | 4334690.91 | μs |
Self CPU Time | 10135.48 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 594016.35 | μs |
Device Time | 4334690.91 | μs |
Self CPU Time | 13630.41 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 580385.94 | μs |
Device Time | 4334690.91 | μs |
Self CPU Time | 29335.51 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 485383.46 | μs |
Device Time | 3762166.40 | μs |
Self CPU Time | 149114.13 | μs |
Self Device Time | 3762166.40 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
sm80_xmma_fprop_implicit_gemm_indexed_f32f32_f32f32_f32_nchwkcrs_nchw_tilesize32x32x8_stage3_warpsize1x2x1_g1_ffma_aligna4_alignc4_execute_kernel__5x_cudnn | ||
CPU Time | 0.00 | μs |
Device Time | 3762164.92 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 3762164.92 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 4054128.79 | μs |
Device Time | 82000.49 | μs |
Self CPU Time | 4054128.79 | μs |
Self Device Time | 82000.49 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 494761.60 | μs |
Device Time | 495366.32 | μs |
Self CPU Time | 13969.55 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45290 warnings generated when compiling for host. Suppressed 45324 warnings (45277 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.