import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
params: nn.ParameterDict,
is_training: bool,
) -> torch.Tensor:
"""
Implements the DenseNet121 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
params (nn.ParameterDict): Dictionary of parameters
is_training (bool): Whether to use training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Initial features
x = F.conv2d(x, params["features_conv_weight"], bias=None, stride=2, padding=3)
x = F.batch_norm(
x,
params["features_bn_mean"],
params["features_bn_var"],
params["features_bn_weight"],
params["features_bn_bias"],
training=is_training,
)
x = F.relu(x, inplace=True)
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
def dense_layer_fn(
x, bn_weight, bn_bias, bn_mean, bn_var, conv_weight, is_training
):
"""
Functional version of a single dense layer
"""
x = F.batch_norm(x, bn_mean, bn_var, bn_weight, bn_bias, training=is_training)
x = F.relu(x, inplace=True)
x = F.conv2d(x, conv_weight, bias=None, stride=1, padding=1)
x = F.dropout(x, p=0.0, training=is_training)
return x
def transition_layer_fn(
x, bn_weight, bn_bias, bn_mean, bn_var, conv_weight, is_training
):
"""
Functional version of transition layer
"""
x = F.batch_norm(x, bn_mean, bn_var, bn_weight, bn_bias, training=is_training)
x = F.relu(x, inplace=True)
x = F.conv2d(x, conv_weight, bias=None)
x = F.avg_pool2d(x, kernel_size=2, stride=2)
return x
# Dense blocks and transitions
for i in range(4): # 4 dense blocks
features = [x]
for j in range(params[f"block{i}_num_layers"]): # layers per block
prefix = f"block{i}_layer{j}_"
new_feature = dense_layer_fn(
x,
params[prefix + "bn_weight"],
params[prefix + "bn_bias"],
params[prefix + "bn_mean"],
params[prefix + "bn_var"],
params[prefix + "conv_weight"],
is_training,
)
features.append(new_feature)
x = torch.cat(features, 1)
if i != 3: # Apply transition after all blocks except last
x = transition_layer_fn(
x,
params[f"transition{i}_bn_weight"],
params[f"transition{i}_bn_bias"],
params[f"transition{i}_bn_mean"],
params[f"transition{i}_bn_var"],
params[f"transition{i}_conv_weight"],
is_training,
)
# Final layers
x = F.batch_norm(
x,
params["final_bn_mean"],
params["final_bn_var"],
params["final_bn_weight"],
params["final_bn_bias"],
training=is_training,
)
x = F.relu(x, inplace=True)
x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
x = F.linear(x, params["classifier_weight"], params["classifier_bias"])
return x
class Model(nn.Module):
def __init__(self, growth_rate=32, num_classes=1000):
super(Model, self).__init__()
self.params = nn.ParameterDict()
block_layers = [6, 12, 24, 16]
# Initial features parameters
conv = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
bn = nn.BatchNorm2d(64)
self.params["features_conv_weight"] = nn.Parameter(conv.weight.data.clone())
self.params["features_bn_weight"] = nn.Parameter(bn.weight.data.clone())
self.params["features_bn_bias"] = nn.Parameter(bn.bias.data.clone())
self.params["features_bn_mean"] = nn.Parameter(bn.running_mean.data.clone())
self.params["features_bn_var"] = nn.Parameter(bn.running_var.data.clone())
# Dense blocks parameters
num_features = 64
for i, num_layers in enumerate(block_layers):
self.params[f"block{i}_num_layers"] = num_layers
for j in range(num_layers):
in_features = num_features + j * growth_rate
prefix = f"block{i}_layer{j}_"
bn = nn.BatchNorm2d(in_features)
conv = nn.Conv2d(
in_features, growth_rate, kernel_size=3, padding=1, bias=False
)
self.params[prefix + "bn_weight"] = nn.Parameter(bn.weight.data.clone())
self.params[prefix + "bn_bias"] = nn.Parameter(bn.bias.data.clone())
self.params[prefix + "bn_mean"] = nn.Parameter(
bn.running_mean.data.clone()
)
self.params[prefix + "bn_var"] = nn.Parameter(
bn.running_var.data.clone()
)
self.params[prefix + "conv_weight"] = nn.Parameter(
conv.weight.data.clone()
)
num_features = num_features + num_layers * growth_rate
# Transition layers parameters (except after last block)
if i != len(block_layers) - 1:
bn = nn.BatchNorm2d(num_features)
conv = nn.Conv2d(
num_features, num_features // 2, kernel_size=1, bias=False
)
self.params[f"transition{i}_bn_weight"] = nn.Parameter(
bn.weight.data.clone()
)
self.params[f"transition{i}_bn_bias"] = nn.Parameter(
bn.bias.data.clone()
)
self.params[f"transition{i}_bn_mean"] = nn.Parameter(
bn.running_mean.data.clone()
)
self.params[f"transition{i}_bn_var"] = nn.Parameter(
bn.running_var.data.clone()
)
self.params[f"transition{i}_conv_weight"] = nn.Parameter(
conv.weight.data.clone()
)
num_features = num_features // 2
# Final layers parameters
bn = nn.BatchNorm2d(num_features)
self.params["final_bn_weight"] = nn.Parameter(bn.weight.data.clone())
self.params["final_bn_bias"] = nn.Parameter(bn.bias.data.clone())
self.params["final_bn_mean"] = nn.Parameter(bn.running_mean.data.clone())
self.params["final_bn_var"] = nn.Parameter(bn.running_var.data.clone())
linear = nn.Linear(num_features, num_classes)
self.params["classifier_weight"] = nn.Parameter(linear.weight.data.clone())
self.params["classifier_bias"] = nn.Parameter(linear.bias.data.clone())
def forward(self, x, fn=module_fn):
return fn(x, self.params, self.training)
# Test configurations
batch_size = 10
num_classes = 10
height, width = 224, 224
def get_inputs():
return [torch.randn(batch_size, 3, height, width)]
def get_init_inputs():
return [32, num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class DenseBlock(nn.Module):
def __init__(self, num_layers: int, num_input_features: int, growth_rate: int):
"""
:param num_layers: The number of layers in the dense block
:param num_input_features: The number of input feature maps
:param growth_rate: The growth rate for the dense block (new features added per layer)
"""
super(DenseBlock, self).__init__()
layers = []
for i in range(num_layers):
layers.append(self._make_layer(num_input_features + i * growth_rate, growth_rate))
self.layers = nn.ModuleList(layers)
def _make_layer(self, in_features: int, growth_rate: int):
"""
Creates a single layer with BatchNorm, ReLU, Conv2D, and Dropout.
"""
return nn.Sequential(
nn.BatchNorm2d(in_features),
nn.ReLU(inplace=True),
nn.Conv2d(in_features, growth_rate, kernel_size=3, padding=1, bias=False),
nn.Dropout(0.0)
)
def forward(self, x):
"""
:param x: Input tensor of shape (batch_size, num_input_features, height, width)
:return: Concatenated output tensor with shape (batch_size, num_output_features, height, width)
"""
features = [x]
for layer in self.layers:
new_feature = layer(x)
features.append(new_feature)
x = torch.cat(features, 1) # Concatenate along channel axis
return x
class TransitionLayer(nn.Module):
def __init__(self, num_input_features: int, num_output_features: int):
"""
:param num_input_features: The number of input feature maps
:param num_output_features: The number of output feature maps
"""
super(TransitionLayer, self).__init__()
self.transition = nn.Sequential(
nn.BatchNorm2d(num_input_features),
nn.ReLU(inplace=True),
nn.Conv2d(num_input_features, num_output_features, kernel_size=1, bias=False),
nn.AvgPool2d(kernel_size=2, stride=2)
)
def forward(self, x):
"""
:param x: Input tensor of shape (batch_size, num_input_features, height, width)
:return: Downsampled tensor with reduced number of feature maps
"""
return self.transition(x)
class Model(nn.Module):
def __init__(self, growth_rate: int = 32, num_classes: int = 1000):
"""
:param growth_rate: The growth rate of the DenseNet (new features added per layer)
:param num_classes: The number of output classes for classification
"""
super(Model, self).__init__()
# Initial convolution and pooling
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
# Each dense block is followed by a transition layer, except the last one
num_features = 64
block_layers = [6, 12, 24, 16] # Corresponding layers in DenseNet121
self.dense_blocks = nn.ModuleList()
self.transition_layers = nn.ModuleList()
for i, num_layers in enumerate(block_layers):
block = DenseBlock(num_layers=num_layers, num_input_features=num_features, growth_rate=growth_rate)
self.dense_blocks.append(block)
num_features = num_features + num_layers * growth_rate
if i != len(block_layers) - 1:
transition = TransitionLayer(num_input_features=num_features, num_output_features=num_features // 2)
self.transition_layers.append(transition)
num_features = num_features // 2
# Final batch norm and classifier
self.final_bn = nn.BatchNorm2d(num_features)
self.classifier = nn.Linear(num_features, num_classes)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
:param x: Input tensor of shape (batch_size, 3, height, width)
:return: Output tensor of shape (batch_size, num_classes)
"""
x = self.features(x)
for i, block in enumerate(self.dense_blocks):
x = block(x)
if i != len(self.dense_blocks) - 1:
x = self.transition_layers[i](x)
x = self.final_bn(x)
x = F.relu(x, inplace=True)
x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
x = self.classifier(x)
return x
# Testing the DenseNet121 model
batch_size = 10
num_classes = 10
height, width = 224, 224 # Standard input size for DenseNet
def get_inputs():
return [torch.randn(batch_size, 3, height, width)]
def get_init_inputs():
return [32, num_classes]
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <cuda_runtime.h>
#include <vector>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
namespace py = pybind11;
// Warp-level reduction using __shfl_down_sync
__inline__ __device__ float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
// Optimized CUDA kernel with strided loops and efficient work distribution
__global__ void globalAvgPoolKernel(const float* __restrict__ input,
float* __restrict__ output,
const int N,
const int C,
const int H,
const int W) {
const int HW = H * W;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid / C;
const int c = bid % C;
// Base offset for this feature map
const int base_idx = (n * C + c) * HW;
// Compute optimal stride based on thread count and workload
const int stride = blockDim.x * 4; // Process 4 elements per thread per iteration
float sum = 0.0f;
// Main loop with stride optimization
#pragma unroll 4
for (int i = tid * 4; i < HW; i += stride) {
// Load 4 elements per iteration if possible
if (i + 3 < HW) {
sum += input[base_idx + i] +
input[base_idx + i + 1] +
input[base_idx + i + 2] +
input[base_idx + i + 3];
} else {
// Handle boundary case
for (int j = 0; j < 4 && (i + j) < HW; j++) {
sum += input[base_idx + i + j];
}
}
}
// Warp-level reduction
sum = warpReduceSum(sum);
// Block-level reduction using shared memory
__shared__ float shared_sum;
if (tid == 0) {
shared_sum = 0.0f;
}
__syncthreads();
if (tid == 0) {
atomicAdd(&shared_sum, sum);
}
__syncthreads();
// Write final result
if (tid == 0) {
output[bid] = shared_sum / static_cast<float>(HW);
}
}
// Host function for global average pooling
at::Tensor global_avg_pool(at::Tensor x) {
auto input = x.contiguous();
const int N = input.size(0);
const int C = input.size(1);
const int H = input.size(2);
const int W = input.size(3);
at::Tensor output = at::empty({N, C}, input.options());
const int threads = 128; // Increased thread count for better occupancy
const int blocks = N * C;
globalAvgPoolKernel<<<blocks, threads>>>(
input.data_ptr<float>(),
output.data_ptr<float>(),
N, C, H, W
);
return output;
}
// Dense layer implementation
at::Tensor dense_layer_fn(
at::Tensor x,
at::Tensor bn_weight,
at::Tensor bn_bias,
at::Tensor bn_mean,
at::Tensor bn_var,
at::Tensor conv_weight,
bool is_training
) {
x = at::batch_norm(
x, bn_weight, bn_bias, bn_mean, bn_var,
is_training, 0.1, 1e-5, true
);
x = at::relu(x);
x = at::conv2d(x, conv_weight, /*bias=*/{}, /*stride=*/{1, 1}, /*padding=*/{1, 1});
return x; // Removed dropout as p=0.0
}
// Transition layer implementation
at::Tensor transition_layer_fn(
at::Tensor x,
at::Tensor bn_weight,
at::Tensor bn_bias,
at::Tensor bn_mean,
at::Tensor bn_var,
at::Tensor conv_weight,
bool is_training
) {
x = at::batch_norm(
x, bn_weight, bn_bias, bn_mean, bn_var,
is_training, 0.1, 1e-5, true
);
x = at::relu(x);
x = at::conv2d(x, conv_weight);
x = at::avg_pool2d(x, /*kernel_size=*/{2, 2}, /*stride=*/{2, 2});
return x;
}
// Main module function
at::Tensor module_fn(
at::Tensor x,
py::object params,
bool is_training
) {
auto get_param = [&](const std::string& key) -> at::Tensor {
return params.attr("__getitem__")(key.c_str()).cast<at::Tensor>();
};
// Initial features
x = at::conv2d(x, get_param("features_conv_weight"),
/*bias=*/{}, /*stride=*/{2, 2}, /*padding=*/{3, 3});
x = at::batch_norm(
x,
get_param("features_bn_weight"),
get_param("features_bn_bias"),
get_param("features_bn_mean"),
get_param("features_bn_var"),
is_training, 0.1, 1e-5, true
);
x = at::relu(x);
x = at::max_pool2d(x, /*kernel_size=*/{3, 3}, /*stride=*/{2, 2}, /*padding=*/{1, 1});
std::vector<int> num_layers = {6, 12, 24, 16};
// Dense blocks and transitions
for (int i = 0; i < 4; ++i) {
std::vector<at::Tensor> features;
features.push_back(x);
for (int j = 0; j < num_layers[i]; ++j) {
std::string prefix = "block" + std::to_string(i) + "_layer" + std::to_string(j) + "_";
at::Tensor new_feature = dense_layer_fn(
x,
get_param(prefix + "bn_weight"),
get_param(prefix + "bn_bias"),
get_param(prefix + "bn_mean"),
get_param(prefix + "bn_var"),
get_param(prefix + "conv_weight"),
is_training
);
features.push_back(new_feature);
x = at::cat(features, 1);
}
if (i != 3) {
std::string prefix = "transition" + std::to_string(i) + "_";
x = transition_layer_fn(
x,
get_param(prefix + "bn_weight"),
get_param(prefix + "bn_bias"),
get_param(prefix + "bn_mean"),
get_param(prefix + "bn_var"),
get_param(prefix + "conv_weight"),
is_training
);
}
}
// Final layers
x = at::batch_norm(
x,
get_param("final_bn_weight"),
get_param("final_bn_bias"),
get_param("final_bn_mean"),
get_param("final_bn_var"),
is_training, 0.1, 1e-5, true
);
x = at::relu(x);
if (x.is_cuda()) {
x = global_avg_pool(x).view({x.size(0), -1});
} else {
x = at::adaptive_avg_pool2d(x, {1, 1}).reshape({x.size(0), -1});
}
x = at::linear(x, get_param("classifier_weight"), get_param("classifier_bias"));
return x;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &module_fn, "DenseNet121 forward with optimized strided loops");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.650 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 1.336 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 42.126 | % | 0.163 | 5 |
Issued Ipc Active | 1.682 | inst/cycle | 0.000 | 5 |
SM Busy | 42.126 | % | 0.163 | 5 |
Memory Throughput | 172821539264.552 | byte/second | 387861072013314368.000 | 5 |
Mem Busy | 20.182 | % | 0.005 | 5 |
Max Bandwidth | 15.542 | % | 0.003 | 5 |
L1/TEX Hit Rate | 71.618 | % | 0.000 | 5 |
L2 Hit Rate | 49.684 | % | 0.022 | 5 |
Mem Pipes Busy | 24.748 | % | 0.007 | 5 |
Warp Cycles Per Issued Instruction | 30.062 | cycle | 0.024 | 5 |
Warp Cycles Per Executed Instruction | 30.700 | cycle | 0.026 | 5 |
Avg. Active Threads Per Warp | 26.220 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 24.160 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 16.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 28.000 | block | 0.000 | 5 |
Block Limit Warps | 16.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 79.892 | % | 0.074 | 5 |
Achieved Active Warps Per SM | 51.130 | warp | 0.030 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (33.5%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (79.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::conv2d | ||
CPU Time | 3478099.54 | μs |
Device Time | 2993818.40 | μs |
Self CPU Time | 142176.66 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 3335922.88 | μs |
Device Time | 2993818.40 | μs |
Self CPU Time | 181859.33 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 3154063.55 | μs |
Device Time | 2993818.40 | μs |
Self CPU Time | 206326.64 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 2947736.90 | μs |
Device Time | 2993818.40 | μs |
Self CPU Time | 1494757.12 | μs |
Self Device Time | 2993818.40 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::batch_norm | ||
CPU Time | 3208731.22 | μs |
Device Time | 1319895.89 | μs |
Self CPU Time | 158840.07 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_batch_norm_impl_index | ||
CPU Time | 3049891.15 | μs |
Device Time | 1319895.89 | μs |
Self CPU Time | 127426.45 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45288 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.