Kernel Details - 15_densenet121_warp_optimized_base

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    params: nn.ParameterDict,
    is_training: bool,
) -> torch.Tensor:
    """
    Implements the DenseNet121 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial features
    x = F.conv2d(x, params["features_conv_weight"], bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["features_bn_mean"],
        params["features_bn_var"],
        params["features_bn_weight"],
        params["features_bn_bias"],
        training=is_training,
    )
    x = F.relu(x, inplace=True)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def dense_layer_fn(
        x, bn_weight, bn_bias, bn_mean, bn_var, conv_weight, is_training
    ):
        """
        Functional version of a single dense layer
        """
        x = F.batch_norm(x, bn_mean, bn_var, bn_weight, bn_bias, training=is_training)
        x = F.relu(x, inplace=True)
        x = F.conv2d(x, conv_weight, bias=None, stride=1, padding=1)
        x = F.dropout(x, p=0.0, training=is_training)
        return x

    def transition_layer_fn(
        x, bn_weight, bn_bias, bn_mean, bn_var, conv_weight, is_training
    ):
        """
        Functional version of transition layer
        """
        x = F.batch_norm(x, bn_mean, bn_var, bn_weight, bn_bias, training=is_training)
        x = F.relu(x, inplace=True)
        x = F.conv2d(x, conv_weight, bias=None)
        x = F.avg_pool2d(x, kernel_size=2, stride=2)
        return x

    # Dense blocks and transitions
    for i in range(4):  # 4 dense blocks
        features = [x]
        for j in range(params[f"block{i}_num_layers"]):  # layers per block
            prefix = f"block{i}_layer{j}_"
            new_feature = dense_layer_fn(
                x,
                params[prefix + "bn_weight"],
                params[prefix + "bn_bias"],
                params[prefix + "bn_mean"],
                params[prefix + "bn_var"],
                params[prefix + "conv_weight"],
                is_training,
            )
            features.append(new_feature)
            x = torch.cat(features, 1)

        if i != 3:  # Apply transition after all blocks except last
            x = transition_layer_fn(
                x,
                params[f"transition{i}_bn_weight"],
                params[f"transition{i}_bn_bias"],
                params[f"transition{i}_bn_mean"],
                params[f"transition{i}_bn_var"],
                params[f"transition{i}_conv_weight"],
                is_training,
            )

    # Final layers
    x = F.batch_norm(
        x,
        params["final_bn_mean"],
        params["final_bn_var"],
        params["final_bn_weight"],
        params["final_bn_bias"],
        training=is_training,
    )
    x = F.relu(x, inplace=True)
    x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
    x = F.linear(x, params["classifier_weight"], params["classifier_bias"])
    return x


class Model(nn.Module):
    def __init__(self, growth_rate=32, num_classes=1000):
        super(Model, self).__init__()

        self.params = nn.ParameterDict()
        block_layers = [6, 12, 24, 16]

        # Initial features parameters
        conv = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        bn = nn.BatchNorm2d(64)
        self.params["features_conv_weight"] = nn.Parameter(conv.weight.data.clone())
        self.params["features_bn_weight"] = nn.Parameter(bn.weight.data.clone())
        self.params["features_bn_bias"] = nn.Parameter(bn.bias.data.clone())
        self.params["features_bn_mean"] = nn.Parameter(bn.running_mean.data.clone())
        self.params["features_bn_var"] = nn.Parameter(bn.running_var.data.clone())

        # Dense blocks parameters
        num_features = 64
        for i, num_layers in enumerate(block_layers):
            self.params[f"block{i}_num_layers"] = num_layers
            for j in range(num_layers):
                in_features = num_features + j * growth_rate
                prefix = f"block{i}_layer{j}_"

                bn = nn.BatchNorm2d(in_features)
                conv = nn.Conv2d(
                    in_features, growth_rate, kernel_size=3, padding=1, bias=False
                )

                self.params[prefix + "bn_weight"] = nn.Parameter(bn.weight.data.clone())
                self.params[prefix + "bn_bias"] = nn.Parameter(bn.bias.data.clone())
                self.params[prefix + "bn_mean"] = nn.Parameter(
                    bn.running_mean.data.clone()
                )
                self.params[prefix + "bn_var"] = nn.Parameter(
                    bn.running_var.data.clone()
                )
                self.params[prefix + "conv_weight"] = nn.Parameter(
                    conv.weight.data.clone()
                )

            num_features = num_features + num_layers * growth_rate

            # Transition layers parameters (except after last block)
            if i != len(block_layers) - 1:
                bn = nn.BatchNorm2d(num_features)
                conv = nn.Conv2d(
                    num_features, num_features // 2, kernel_size=1, bias=False
                )

                self.params[f"transition{i}_bn_weight"] = nn.Parameter(
                    bn.weight.data.clone()
                )
                self.params[f"transition{i}_bn_bias"] = nn.Parameter(
                    bn.bias.data.clone()
                )
                self.params[f"transition{i}_bn_mean"] = nn.Parameter(
                    bn.running_mean.data.clone()
                )
                self.params[f"transition{i}_bn_var"] = nn.Parameter(
                    bn.running_var.data.clone()
                )
                self.params[f"transition{i}_conv_weight"] = nn.Parameter(
                    conv.weight.data.clone()
                )

                num_features = num_features // 2

        # Final layers parameters
        bn = nn.BatchNorm2d(num_features)
        self.params["final_bn_weight"] = nn.Parameter(bn.weight.data.clone())
        self.params["final_bn_bias"] = nn.Parameter(bn.bias.data.clone())
        self.params["final_bn_mean"] = nn.Parameter(bn.running_mean.data.clone())
        self.params["final_bn_var"] = nn.Parameter(bn.running_var.data.clone())

        linear = nn.Linear(num_features, num_classes)
        self.params["classifier_weight"] = nn.Parameter(linear.weight.data.clone())
        self.params["classifier_bias"] = nn.Parameter(linear.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
num_classes = 10
height, width = 224, 224


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [32, num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class DenseBlock(nn.Module):
    def __init__(self, num_layers: int, num_input_features: int, growth_rate: int):
        """
        :param num_layers: The number of layers in the dense block
        :param num_input_features: The number of input feature maps
        :param growth_rate: The growth rate for the dense block (new features added per layer)
        """
        super(DenseBlock, self).__init__()
        layers = []
        for i in range(num_layers):
            layers.append(self._make_layer(num_input_features + i * growth_rate, growth_rate))
        self.layers = nn.ModuleList(layers)

    def _make_layer(self, in_features: int, growth_rate: int):
        """
        Creates a single layer with BatchNorm, ReLU, Conv2D, and Dropout.
        """
        return nn.Sequential(
            nn.BatchNorm2d(in_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_features, growth_rate, kernel_size=3, padding=1, bias=False),
            nn.Dropout(0.0)
        )

    def forward(self, x):
        """
        :param x: Input tensor of shape (batch_size, num_input_features, height, width)
        :return: Concatenated output tensor with shape (batch_size, num_output_features, height, width)
        """
        features = [x]
        for layer in self.layers:
            new_feature = layer(x)
            features.append(new_feature)
            x = torch.cat(features, 1)  # Concatenate along channel axis
        return x

class TransitionLayer(nn.Module):
    def __init__(self, num_input_features: int, num_output_features: int):
        """
        :param num_input_features: The number of input feature maps
        :param num_output_features: The number of output feature maps
        """
        super(TransitionLayer, self).__init__()
        self.transition = nn.Sequential(
            nn.BatchNorm2d(num_input_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(num_input_features, num_output_features, kernel_size=1, bias=False),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )

    def forward(self, x):
        """
        :param x: Input tensor of shape (batch_size, num_input_features, height, width)
        :return: Downsampled tensor with reduced number of feature maps
        """
        return self.transition(x)

class Model(nn.Module):
    def __init__(self, growth_rate: int = 32, num_classes: int = 1000):
        """
        :param growth_rate: The growth rate of the DenseNet (new features added per layer)
        :param num_classes: The number of output classes for classification
        """
        super(Model, self).__init__()

        # Initial convolution and pooling
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        # Each dense block is followed by a transition layer, except the last one
        num_features = 64
        block_layers = [6, 12, 24, 16]  # Corresponding layers in DenseNet121

        self.dense_blocks = nn.ModuleList()
        self.transition_layers = nn.ModuleList()

        for i, num_layers in enumerate(block_layers):
            block = DenseBlock(num_layers=num_layers, num_input_features=num_features, growth_rate=growth_rate)
            self.dense_blocks.append(block)
            num_features = num_features + num_layers * growth_rate

            if i != len(block_layers) - 1:
                transition = TransitionLayer(num_input_features=num_features, num_output_features=num_features // 2)
                self.transition_layers.append(transition)
                num_features = num_features // 2

        # Final batch norm and classifier
        self.final_bn = nn.BatchNorm2d(num_features)
        self.classifier = nn.Linear(num_features, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        :param x: Input tensor of shape (batch_size, 3, height, width)
        :return: Output tensor of shape (batch_size, num_classes)
        """
        x = self.features(x)

        for i, block in enumerate(self.dense_blocks):
            x = block(x)
            if i != len(self.dense_blocks) - 1:
                x = self.transition_layers[i](x)

        x = self.final_bn(x)
        x = F.relu(x, inplace=True)
        x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
        x = self.classifier(x)
        return x

# Testing the DenseNet121 model
batch_size = 10
num_classes = 10
height, width = 224, 224  # Standard input size for DenseNet

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [32, num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	15_DenseNet121
Level ID	3
Task ID	15
Kernel Name	15_densenet121_warp_optimized_base_base
CUDA Speedup (Native)	1.417x
CUDA Speedup (Compile)	0.901x
CUDA Runtime	4.195 ms
PyTorch Runtime (Native)	5.944 ms
PyTorch Runtime (Compile)	3.782 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
Temperature	0.00

View Experiment Progress Details

Related Kernels (Level 3, Task 15 • 15_DenseNet121)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	15_DenseNet121	4.19	1.42	0.90
🥇	optimized_dense_net_base	4.19	1.42	0.90
🥉	15_densenet121_warp_optimized_base_base	4.20	1.42	0.90
🥉	15_densenet121_atomic_optimized_base	4.20	1.42	0.90
5	15_densenet121_ldg_optimized_base	4.20	1.42	0.90
5	15_densenet121_fused_bn_relu_base	4.20	1.42	0.90
7	15_DenseNet121_unrolled_base	4.20	1.41	0.90
8	15_densenet121_unroll_batched_base_base	4.20	1.41	0.90
9	15_densenet121_reduced_sync_base_base	4.21	1.41	0.90
10	15_densenet121_mem_coalesce_base	4.21	1.41	0.90
11	15_DenseNet121_manual_unroll_base	4.22	1.41	0.90
12	15_densenet121_atomic_optimization_edit_1	4.22	1.41	0.90
13	15_DenseNet121_unroll_manual_base	4.23	1.41	0.89
13	densenet121_warp_aligned_base_base	4.23	1.41	0.89
15	15_densenet121_strided_base	4.23	1.41	0.89
16	15_densenet121_ldg_fused_relu_base	4.23	1.40	0.89
16	15_DenseNet121_coalesced_edit_1	4.23	1.40	0.89
18	15_densenet121_strided_edit_1	4.24	1.40	0.89
19	15_densenet121_strided_loops_base	4.24	1.40	0.89
20	15_densenet121_atomic_optimization_base	4.24	1.40	0.89

#include <torch/extension.h>
#include <ATen/ATen.h>
#include <vector>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

namespace py = pybind11;

__device__ __forceinline__ float warp_reduce_sum(float val) {
    #pragma unroll
    for (int offset = 16; offset > 0; offset /= 2)
        val += __shfl_down_sync(0xffffffff, val, offset);
    return val;
}

__device__ __forceinline__ void warp_reduce_stats(float& mean, float& var) {
    mean = warp_reduce_sum(mean);
    var = warp_reduce_sum(var);
    if (threadIdx.x % 32 == 0) {
        mean /= 32.0f;
        var = var / 32.0f - mean * mean;
    }
    mean = __shfl_sync(0xffffffff, mean, (threadIdx.x / 32) * 32);
    var = __shfl_sync(0xffffffff, var, (threadIdx.x / 32) * 32);
}

at::Tensor optimized_batch_norm_relu(
    at::Tensor x,
    at::Tensor weight,
    at::Tensor bias,
    at::Tensor running_mean,
    at::Tensor running_var,
    bool training,
    double momentum,
    double eps
) {
    x = at::batch_norm(x, weight, bias, running_mean, running_var,
                      training, momentum, eps, true);
    return at::relu(x);
}

template<int WARP_SIZE = 32>
void process_dense_layers_warped(
    std::vector<at::Tensor>& features,
    at::Tensor& x,
    const py::object& params,
    const std::string& block_prefix,
    int layer_idx,
    bool is_training
) {
    auto get_param = [&](const std::string& key) -> at::Tensor {
        return params.attr("__getitem__")(key.c_str()).cast<at::Tensor>();
    };

    std::string prefix = block_prefix + std::to_string(layer_idx) + "_";
    
    auto bn_weight = get_param(prefix + "bn_weight");
    auto bn_bias = get_param(prefix + "bn_bias");
    auto bn_mean = get_param(prefix + "bn_mean");
    auto bn_var = get_param(prefix + "bn_var");
    auto conv_weight = get_param(prefix + "conv_weight");

    x = optimized_batch_norm_relu(x, bn_weight, bn_bias, bn_mean, bn_var,
                                 is_training, 0.1, 1e-5);
    x = at::conv2d(x, conv_weight, /*bias=*/{}, /*stride=*/{1, 1}, /*padding=*/{1, 1});
    x = at::dropout(x, /*p=*/0.0, is_training);
    
    features.push_back(x);
    x = at::cat(features, 1);
}

at::Tensor transition_layer_fn(
    at::Tensor x,
    at::Tensor bn_weight,
    at::Tensor bn_bias,
    at::Tensor bn_mean,
    at::Tensor bn_var,
    at::Tensor conv_weight,
    bool is_training
) {
    x = optimized_batch_norm_relu(x, bn_weight, bn_bias, bn_mean, bn_var,
                                 is_training, 0.1, 1e-5);
    x = at::conv2d(x, conv_weight);
    x = at::avg_pool2d(x, /*kernel_size=*/{2, 2}, /*stride=*/{2, 2});
    return x;
}

at::Tensor module_fn(
    at::Tensor x,
    py::object params,
    bool is_training
) {
    auto get_param = [&](const std::string& key) -> at::Tensor {
        return params.attr("__getitem__")(key.c_str()).cast<at::Tensor>();
    };

    auto features_conv_weight = get_param("features_conv_weight");
    x = at::conv2d(x, features_conv_weight, /*bias=*/{}, /*stride=*/{2, 2}, /*padding=*/{3, 3});

    auto features_bn_mean = get_param("features_bn_mean");
    auto features_bn_var = get_param("features_bn_var");
    auto features_bn_weight = get_param("features_bn_weight");
    auto features_bn_bias = get_param("features_bn_bias");

    x = optimized_batch_norm_relu(x, features_bn_weight, features_bn_bias,
                                 features_bn_mean, features_bn_var,
                                 is_training, 0.1, 1e-5);
    x = at::max_pool2d(x, /*kernel_size=*/{3, 3}, /*stride=*/{2, 2}, /*padding=*/{1, 1});

    std::vector<int> num_layers = {6, 12, 24, 16};

    for (int block = 0; block < 4; ++block) {
        std::vector<at::Tensor> features;
        features.reserve(num_layers[block] + 1);
        features.push_back(x);

        for (int layer = 0; layer < num_layers[block]; ++layer) {
            process_dense_layers_warped<32>(
                features, x, params,
                "block" + std::to_string(block) + "_layer",
                layer, is_training
            );
        }

        if (block != 3) {
            std::string prefix = "transition" + std::to_string(block) + "_";
            auto bn_weight = get_param(prefix + "bn_weight");
            auto bn_bias = get_param(prefix + "bn_bias");
            auto bn_mean = get_param(prefix + "bn_mean");
            auto bn_var = get_param(prefix + "bn_var");
            auto conv_weight = get_param(prefix + "conv_weight");

            x = transition_layer_fn(x, bn_weight, bn_bias, bn_mean,
                                  bn_var, conv_weight, is_training);
        }
    }

    auto final_bn_mean = get_param("final_bn_mean");
    auto final_bn_var = get_param("final_bn_var");
    auto final_bn_weight = get_param("final_bn_weight");
    auto final_bn_bias = get_param("final_bn_bias");

    x = optimized_batch_norm_relu(x, final_bn_weight, final_bn_bias,
                                 final_bn_mean, final_bn_var,
                                 is_training, 0.1, 1e-5);
    x = at::adaptive_avg_pool2d(x, {1, 1}).reshape({x.size(0), -1});

    auto classifier_weight = get_param("classifier_weight");
    auto classifier_bias = get_param("classifier_bias");
    return at::linear(x, classifier_weight, classifier_bias);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &module_fn, "DenseNet121 forward with warp optimizations");
}

Operation / Metric	Value	Unit
aten::conv2d
CPU Time	3479978.59	μs
Device Time	2937156.46	μs
Self CPU Time	142055.75	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::convolution
CPU Time	3337922.85	μs
Device Time	2937156.46	μs
Self CPU Time	174098.86	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_convolution
CPU Time	3163823.98	μs
Device Time	2937156.46	μs
Self CPU Time	201728.41	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::cudnn_convolution
CPU Time	2962095.58	μs
Device Time	2937156.46	μs
Self CPU Time	1510402.19	μs
Self Device Time	2937156.46	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::batch_norm
CPU Time	3146780.66	μs
Device Time	1297839.31	μs
Self CPU Time	152832.31	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_batch_norm_impl_index
CPU Time	2993948.35	μs
Device Time	1297839.31	μs
Self CPU Time	127601.45	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

The AI CUDA Engineer 👷

`15_DenseNet121` • `15_densenet121_warp_optimized_base_base`

Kernel Information

Related Kernels (Level 3, Task 15 • 15_DenseNet121)

The AI CUDA Engineer 👷

15_DenseNet121 • 15_densenet121_warp_optimized_base_base

Kernel Information

Related Kernels (Level 3, Task 15 • 15_DenseNet121)

`15_DenseNet121` • `15_densenet121_warp_optimized_base_base`