Kernel Details - 15_DenseNet121_manual_unroll

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    params: nn.ParameterDict,
    is_training: bool,
) -> torch.Tensor:
    """
    Implements the DenseNet121 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial features
    x = F.conv2d(x, params["features_conv_weight"], bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["features_bn_mean"],
        params["features_bn_var"],
        params["features_bn_weight"],
        params["features_bn_bias"],
        training=is_training,
    )
    x = F.relu(x, inplace=True)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def dense_layer_fn(
        x, bn_weight, bn_bias, bn_mean, bn_var, conv_weight, is_training
    ):
        """
        Functional version of a single dense layer
        """
        x = F.batch_norm(x, bn_mean, bn_var, bn_weight, bn_bias, training=is_training)
        x = F.relu(x, inplace=True)
        x = F.conv2d(x, conv_weight, bias=None, stride=1, padding=1)
        x = F.dropout(x, p=0.0, training=is_training)
        return x

    def transition_layer_fn(
        x, bn_weight, bn_bias, bn_mean, bn_var, conv_weight, is_training
    ):
        """
        Functional version of transition layer
        """
        x = F.batch_norm(x, bn_mean, bn_var, bn_weight, bn_bias, training=is_training)
        x = F.relu(x, inplace=True)
        x = F.conv2d(x, conv_weight, bias=None)
        x = F.avg_pool2d(x, kernel_size=2, stride=2)
        return x

    # Dense blocks and transitions
    for i in range(4):  # 4 dense blocks
        features = [x]
        for j in range(params[f"block{i}_num_layers"]):  # layers per block
            prefix = f"block{i}_layer{j}_"
            new_feature = dense_layer_fn(
                x,
                params[prefix + "bn_weight"],
                params[prefix + "bn_bias"],
                params[prefix + "bn_mean"],
                params[prefix + "bn_var"],
                params[prefix + "conv_weight"],
                is_training,
            )
            features.append(new_feature)
            x = torch.cat(features, 1)

        if i != 3:  # Apply transition after all blocks except last
            x = transition_layer_fn(
                x,
                params[f"transition{i}_bn_weight"],
                params[f"transition{i}_bn_bias"],
                params[f"transition{i}_bn_mean"],
                params[f"transition{i}_bn_var"],
                params[f"transition{i}_conv_weight"],
                is_training,
            )

    # Final layers
    x = F.batch_norm(
        x,
        params["final_bn_mean"],
        params["final_bn_var"],
        params["final_bn_weight"],
        params["final_bn_bias"],
        training=is_training,
    )
    x = F.relu(x, inplace=True)
    x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
    x = F.linear(x, params["classifier_weight"], params["classifier_bias"])
    return x


class Model(nn.Module):
    def __init__(self, growth_rate=32, num_classes=1000):
        super(Model, self).__init__()

        self.params = nn.ParameterDict()
        block_layers = [6, 12, 24, 16]

        # Initial features parameters
        conv = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        bn = nn.BatchNorm2d(64)
        self.params["features_conv_weight"] = nn.Parameter(conv.weight.data.clone())
        self.params["features_bn_weight"] = nn.Parameter(bn.weight.data.clone())
        self.params["features_bn_bias"] = nn.Parameter(bn.bias.data.clone())
        self.params["features_bn_mean"] = nn.Parameter(bn.running_mean.data.clone())
        self.params["features_bn_var"] = nn.Parameter(bn.running_var.data.clone())

        # Dense blocks parameters
        num_features = 64
        for i, num_layers in enumerate(block_layers):
            self.params[f"block{i}_num_layers"] = num_layers
            for j in range(num_layers):
                in_features = num_features + j * growth_rate
                prefix = f"block{i}_layer{j}_"

                bn = nn.BatchNorm2d(in_features)
                conv = nn.Conv2d(
                    in_features, growth_rate, kernel_size=3, padding=1, bias=False
                )

                self.params[prefix + "bn_weight"] = nn.Parameter(bn.weight.data.clone())
                self.params[prefix + "bn_bias"] = nn.Parameter(bn.bias.data.clone())
                self.params[prefix + "bn_mean"] = nn.Parameter(
                    bn.running_mean.data.clone()
                )
                self.params[prefix + "bn_var"] = nn.Parameter(
                    bn.running_var.data.clone()
                )
                self.params[prefix + "conv_weight"] = nn.Parameter(
                    conv.weight.data.clone()
                )

            num_features = num_features + num_layers * growth_rate

            # Transition layers parameters (except after last block)
            if i != len(block_layers) - 1:
                bn = nn.BatchNorm2d(num_features)
                conv = nn.Conv2d(
                    num_features, num_features // 2, kernel_size=1, bias=False
                )

                self.params[f"transition{i}_bn_weight"] = nn.Parameter(
                    bn.weight.data.clone()
                )
                self.params[f"transition{i}_bn_bias"] = nn.Parameter(
                    bn.bias.data.clone()
                )
                self.params[f"transition{i}_bn_mean"] = nn.Parameter(
                    bn.running_mean.data.clone()
                )
                self.params[f"transition{i}_bn_var"] = nn.Parameter(
                    bn.running_var.data.clone()
                )
                self.params[f"transition{i}_conv_weight"] = nn.Parameter(
                    conv.weight.data.clone()
                )

                num_features = num_features // 2

        # Final layers parameters
        bn = nn.BatchNorm2d(num_features)
        self.params["final_bn_weight"] = nn.Parameter(bn.weight.data.clone())
        self.params["final_bn_bias"] = nn.Parameter(bn.bias.data.clone())
        self.params["final_bn_mean"] = nn.Parameter(bn.running_mean.data.clone())
        self.params["final_bn_var"] = nn.Parameter(bn.running_var.data.clone())

        linear = nn.Linear(num_features, num_classes)
        self.params["classifier_weight"] = nn.Parameter(linear.weight.data.clone())
        self.params["classifier_bias"] = nn.Parameter(linear.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
num_classes = 10
height, width = 224, 224


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [32, num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class DenseBlock(nn.Module):
    def __init__(self, num_layers: int, num_input_features: int, growth_rate: int):
        """
        :param num_layers: The number of layers in the dense block
        :param num_input_features: The number of input feature maps
        :param growth_rate: The growth rate for the dense block (new features added per layer)
        """
        super(DenseBlock, self).__init__()
        layers = []
        for i in range(num_layers):
            layers.append(self._make_layer(num_input_features + i * growth_rate, growth_rate))
        self.layers = nn.ModuleList(layers)

    def _make_layer(self, in_features: int, growth_rate: int):
        """
        Creates a single layer with BatchNorm, ReLU, Conv2D, and Dropout.
        """
        return nn.Sequential(
            nn.BatchNorm2d(in_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_features, growth_rate, kernel_size=3, padding=1, bias=False),
            nn.Dropout(0.0)
        )

    def forward(self, x):
        """
        :param x: Input tensor of shape (batch_size, num_input_features, height, width)
        :return: Concatenated output tensor with shape (batch_size, num_output_features, height, width)
        """
        features = [x]
        for layer in self.layers:
            new_feature = layer(x)
            features.append(new_feature)
            x = torch.cat(features, 1)  # Concatenate along channel axis
        return x

class TransitionLayer(nn.Module):
    def __init__(self, num_input_features: int, num_output_features: int):
        """
        :param num_input_features: The number of input feature maps
        :param num_output_features: The number of output feature maps
        """
        super(TransitionLayer, self).__init__()
        self.transition = nn.Sequential(
            nn.BatchNorm2d(num_input_features),
            nn.ReLU(inplace=True),
            nn.Conv2d(num_input_features, num_output_features, kernel_size=1, bias=False),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )

    def forward(self, x):
        """
        :param x: Input tensor of shape (batch_size, num_input_features, height, width)
        :return: Downsampled tensor with reduced number of feature maps
        """
        return self.transition(x)

class Model(nn.Module):
    def __init__(self, growth_rate: int = 32, num_classes: int = 1000):
        """
        :param growth_rate: The growth rate of the DenseNet (new features added per layer)
        :param num_classes: The number of output classes for classification
        """
        super(Model, self).__init__()

        # Initial convolution and pooling
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        # Each dense block is followed by a transition layer, except the last one
        num_features = 64
        block_layers = [6, 12, 24, 16]  # Corresponding layers in DenseNet121

        self.dense_blocks = nn.ModuleList()
        self.transition_layers = nn.ModuleList()

        for i, num_layers in enumerate(block_layers):
            block = DenseBlock(num_layers=num_layers, num_input_features=num_features, growth_rate=growth_rate)
            self.dense_blocks.append(block)
            num_features = num_features + num_layers * growth_rate

            if i != len(block_layers) - 1:
                transition = TransitionLayer(num_input_features=num_features, num_output_features=num_features // 2)
                self.transition_layers.append(transition)
                num_features = num_features // 2

        # Final batch norm and classifier
        self.final_bn = nn.BatchNorm2d(num_features)
        self.classifier = nn.Linear(num_features, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        :param x: Input tensor of shape (batch_size, 3, height, width)
        :return: Output tensor of shape (batch_size, num_classes)
        """
        x = self.features(x)

        for i, block in enumerate(self.dense_blocks):
            x = block(x)
            if i != len(self.dense_blocks) - 1:
                x = self.transition_layers[i](x)

        x = self.final_bn(x)
        x = F.relu(x, inplace=True)
        x = F.adaptive_avg_pool2d(x, (1, 1)).view(x.size(0), -1)
        x = self.classifier(x)
        return x

# Testing the DenseNet121 model
batch_size = 10
num_classes = 10
height, width = 224, 224  # Standard input size for DenseNet

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [32, num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	15_DenseNet121
Level ID	3
Task ID	15
Kernel Name	15_DenseNet121_manual_unroll_base
CUDA Speedup (Native)	1.410x
CUDA Speedup (Compile)	0.897x
CUDA Runtime	4.216 ms
PyTorch Runtime (Native)	5.944 ms
PyTorch Runtime (Compile)	3.782 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	azure-gpt-4o-2024-08-06
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 3, Task 15 • 15_DenseNet121)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	15_DenseNet121	4.19	1.42	0.90
🥇	optimized_dense_net_base	4.19	1.42	0.90
🥉	15_densenet121_warp_optimized_base_base	4.20	1.42	0.90
🥉	15_densenet121_atomic_optimized_base	4.20	1.42	0.90
5	15_densenet121_ldg_optimized_base	4.20	1.42	0.90
5	15_densenet121_fused_bn_relu_base	4.20	1.42	0.90
7	15_DenseNet121_unrolled_base	4.20	1.41	0.90
8	15_densenet121_unroll_batched_base_base	4.20	1.41	0.90
9	15_densenet121_reduced_sync_base_base	4.21	1.41	0.90
10	15_densenet121_mem_coalesce_base	4.21	1.41	0.90
11	15_DenseNet121_manual_unroll_base	4.22	1.41	0.90
12	15_densenet121_atomic_optimization_edit_1	4.22	1.41	0.90
13	15_DenseNet121_unroll_manual_base	4.23	1.41	0.89
13	densenet121_warp_aligned_base_base	4.23	1.41	0.89
15	15_densenet121_strided_base	4.23	1.41	0.89
16	15_densenet121_ldg_fused_relu_base	4.23	1.40	0.89
16	15_DenseNet121_coalesced_edit_1	4.23	1.40	0.89
18	15_densenet121_strided_edit_1	4.24	1.40	0.89
19	15_densenet121_strided_loops_base	4.24	1.40	0.89
20	15_densenet121_atomic_optimization_base	4.24	1.40	0.89

#include <torch/extension.h>
#include <ATen/ATen.h>
#include <vector>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

namespace py = pybind11;

at::Tensor dense_layer_fn(
    at::Tensor x,
    at::Tensor bn_weight,
    at::Tensor bn_bias,
    at::Tensor bn_mean,
    at::Tensor bn_var,
    at::Tensor conv_weight,
    bool is_training
) {
    x = at::batch_norm(
        x, bn_weight, bn_bias, bn_mean, bn_var,
        is_training, 0.1, 1e-5, true
    );
    x = at::relu(x);
    x = at::conv2d(x, conv_weight, /*bias=*/{}, /*stride=*/{1, 1}, /*padding=*/{1, 1});
    x = at::dropout(x, /*p=*/0.0, is_training);
    return x;
}

at::Tensor transition_layer_fn(
    at::Tensor x,
    at::Tensor bn_weight,
    at::Tensor bn_bias,
    at::Tensor bn_mean,
    at::Tensor bn_var,
    at::Tensor conv_weight,
    bool is_training
) {
    x = at::batch_norm(
        x, bn_weight, bn_bias, bn_mean, bn_var,
        is_training, 0.1, 1e-5, true
    );
    x = at::relu(x);
    x = at::conv2d(x, conv_weight);
    x = at::avg_pool2d(x, /*kernel_size=*/{2, 2}, /*stride=*/{2, 2});
    return x;
}

at::Tensor module_fn(
    at::Tensor x,
    py::object params,
    bool is_training
) {
    // Helper function to get parameters
    auto get_param = [&](const std::string& key) -> at::Tensor {
        return params.attr("__getitem__")(key.c_str()).cast<at::Tensor>();
    };

    // Initial features
    auto features_conv_weight = get_param("features_conv_weight");
    x = at::conv2d(x, features_conv_weight, /*bias=*/{}, /*stride=*/{2, 2}, /*padding=*/{3, 3});

    auto features_bn_mean = get_param("features_bn_mean");
    auto features_bn_var = get_param("features_bn_var");
    auto features_bn_weight = get_param("features_bn_weight");
    auto features_bn_bias = get_param("features_bn_bias");

    x = at::batch_norm(
        x, features_bn_weight, features_bn_bias, features_bn_mean, features_bn_var,
        is_training, 0.1, 1e-5, true
    );
    x = at::relu(x);
    x = at::max_pool2d(x, /*kernel_size=*/{3, 3}, /*stride=*/{2, 2}, /*padding=*/{1, 1});

    std::vector<int> num_layers = {6, 12, 24, 16};  // Layers per block for DenseNet121

    // Dense blocks and transitions
    for (int i = 0; i < 4; ++i) {
        std::vector<at::Tensor> features;
        features.push_back(x);

        // Manual unrolling
        for (int j = 0; j < num_layers[i]; j += 4) {
            // Process four layers at a time if possible
            for (int k = j; k < std::min(j + 4, num_layers[i]); ++k) {
                std::string prefix = "block" + std::to_string(i) + "_layer" + std::to_string(k) + "_";

                auto bn_weight = get_param(prefix + "bn_weight");
                auto bn_bias = get_param(prefix + "bn_bias");
                auto bn_mean = get_param(prefix + "bn_mean");
                auto bn_var = get_param(prefix + "bn_var");
                auto conv_weight = get_param(prefix + "conv_weight");

                at::Tensor new_feature = dense_layer_fn(
                    x,
                    bn_weight,
                    bn_bias,
                    bn_mean,
                    bn_var,
                    conv_weight,
                    is_training
                );

                features.push_back(new_feature);

                x = at::cat(features, 1);
            }
        }

        if (i != 3) {  // Apply transition after all blocks except the last
            std::string prefix = "transition" + std::to_string(i) + "_";

            auto bn_weight = get_param(prefix + "bn_weight");
            auto bn_bias = get_param(prefix + "bn_bias");
            auto bn_mean = get_param(prefix + "bn_mean");
            auto bn_var = get_param(prefix + "bn_var");
            auto conv_weight = get_param(prefix + "conv_weight");

            x = transition_layer_fn(
                x,
                bn_weight,
                bn_bias,
                bn_mean,
                bn_var,
                conv_weight,
                is_training
            );
        }
    }

    // Final layers
    auto final_bn_mean = get_param("final_bn_mean");
    auto final_bn_var = get_param("final_bn_var");
    auto final_bn_weight = get_param("final_bn_weight");
    auto final_bn_bias = get_param("final_bn_bias");

    x = at::batch_norm(
        x, final_bn_weight, final_bn_bias, final_bn_mean, final_bn_var,
        is_training, 0.1, 1e-5, true
    );
    x = at::relu(x);
    x = at::adaptive_avg_pool2d(x, {1, 1}).reshape({x.size(0), -1});

    auto classifier_weight = get_param("classifier_weight");
    auto classifier_bias = get_param("classifier_bias");
    x = at::linear(x, classifier_weight, classifier_bias);

    return x;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &module_fn, "DenseNet121 forward");
}

Operation / Metric	Value	Unit
aten::conv2d
CPU Time	3378270.56	μs
Device Time	2898044.27	μs
Self CPU Time	139261.56	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::convolution
CPU Time	3239008.99	μs
Device Time	2898044.27	μs
Self CPU Time	175826.68	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_convolution
CPU Time	3063182.31	μs
Device Time	2898044.27	μs
Self CPU Time	203818.65	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::cudnn_convolution
CPU Time	2859363.66	μs
Device Time	2898044.27	μs
Self CPU Time	1467644.92	μs
Self Device Time	2898044.27	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::batch_norm
CPU Time	3103785.54	μs
Device Time	1284686.01	μs
Self CPU Time	152117.85	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_batch_norm_impl_index
CPU Time	2951667.69	μs
Device Time	1284686.01	μs
Self CPU Time	126092.53	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

The AI CUDA Engineer 👷

`15_DenseNet121` • `15_DenseNet121_manual_unroll_base`

Kernel Information

Related Kernels (Level 3, Task 15 • 15_DenseNet121)

The AI CUDA Engineer 👷

15_DenseNet121 • 15_DenseNet121_manual_unroll_base

Kernel Information

Related Kernels (Level 3, Task 15 • 15_DenseNet121)

`15_DenseNet121` • `15_DenseNet121_manual_unroll_base`