← Back to Leaderboard

The AI CUDA Engineer 👷

10_ResNet101resnet101_fused_distr_base

Level 3 • Task 10
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
    """
    Implements the ResNet101 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial layers
    x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["bn1_m"].to(x.device),
        params["bn1_v"].to(x.device),
        params["bn1_w"].to(x.device),
        params["bn1_b"].to(x.device),
        training=is_training,
    )
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def bottleneck_fn(
        x,
        conv1_w,
        conv2_w,
        conv3_w,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        downsample_conv_w=None,
        downsample_bn_w=None,
        downsample_bn_b=None,
        downsample_bn_m=None,
        downsample_bn_v=None,
        stride=1,
        is_training=True,
    ):
        identity = x

        out = F.conv2d(x, conv1_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn1_m.to(x.device),
            bn1_v.to(x.device),
            bn1_w.to(x.device),
            bn1_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
        out = F.batch_norm(
            out,
            bn2_m.to(x.device),
            bn2_v.to(x.device),
            bn2_w.to(x.device),
            bn2_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv3_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn3_m.to(x.device),
            bn3_v.to(x.device),
            bn3_w.to(x.device),
            bn3_b.to(x.device),
            training=is_training,
        )

        if downsample_conv_w is not None:
            identity = F.conv2d(
                x, downsample_conv_w.to(x.device), bias=None, stride=stride
            )
            identity = F.batch_norm(
                identity,
                downsample_bn_m.to(x.device),
                downsample_bn_v.to(x.device),
                downsample_bn_w.to(x.device),
                downsample_bn_b.to(x.device),
                training=is_training,
            )

        out += identity
        out = F.relu(out)

        return out

    # Layer 1-4
    for layer_idx in range(1, 5):
        blocks = params[f"layer{layer_idx}_blocks"]
        for block_idx in range(len(blocks)):
            block_params = blocks[block_idx]

            downsample_params = None
            if "downsample_conv_w" in block_params:
                downsample_params = [
                    block_params["downsample_conv_w"],
                    block_params["downsample_bn_w"],
                    block_params["downsample_bn_b"],
                    block_params["downsample_bn_m"],
                    block_params["downsample_bn_v"],
                ]

            x = bottleneck_fn(
                x,
                block_params["conv1_w"],
                block_params["conv2_w"],
                block_params["conv3_w"],
                block_params["bn1_w"],
                block_params["bn1_b"],
                block_params["bn1_m"],
                block_params["bn1_v"],
                block_params["bn2_w"],
                block_params["bn2_b"],
                block_params["bn2_m"],
                block_params["bn2_v"],
                block_params["bn3_w"],
                block_params["bn3_b"],
                block_params["bn3_m"],
                block_params["bn3_v"],
                *(downsample_params if downsample_params else [None] * 5),
                stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                is_training=is_training,
            )

    x = F.adaptive_avg_pool2d(x, (1, 1))
    x = torch.flatten(x, 1)
    x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))

    return x


class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        super(Model, self).__init__()
        self.params = nn.ParameterDict()
        in_channels = 64
        expansion = 4

        # Initial layers
        conv1 = nn.Conv2d(
            3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
        )
        bn1 = nn.BatchNorm2d(in_channels)
        self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
        self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
        self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
        self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
        self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

        # Layers 1-4
        channels = [64, 128, 256, 512]
        for layer_idx, (out_channels, num_blocks) in enumerate(
            zip(channels, layers), 1
        ):
            layer_blocks = []

            for block_idx in range(num_blocks):
                block_in_channels = (
                    in_channels if block_idx == 0 else out_channels * expansion
                )

                # Create block parameters
                block_params = {}

                # First block may have downsample
                if block_idx == 0 and (
                    layer_idx > 1 or block_in_channels != out_channels * expansion
                ):
                    downsample_conv = nn.Conv2d(
                        block_in_channels,
                        out_channels * expansion,
                        kernel_size=1,
                        stride=2 if layer_idx > 1 else 1,
                        bias=False,
                    )
                    downsample_bn = nn.BatchNorm2d(out_channels * expansion)

                    block_params["downsample_conv_w"] = nn.Parameter(
                        downsample_conv.weight.data.clone()
                    )
                    block_params["downsample_bn_w"] = nn.Parameter(
                        downsample_bn.weight.data.clone()
                    )
                    block_params["downsample_bn_b"] = nn.Parameter(
                        downsample_bn.bias.data.clone()
                    )
                    block_params["downsample_bn_m"] = nn.Parameter(
                        downsample_bn.running_mean.data.clone()
                    )
                    block_params["downsample_bn_v"] = nn.Parameter(
                        downsample_bn.running_var.data.clone()
                    )

                conv1 = nn.Conv2d(
                    block_in_channels, out_channels, kernel_size=1, bias=False
                )
                bn1 = nn.BatchNorm2d(out_channels)
                conv2 = nn.Conv2d(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                    padding=1,
                    bias=False,
                )
                bn2 = nn.BatchNorm2d(out_channels)
                conv3 = nn.Conv2d(
                    out_channels, out_channels * expansion, kernel_size=1, bias=False
                )
                bn3 = nn.BatchNorm2d(out_channels * expansion)

                block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
                block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
                block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
                block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
                block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

                block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
                block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
                block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
                block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
                block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())

                block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
                block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
                block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
                block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
                block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())

                layer_blocks.append(block_params)

            self.params[f"layer{layer_idx}_blocks"] = layer_blocks
            in_channels = out_channels * expansion

        # Final FC layer
        fc = nn.Linear(512 * expansion, num_classes)
        self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
        self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [layers, num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        """
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param stride: Stride for the first convolutional layer
        :param downsample: Downsample layer for the shortcut connection
        """
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, in_channels, height, width)
        :return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
        """
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        """
        :param block: Type of block to use (BasicBlock or Bottleneck)
        :param layers: List of integers specifying the number of blocks in each layer
        :param num_classes: Number of output classes
        """
        super(Model, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        block = Bottleneck

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, 3, height, width)
        :return: Output tensor, shape (batch_size, num_classes)
        """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [layers, num_classes]

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 resnet101_modular_functions_base_base 23.20 1.33 1.33
🥈 resnet101_coalesced_memory_access_base 24.25 1.27 1.28
🥉 resnet101_balanced_workload_base 24.36 1.27 1.27
4 resnet101_balanced_workload_base 24.60 1.26 1.26
5 10_ResNet101_mem_opt_base_base 24.62 1.26 1.26
6 resnet101_uniform_flow_base_base 24.84 1.24 1.25
7 resnet101_shared_mem_sync_optimized_base 24.93 1.24 1.24
8 efficient_resnet_base 25.08 1.23 1.23
9 resnet101_optimized_memory_access_base 25.45 1.21 1.22
10 resnet101_unrolled_loops_base_base 25.58 1.21 1.21
11 resnet101_min_sync_relu_base 25.64 1.21 1.21
12 warp_pool_optimized_bottleneck_base 25.79 1.20 1.20
13 unified_resnet_base 26.38 1.17 1.17
14 10_ResNet101_warp_avg_pool_base 26.55 1.16 1.17
15 resnet101_minimal_sync_base_base 26.90 1.15 1.15
16 10_ResNet101 28.04 1.10 1.10
17 resnet101_fused_distr_base 28.10 1.10 1.10
18 10_resnet101_opt_aligned_mem_edit_1 29.45 1.05 1.05
19 10_resnet101_opt_min_sync_edit_1 29.58 1.04 1.05
20 10_resnet101_opt_base 29.83 1.04 1.04
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

namespace py = pybind11;

// This kernel fuses the elementwise addition of the identity and the main branch, and applies ReLU
// It evenly distributes the workload across threads and blocks using a grid-stride loop
__global__ void fused_add_relu_kernel(const float* __restrict__ identity, float* __restrict__ out, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride) {
        float val = out[idx] + identity[idx];
        out[idx] = (val > 0.0f) ? val : 0.0f;
    }
}

// The bottleneck function implements the main branch and optionally a downsample branch.
// The final residual addition and activation are fused via a custom CUDA kernel to evenly distribute work.
torch::Tensor bottleneck_fn(
    torch::Tensor x,
    const torch::Tensor& conv1_w,
    const torch::Tensor& conv2_w,
    const torch::Tensor& conv3_w,
    const torch::Tensor& bn1_w,
    const torch::Tensor& bn1_b,
    const torch::Tensor& bn1_m,
    const torch::Tensor& bn1_v,
    const torch::Tensor& bn2_w,
    const torch::Tensor& bn2_b,
    const torch::Tensor& bn2_m,
    const torch::Tensor& bn2_v,
    const torch::Tensor& bn3_w,
    const torch::Tensor& bn3_b,
    const torch::Tensor& bn3_m,
    const torch::Tensor& bn3_v,
    const torch::Tensor& downsample_conv_w,
    const torch::Tensor& downsample_bn_w,
    const torch::Tensor& downsample_bn_b,
    const torch::Tensor& downsample_bn_m,
    const torch::Tensor& downsample_bn_v,
    int64_t stride,
    bool is_training
) {
    torch::Tensor identity = x;

    bool has_downsample = downsample_conv_w.defined();
    torch::Tensor downsample_out;
    if (has_downsample) {
        downsample_out = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride)
            .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
        downsample_out = torch::batch_norm(
            downsample_out,
            downsample_bn_w,
            downsample_bn_b,
            downsample_bn_m,
            downsample_bn_v,
            is_training,
            /*momentum=*/0.1,
            /*eps=*/1e-5,
            /*cudnn_enabled=*/true
        );
        identity = downsample_out;
    }

    torch::Tensor out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor())
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(
        out,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        is_training,
        0.1,
        1e-5,
        true
    );
    out = torch::relu(out);

    out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, /*padding=*/1)
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(
        out,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        is_training,
        0.1,
        1e-5,
        true
    );
    out = torch::relu(out);

    out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor())
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(
        out,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        is_training,
        0.1,
        1e-5,
        true
    );

    // Fuse the residual addition and ReLU activation using the custom CUDA kernel
    int64_t num_elements = out.numel();
    const int threads = 256;
    const int blocks = (num_elements + threads - 1) / threads;
    fused_add_relu_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        identity.data_ptr<float>(),
        out.data_ptr<float>(),
        num_elements
    );
    cudaDeviceSynchronize(); // Ensure kernel execution completes before proceeding

    return out;
}

// The forward function processes the ResNet101 architecture layer by layer.
// It prefetches all parameters on the correct device and uses the bottleneck_fn to apply each block.
// The fused kernel ensures that the residual addition and activation are distributed evenly among CUDA threads.
torch::Tensor forward(
    torch::Tensor x,
    py::object params,
    bool is_training
) {
    auto device = x.device();

    // Initial layer parameters
    auto conv1_w = params.attr("get")(py::str("conv1_w")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_w = params.attr("get")(py::str("bn1_w")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_b = params.attr("get")(py::str("bn1_b")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_m = params.attr("get")(py::str("bn1_m")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_v = params.attr("get")(py::str("bn1_v")).cast<torch::Tensor>().contiguous().to(device, true);

    x = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor(), /*stride=*/2, /*padding=*/3)
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    x = torch::batch_norm(
        x,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        is_training,
        0.1,
        1e-5,
        true
    );
    x = torch::relu(x);
    x = torch::max_pool2d(x, /*kernel_size=*/3, /*stride=*/2, /*padding=*/1);

    // Process each layer's blocks
    for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
        std::string layer_key = "layer" + std::to_string(layer_idx) + "_blocks";
        py::list blocks = params.attr("get")(py::str(layer_key)).cast<py::list>();
        for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
            py::object block_params = blocks[block_idx];

            auto conv1_w = block_params.attr("get")(py::str("conv1_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto conv2_w = block_params.attr("get")(py::str("conv2_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto conv3_w = block_params.attr("get")(py::str("conv3_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_w = block_params.attr("get")(py::str("bn1_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_b = block_params.attr("get")(py::str("bn1_b")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_m = block_params.attr("get")(py::str("bn1_m")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_v = block_params.attr("get")(py::str("bn1_v")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_w = block_params.attr("get")(py::str("bn2_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_b = block_params.attr("get")(py::str("bn2_b")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_m = block_params.attr("get")(py::str("bn2_m")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_v = block_params.attr("get")(py::str("bn2_v")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_w = block_params.attr("get")(py::str("bn3_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_b = block_params.attr("get")(py::str("bn3_b")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_m = block_params.attr("get")(py::str("bn3_m")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_v = block_params.attr("get")(py::str("bn3_v")).cast<torch::Tensor>().contiguous().to(device, true);

            torch::Tensor downsample_conv_w = torch::Tensor();
            torch::Tensor downsample_bn_w = torch::Tensor();
            torch::Tensor downsample_bn_b = torch::Tensor();
            torch::Tensor downsample_bn_m = torch::Tensor();
            torch::Tensor downsample_bn_v = torch::Tensor();
            
            if (py::bool_(block_params.attr("__contains__")(py::str("downsample_conv_w")))) {
                downsample_conv_w = block_params.attr("get")(py::str("downsample_conv_w")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_w = block_params.attr("get")(py::str("downsample_bn_w")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_b = block_params.attr("get")(py::str("downsample_bn_b")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_m = block_params.attr("get")(py::str("downsample_bn_m")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_v = block_params.attr("get")(py::str("downsample_bn_v")).cast<torch::Tensor>().contiguous().to(device, true);
            }

            int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
            x = bottleneck_fn(
                x,
                conv1_w,
                conv2_w,
                conv3_w,
                bn1_w,
                bn1_b,
                bn1_m,
                bn1_v,
                bn2_w,
                bn2_b,
                bn2_m,
                bn2_v,
                bn3_w,
                bn3_b,
                bn3_m,
                bn3_v,
                downsample_conv_w,
                downsample_bn_w,
                downsample_bn_b,
                downsample_bn_m,
                downsample_bn_v,
                stride,
                is_training
            );
        }
    }

    x = torch::adaptive_avg_pool2d(x, {1, 1}).contiguous();
    x = x.view({x.size(0), -1});

    auto fc_w = params.attr("get")(py::str("fc_w")).cast<torch::Tensor>().contiguous().to(device, true);
    auto fc_b = params.attr("get")(py::str("fc_b")).cast<torch::Tensor>().contiguous().to(device, true);
    return torch::linear(x, fc_w, fc_b);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "ResNet101 forward function with fused addition and ReLU");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.026 inst/cycle 0.002 5
Executed Ipc Elapsed 1.624 inst/cycle 0.007 5
Issue Slots Busy 51.182 % 1.069 5
Issued Ipc Active 2.046 inst/cycle 0.002 5
SM Busy 51.182 % 1.069 5
Memory Throughput 1555965485156.598 byte/second 12865757325163071799296.000 5
Mem Busy 33.272 % 3.706 5
Max Bandwidth 46.618 % 11.119 5
L1/TEX Hit Rate 33.278 % 0.002 5
L2 Hit Rate 36.298 % 0.188 5
Mem Pipes Busy 41.030 % 3.968 5
Warp Cycles Per Issued Instruction 24.240 cycle 0.056 5
Warp Cycles Per Executed Instruction 24.464 cycle 0.082 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 28.000 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 78.476 % 0.062 5
Achieved Active Warps Per SM 50.224 warp 0.026 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (32.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (78.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 5755936.10 μs
Device Time 2899889.06 μs
Self CPU Time 100871.29 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 5655064.81 μs
Device Time 2899889.06 μs
Self CPU Time 306699.69 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::copy_
CPU Time 6391356.89 μs
Device Time 2899889.06 μs
Self CPU Time 1902843.96 μs
Self Device Time 2899889.06 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaMemcpyAsync
CPU Time 4488403.88 μs
Device Time 0.00 μs
Self CPU Time 4488403.88 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Memcpy HtoD (Pageable -> Device)
CPU Time 0.00 μs
Device Time 2899890.95 μs
Self CPU Time 0.00 μs
Self Device Time 2899890.95 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::clone
CPU Time 1661466.33 μs
Device Time 0.00 μs
Self CPU Time 6945.93 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::conv2d
CPU Time 1106288.10 μs
Device Time 604797.11 μs
Self CPU Time 44392.89 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45300 warnings generated when compiling for host.
Suppressed 45337 warnings (45290 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:12:15 bugprone-narrowing-conversions
12 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:13:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
13 | int stride = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:23:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
23 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:26:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]
26 | const torch::Tensor& conv3_w,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | const torch::Tensor& bn1_w,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:26:26: note: the first parameter in the range is 'conv3_w'
26 | const torch::Tensor& conv3_w,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:27:26: note: the last parameter in the range is 'bn1_w'
27 | const torch::Tensor& bn1_w,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:30:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]
30 | const torch::Tensor& bn1_v,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
31 | const torch::Tensor& bn2_w,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:30:26: note: the first parameter in the range is 'bn1_v'
30 | const torch::Tensor& bn1_v,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:31:26: note: the last parameter in the range is 'bn2_w'
31 | const torch::Tensor& bn2_w,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:34:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]
34 | const torch::Tensor& bn2_v,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
35 | const torch::Tensor& bn3_w,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:34:26: note: the first parameter in the range is 'bn2_v'
34 | const torch::Tensor& bn2_v,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:35:26: note: the last parameter in the range is 'bn3_w'
35 | const torch::Tensor& bn3_w,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:38:5: warning: 3 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]
38 | const torch::Tensor& bn3_v,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
39 | const torch::Tensor& downsample_conv_w,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
40 | const torch::Tensor& downsample_bn_w,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:38:26: note: the first parameter in the range is 'bn3_v'
38 | const torch::Tensor& bn3_v,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:40:26: note: the last parameter in the range is 'downsample_bn_w'
40 | const torch::Tensor& downsample_bn_w,
| ^~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:115:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
115 | const int blocks = (num_elements + threads - 1) / threads;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:119:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
119 | num_elements
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:131:16: warning: the parameter 'params' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
131 | py::object params,
| ^
| const &