Kernel Details - resnet101_fused_distr

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
    """
    Implements the ResNet101 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial layers
    x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["bn1_m"].to(x.device),
        params["bn1_v"].to(x.device),
        params["bn1_w"].to(x.device),
        params["bn1_b"].to(x.device),
        training=is_training,
    )
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def bottleneck_fn(
        x,
        conv1_w,
        conv2_w,
        conv3_w,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        downsample_conv_w=None,
        downsample_bn_w=None,
        downsample_bn_b=None,
        downsample_bn_m=None,
        downsample_bn_v=None,
        stride=1,
        is_training=True,
    ):
        identity = x

        out = F.conv2d(x, conv1_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn1_m.to(x.device),
            bn1_v.to(x.device),
            bn1_w.to(x.device),
            bn1_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
        out = F.batch_norm(
            out,
            bn2_m.to(x.device),
            bn2_v.to(x.device),
            bn2_w.to(x.device),
            bn2_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv3_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn3_m.to(x.device),
            bn3_v.to(x.device),
            bn3_w.to(x.device),
            bn3_b.to(x.device),
            training=is_training,
        )

        if downsample_conv_w is not None:
            identity = F.conv2d(
                x, downsample_conv_w.to(x.device), bias=None, stride=stride
            )
            identity = F.batch_norm(
                identity,
                downsample_bn_m.to(x.device),
                downsample_bn_v.to(x.device),
                downsample_bn_w.to(x.device),
                downsample_bn_b.to(x.device),
                training=is_training,
            )

        out += identity
        out = F.relu(out)

        return out

    # Layer 1-4
    for layer_idx in range(1, 5):
        blocks = params[f"layer{layer_idx}_blocks"]
        for block_idx in range(len(blocks)):
            block_params = blocks[block_idx]

            downsample_params = None
            if "downsample_conv_w" in block_params:
                downsample_params = [
                    block_params["downsample_conv_w"],
                    block_params["downsample_bn_w"],
                    block_params["downsample_bn_b"],
                    block_params["downsample_bn_m"],
                    block_params["downsample_bn_v"],
                ]

            x = bottleneck_fn(
                x,
                block_params["conv1_w"],
                block_params["conv2_w"],
                block_params["conv3_w"],
                block_params["bn1_w"],
                block_params["bn1_b"],
                block_params["bn1_m"],
                block_params["bn1_v"],
                block_params["bn2_w"],
                block_params["bn2_b"],
                block_params["bn2_m"],
                block_params["bn2_v"],
                block_params["bn3_w"],
                block_params["bn3_b"],
                block_params["bn3_m"],
                block_params["bn3_v"],
                *(downsample_params if downsample_params else [None] * 5),
                stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                is_training=is_training,
            )

    x = F.adaptive_avg_pool2d(x, (1, 1))
    x = torch.flatten(x, 1)
    x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))

    return x


class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        super(Model, self).__init__()
        self.params = nn.ParameterDict()
        in_channels = 64
        expansion = 4

        # Initial layers
        conv1 = nn.Conv2d(
            3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
        )
        bn1 = nn.BatchNorm2d(in_channels)
        self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
        self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
        self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
        self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
        self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

        # Layers 1-4
        channels = [64, 128, 256, 512]
        for layer_idx, (out_channels, num_blocks) in enumerate(
            zip(channels, layers), 1
        ):
            layer_blocks = []

            for block_idx in range(num_blocks):
                block_in_channels = (
                    in_channels if block_idx == 0 else out_channels * expansion
                )

                # Create block parameters
                block_params = {}

                # First block may have downsample
                if block_idx == 0 and (
                    layer_idx > 1 or block_in_channels != out_channels * expansion
                ):
                    downsample_conv = nn.Conv2d(
                        block_in_channels,
                        out_channels * expansion,
                        kernel_size=1,
                        stride=2 if layer_idx > 1 else 1,
                        bias=False,
                    )
                    downsample_bn = nn.BatchNorm2d(out_channels * expansion)

                    block_params["downsample_conv_w"] = nn.Parameter(
                        downsample_conv.weight.data.clone()
                    )
                    block_params["downsample_bn_w"] = nn.Parameter(
                        downsample_bn.weight.data.clone()
                    )
                    block_params["downsample_bn_b"] = nn.Parameter(
                        downsample_bn.bias.data.clone()
                    )
                    block_params["downsample_bn_m"] = nn.Parameter(
                        downsample_bn.running_mean.data.clone()
                    )
                    block_params["downsample_bn_v"] = nn.Parameter(
                        downsample_bn.running_var.data.clone()
                    )

                conv1 = nn.Conv2d(
                    block_in_channels, out_channels, kernel_size=1, bias=False
                )
                bn1 = nn.BatchNorm2d(out_channels)
                conv2 = nn.Conv2d(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                    padding=1,
                    bias=False,
                )
                bn2 = nn.BatchNorm2d(out_channels)
                conv3 = nn.Conv2d(
                    out_channels, out_channels * expansion, kernel_size=1, bias=False
                )
                bn3 = nn.BatchNorm2d(out_channels * expansion)

                block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
                block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
                block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
                block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
                block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

                block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
                block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
                block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
                block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
                block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())

                block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
                block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
                block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
                block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
                block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())

                layer_blocks.append(block_params)

            self.params[f"layer{layer_idx}_blocks"] = layer_blocks
            in_channels = out_channels * expansion

        # Final FC layer
        fc = nn.Linear(512 * expansion, num_classes)
        self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
        self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [layers, num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        """
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param stride: Stride for the first convolutional layer
        :param downsample: Downsample layer for the shortcut connection
        """
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, in_channels, height, width)
        :return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
        """
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        """
        :param block: Type of block to use (BasicBlock or Bottleneck)
        :param layers: List of integers specifying the number of blocks in each layer
        :param num_classes: Number of output classes
        """
        super(Model, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        block = Bottleneck

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, 3, height, width)
        :return: Output tensor, shape (batch_size, num_classes)
        """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [layers, num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	10_ResNet101
Level ID	3
Task ID	10
Kernel Name	resnet101_fused_distr_base
CUDA Speedup (Native)	1.100x
CUDA Speedup (Compile)	1.102x
CUDA Runtime	28.099 ms
PyTorch Runtime (Native)	30.902 ms
PyTorch Runtime (Compile)	30.963 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 3, Task 10 • 10_ResNet101)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	resnet101_modular_functions_base_base	23.20	1.33	1.33
🥈	resnet101_coalesced_memory_access_base	24.25	1.27	1.28
🥉	resnet101_balanced_workload_base	24.36	1.27	1.27
4	resnet101_balanced_workload_base	24.60	1.26	1.26
5	10_ResNet101_mem_opt_base_base	24.62	1.26	1.26
6	resnet101_uniform_flow_base_base	24.84	1.24	1.25
7	resnet101_shared_mem_sync_optimized_base	24.93	1.24	1.24
8	efficient_resnet_base	25.08	1.23	1.23
9	resnet101_optimized_memory_access_base	25.45	1.21	1.22
10	resnet101_unrolled_loops_base_base	25.58	1.21	1.21
11	resnet101_min_sync_relu_base	25.64	1.21	1.21
12	warp_pool_optimized_bottleneck_base	25.79	1.20	1.20
13	unified_resnet_base	26.38	1.17	1.17
14	10_ResNet101_warp_avg_pool_base	26.55	1.16	1.17
15	resnet101_minimal_sync_base_base	26.90	1.15	1.15
16	10_ResNet101	28.04	1.10	1.10
17	resnet101_fused_distr_base	28.10	1.10	1.10
18	10_resnet101_opt_aligned_mem_edit_1	29.45	1.05	1.05
19	10_resnet101_opt_min_sync_edit_1	29.58	1.04	1.05
20	10_resnet101_opt_base	29.83	1.04	1.04

#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

namespace py = pybind11;

// This kernel fuses the elementwise addition of the identity and the main branch, and applies ReLU
// It evenly distributes the workload across threads and blocks using a grid-stride loop
__global__ void fused_add_relu_kernel(const float* __restrict__ identity, float* __restrict__ out, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride) {
        float val = out[idx] + identity[idx];
        out[idx] = (val > 0.0f) ? val : 0.0f;
    }
}

// The bottleneck function implements the main branch and optionally a downsample branch.
// The final residual addition and activation are fused via a custom CUDA kernel to evenly distribute work.
torch::Tensor bottleneck_fn(
    torch::Tensor x,
    const torch::Tensor& conv1_w,
    const torch::Tensor& conv2_w,
    const torch::Tensor& conv3_w,
    const torch::Tensor& bn1_w,
    const torch::Tensor& bn1_b,
    const torch::Tensor& bn1_m,
    const torch::Tensor& bn1_v,
    const torch::Tensor& bn2_w,
    const torch::Tensor& bn2_b,
    const torch::Tensor& bn2_m,
    const torch::Tensor& bn2_v,
    const torch::Tensor& bn3_w,
    const torch::Tensor& bn3_b,
    const torch::Tensor& bn3_m,
    const torch::Tensor& bn3_v,
    const torch::Tensor& downsample_conv_w,
    const torch::Tensor& downsample_bn_w,
    const torch::Tensor& downsample_bn_b,
    const torch::Tensor& downsample_bn_m,
    const torch::Tensor& downsample_bn_v,
    int64_t stride,
    bool is_training
) {
    torch::Tensor identity = x;

    bool has_downsample = downsample_conv_w.defined();
    torch::Tensor downsample_out;
    if (has_downsample) {
        downsample_out = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride)
            .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
        downsample_out = torch::batch_norm(
            downsample_out,
            downsample_bn_w,
            downsample_bn_b,
            downsample_bn_m,
            downsample_bn_v,
            is_training,
            /*momentum=*/0.1,
            /*eps=*/1e-5,
            /*cudnn_enabled=*/true
        );
        identity = downsample_out;
    }

    torch::Tensor out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor())
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(
        out,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        is_training,
        0.1,
        1e-5,
        true
    );
    out = torch::relu(out);

    out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, /*padding=*/1)
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(
        out,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        is_training,
        0.1,
        1e-5,
        true
    );
    out = torch::relu(out);

    out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor())
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(
        out,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        is_training,
        0.1,
        1e-5,
        true
    );

    // Fuse the residual addition and ReLU activation using the custom CUDA kernel
    int64_t num_elements = out.numel();
    const int threads = 256;
    const int blocks = (num_elements + threads - 1) / threads;
    fused_add_relu_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        identity.data_ptr<float>(),
        out.data_ptr<float>(),
        num_elements
    );
    cudaDeviceSynchronize(); // Ensure kernel execution completes before proceeding

    return out;
}

// The forward function processes the ResNet101 architecture layer by layer.
// It prefetches all parameters on the correct device and uses the bottleneck_fn to apply each block.
// The fused kernel ensures that the residual addition and activation are distributed evenly among CUDA threads.
torch::Tensor forward(
    torch::Tensor x,
    py::object params,
    bool is_training
) {
    auto device = x.device();

    // Initial layer parameters
    auto conv1_w = params.attr("get")(py::str("conv1_w")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_w = params.attr("get")(py::str("bn1_w")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_b = params.attr("get")(py::str("bn1_b")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_m = params.attr("get")(py::str("bn1_m")).cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_v = params.attr("get")(py::str("bn1_v")).cast<torch::Tensor>().contiguous().to(device, true);

    x = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor(), /*stride=*/2, /*padding=*/3)
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    x = torch::batch_norm(
        x,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        is_training,
        0.1,
        1e-5,
        true
    );
    x = torch::relu(x);
    x = torch::max_pool2d(x, /*kernel_size=*/3, /*stride=*/2, /*padding=*/1);

    // Process each layer's blocks
    for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
        std::string layer_key = "layer" + std::to_string(layer_idx) + "_blocks";
        py::list blocks = params.attr("get")(py::str(layer_key)).cast<py::list>();
        for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
            py::object block_params = blocks[block_idx];

            auto conv1_w = block_params.attr("get")(py::str("conv1_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto conv2_w = block_params.attr("get")(py::str("conv2_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto conv3_w = block_params.attr("get")(py::str("conv3_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_w = block_params.attr("get")(py::str("bn1_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_b = block_params.attr("get")(py::str("bn1_b")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_m = block_params.attr("get")(py::str("bn1_m")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_v = block_params.attr("get")(py::str("bn1_v")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_w = block_params.attr("get")(py::str("bn2_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_b = block_params.attr("get")(py::str("bn2_b")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_m = block_params.attr("get")(py::str("bn2_m")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_v = block_params.attr("get")(py::str("bn2_v")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_w = block_params.attr("get")(py::str("bn3_w")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_b = block_params.attr("get")(py::str("bn3_b")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_m = block_params.attr("get")(py::str("bn3_m")).cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_v = block_params.attr("get")(py::str("bn3_v")).cast<torch::Tensor>().contiguous().to(device, true);

            torch::Tensor downsample_conv_w = torch::Tensor();
            torch::Tensor downsample_bn_w = torch::Tensor();
            torch::Tensor downsample_bn_b = torch::Tensor();
            torch::Tensor downsample_bn_m = torch::Tensor();
            torch::Tensor downsample_bn_v = torch::Tensor();
            
            if (py::bool_(block_params.attr("__contains__")(py::str("downsample_conv_w")))) {
                downsample_conv_w = block_params.attr("get")(py::str("downsample_conv_w")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_w = block_params.attr("get")(py::str("downsample_bn_w")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_b = block_params.attr("get")(py::str("downsample_bn_b")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_m = block_params.attr("get")(py::str("downsample_bn_m")).cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_v = block_params.attr("get")(py::str("downsample_bn_v")).cast<torch::Tensor>().contiguous().to(device, true);
            }

            int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
            x = bottleneck_fn(
                x,
                conv1_w,
                conv2_w,
                conv3_w,
                bn1_w,
                bn1_b,
                bn1_m,
                bn1_v,
                bn2_w,
                bn2_b,
                bn2_m,
                bn2_v,
                bn3_w,
                bn3_b,
                bn3_m,
                bn3_v,
                downsample_conv_w,
                downsample_bn_w,
                downsample_bn_b,
                downsample_bn_m,
                downsample_bn_v,
                stride,
                is_training
            );
        }
    }

    x = torch::adaptive_avg_pool2d(x, {1, 1}).contiguous();
    x = x.view({x.size(0), -1});

    auto fc_w = params.attr("get")(py::str("fc_w")).cast<torch::Tensor>().contiguous().to(device, true);
    auto fc_b = params.attr("get")(py::str("fc_b")).cast<torch::Tensor>().contiguous().to(device, true);
    return torch::linear(x, fc_w, fc_b);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "ResNet101 forward function with fused addition and ReLU");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	2.026	inst/cycle	0.002	5
Executed Ipc Elapsed	1.624	inst/cycle	0.007	5
Issue Slots Busy	51.182	%	1.069	5
Issued Ipc Active	2.046	inst/cycle	0.002	5
SM Busy	51.182	%	1.069	5
Memory Throughput	1555965485156.598	byte/second	12865757325163071799296.000	5
Mem Busy	33.272	%	3.706	5
Max Bandwidth	46.618	%	11.119	5
L1/TEX Hit Rate	33.278	%	0.002	5
L2 Hit Rate	36.298	%	0.188	5
Mem Pipes Busy	41.030	%	3.968	5
Warp Cycles Per Issued Instruction	24.240	cycle	0.056	5
Warp Cycles Per Executed Instruction	24.464	cycle	0.082	5
Avg. Active Threads Per Warp	32.000		0.000	5
Avg. Not Predicated Off Threads Per Warp	28.000		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	8.000	block	0.000	5
Block Limit Shared Mem	32.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	64.000	warp	0.000	5
Theoretical Occupancy	100.000	%	0.000	5
Achieved Occupancy	78.476	%	0.062	5
Achieved Active Warps Per SM	50.224	warp	0.026	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (32.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall	Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy	This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (78.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	5755936.10	μs
Device Time	2899889.06	μs
Self CPU Time	100871.29	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	5655064.81	μs
Device Time	2899889.06	μs
Self CPU Time	306699.69	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::copy_
CPU Time	6391356.89	μs
Device Time	2899889.06	μs
Self CPU Time	1902843.96	μs
Self Device Time	2899889.06	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaMemcpyAsync
CPU Time	4488403.88	μs
Device Time	0.00	μs
Self CPU Time	4488403.88	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
Memcpy HtoD (Pageable -> Device)
CPU Time	0.00	μs
Device Time	2899890.95	μs
Self CPU Time	0.00	μs
Self Device Time	2899890.95	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::clone
CPU Time	1661466.33	μs
Device Time	0.00	μs
Self CPU Time	6945.93	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::conv2d
CPU Time	1106288.10	μs
Device Time	604797.11	μs
Self CPU Time	44392.89	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45300 warnings generated when compiling for host.
Suppressed 45337 warnings (45290 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:12:15 bugprone-narrowing-conversions

12 | int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:13:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

13 | int stride = blockDim.x * gridDim.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:23:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

23 | torch::Tensor x,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:26:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

26 | const torch::Tensor& conv3_w,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

27 | const torch::Tensor& bn1_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:26:26: note: the first parameter in the range is 'conv3_w'

26 | const torch::Tensor& conv3_w,

| ^~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:27:26: note: the last parameter in the range is 'bn1_w'

27 | const torch::Tensor& bn1_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:30:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

30 | const torch::Tensor& bn1_v,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~

31 | const torch::Tensor& bn2_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:30:26: note: the first parameter in the range is 'bn1_v'

30 | const torch::Tensor& bn1_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:31:26: note: the last parameter in the range is 'bn2_w'

31 | const torch::Tensor& bn2_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:34:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

34 | const torch::Tensor& bn2_v,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~

35 | const torch::Tensor& bn3_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:34:26: note: the first parameter in the range is 'bn2_v'

34 | const torch::Tensor& bn2_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:35:26: note: the last parameter in the range is 'bn3_w'

35 | const torch::Tensor& bn3_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:38:5: warning: 3 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

38 | const torch::Tensor& bn3_v,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~

39 | const torch::Tensor& downsample_conv_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

40 | const torch::Tensor& downsample_bn_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:38:26: note: the first parameter in the range is 'bn3_v'

38 | const torch::Tensor& bn3_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:40:26: note: the last parameter in the range is 'downsample_bn_w'

40 | const torch::Tensor& downsample_bn_w,

| ^~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:115:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

115 | const int blocks = (num_elements + threads - 1) / threads;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:119:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

119 | num_elements

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b7_s1_resnet101_fused_distr/base/base.cu:131:16: warning: the parameter 'params' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

131 | py::object params,

| ^

| const &

The AI CUDA Engineer 👷

`10_ResNet101` • `resnet101_fused_distr_base`

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

The AI CUDA Engineer 👷

10_ResNet101 • resnet101_fused_distr_base

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

`10_ResNet101` • `resnet101_fused_distr_base`