← Back to Leaderboard

The AI CUDA Engineer 👷

10_ResNet101warp_pool_optimized_bottleneck_base

Level 3 • Task 10
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
    """
    Implements the ResNet101 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial layers
    x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["bn1_m"].to(x.device),
        params["bn1_v"].to(x.device),
        params["bn1_w"].to(x.device),
        params["bn1_b"].to(x.device),
        training=is_training,
    )
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def bottleneck_fn(
        x,
        conv1_w,
        conv2_w,
        conv3_w,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        downsample_conv_w=None,
        downsample_bn_w=None,
        downsample_bn_b=None,
        downsample_bn_m=None,
        downsample_bn_v=None,
        stride=1,
        is_training=True,
    ):
        identity = x

        out = F.conv2d(x, conv1_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn1_m.to(x.device),
            bn1_v.to(x.device),
            bn1_w.to(x.device),
            bn1_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
        out = F.batch_norm(
            out,
            bn2_m.to(x.device),
            bn2_v.to(x.device),
            bn2_w.to(x.device),
            bn2_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv3_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn3_m.to(x.device),
            bn3_v.to(x.device),
            bn3_w.to(x.device),
            bn3_b.to(x.device),
            training=is_training,
        )

        if downsample_conv_w is not None:
            identity = F.conv2d(
                x, downsample_conv_w.to(x.device), bias=None, stride=stride
            )
            identity = F.batch_norm(
                identity,
                downsample_bn_m.to(x.device),
                downsample_bn_v.to(x.device),
                downsample_bn_w.to(x.device),
                downsample_bn_b.to(x.device),
                training=is_training,
            )

        out += identity
        out = F.relu(out)

        return out

    # Layer 1-4
    for layer_idx in range(1, 5):
        blocks = params[f"layer{layer_idx}_blocks"]
        for block_idx in range(len(blocks)):
            block_params = blocks[block_idx]

            downsample_params = None
            if "downsample_conv_w" in block_params:
                downsample_params = [
                    block_params["downsample_conv_w"],
                    block_params["downsample_bn_w"],
                    block_params["downsample_bn_b"],
                    block_params["downsample_bn_m"],
                    block_params["downsample_bn_v"],
                ]

            x = bottleneck_fn(
                x,
                block_params["conv1_w"],
                block_params["conv2_w"],
                block_params["conv3_w"],
                block_params["bn1_w"],
                block_params["bn1_b"],
                block_params["bn1_m"],
                block_params["bn1_v"],
                block_params["bn2_w"],
                block_params["bn2_b"],
                block_params["bn2_m"],
                block_params["bn2_v"],
                block_params["bn3_w"],
                block_params["bn3_b"],
                block_params["bn3_m"],
                block_params["bn3_v"],
                *(downsample_params if downsample_params else [None] * 5),
                stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                is_training=is_training,
            )

    x = F.adaptive_avg_pool2d(x, (1, 1))
    x = torch.flatten(x, 1)
    x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))

    return x


class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        super(Model, self).__init__()
        self.params = nn.ParameterDict()
        in_channels = 64
        expansion = 4

        # Initial layers
        conv1 = nn.Conv2d(
            3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
        )
        bn1 = nn.BatchNorm2d(in_channels)
        self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
        self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
        self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
        self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
        self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

        # Layers 1-4
        channels = [64, 128, 256, 512]
        for layer_idx, (out_channels, num_blocks) in enumerate(
            zip(channels, layers), 1
        ):
            layer_blocks = []

            for block_idx in range(num_blocks):
                block_in_channels = (
                    in_channels if block_idx == 0 else out_channels * expansion
                )

                # Create block parameters
                block_params = {}

                # First block may have downsample
                if block_idx == 0 and (
                    layer_idx > 1 or block_in_channels != out_channels * expansion
                ):
                    downsample_conv = nn.Conv2d(
                        block_in_channels,
                        out_channels * expansion,
                        kernel_size=1,
                        stride=2 if layer_idx > 1 else 1,
                        bias=False,
                    )
                    downsample_bn = nn.BatchNorm2d(out_channels * expansion)

                    block_params["downsample_conv_w"] = nn.Parameter(
                        downsample_conv.weight.data.clone()
                    )
                    block_params["downsample_bn_w"] = nn.Parameter(
                        downsample_bn.weight.data.clone()
                    )
                    block_params["downsample_bn_b"] = nn.Parameter(
                        downsample_bn.bias.data.clone()
                    )
                    block_params["downsample_bn_m"] = nn.Parameter(
                        downsample_bn.running_mean.data.clone()
                    )
                    block_params["downsample_bn_v"] = nn.Parameter(
                        downsample_bn.running_var.data.clone()
                    )

                conv1 = nn.Conv2d(
                    block_in_channels, out_channels, kernel_size=1, bias=False
                )
                bn1 = nn.BatchNorm2d(out_channels)
                conv2 = nn.Conv2d(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                    padding=1,
                    bias=False,
                )
                bn2 = nn.BatchNorm2d(out_channels)
                conv3 = nn.Conv2d(
                    out_channels, out_channels * expansion, kernel_size=1, bias=False
                )
                bn3 = nn.BatchNorm2d(out_channels * expansion)

                block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
                block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
                block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
                block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
                block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

                block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
                block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
                block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
                block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
                block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())

                block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
                block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
                block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
                block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
                block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())

                layer_blocks.append(block_params)

            self.params[f"layer{layer_idx}_blocks"] = layer_blocks
            in_channels = out_channels * expansion

        # Final FC layer
        fc = nn.Linear(512 * expansion, num_classes)
        self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
        self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [layers, num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        """
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param stride: Stride for the first convolutional layer
        :param downsample: Downsample layer for the shortcut connection
        """
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, in_channels, height, width)
        :return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
        """
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        """
        :param block: Type of block to use (BasicBlock or Bottleneck)
        :param layers: List of integers specifying the number of blocks in each layer
        :param num_classes: Number of output classes
        """
        super(Model, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        block = Bottleneck

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, 3, height, width)
        :return: Output tensor, shape (batch_size, num_classes)
        """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [layers, num_classes]

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 resnet101_modular_functions_base_base 23.20 1.33 1.33
🥈 resnet101_coalesced_memory_access_base 24.25 1.27 1.28
🥉 resnet101_balanced_workload_base 24.36 1.27 1.27
4 resnet101_balanced_workload_base 24.60 1.26 1.26
5 10_ResNet101_mem_opt_base_base 24.62 1.26 1.26
6 resnet101_uniform_flow_base_base 24.84 1.24 1.25
7 resnet101_shared_mem_sync_optimized_base 24.93 1.24 1.24
8 efficient_resnet_base 25.08 1.23 1.23
9 resnet101_optimized_memory_access_base 25.45 1.21 1.22
10 resnet101_unrolled_loops_base_base 25.58 1.21 1.21
11 resnet101_min_sync_relu_base 25.64 1.21 1.21
12 warp_pool_optimized_bottleneck_base 25.79 1.20 1.20
13 unified_resnet_base 26.38 1.17 1.17
14 10_ResNet101_warp_avg_pool_base 26.55 1.16 1.17
15 resnet101_minimal_sync_base_base 26.90 1.15 1.15
16 10_ResNet101 28.04 1.10 1.10
17 resnet101_fused_distr_base 28.10 1.10 1.10
18 10_resnet101_opt_aligned_mem_edit_1 29.45 1.05 1.05
19 10_resnet101_opt_min_sync_edit_1 29.58 1.04 1.05
20 10_resnet101_opt_base 29.83 1.04 1.04
#include <torch/extension.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

namespace py = pybind11;

__global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
    int index = blockIdx.x;
    int n = index / C;
    int c = index % C;
    int total = H * W;
    const float* in_ptr = input + (n * C + c) * total;
    float sum = 0.0f;
    
    for (int i = threadIdx.x; i < total; i += blockDim.x) {
        sum += in_ptr[i];
    }
    
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        sum += __shfl_down_sync(0xffffffff, sum, offset);
    }

    if (threadIdx.x == 0) {
        output[index] = sum / total;
    }
}

torch::Tensor unified_bottleneck(
    torch::Tensor x,
    torch::Tensor conv1_w,
    torch::Tensor conv2_w,
    torch::Tensor conv3_w,
    torch::Tensor bn1_w,
    torch::Tensor bn1_b,
    torch::Tensor bn1_m,
    torch::Tensor bn1_v,
    torch::Tensor bn2_w,
    torch::Tensor bn2_b,
    torch::Tensor bn2_m,
    torch::Tensor bn2_v,
    torch::Tensor bn3_w,
    torch::Tensor bn3_b,
    torch::Tensor bn3_m,
    torch::Tensor bn3_v,
    bool has_downsample,
    torch::Tensor downsample_conv_w,
    torch::Tensor downsample_bn_w,
    torch::Tensor downsample_bn_b,
    torch::Tensor downsample_bn_m,
    torch::Tensor downsample_bn_v,
    int64_t stride,
    bool is_training
) {
    auto out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor());
    out = torch::batch_norm(out, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
    out = torch::relu(out);
    
    out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, 1);
    out = torch::batch_norm(out, bn2_w, bn2_b, bn2_m, bn2_v, is_training, 0.1, 1e-5, true);
    out = torch::relu(out);
    
    out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor());
    out = torch::batch_norm(out, bn3_w, bn3_b, bn3_m, bn3_v, is_training, 0.1, 1e-5, true);
    
    torch::Tensor identity;
    if (has_downsample) {
        identity = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride);
        identity = torch::batch_norm(identity, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v, is_training, 0.1, 1e-5, true);
    } else {
        identity = x.to(out.dtype());
    }
    
    return torch::relu(out + identity);
}

torch::Tensor forward(torch::Tensor x, py::object params, bool is_training) {
    auto device = x.device();
    
    auto conv1_w = params.attr("get")("conv1_w").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_w = params.attr("get")("bn1_w").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_b = params.attr("get")("bn1_b").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_m = params.attr("get")("bn1_m").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_v = params.attr("get")("bn1_v").cast<torch::Tensor>().contiguous().to(device, true);
    
    x = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor(), 2, 3);
    x = torch::batch_norm(x, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
    x = torch::relu(x);
    x = torch::max_pool2d(x, 3, 2, 1);
    
    for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
        std::string layer_key = "layer" + std::to_string(layer_idx) + "_blocks";
        py::list blocks = params.attr("get")(py::str(layer_key)).cast<py::list>();
        
        for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
            py::object block = blocks[block_idx];
            
            auto conv1_w_blk = block.attr("get")("conv1_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto conv2_w_blk = block.attr("get")("conv2_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto conv3_w_blk = block.attr("get")("conv3_w").cast<torch::Tensor>().contiguous().to(device, true);
            
            auto bn1_w_blk = block.attr("get")("bn1_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_b_blk = block.attr("get")("bn1_b").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_m_blk = block.attr("get")("bn1_m").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_v_blk = block.attr("get")("bn1_v").cast<torch::Tensor>().contiguous().to(device, true);
            
            auto bn2_w_blk = block.attr("get")("bn2_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_b_blk = block.attr("get")("bn2_b").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_m_blk = block.attr("get")("bn2_m").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_v_blk = block.attr("get")("bn2_v").cast<torch::Tensor>().contiguous().to(device, true);
            
            auto bn3_w_blk = block.attr("get")("bn3_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_b_blk = block.attr("get")("bn3_b").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_m_blk = block.attr("get")("bn3_m").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_v_blk = block.attr("get")("bn3_v").cast<torch::Tensor>().contiguous().to(device, true);
            
            bool has_downsample = py::bool_(block.attr("__contains__")(py::str("downsample_conv_w")));
            torch::Tensor downsample_conv_w, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v;
            if (has_downsample) {
                downsample_conv_w = block.attr("get")("downsample_conv_w").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_w = block.attr("get")("downsample_bn_w").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_b = block.attr("get")("downsample_bn_b").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_m = block.attr("get")("downsample_bn_m").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_v = block.attr("get")("downsample_bn_v").cast<torch::Tensor>().contiguous().to(device, true);
            }
            
            int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
            
            x = unified_bottleneck(
                x,
                conv1_w_blk, conv2_w_blk, conv3_w_blk,
                bn1_w_blk, bn1_b_blk, bn1_m_blk, bn1_v_blk,
                bn2_w_blk, bn2_b_blk, bn2_m_blk, bn2_v_blk,
                bn3_w_blk, bn3_b_blk, bn3_m_blk, bn3_v_blk,
                has_downsample,
                downsample_conv_w, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v,
                stride, is_training
            );
        }
    }
    
    auto sizes = x.sizes();
    int N = sizes[0];
    int C = sizes[1];
    int H = sizes[2];
    int W = sizes[3];
    int spatial = H * W;
    auto pooled = torch::empty({N, C, 1, 1}, x.options());
    
    int grid = N * C;
    int threads = 32; 
    warp_avg_pool_kernel<<<grid, threads>>>(x.data_ptr<float>(), pooled.data_ptr<float>(), spatial, C, H, W);
    cudaDeviceSynchronize();
    
    x = pooled.view({N, C});
    
    auto fc_w = params.attr("get")("fc_w").cast<torch::Tensor>().contiguous().to(device, true);
    auto fc_b = params.attr("get")("fc_b").cast<torch::Tensor>().contiguous().to(device, true);
    return torch::linear(x, fc_w, fc_b);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized ResNet101 with warp pooling and streamlined params");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 0.726 inst/cycle 0.000 5
Executed Ipc Elapsed 0.560 inst/cycle 0.000 5
Issue Slots Busy 18.428 % 0.053 5
Issued Ipc Active 0.736 inst/cycle 0.000 5
SM Busy 18.428 % 0.053 5
Memory Throughput 258432560647.092 byte/second 499909539194520384.000 5
Mem Busy 7.828 % 0.001 5
Max Bandwidth 7.730 % 0.000 5
L1/TEX Hit Rate 9.922 % 0.000 5
L2 Hit Rate 40.640 % 0.007 5
Mem Pipes Busy 5.682 % 0.000 5
Warp Cycles Per Issued Instruction 17.286 cycle 0.015 5
Warp Cycles Per Executed Instruction 17.628 cycle 0.020 5
Avg. Active Threads Per Warp 26.080 0.000 5
Avg. Not Predicated Off Threads Per Warp 24.990 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 128.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 64.000 block 0.000 5
Theoretical Active Warps per SM 32.000 warp 0.000 5
Theoretical Occupancy 50.000 % 0.000 5
Achieved Occupancy 20.382 % 0.021 5
Achieved Active Warps Per SM 13.044 warp 0.009 5
Analysis Rules
Rule Description
WRN HighPipeUtilization All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory. The difference between calculated theoretical (50.0%) and measured achieved occupancy (20.2%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 5043381.72 μs
Device Time 2434767.38 μs
Self CPU Time 86332.07 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 4957049.64 μs
Device Time 2434767.38 μs
Self CPU Time 284941.56 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::copy_
CPU Time 5246972.96 μs
Device Time 2434767.38 μs
Self CPU Time 1318476.41 μs
Self Device Time 2434767.38 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaMemcpyAsync
CPU Time 3928388.91 μs
Device Time 0.00 μs
Self CPU Time 3928388.91 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Memcpy HtoD (Pageable -> Device)
CPU Time 0.00 μs
Device Time 2434769.05 μs
Self CPU Time 0.00 μs
Self Device Time 2434769.05 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::conv2d
CPU Time 1098533.59 μs
Device Time 534840.73 μs
Self CPU Time 40168.28 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45295 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:8:99 bugprone-easily-swappable-parameters
8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:8:103: note: the first parameter in the range is 'spatial'
8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:8:123: note: the last parameter in the range is 'H'
8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:9:17: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
9 | int index = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:13:27: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
13 | const float* in_ptr = input + (n * C + c) * total;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:13:35: note: make conversion explicit to silence this warning
5 |
6 | namespace py = pybind11;
7 |
8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
9 | int index = blockIdx.x;
10 | int n = index / C;
11 | int c = index % C;
12 | int total = H * W;
13 | const float* in_ptr = input + (n * C + c) * total;
| ^~~~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:13:35: note: perform multiplication in a wider type
13 | const float* in_ptr = input + (n * C + c) * total;
| ^~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:16:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
16 | for (int i = threadIdx.x; i < total; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:16:47: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
16 | for (int i = threadIdx.x; i < total; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:25:31: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
25 | output[index] = sum / total;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:30:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
30 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:31:19: warning: the parameter 'conv1_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
31 | torch::Tensor conv1_w,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:32:19: warning: the parameter 'conv2_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
32 | torch::Tensor conv2_w,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:33:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
33 | torch::Tensor conv3_w,
| ^~~~~~~~~~~~~~~~~~~~~~
34 | torch::Tensor bn1_w,
| ~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:33:19: note: the first parameter in the range is 'conv3_w'
33 | torch::Tensor conv3_w,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:34:19: note: the last parameter in the range is 'bn1_w'
34 | torch::Tensor bn1_w,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:33:19: warning: the parameter 'conv3_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
33 | torch::Tensor conv3_w,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:37:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
37 | torch::Tensor bn1_v,
| ^~~~~~~~~~~~~~~~~~~~
38 | torch::Tensor bn2_w,
| ~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:37:19: note: the first parameter in the range is 'bn1_v'
37 | torch::Tensor bn1_v,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:38:19: note: the last parameter in the range is 'bn2_w'
38 | torch::Tensor bn2_w,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:41:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
41 | torch::Tensor bn2_v,
| ^~~~~~~~~~~~~~~~~~~~
42 | torch::Tensor bn3_w,
| ~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:41:19: note: the first parameter in the range is 'bn2_v'
41 | torch::Tensor bn2_v,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:42:19: note: the last parameter in the range is 'bn3_w'
42 | torch::Tensor bn3_w,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:47:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
47 | torch::Tensor downsample_conv_w,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 | torch::Tensor downsample_bn_w,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:47:19: note: the first parameter in the range is 'downsample_conv_w'
47 | torch::Tensor downsample_conv_w,
| ^~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:48:19: note: the last parameter in the range is 'downsample_bn_w'
48 | torch::Tensor downsample_bn_w,
| ^~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:47:19: warning: the parameter 'downsample_conv_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
47 | torch::Tensor downsample_conv_w,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:77:51: warning: the parameter 'params' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
77 | torch::Tensor forward(torch::Tensor x, py::object params, bool is_training) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:143:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
143 | int N = sizes[0];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:144:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
144 | int C = sizes[1];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:145:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
145 | int H = sizes[2];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:146:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
146 | int W = sizes[3];
| ^