Kernel Details - warp_pool_optimized_bottleneck

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
    """
    Implements the ResNet101 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial layers
    x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["bn1_m"].to(x.device),
        params["bn1_v"].to(x.device),
        params["bn1_w"].to(x.device),
        params["bn1_b"].to(x.device),
        training=is_training,
    )
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def bottleneck_fn(
        x,
        conv1_w,
        conv2_w,
        conv3_w,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        downsample_conv_w=None,
        downsample_bn_w=None,
        downsample_bn_b=None,
        downsample_bn_m=None,
        downsample_bn_v=None,
        stride=1,
        is_training=True,
    ):
        identity = x

        out = F.conv2d(x, conv1_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn1_m.to(x.device),
            bn1_v.to(x.device),
            bn1_w.to(x.device),
            bn1_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
        out = F.batch_norm(
            out,
            bn2_m.to(x.device),
            bn2_v.to(x.device),
            bn2_w.to(x.device),
            bn2_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv3_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn3_m.to(x.device),
            bn3_v.to(x.device),
            bn3_w.to(x.device),
            bn3_b.to(x.device),
            training=is_training,
        )

        if downsample_conv_w is not None:
            identity = F.conv2d(
                x, downsample_conv_w.to(x.device), bias=None, stride=stride
            )
            identity = F.batch_norm(
                identity,
                downsample_bn_m.to(x.device),
                downsample_bn_v.to(x.device),
                downsample_bn_w.to(x.device),
                downsample_bn_b.to(x.device),
                training=is_training,
            )

        out += identity
        out = F.relu(out)

        return out

    # Layer 1-4
    for layer_idx in range(1, 5):
        blocks = params[f"layer{layer_idx}_blocks"]
        for block_idx in range(len(blocks)):
            block_params = blocks[block_idx]

            downsample_params = None
            if "downsample_conv_w" in block_params:
                downsample_params = [
                    block_params["downsample_conv_w"],
                    block_params["downsample_bn_w"],
                    block_params["downsample_bn_b"],
                    block_params["downsample_bn_m"],
                    block_params["downsample_bn_v"],
                ]

            x = bottleneck_fn(
                x,
                block_params["conv1_w"],
                block_params["conv2_w"],
                block_params["conv3_w"],
                block_params["bn1_w"],
                block_params["bn1_b"],
                block_params["bn1_m"],
                block_params["bn1_v"],
                block_params["bn2_w"],
                block_params["bn2_b"],
                block_params["bn2_m"],
                block_params["bn2_v"],
                block_params["bn3_w"],
                block_params["bn3_b"],
                block_params["bn3_m"],
                block_params["bn3_v"],
                *(downsample_params if downsample_params else [None] * 5),
                stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                is_training=is_training,
            )

    x = F.adaptive_avg_pool2d(x, (1, 1))
    x = torch.flatten(x, 1)
    x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))

    return x


class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        super(Model, self).__init__()
        self.params = nn.ParameterDict()
        in_channels = 64
        expansion = 4

        # Initial layers
        conv1 = nn.Conv2d(
            3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
        )
        bn1 = nn.BatchNorm2d(in_channels)
        self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
        self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
        self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
        self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
        self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

        # Layers 1-4
        channels = [64, 128, 256, 512]
        for layer_idx, (out_channels, num_blocks) in enumerate(
            zip(channels, layers), 1
        ):
            layer_blocks = []

            for block_idx in range(num_blocks):
                block_in_channels = (
                    in_channels if block_idx == 0 else out_channels * expansion
                )

                # Create block parameters
                block_params = {}

                # First block may have downsample
                if block_idx == 0 and (
                    layer_idx > 1 or block_in_channels != out_channels * expansion
                ):
                    downsample_conv = nn.Conv2d(
                        block_in_channels,
                        out_channels * expansion,
                        kernel_size=1,
                        stride=2 if layer_idx > 1 else 1,
                        bias=False,
                    )
                    downsample_bn = nn.BatchNorm2d(out_channels * expansion)

                    block_params["downsample_conv_w"] = nn.Parameter(
                        downsample_conv.weight.data.clone()
                    )
                    block_params["downsample_bn_w"] = nn.Parameter(
                        downsample_bn.weight.data.clone()
                    )
                    block_params["downsample_bn_b"] = nn.Parameter(
                        downsample_bn.bias.data.clone()
                    )
                    block_params["downsample_bn_m"] = nn.Parameter(
                        downsample_bn.running_mean.data.clone()
                    )
                    block_params["downsample_bn_v"] = nn.Parameter(
                        downsample_bn.running_var.data.clone()
                    )

                conv1 = nn.Conv2d(
                    block_in_channels, out_channels, kernel_size=1, bias=False
                )
                bn1 = nn.BatchNorm2d(out_channels)
                conv2 = nn.Conv2d(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                    padding=1,
                    bias=False,
                )
                bn2 = nn.BatchNorm2d(out_channels)
                conv3 = nn.Conv2d(
                    out_channels, out_channels * expansion, kernel_size=1, bias=False
                )
                bn3 = nn.BatchNorm2d(out_channels * expansion)

                block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
                block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
                block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
                block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
                block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

                block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
                block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
                block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
                block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
                block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())

                block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
                block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
                block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
                block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
                block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())

                layer_blocks.append(block_params)

            self.params[f"layer{layer_idx}_blocks"] = layer_blocks
            in_channels = out_channels * expansion

        # Final FC layer
        fc = nn.Linear(512 * expansion, num_classes)
        self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
        self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [layers, num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        """
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param stride: Stride for the first convolutional layer
        :param downsample: Downsample layer for the shortcut connection
        """
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, in_channels, height, width)
        :return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
        """
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        """
        :param block: Type of block to use (BasicBlock or Bottleneck)
        :param layers: List of integers specifying the number of blocks in each layer
        :param num_classes: Number of output classes
        """
        super(Model, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        block = Bottleneck

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, 3, height, width)
        :return: Output tensor, shape (batch_size, num_classes)
        """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [layers, num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	10_ResNet101
Level ID	3
Task ID	10
Kernel Name	warp_pool_optimized_bottleneck_base
CUDA Speedup (Native)	1.198x
CUDA Speedup (Compile)	1.200x
CUDA Runtime	25.792 ms
PyTorch Runtime (Native)	30.902 ms
PyTorch Runtime (Compile)	30.963 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	deepseek-reasoner
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 3, Task 10 • 10_ResNet101)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	resnet101_modular_functions_base_base	23.20	1.33	1.33
🥈	resnet101_coalesced_memory_access_base	24.25	1.27	1.28
🥉	resnet101_balanced_workload_base	24.36	1.27	1.27
4	resnet101_balanced_workload_base	24.60	1.26	1.26
5	10_ResNet101_mem_opt_base_base	24.62	1.26	1.26
6	resnet101_uniform_flow_base_base	24.84	1.24	1.25
7	resnet101_shared_mem_sync_optimized_base	24.93	1.24	1.24
8	efficient_resnet_base	25.08	1.23	1.23
9	resnet101_optimized_memory_access_base	25.45	1.21	1.22
10	resnet101_unrolled_loops_base_base	25.58	1.21	1.21
11	resnet101_min_sync_relu_base	25.64	1.21	1.21
12	warp_pool_optimized_bottleneck_base	25.79	1.20	1.20
13	unified_resnet_base	26.38	1.17	1.17
14	10_ResNet101_warp_avg_pool_base	26.55	1.16	1.17
15	resnet101_minimal_sync_base_base	26.90	1.15	1.15
16	10_ResNet101	28.04	1.10	1.10
17	resnet101_fused_distr_base	28.10	1.10	1.10
18	10_resnet101_opt_aligned_mem_edit_1	29.45	1.05	1.05
19	10_resnet101_opt_min_sync_edit_1	29.58	1.04	1.05
20	10_resnet101_opt_base	29.83	1.04	1.04

#include <torch/extension.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

namespace py = pybind11;

__global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
    int index = blockIdx.x;
    int n = index / C;
    int c = index % C;
    int total = H * W;
    const float* in_ptr = input + (n * C + c) * total;
    float sum = 0.0f;
    
    for (int i = threadIdx.x; i < total; i += blockDim.x) {
        sum += in_ptr[i];
    }
    
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        sum += __shfl_down_sync(0xffffffff, sum, offset);
    }

    if (threadIdx.x == 0) {
        output[index] = sum / total;
    }
}

torch::Tensor unified_bottleneck(
    torch::Tensor x,
    torch::Tensor conv1_w,
    torch::Tensor conv2_w,
    torch::Tensor conv3_w,
    torch::Tensor bn1_w,
    torch::Tensor bn1_b,
    torch::Tensor bn1_m,
    torch::Tensor bn1_v,
    torch::Tensor bn2_w,
    torch::Tensor bn2_b,
    torch::Tensor bn2_m,
    torch::Tensor bn2_v,
    torch::Tensor bn3_w,
    torch::Tensor bn3_b,
    torch::Tensor bn3_m,
    torch::Tensor bn3_v,
    bool has_downsample,
    torch::Tensor downsample_conv_w,
    torch::Tensor downsample_bn_w,
    torch::Tensor downsample_bn_b,
    torch::Tensor downsample_bn_m,
    torch::Tensor downsample_bn_v,
    int64_t stride,
    bool is_training
) {
    auto out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor());
    out = torch::batch_norm(out, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
    out = torch::relu(out);
    
    out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, 1);
    out = torch::batch_norm(out, bn2_w, bn2_b, bn2_m, bn2_v, is_training, 0.1, 1e-5, true);
    out = torch::relu(out);
    
    out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor());
    out = torch::batch_norm(out, bn3_w, bn3_b, bn3_m, bn3_v, is_training, 0.1, 1e-5, true);
    
    torch::Tensor identity;
    if (has_downsample) {
        identity = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride);
        identity = torch::batch_norm(identity, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v, is_training, 0.1, 1e-5, true);
    } else {
        identity = x.to(out.dtype());
    }
    
    return torch::relu(out + identity);
}

torch::Tensor forward(torch::Tensor x, py::object params, bool is_training) {
    auto device = x.device();
    
    auto conv1_w = params.attr("get")("conv1_w").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_w = params.attr("get")("bn1_w").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_b = params.attr("get")("bn1_b").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_m = params.attr("get")("bn1_m").cast<torch::Tensor>().contiguous().to(device, true);
    auto bn1_v = params.attr("get")("bn1_v").cast<torch::Tensor>().contiguous().to(device, true);
    
    x = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor(), 2, 3);
    x = torch::batch_norm(x, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
    x = torch::relu(x);
    x = torch::max_pool2d(x, 3, 2, 1);
    
    for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
        std::string layer_key = "layer" + std::to_string(layer_idx) + "_blocks";
        py::list blocks = params.attr("get")(py::str(layer_key)).cast<py::list>();
        
        for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
            py::object block = blocks[block_idx];
            
            auto conv1_w_blk = block.attr("get")("conv1_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto conv2_w_blk = block.attr("get")("conv2_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto conv3_w_blk = block.attr("get")("conv3_w").cast<torch::Tensor>().contiguous().to(device, true);
            
            auto bn1_w_blk = block.attr("get")("bn1_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_b_blk = block.attr("get")("bn1_b").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_m_blk = block.attr("get")("bn1_m").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn1_v_blk = block.attr("get")("bn1_v").cast<torch::Tensor>().contiguous().to(device, true);
            
            auto bn2_w_blk = block.attr("get")("bn2_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_b_blk = block.attr("get")("bn2_b").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_m_blk = block.attr("get")("bn2_m").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn2_v_blk = block.attr("get")("bn2_v").cast<torch::Tensor>().contiguous().to(device, true);
            
            auto bn3_w_blk = block.attr("get")("bn3_w").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_b_blk = block.attr("get")("bn3_b").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_m_blk = block.attr("get")("bn3_m").cast<torch::Tensor>().contiguous().to(device, true);
            auto bn3_v_blk = block.attr("get")("bn3_v").cast<torch::Tensor>().contiguous().to(device, true);
            
            bool has_downsample = py::bool_(block.attr("__contains__")(py::str("downsample_conv_w")));
            torch::Tensor downsample_conv_w, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v;
            if (has_downsample) {
                downsample_conv_w = block.attr("get")("downsample_conv_w").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_w = block.attr("get")("downsample_bn_w").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_b = block.attr("get")("downsample_bn_b").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_m = block.attr("get")("downsample_bn_m").cast<torch::Tensor>().contiguous().to(device, true);
                downsample_bn_v = block.attr("get")("downsample_bn_v").cast<torch::Tensor>().contiguous().to(device, true);
            }
            
            int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
            
            x = unified_bottleneck(
                x,
                conv1_w_blk, conv2_w_blk, conv3_w_blk,
                bn1_w_blk, bn1_b_blk, bn1_m_blk, bn1_v_blk,
                bn2_w_blk, bn2_b_blk, bn2_m_blk, bn2_v_blk,
                bn3_w_blk, bn3_b_blk, bn3_m_blk, bn3_v_blk,
                has_downsample,
                downsample_conv_w, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v,
                stride, is_training
            );
        }
    }
    
    auto sizes = x.sizes();
    int N = sizes[0];
    int C = sizes[1];
    int H = sizes[2];
    int W = sizes[3];
    int spatial = H * W;
    auto pooled = torch::empty({N, C, 1, 1}, x.options());
    
    int grid = N * C;
    int threads = 32; 
    warp_avg_pool_kernel<<<grid, threads>>>(x.data_ptr<float>(), pooled.data_ptr<float>(), spatial, C, H, W);
    cudaDeviceSynchronize();
    
    x = pooled.view({N, C});
    
    auto fc_w = params.attr("get")("fc_w").cast<torch::Tensor>().contiguous().to(device, true);
    auto fc_b = params.attr("get")("fc_b").cast<torch::Tensor>().contiguous().to(device, true);
    return torch::linear(x, fc_w, fc_b);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized ResNet101 with warp pooling and streamlined params");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	0.726	inst/cycle	0.000	5
Executed Ipc Elapsed	0.560	inst/cycle	0.000	5
Issue Slots Busy	18.428	%	0.053	5
Issued Ipc Active	0.736	inst/cycle	0.000	5
SM Busy	18.428	%	0.053	5
Memory Throughput	258432560647.092	byte/second	499909539194520384.000	5
Mem Busy	7.828	%	0.001	5
Max Bandwidth	7.730	%	0.000	5
L1/TEX Hit Rate	9.922	%	0.000	5
L2 Hit Rate	40.640	%	0.007	5
Mem Pipes Busy	5.682	%	0.000	5
Warp Cycles Per Issued Instruction	17.286	cycle	0.015	5
Warp Cycles Per Executed Instruction	17.628	cycle	0.020	5
Avg. Active Threads Per Warp	26.080		0.000	5
Avg. Not Predicated Off Threads Per Warp	24.990		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	128.000	block	0.000	5
Block Limit Shared Mem	32.000	block	0.000	5
Block Limit Warps	64.000	block	0.000	5
Theoretical Active Warps per SM	32.000	warp	0.000	5
Theoretical Occupancy	50.000	%	0.000	5
Achieved Occupancy	20.382	%	0.021	5
Achieved Active Warps Per SM	13.044	warp	0.009	5

Analysis Rules

Rule	Description
WRN HighPipeUtilization	All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
INF CPIStall	Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy	This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory. The difference between calculated theoretical (50.0%) and measured achieved occupancy (20.2%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	5043381.72	μs
Device Time	2434767.38	μs
Self CPU Time	86332.07	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	4957049.64	μs
Device Time	2434767.38	μs
Self CPU Time	284941.56	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::copy_
CPU Time	5246972.96	μs
Device Time	2434767.38	μs
Self CPU Time	1318476.41	μs
Self Device Time	2434767.38	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaMemcpyAsync
CPU Time	3928388.91	μs
Device Time	0.00	μs
Self CPU Time	3928388.91	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
Memcpy HtoD (Pageable -> Device)
CPU Time	0.00	μs
Device Time	2434769.05	μs
Self CPU Time	0.00	μs
Self Device Time	2434769.05	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::conv2d
CPU Time	1098533.59	μs
Device Time	534840.73	μs
Self CPU Time	40168.28	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45295 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:8:99 bugprone-easily-swappable-parameters

8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {

| ^~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:8:103: note: the first parameter in the range is 'spatial'

8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {

| ^~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:8:123: note: the last parameter in the range is 'H'

8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:9:17: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

9 | int index = blockIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:13:27: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]

13 | const float* in_ptr = input + (n * C + c) * total;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:13:35: note: make conversion explicit to silence this warning

5 |

6 | namespace py = pybind11;

7 |

8 | __global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {

9 | int index = blockIdx.x;

10 | int n = index / C;

11 | int c = index % C;

12 | int total = H * W;

13 | const float* in_ptr = input + (n * C + c) * total;

| ^~~~~~~~~~~~~~~~~~~

| static_cast<ptrdiff_t>( )

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:13:35: note: perform multiplication in a wider type

13 | const float* in_ptr = input + (n * C + c) * total;

| ^~~~~~~~~~

| static_cast<ptrdiff_t>( )

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:16:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

16 | for (int i = threadIdx.x; i < total; i += blockDim.x) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:16:47: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

16 | for (int i = threadIdx.x; i < total; i += blockDim.x) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:25:31: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]

25 | output[index] = sum / total;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:30:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

30 | torch::Tensor x,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:31:19: warning: the parameter 'conv1_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

31 | torch::Tensor conv1_w,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:32:19: warning: the parameter 'conv2_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

32 | torch::Tensor conv2_w,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:33:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]

33 | torch::Tensor conv3_w,

| ^~~~~~~~~~~~~~~~~~~~~~

34 | torch::Tensor bn1_w,

| ~~~~~~~~~~~~~~~~~~~

33 | torch::Tensor conv3_w,

| ^~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:34:19: note: the last parameter in the range is 'bn1_w'

34 | torch::Tensor bn1_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:33:19: warning: the parameter 'conv3_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

33 | torch::Tensor conv3_w,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:37:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]

37 | torch::Tensor bn1_v,

| ^~~~~~~~~~~~~~~~~~~~

38 | torch::Tensor bn2_w,

| ~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:37:19: note: the first parameter in the range is 'bn1_v'

37 | torch::Tensor bn1_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:38:19: note: the last parameter in the range is 'bn2_w'

38 | torch::Tensor bn2_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:41:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]

41 | torch::Tensor bn2_v,

| ^~~~~~~~~~~~~~~~~~~~

42 | torch::Tensor bn3_w,

| ~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:41:19: note: the first parameter in the range is 'bn2_v'

41 | torch::Tensor bn2_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:42:19: note: the last parameter in the range is 'bn3_w'

42 | torch::Tensor bn3_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:47:5: warning: 2 adjacent parameters of 'unified_bottleneck' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]

47 | torch::Tensor downsample_conv_w,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

48 | torch::Tensor downsample_bn_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

47 | torch::Tensor downsample_conv_w,

| ^~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:48:19: note: the last parameter in the range is 'downsample_bn_w'

48 | torch::Tensor downsample_bn_w,

| ^~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:47:19: warning: the parameter 'downsample_conv_w' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

47 | torch::Tensor downsample_conv_w,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:77:51: warning: the parameter 'params' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

77 | torch::Tensor forward(torch::Tensor x, py::object params, bool is_training) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:143:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

143 | int N = sizes[0];

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:144:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

144 | int C = sizes[1];

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:145:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

145 | int H = sizes[2];

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b8_s2_warp_pool_optimized_bottleneck/base/base.cu:146:13: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

146 | int W = sizes[3];

| ^

The AI CUDA Engineer 👷

`10_ResNet101` • `warp_pool_optimized_bottleneck_base`

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

The AI CUDA Engineer 👷

10_ResNet101 • warp_pool_optimized_bottleneck_base

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

`10_ResNet101` • `warp_pool_optimized_bottleneck_base`