Kernel Details - resnet101_modular_functions_base

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
    """
    Implements the ResNet101 module.

    Args:
        x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
        params (nn.ParameterDict): Dictionary of parameters
        is_training (bool): Whether to use training mode

    Returns:
        torch.Tensor: Output tensor, shape (batch_size, num_classes)
    """
    # Initial layers
    x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
    x = F.batch_norm(
        x,
        params["bn1_m"].to(x.device),
        params["bn1_v"].to(x.device),
        params["bn1_w"].to(x.device),
        params["bn1_b"].to(x.device),
        training=is_training,
    )
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)

    def bottleneck_fn(
        x,
        conv1_w,
        conv2_w,
        conv3_w,
        bn1_w,
        bn1_b,
        bn1_m,
        bn1_v,
        bn2_w,
        bn2_b,
        bn2_m,
        bn2_v,
        bn3_w,
        bn3_b,
        bn3_m,
        bn3_v,
        downsample_conv_w=None,
        downsample_bn_w=None,
        downsample_bn_b=None,
        downsample_bn_m=None,
        downsample_bn_v=None,
        stride=1,
        is_training=True,
    ):
        identity = x

        out = F.conv2d(x, conv1_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn1_m.to(x.device),
            bn1_v.to(x.device),
            bn1_w.to(x.device),
            bn1_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
        out = F.batch_norm(
            out,
            bn2_m.to(x.device),
            bn2_v.to(x.device),
            bn2_w.to(x.device),
            bn2_b.to(x.device),
            training=is_training,
        )
        out = F.relu(out)

        out = F.conv2d(out, conv3_w.to(x.device), bias=None)
        out = F.batch_norm(
            out,
            bn3_m.to(x.device),
            bn3_v.to(x.device),
            bn3_w.to(x.device),
            bn3_b.to(x.device),
            training=is_training,
        )

        if downsample_conv_w is not None:
            identity = F.conv2d(
                x, downsample_conv_w.to(x.device), bias=None, stride=stride
            )
            identity = F.batch_norm(
                identity,
                downsample_bn_m.to(x.device),
                downsample_bn_v.to(x.device),
                downsample_bn_w.to(x.device),
                downsample_bn_b.to(x.device),
                training=is_training,
            )

        out += identity
        out = F.relu(out)

        return out

    # Layer 1-4
    for layer_idx in range(1, 5):
        blocks = params[f"layer{layer_idx}_blocks"]
        for block_idx in range(len(blocks)):
            block_params = blocks[block_idx]

            downsample_params = None
            if "downsample_conv_w" in block_params:
                downsample_params = [
                    block_params["downsample_conv_w"],
                    block_params["downsample_bn_w"],
                    block_params["downsample_bn_b"],
                    block_params["downsample_bn_m"],
                    block_params["downsample_bn_v"],
                ]

            x = bottleneck_fn(
                x,
                block_params["conv1_w"],
                block_params["conv2_w"],
                block_params["conv3_w"],
                block_params["bn1_w"],
                block_params["bn1_b"],
                block_params["bn1_m"],
                block_params["bn1_v"],
                block_params["bn2_w"],
                block_params["bn2_b"],
                block_params["bn2_m"],
                block_params["bn2_v"],
                block_params["bn3_w"],
                block_params["bn3_b"],
                block_params["bn3_m"],
                block_params["bn3_v"],
                *(downsample_params if downsample_params else [None] * 5),
                stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                is_training=is_training,
            )

    x = F.adaptive_avg_pool2d(x, (1, 1))
    x = torch.flatten(x, 1)
    x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))

    return x


class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        super(Model, self).__init__()
        self.params = nn.ParameterDict()
        in_channels = 64
        expansion = 4

        # Initial layers
        conv1 = nn.Conv2d(
            3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
        )
        bn1 = nn.BatchNorm2d(in_channels)
        self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
        self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
        self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
        self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
        self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

        # Layers 1-4
        channels = [64, 128, 256, 512]
        for layer_idx, (out_channels, num_blocks) in enumerate(
            zip(channels, layers), 1
        ):
            layer_blocks = []

            for block_idx in range(num_blocks):
                block_in_channels = (
                    in_channels if block_idx == 0 else out_channels * expansion
                )

                # Create block parameters
                block_params = {}

                # First block may have downsample
                if block_idx == 0 and (
                    layer_idx > 1 or block_in_channels != out_channels * expansion
                ):
                    downsample_conv = nn.Conv2d(
                        block_in_channels,
                        out_channels * expansion,
                        kernel_size=1,
                        stride=2 if layer_idx > 1 else 1,
                        bias=False,
                    )
                    downsample_bn = nn.BatchNorm2d(out_channels * expansion)

                    block_params["downsample_conv_w"] = nn.Parameter(
                        downsample_conv.weight.data.clone()
                    )
                    block_params["downsample_bn_w"] = nn.Parameter(
                        downsample_bn.weight.data.clone()
                    )
                    block_params["downsample_bn_b"] = nn.Parameter(
                        downsample_bn.bias.data.clone()
                    )
                    block_params["downsample_bn_m"] = nn.Parameter(
                        downsample_bn.running_mean.data.clone()
                    )
                    block_params["downsample_bn_v"] = nn.Parameter(
                        downsample_bn.running_var.data.clone()
                    )

                conv1 = nn.Conv2d(
                    block_in_channels, out_channels, kernel_size=1, bias=False
                )
                bn1 = nn.BatchNorm2d(out_channels)
                conv2 = nn.Conv2d(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    stride=2 if block_idx == 0 and layer_idx > 1 else 1,
                    padding=1,
                    bias=False,
                )
                bn2 = nn.BatchNorm2d(out_channels)
                conv3 = nn.Conv2d(
                    out_channels, out_channels * expansion, kernel_size=1, bias=False
                )
                bn3 = nn.BatchNorm2d(out_channels * expansion)

                block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
                block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
                block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
                block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
                block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())

                block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
                block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
                block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
                block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
                block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())

                block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
                block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
                block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
                block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
                block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())

                layer_blocks.append(block_params)

            self.params[f"layer{layer_idx}_blocks"] = layer_blocks
            in_channels = out_channels * expansion

        # Final FC layer
        fc = nn.Linear(512 * expansion, num_classes)
        self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
        self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(x, self.params, self.training)


# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000


def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]


def get_init_inputs():
    return [layers, num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        """
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param stride: Stride for the first convolutional layer
        :param downsample: Downsample layer for the shortcut connection
        """
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, in_channels, height, width)
        :return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
        """
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class Model(nn.Module):
    def __init__(self, layers, num_classes=1000):
        """
        :param block: Type of block to use (BasicBlock or Bottleneck)
        :param layers: List of integers specifying the number of blocks in each layer
        :param num_classes: Number of output classes
        """
        super(Model, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        block = Bottleneck

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Input tensor, shape (batch_size, 3, height, width)
        :return: Output tensor, shape (batch_size, num_classes)
        """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000

def get_inputs():
    return [torch.randn(batch_size, 3, height, width)]

def get_init_inputs():
    return [layers, num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	10_ResNet101
Level ID	3
Task ID	10
Kernel Name	resnet101_modular_functions_base_base
CUDA Speedup (Native)	1.332x
CUDA Speedup (Compile)	1.335x
CUDA Runtime	23.202 ms
PyTorch Runtime (Native)	30.902 ms
PyTorch Runtime (Compile)	30.963 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	azure-gpt-4o-2024-08-06
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 3, Task 10 • 10_ResNet101)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	resnet101_modular_functions_base_base	23.20	1.33	1.33
🥈	resnet101_coalesced_memory_access_base	24.25	1.27	1.28
🥉	resnet101_balanced_workload_base	24.36	1.27	1.27
4	resnet101_balanced_workload_base	24.60	1.26	1.26
5	10_ResNet101_mem_opt_base_base	24.62	1.26	1.26
6	resnet101_uniform_flow_base_base	24.84	1.24	1.25
7	resnet101_shared_mem_sync_optimized_base	24.93	1.24	1.24
8	efficient_resnet_base	25.08	1.23	1.23
9	resnet101_optimized_memory_access_base	25.45	1.21	1.22
10	resnet101_unrolled_loops_base_base	25.58	1.21	1.21
11	resnet101_min_sync_relu_base	25.64	1.21	1.21
12	warp_pool_optimized_bottleneck_base	25.79	1.20	1.20
13	unified_resnet_base	26.38	1.17	1.17
14	10_ResNet101_warp_avg_pool_base	26.55	1.16	1.17
15	resnet101_minimal_sync_base_base	26.90	1.15	1.15
16	10_ResNet101	28.04	1.10	1.10
17	resnet101_fused_distr_base	28.10	1.10	1.10
18	10_resnet101_opt_aligned_mem_edit_1	29.45	1.05	1.05
19	10_resnet101_opt_min_sync_edit_1	29.58	1.04	1.05
20	10_resnet101_opt_base	29.83	1.04	1.04

#include <torch/extension.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

namespace py = pybind11;

// Modularized device code for each operation
template <typename scalar_t>
__global__ void conv2d_kernel(const scalar_t* __restrict__ input,
                              const scalar_t* __restrict__ kernel,
                              scalar_t* __restrict__ output,
                              int width, int height, int ksize,
                              int stride, int padding) {
    // Calculate indices
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int tidy = blockIdx.y * blockDim.y + threadIdx.y;

    // Check boundaries
    if (tidx < width && tidy < height) {
        // Perform convolution
        int kernel_radius = ksize / 2;
        scalar_t sum = 0;
        for (int i = -kernel_radius; i <= kernel_radius; ++i) {
            for (int j = -kernel_radius; j <= kernel_radius; ++j) {
                int x = tidx * stride + j - padding;
                int y = tidy * stride + i - padding;
                if (x >= 0 && x < width && y >= 0 && y < height) {
                    sum += input[y * width + x] * kernel[(i + kernel_radius) * ksize + (j + kernel_radius)];
                }
            }
        }
        output[tidy * width + tidx] = sum;
    }
}

// Unified bottleneck function
torch::Tensor bottleneck_fn(
    torch::Tensor x,
    const torch::Tensor& conv1_w,
    const torch::Tensor& conv2_w,
    const torch::Tensor& conv3_w,
    const torch::Tensor& bn1_w,
    const torch::Tensor& bn1_b,
    const torch::Tensor& bn1_m,
    const torch::Tensor& bn1_v,
    const torch::Tensor& bn2_w,
    const torch::Tensor& bn2_b,
    const torch::Tensor& bn2_m,
    const torch::Tensor& bn2_v,
    const torch::Tensor& bn3_w,
    const torch::Tensor& bn3_b,
    const torch::Tensor& bn3_m,
    const torch::Tensor& bn3_v,
    const torch::Tensor& downsample_conv_w,
    const torch::Tensor& downsample_bn_w,
    const torch::Tensor& downsample_bn_b,
    const torch::Tensor& downsample_bn_m,
    const torch::Tensor& downsample_bn_v,
    int64_t stride,
    bool is_training
) {
    torch::Tensor identity = x;
    bool has_downsample = downsample_conv_w.defined();

    torch::Tensor downsample_out;
    if (has_downsample) {
        downsample_out = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride)
            .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
        downsample_out = torch::batch_norm(downsample_out, downsample_bn_w, downsample_bn_b, 
            downsample_bn_m, downsample_bn_v, is_training, 0.1, 1e-5, true);
    }

    torch::Tensor out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor())
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(out, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
    out = torch::relu(out);

    out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, /*padding=*/1)
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(out, bn2_w, bn2_b, bn2_m, bn2_v, is_training, 0.1, 1e-5, true);
    out = torch::relu(out);

    out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor())
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    out = torch::batch_norm(out, bn3_w, bn3_b, bn3_m, bn3_v, is_training, 0.1, 1e-5, true);

    identity = has_downsample ? downsample_out : identity.to(out.dtype());
    out = out + identity;
    return torch::relu(out);
}

torch::Tensor forward(
    torch::Tensor x,
    py::object params,
    bool is_training
) {
    // Pre-fetch all parameters in contiguous memory blocks
    auto device = x.device();
    std::vector<torch::Tensor> param_buffers;

    // Initial layer parameters
    std::vector<torch::Tensor> initial_params{
        params.attr("get")("conv1_w").cast<torch::Tensor>(),
        params.attr("get")("bn1_w").cast<torch::Tensor>(),
        params.attr("get")("bn1_b").cast<torch::Tensor>(),
        params.attr("get")("bn1_m").cast<torch::Tensor>(),
        params.attr("get")("bn1_v").cast<torch::Tensor>()
    };
    for (auto& p : initial_params) p = p.contiguous().to(device, /*non_blocking=*/true);

    x = torch::conv2d(x, initial_params[0], /*bias=*/torch::Tensor(), 2, 3)
        .to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
    x = torch::batch_norm(x, initial_params[1], initial_params[2], initial_params[3], initial_params[4], 
                        is_training, 0.1, 1e-5, true);
    x = torch::relu(x);
    x = torch::max_pool2d(x, 3, 2, 1);

    // Layer processing with batched parameter transfers
    for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
        std::string key = "layer" + std::to_string(layer_idx) + "_blocks";
        py::list blocks = params.attr("get")(py::str(key)).cast<py::list>();

        // Pre-fetch all block parameters
        std::vector<std::vector<torch::Tensor>> layer_params;
        for (auto block : blocks) {
            py::object bp = block.cast<py::object>();
            std::vector<torch::Tensor> block_tensors;
            
            const char* names[] = {"conv1_w", "conv2_w", "conv3_w",
                                  "bn1_w", "bn1_b", "bn1_m", "bn1_v",
                                  "bn2_w", "bn2_b", "bn2_m", "bn2_v",
                                  "bn3_w", "bn3_b", "bn3_m", "bn3_v"};
            
            for (const char* name : names) {
                block_tensors.push_back(bp.attr("get")(py::str(name)).cast<torch::Tensor>());
            }

            if (py::bool_(bp.attr("__contains__")("downsample_conv_w"))) {
                const char* ds_names[] = {"downsample_conv_w", "downsample_bn_w",
                                         "downsample_bn_b", "downsample_bn_m", "downsample_bn_v"};
                for (const char* ds_name : ds_names) {
                    block_tensors.push_back(bp.attr("get")(py::str(ds_name)).cast<torch::Tensor>());
                }
            }
            
            layer_params.push_back(block_tensors);
        }

        // Batch transfer for layer
        for (auto& block_tensors : layer_params) {
            for (auto& t : block_tensors) {
                t = t.contiguous().to(device, /*non_blocking=*/true);
            }
        }

        // Process blocks with pre-fetched parameters
        for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
            auto& block_tensors = layer_params[block_idx];
            int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
            bool has_downsample = block_tensors.size() > 15;

            x = bottleneck_fn(x,
                block_tensors[0], block_tensors[1], block_tensors[2],
                block_tensors[3], block_tensors[4], block_tensors[5], block_tensors[6],
                block_tensors[7], block_tensors[8], block_tensors[9], block_tensors[10],
                block_tensors[11], block_tensors[12], block_tensors[13], block_tensors[14],
                has_downsample ? block_tensors[15] : torch::Tensor(),
                has_downsample ? block_tensors[16] : torch::Tensor(),
                has_downsample ? block_tensors[17] : torch::Tensor(),
                has_downsample ? block_tensors[18] : torch::Tensor(),
                has_downsample ? block_tensors[19] : torch::Tensor(),
                stride, is_training
            );
        }
    }

    x = torch::adaptive_avg_pool2d(x, {1, 1}).contiguous();
    x = x.view({x.size(0), -1});

    auto fc_w = params.attr("get")("fc_w").cast<torch::Tensor>().contiguous().to(device);
    auto fc_b = params.attr("get")("fc_b").cast<torch::Tensor>().contiguous().to(device);
    return torch::linear(x, fc_w, fc_b);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "ResNet101 forward");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::to
CPU Time	7097846.22	μs
Device Time	3523638.67	μs
Self CPU Time	135001.07	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	6962845.16	μs
Device Time	3523638.67	μs
Self CPU Time	404153.41	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::copy_
CPU Time	6812401.04	μs
Device Time	3523638.67	μs
Self CPU Time	1261776.06	μs
Self Device Time	3523638.67	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaMemcpyAsync
CPU Time	5550512.17	μs
Device Time	0.00	μs
Self CPU Time	5550512.17	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
Memcpy HtoD (Pageable -> Device)
CPU Time	0.00	μs
Device Time	3523638.67	μs
Self CPU Time	0.00	μs
Self Device Time	3523638.67	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::conv2d
CPU Time	1095293.02	μs
Device Time	788936.62	μs
Self CPU Time	47215.33	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45284 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:12:42 bugprone-easily-swappable-parameters

12 | int width, int height, int ksize,

| ^~~~~~~~~~~~~~~~~~~~~~

13 | int stride, int padding) {

| ~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:12:46: note: the first parameter in the range is 'height'

12 | int width, int height, int ksize,

| ^~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:13:35: note: the last parameter in the range is 'stride'

13 | int stride, int padding) {

| ^~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:15:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

15 | int tidx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:16:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

16 | int tidy = blockIdx.y * blockDim.y + threadIdx.y;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:38:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

38 | torch::Tensor x,

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:41:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

41 | const torch::Tensor& conv3_w,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

42 | const torch::Tensor& bn1_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:41:26: note: the first parameter in the range is 'conv3_w'

41 | const torch::Tensor& conv3_w,

| ^~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:42:26: note: the last parameter in the range is 'bn1_w'

42 | const torch::Tensor& bn1_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:45:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

45 | const torch::Tensor& bn1_v,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~

46 | const torch::Tensor& bn2_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:45:26: note: the first parameter in the range is 'bn1_v'

45 | const torch::Tensor& bn1_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:46:26: note: the last parameter in the range is 'bn2_w'

46 | const torch::Tensor& bn2_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:49:5: warning: 2 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

49 | const torch::Tensor& bn2_v,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~

50 | const torch::Tensor& bn3_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:49:26: note: the first parameter in the range is 'bn2_v'

49 | const torch::Tensor& bn2_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:50:26: note: the last parameter in the range is 'bn3_w'

50 | const torch::Tensor& bn3_w,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:53:5: warning: 3 adjacent parameters of 'bottleneck_fn' of similar type ('const torch::Tensor &') are easily swapped by mistake [bugprone-easily-swappable-parameters]

53 | const torch::Tensor& bn3_v,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~

54 | const torch::Tensor& downsample_conv_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

55 | const torch::Tensor& downsample_bn_w,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:53:26: note: the first parameter in the range is 'bn3_v'

53 | const torch::Tensor& bn3_v,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:55:26: note: the last parameter in the range is 'downsample_bn_w'

55 | const torch::Tensor& downsample_bn_w,

| ^~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_10/b9_s2_resnet101_modular_functions_base/base/base.cu:94:16: warning: the parameter 'params' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

94 | py::object params,

| ^

| const &

The AI CUDA Engineer 👷

`10_ResNet101` • `resnet101_modular_functions_base_base`

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

The AI CUDA Engineer 👷

10_ResNet101 • resnet101_modular_functions_base_base

Kernel Information

Related Kernels (Level 3, Task 10 • 10_ResNet101)

`10_ResNet101` • `resnet101_modular_functions_base_base`