Kernel Details - 4_LeNet5_fused_even

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    conv1_weight: nn.Parameter,
    conv1_bias: nn.Parameter,
    conv2_weight: nn.Parameter,
    conv2_bias: nn.Parameter,
    fc1_weight: nn.Parameter,
    fc1_bias: nn.Parameter,
    fc2_weight: nn.Parameter,
    fc2_bias: nn.Parameter,
    fc3_weight: nn.Parameter,
    fc3_bias: nn.Parameter,
) -> torch.Tensor:
    """
    Implements a LeNet-5 architecture with ReLU activation.

    Args:
        x (torch.Tensor): The input tensor, shape (batch_size, 1, 32, 32)
        conv1_weight (nn.Parameter): Parameters for first conv layer
        conv1_bias (nn.Parameter): Parameters for first conv layer
        conv2_weight (nn.Parameter): Parameters for second conv layer
        conv2_bias (nn.Parameter): Parameters for second conv layer
        fc1_weight (nn.Parameter): Parameters for first FC layer
        fc1_bias (nn.Parameter): Parameters for first FC layer
        fc2_weight (nn.Parameter): Parameters for second FC layer
        fc3_weight (nn.Parameter): Parameters for third FC layer
        fc3_bias (nn.Parameter): Parameters for third FC layer

    Returns:
        torch.Tensor: The output tensor, shape (batch_size, num_classes)
    """
    # First convolutional layer with ReLU activation and max pooling
    x = F.conv2d(x, conv1_weight, conv1_bias, stride=1)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2, stride=2)

    # Second convolutional layer with ReLU activation and max pooling
    x = F.conv2d(x, conv2_weight, conv2_bias, stride=1)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2, stride=2)

    # Flatten the output for the fully connected layers
    x = x.view(-1, 16 * 5 * 5)

    # First fully connected layer with ReLU activation
    x = F.linear(x, fc1_weight, fc1_bias)
    x = F.relu(x)

    # Second fully connected layer with ReLU activation
    x = F.linear(x, fc2_weight, fc2_bias)
    x = F.relu(x)

    # Final fully connected layer
    x = F.linear(x, fc3_weight, fc3_bias)

    return x


class Model(nn.Module):
    def __init__(self, num_classes):
        """
        LeNet-5 architecture implementation in PyTorch.

        :param num_classes: The number of output classes.
        """
        super(Model, self).__init__()

        # Extract parameters from convolutional layers
        conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv1_weight = nn.Parameter(conv1.weight.data.clone())
        self.conv1_bias = nn.Parameter(conv1.bias.data.clone())

        conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        self.conv2_weight = nn.Parameter(conv2.weight.data.clone())
        self.conv2_bias = nn.Parameter(conv2.bias.data.clone())

        # Extract parameters from fully connected layers
        fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120)
        self.fc1_weight = nn.Parameter(fc1.weight.data.clone())
        self.fc1_bias = nn.Parameter(fc1.bias.data.clone())

        fc2 = nn.Linear(in_features=120, out_features=84)
        self.fc2_weight = nn.Parameter(fc2.weight.data.clone())
        self.fc2_bias = nn.Parameter(fc2.bias.data.clone())

        fc3 = nn.Linear(in_features=84, out_features=num_classes)
        self.fc3_weight = nn.Parameter(fc3.weight.data.clone())
        self.fc3_bias = nn.Parameter(fc3.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(
            x,
            self.conv1_weight,
            self.conv1_bias,
            self.conv2_weight,
            self.conv2_bias,
            self.fc1_weight,
            self.fc1_bias,
            self.fc2_weight,
            self.fc2_bias,
            self.fc3_weight,
            self.fc3_bias,
        )


# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10


def get_inputs():
    return [torch.randn(batch_size, 1, 32, 32)]


def get_init_inputs():
    return [num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, num_classes):
        """
        LeNet-5 architecture implementation in PyTorch.

        :param num_classes: The number of output classes.
        """
        super(Model, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        
        # Fully connected layers
        self.fc1 = nn.Linear(in_features=16*5*5, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=84)
        self.fc3 = nn.Linear(in_features=84, out_features=num_classes)
    
    def forward(self, x):
        """
        Forward pass of the LeNet-5 model.

        :param x: The input tensor, shape (batch_size, 1, 32, 32)
        :return: The output tensor, shape (batch_size, num_classes)
        """
        # First convolutional layer with ReLU activation and max pooling
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        # Second convolutional layer with ReLU activation and max pooling
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        # Flatten the output for the fully connected layers
        x = x.view(-1, 16*5*5)
        
        # First fully connected layer with ReLU activation
        x = F.relu(self.fc1(x))
        
        # Second fully connected layer with ReLU activation
        x = F.relu(self.fc2(x))
        
        # Final fully connected layer
        x = self.fc3(x)
        
        return x

# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10

def get_inputs():
    return [torch.randn(batch_size, 1, 32, 32)]

def get_init_inputs():
    return [num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	4_LeNet5
Level ID	3
Task ID	4
Kernel Name	4_LeNet5_fused_even_base
CUDA Speedup (Native)	2.384x
CUDA Speedup (Compile)	1.468x
CUDA Runtime	0.049 ms
PyTorch Runtime (Native)	0.117 ms
PyTorch Runtime (Compile)	0.072 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 3, Task 4 • 4_LeNet5)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	4_LeNet5_fused_even_edit_1	0.05	2.38	1.47
🥇	4_LeNet5_fused_even_base	0.05	2.38	1.47
🥉	warp_uniform_optimized_base	0.05	2.29	1.41
4	modular_device_functions_base_base	0.05	2.25	1.38
4	4_LeNet5	0.05	2.25	1.38
4	optimized_sync_4_lenet5_base	0.05	2.25	1.38
4	4_LeNet5_strided_loops_edit_1	0.05	2.25	1.38
4	4_LeNet5_warp_divergence_edit_1	0.05	2.25	1.38
4	4_LeNet5_atomic_optimization_base	0.05	2.25	1.38
4	4_LeNet5_unroll_loops_base	0.05	2.25	1.38
4	4_lenet5_shared_mem_optimization_base	0.05	2.25	1.38
4	4_LeNet5_strided_loops_base	0.05	2.25	1.38
4	4_LeNet5_shared_memory_reduction_base	0.05	2.25	1.38
14	4_lenet5_workload_balancing_base	0.05	2.20	1.36
14	4_LeNet5_shared_memory_atomic_base	0.05	2.20	1.36
14	4_LeNet5_shared_memory_base	0.05	2.20	1.36
14	4_lenet5_memory_coalescing_edit_1	0.05	2.20	1.36
14	balanced_workload_distribution_base	0.05	2.20	1.36
14	no_divergence_fast_base	0.05	2.20	1.36
14	4_LeNet5_unroll_loops_edit_1	0.05	2.20	1.36

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>
#include <cfloat>

// Fused kernel: Applies ReLU and then performs 2D max pooling in one pass.
// Workload is evenly distributed using a grid-stride loop.
__global__ void fused_relu_pool_kernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    int batch, int channels,
    int height, int width,
    int pool_h, int pool_w, int stride
) {
    int out_h = (height - pool_h) / stride + 1;
    int out_w = (width - pool_w) / stride + 1;
    int total = batch * channels * out_h * out_w;

    for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) {
        int tmp = idx;
        int w = tmp % out_w; tmp /= out_w;
        int h = tmp % out_h; tmp /= out_h;
        int c = tmp % channels; tmp /= channels;
        int b = tmp;

        int in_row_start = h * stride;
        int in_col_start = w * stride;
        // Initialize to 0 since with ReLU negatives become 0.
        float max_val = 0.0f;

        for (int i = 0; i < pool_h; i++) {
            for (int j = 0; j < pool_w; j++) {
                int in_row = in_row_start + i;
                int in_col = in_col_start + j;
                float val = input[((b * channels + c) * height + in_row) * width + in_col];
                // Apply ReLU inline
                float relu_val = fmaxf(val, 0.0f);
                if (relu_val > max_val) {
                    max_val = relu_val;
                }
            }
        }
        output[idx] = max_val;
    }
}

// Simple flattening kernel using a grid-stride loop
__global__ void flatten_kernel(const float* __restrict__ input, float* __restrict__ output, int total) {
    for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) {
        output[idx] = input[idx];
    }
}

// Forward function for the LeNet-5 network that uses the fused ReLU+Pool kernel
// to better distribute workloads evenly and reduce kernel launch overhead.

torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor conv1_weight, torch::Tensor conv1_bias,
    torch::Tensor conv2_weight, torch::Tensor conv2_bias,
    torch::Tensor fc1_weight, torch::Tensor fc1_bias,
    torch::Tensor fc2_weight, torch::Tensor fc2_bias,
    torch::Tensor fc3_weight, torch::Tensor fc3_bias
) {
    // Move all inputs to CUDA
    x = x.to(torch::kCUDA);
    conv1_weight = conv1_weight.to(torch::kCUDA);
    conv1_bias = conv1_bias.to(torch::kCUDA);
    conv2_weight = conv2_weight.to(torch::kCUDA);
    conv2_bias = conv2_bias.to(torch::kCUDA);
    fc1_weight = fc1_weight.to(torch::kCUDA);
    fc1_bias = fc1_bias.to(torch::kCUDA);
    fc2_weight = fc2_weight.to(torch::kCUDA);
    fc2_bias = fc2_bias.to(torch::kCUDA);
    fc3_weight = fc3_weight.to(torch::kCUDA);
    fc3_bias = fc3_bias.to(torch::kCUDA);

    // First Convolutional Layer
    auto conv1 = torch::conv2d(x, conv1_weight, conv1_bias, {1, 1});

    // Instead of launching separate ReLU and max_pool kernels, we fuse them.
    int B = conv1.size(0);
    int C = conv1.size(1);
    int H = conv1.size(2);
    int W = conv1.size(3);
    int pool_h = 2, pool_w = 2, stride = 2;
    int out_h = (H - pool_h) / stride + 1;
    int out_w = (W - pool_w) / stride + 1;

    auto pool1 = torch::empty({B, C, out_h, out_w}, conv1.options());
    int total_pool1 = B * C * out_h * out_w;
    int threads = 256;
    int blocks = (total_pool1 + threads - 1) / threads;
    fused_relu_pool_kernel<<<blocks, threads>>>(
        conv1.data_ptr<float>(), pool1.data_ptr<float>(), B, C, H, W, pool_h, pool_w, stride);

    // Second Convolutional Layer
    auto conv2 = torch::conv2d(pool1, conv2_weight, conv2_bias, {1, 1});
    B = conv2.size(0);
    C = conv2.size(1);
    H = conv2.size(2);
    W = conv2.size(3);
    out_h = (H - pool_h) / stride + 1;
    out_w = (W - pool_w) / stride + 1;
    auto pool2 = torch::empty({B, C, out_h, out_w}, conv2.options());
    int total_pool2 = B * C * out_h * out_w;
    blocks = (total_pool2 + threads - 1) / threads;
    fused_relu_pool_kernel<<<blocks, threads>>>(
        conv2.data_ptr<float>(), pool2.data_ptr<float>(), B, C, H, W, pool_h, pool_w, stride);

    // Flatten the output
    auto flat = pool2.view({pool2.size(0), -1});

    // Fully connected layers are computed using torch::linear which are highly optimized (e.g., via cuBLAS)
    auto fc1 = torch::linear(flat, fc1_weight, fc1_bias);
    fc1 = torch::relu(fc1);
    auto fc2 = torch::linear(fc1, fc2_weight, fc2_bias);
    fc2 = torch::relu(fc2);
    auto fc3 = torch::linear(fc2, fc3_weight, fc3_bias);

    return fc3;
}

// PyBind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "LeNet-5 forward pass with fused ReLU and pooling");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::conv2d
CPU Time	838096.89	μs
Device Time	228939.11	μs
Self CPU Time	32998.55	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::convolution
CPU Time	805098.34	μs
Device Time	228939.11	μs
Self CPU Time	40980.91	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_convolution
CPU Time	764117.43	μs
Device Time	228939.11	μs
Self CPU Time	84194.36	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::cudnn_convolution
CPU Time	499023.36	μs
Device Time	160191.16	μs
Self CPU Time	338654.91	μs
Self Device Time	160191.16	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::linear
CPU Time	541120.27	μs
Device Time	119774.74	μs
Self CPU Time	48302.27	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	88580.60	μs
Device Time	765881.89	μs
Self CPU Time	19549.06	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	69032.82	μs
Device Time	765881.89	μs
Self CPU Time	27380.40	μs
Self Device Time	765881.89	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	765881.89	μs
Self CPU Time	0.00	μs
Self Device Time	765881.89	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45312 warnings generated when compiling for host.
Suppressed 45346 warnings (45299 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:16:17 bugprone-easily-swappable-parameters

16 | int height, int width,

| ^~~~~~~~~~

17 | int pool_h, int pool_w, int stride

| ~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:16:21: note: the first parameter in the range is 'width'

16 | int height, int width,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:17:9: note: the last parameter in the range is 'pool_h'

17 | int pool_h, int pool_w, int stride

| ^~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:23:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

23 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:23:79: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

23 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:53:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

53 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:53:79: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

53 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:86:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

86 | int B = conv1.size(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:87:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

87 | int C = conv1.size(1);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:88:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

88 | int H = conv1.size(2);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:89:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

89 | int W = conv1.size(3);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:103:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

103 | B = conv2.size(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:104:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

104 | C = conv2.size(1);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:105:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

105 | H = conv2.size(2);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_4/b5_s2_4_LeNet5_fused_even/base/base.cu:106:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

106 | W = conv2.size(3);

| ^

The AI CUDA Engineer 👷

`4_LeNet5` • `4_LeNet5_fused_even_base`

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

The AI CUDA Engineer 👷

4_LeNet5 • 4_LeNet5_fused_even_base

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

`4_LeNet5` • `4_LeNet5_fused_even_base`