← Back to Leaderboard

The AI CUDA Engineer 👷


Level 3 • Task 4
import torch
import torch.nn as nn
import torch.nn.functional as F

def module_fn(
    x: torch.Tensor,
    conv1_weight: nn.Parameter,
    conv1_bias: nn.Parameter,
    conv2_weight: nn.Parameter,
    conv2_bias: nn.Parameter,
    fc1_weight: nn.Parameter,
    fc1_bias: nn.Parameter,
    fc2_weight: nn.Parameter,
    fc2_bias: nn.Parameter,
    fc3_weight: nn.Parameter,
    fc3_bias: nn.Parameter,
) -> torch.Tensor:
    Implements a LeNet-5 architecture with ReLU activation.

        x (torch.Tensor): The input tensor, shape (batch_size, 1, 32, 32)
        conv1_weight (nn.Parameter): Parameters for first conv layer
        conv1_bias (nn.Parameter): Parameters for first conv layer
        conv2_weight (nn.Parameter): Parameters for second conv layer
        conv2_bias (nn.Parameter): Parameters for second conv layer
        fc1_weight (nn.Parameter): Parameters for first FC layer
        fc1_bias (nn.Parameter): Parameters for first FC layer
        fc2_weight (nn.Parameter): Parameters for second FC layer
        fc3_weight (nn.Parameter): Parameters for third FC layer
        fc3_bias (nn.Parameter): Parameters for third FC layer

        torch.Tensor: The output tensor, shape (batch_size, num_classes)
    # First convolutional layer with ReLU activation and max pooling
    x = F.conv2d(x, conv1_weight, conv1_bias, stride=1)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2, stride=2)

    # Second convolutional layer with ReLU activation and max pooling
    x = F.conv2d(x, conv2_weight, conv2_bias, stride=1)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2, stride=2)

    # Flatten the output for the fully connected layers
    x = x.view(-1, 16 * 5 * 5)

    # First fully connected layer with ReLU activation
    x = F.linear(x, fc1_weight, fc1_bias)
    x = F.relu(x)

    # Second fully connected layer with ReLU activation
    x = F.linear(x, fc2_weight, fc2_bias)
    x = F.relu(x)

    # Final fully connected layer
    x = F.linear(x, fc3_weight, fc3_bias)

    return x

class Model(nn.Module):
    def __init__(self, num_classes):
        LeNet-5 architecture implementation in PyTorch.

        :param num_classes: The number of output classes.
        super(Model, self).__init__()

        # Extract parameters from convolutional layers
        conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv1_weight = nn.Parameter(conv1.weight.data.clone())
        self.conv1_bias = nn.Parameter(conv1.bias.data.clone())

        conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        self.conv2_weight = nn.Parameter(conv2.weight.data.clone())
        self.conv2_bias = nn.Parameter(conv2.bias.data.clone())

        # Extract parameters from fully connected layers
        fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120)
        self.fc1_weight = nn.Parameter(fc1.weight.data.clone())
        self.fc1_bias = nn.Parameter(fc1.bias.data.clone())

        fc2 = nn.Linear(in_features=120, out_features=84)
        self.fc2_weight = nn.Parameter(fc2.weight.data.clone())
        self.fc2_bias = nn.Parameter(fc2.bias.data.clone())

        fc3 = nn.Linear(in_features=84, out_features=num_classes)
        self.fc3_weight = nn.Parameter(fc3.weight.data.clone())
        self.fc3_bias = nn.Parameter(fc3.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(

# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10

def get_inputs():
    return [torch.randn(batch_size, 1, 32, 32)]

def get_init_inputs():
    return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, num_classes):
        LeNet-5 architecture implementation in PyTorch.

        :param num_classes: The number of output classes.
        super(Model, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        # Fully connected layers
        self.fc1 = nn.Linear(in_features=16*5*5, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=84)
        self.fc3 = nn.Linear(in_features=84, out_features=num_classes)
    def forward(self, x):
        Forward pass of the LeNet-5 model.

        :param x: The input tensor, shape (batch_size, 1, 32, 32)
        :return: The output tensor, shape (batch_size, num_classes)
        # First convolutional layer with ReLU activation and max pooling
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        # Second convolutional layer with ReLU activation and max pooling
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        # Flatten the output for the fully connected layers
        x = x.view(-1, 16*5*5)
        # First fully connected layer with ReLU activation
        x = F.relu(self.fc1(x))
        # Second fully connected layer with ReLU activation
        x = F.relu(self.fc2(x))
        # Final fully connected layer
        x = self.fc3(x)
        return x

# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10

def get_inputs():
    return [torch.randn(batch_size, 1, 32, 32)]

def get_init_inputs():
    return [num_classes]

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 4_LeNet5_fused_even_edit_1 0.05 2.38 1.47
🥇 4_LeNet5_fused_even_base 0.05 2.38 1.47
🥉 warp_uniform_optimized_base 0.05 2.29 1.41
4 modular_device_functions_base_base 0.05 2.25 1.38
4 4_LeNet5 0.05 2.25 1.38
4 optimized_sync_4_lenet5_base 0.05 2.25 1.38
4 4_LeNet5_strided_loops_edit_1 0.05 2.25 1.38
4 4_LeNet5_warp_divergence_edit_1 0.05 2.25 1.38
4 4_LeNet5_atomic_optimization_base 0.05 2.25 1.38
4 4_LeNet5_unroll_loops_base 0.05 2.25 1.38
4 4_lenet5_shared_mem_optimization_base 0.05 2.25 1.38
4 4_LeNet5_strided_loops_base 0.05 2.25 1.38
4 4_LeNet5_shared_memory_reduction_base 0.05 2.25 1.38
14 4_lenet5_workload_balancing_base 0.05 2.20 1.36
14 4_LeNet5_shared_memory_atomic_base 0.05 2.20 1.36
14 4_LeNet5_shared_memory_base 0.05 2.20 1.36
14 4_lenet5_memory_coalescing_edit_1 0.05 2.20 1.36
14 balanced_workload_distribution_base 0.05 2.20 1.36
14 no_divergence_fast_base 0.05 2.20 1.36
14 4_LeNet5_unroll_loops_edit_1 0.05 2.20 1.36
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>

// Optimized ReLU kernel with vectorized loads/stores
__global__ void relu_kernel(float* input, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    // Vector loading for better memory throughput
    float4* input4 = reinterpret_cast<float4*>(input);
    int size4 = size / 4;
    for (int i = idx; i < size4; i += stride) {
        float4 val = input4[i];
        val.x = fmaxf(0.0f, val.x);
        val.y = fmaxf(0.0f, val.y);
        val.z = fmaxf(0.0f, val.z);
        val.w = fmaxf(0.0f, val.w);
        input4[i] = val;

    // Handle remaining elements
    for (int i = idx * 4 + size4 * 4; i < size; i += stride) {
        input[i] = fmaxf(0.0f, input[i]);

// Optimized max pooling kernel with shared memory
__global__ void max_pool2d_kernel(
    const float* __restrict__ input, float* __restrict__ output,
    int batch_size, int channels, int height, int width,
    int pool_height, int pool_width, int stride
) {
    extern __shared__ float shared_mem[];
    int out_h = (height - pool_height) / stride + 1;
    int out_w = (width - pool_width) / stride + 1;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < batch_size * channels * out_h * out_w) {
        int b = idx / (channels * out_h * out_w);
        int c = (idx / (out_h * out_w)) % channels;
        int h = (idx / out_w) % out_h;
        int w = idx % out_w;

        int in_h_start = h * stride;
        int in_w_start = w * stride;

        // Load input window into shared memory
        float max_val = -FLT_MAX;
        for (int i = 0; i < pool_height; ++i) {
            for (int j = 0; j < pool_width; ++j) {
                float val = input[((b * channels + c) * height + in_h_start + i) * width + in_w_start + j];
                max_val = fmaxf(max_val, val);
        output[idx] = max_val;

// Optimized linear layer kernel with shared memory
__global__ void linear_kernel(
    const float* __restrict__ input,
    const float* __restrict__ weight,
    const float* __restrict__ bias,
    float* __restrict__ output,
    int in_features, int out_features
) {
    extern __shared__ float shared_input[];
    // Load input into shared memory
    for (int i = threadIdx.x; i < in_features; i += blockDim.x) {
        shared_input[i] = input[i];
    __syncthreads();  // Single sync point for shared memory consistency
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < out_features) {
        float val = bias[idx];
        // Vectorized computation
        for (int i = 0; i < in_features; i += 4) {
            float4 in_vec = *reinterpret_cast<const float4*>(&shared_input[i]);
            float4 weight_vec = *reinterpret_cast<const float4*>(&weight[idx * in_features + i]);
            val += in_vec.x * weight_vec.x + in_vec.y * weight_vec.y +
                   in_vec.z * weight_vec.z + in_vec.w * weight_vec.w;
        output[idx] = val;

torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor conv1_weight, torch::Tensor conv1_bias,
    torch::Tensor conv2_weight, torch::Tensor conv2_bias,
    torch::Tensor fc1_weight, torch::Tensor fc1_bias,
    torch::Tensor fc2_weight, torch::Tensor fc2_bias,
    torch::Tensor fc3_weight, torch::Tensor fc3_bias
) {
    // Move tensors to CUDA with contiguous memory layout
    auto options = torch::TensorOptions()
    x = x.to(options);
    conv1_weight = conv1_weight.to(options);
    conv1_bias = conv1_bias.to(options);
    conv2_weight = conv2_weight.to(options);
    conv2_bias = conv2_bias.to(options);
    fc1_weight = fc1_weight.to(options);
    fc1_bias = fc1_bias.to(options);
    fc2_weight = fc2_weight.to(options);
    fc2_bias = fc2_bias.to(options);
    fc3_weight = fc3_weight.to(options);
    fc3_bias = fc3_bias.to(options);

    const int block_size = 256;
    const int max_blocks = 32;

    // Convolution layers
    auto conv1 = torch::conv2d(x, conv1_weight, conv1_bias, {1, 1});
    relu_kernel<<<max_blocks, block_size>>>(conv1.data_ptr<float>(), conv1.numel());
    auto pool1 = torch::max_pool2d(conv1, {2, 2}, {2, 2});

    auto conv2 = torch::conv2d(pool1, conv2_weight, conv2_bias, {1, 1});
    relu_kernel<<<max_blocks, block_size>>>(conv2.data_ptr<float>(), conv2.numel());
    auto pool2 = torch::max_pool2d(conv2, {2, 2}, {2, 2});

    auto flat = pool2.view({pool2.size(0), -1});

    // Fully connected layers with shared memory
    size_t shared_mem_size = flat.size(1) * sizeof(float);
    auto fc1 = torch::linear(flat, fc1_weight, fc1_bias);
    relu_kernel<<<max_blocks, block_size>>>(fc1.data_ptr<float>(), fc1.numel());

    auto fc2 = torch::linear(fc1, fc2_weight, fc2_bias);
    relu_kernel<<<max_blocks, block_size>>>(fc2.data_ptr<float>(), fc2.numel());

    auto fc3 = torch::linear(fc2, fc3_weight, fc3_bias);

    return fc3;

    m.def("forward", &forward, "LeNet-5 forward pass");
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
CPU Time 1020937.01 μs
Device Time 262941.57 μs
Self CPU Time 39446.47 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
CPU Time 981490.53 μs
Device Time 262941.57 μs
Self CPU Time 49504.86 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
CPU Time 931985.67 μs
Device Time 262941.57 μs
Self CPU Time 103355.72 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
CPU Time 670049.65 μs
Device Time 37750.41 μs
Self CPU Time 670049.65 μs
Self Device Time 37750.41 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
CPU Time 647694.12 μs
Device Time 141888.64 μs
Self CPU Time 57202.65 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
CPU Time 108389.79 μs
Device Time 921677.43 μs
Self CPU Time 24300.92 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
CPU Time 84090.60 μs
Device Time 921677.43 μs
Self CPU Time 29813.76 μs
Self Device Time 921677.43 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 921677.43 μs
Self CPU Time 0.00 μs
Self Device Time 921677.43 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45313 warnings generated when compiling for host.
Suppressed 45346 warnings (45299 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:11:15 bugprone-narrowing-conversions
11 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:12:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
12 | int stride = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:36:47: warning: 2 adjacent parameters of 'max_pool2d_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
36 | int batch_size, int channels, int height, int width,
| ^~~~~~~~~~
37 | int pool_height, int pool_width, int stride
| ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:36:51: note: the first parameter in the range is 'width'
36 | int batch_size, int channels, int height, int width,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:37:9: note: the last parameter in the range is 'pool_height'
37 | int pool_height, int pool_width, int stride
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:43:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
43 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:68:5: warning: 3 adjacent parameters of 'linear_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
68 | const float* __restrict__ input,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
70 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:68:31: note: the first parameter in the range is 'input'
68 | const float* __restrict__ input,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:70:31: note: the last parameter in the range is 'bias'
70 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:72:5: warning: 2 adjacent parameters of 'linear_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
72 | int in_features, int out_features
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:72:9: note: the first parameter in the range is 'in_features'
72 | int in_features, int out_features
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:72:26: note: the last parameter in the range is 'out_features'
72 | int in_features, int out_features
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:77:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
77 | for (int i = threadIdx.x; i < in_features; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:77:53: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
77 | for (int i = threadIdx.x; i < in_features; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:82:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
82 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:127:70: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
127 | relu_kernel<<<max_blocks, block_size>>>(conv1.data_ptr<float>(), conv1.numel());
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:131:70: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
131 | relu_kernel<<<max_blocks, block_size>>>(conv2.data_ptr<float>(), conv2.numel());
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:137:12: warning: Value stored to 'shared_mem_size' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
137 | size_t shared_mem_size = flat.size(1) * sizeof(float);
| ^~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:137:12: note: Value stored to 'shared_mem_size' during its initialization is never read
137 | size_t shared_mem_size = flat.size(1) * sizeof(float);
| ^~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:139:68: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
139 | relu_kernel<<<max_blocks, block_size>>>(fc1.data_ptr<float>(), fc1.numel());
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b2_s3_optimized_sync_4_lenet5/base/base.cu:142:68: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
142 | relu_kernel<<<max_blocks, block_size>>>(fc2.data_ptr<float>(), fc2.numel());
| ^