Kernel Details - balanced_workload_distribution

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    conv1_weight: nn.Parameter,
    conv1_bias: nn.Parameter,
    conv2_weight: nn.Parameter,
    conv2_bias: nn.Parameter,
    fc1_weight: nn.Parameter,
    fc1_bias: nn.Parameter,
    fc2_weight: nn.Parameter,
    fc2_bias: nn.Parameter,
    fc3_weight: nn.Parameter,
    fc3_bias: nn.Parameter,
) -> torch.Tensor:
    """
    Implements a LeNet-5 architecture with ReLU activation.

    Args:
        x (torch.Tensor): The input tensor, shape (batch_size, 1, 32, 32)
        conv1_weight (nn.Parameter): Parameters for first conv layer
        conv1_bias (nn.Parameter): Parameters for first conv layer
        conv2_weight (nn.Parameter): Parameters for second conv layer
        conv2_bias (nn.Parameter): Parameters for second conv layer
        fc1_weight (nn.Parameter): Parameters for first FC layer
        fc1_bias (nn.Parameter): Parameters for first FC layer
        fc2_weight (nn.Parameter): Parameters for second FC layer
        fc3_weight (nn.Parameter): Parameters for third FC layer
        fc3_bias (nn.Parameter): Parameters for third FC layer

    Returns:
        torch.Tensor: The output tensor, shape (batch_size, num_classes)
    """
    # First convolutional layer with ReLU activation and max pooling
    x = F.conv2d(x, conv1_weight, conv1_bias, stride=1)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2, stride=2)

    # Second convolutional layer with ReLU activation and max pooling
    x = F.conv2d(x, conv2_weight, conv2_bias, stride=1)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2, stride=2)

    # Flatten the output for the fully connected layers
    x = x.view(-1, 16 * 5 * 5)

    # First fully connected layer with ReLU activation
    x = F.linear(x, fc1_weight, fc1_bias)
    x = F.relu(x)

    # Second fully connected layer with ReLU activation
    x = F.linear(x, fc2_weight, fc2_bias)
    x = F.relu(x)

    # Final fully connected layer
    x = F.linear(x, fc3_weight, fc3_bias)

    return x


class Model(nn.Module):
    def __init__(self, num_classes):
        """
        LeNet-5 architecture implementation in PyTorch.

        :param num_classes: The number of output classes.
        """
        super(Model, self).__init__()

        # Extract parameters from convolutional layers
        conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv1_weight = nn.Parameter(conv1.weight.data.clone())
        self.conv1_bias = nn.Parameter(conv1.bias.data.clone())

        conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        self.conv2_weight = nn.Parameter(conv2.weight.data.clone())
        self.conv2_bias = nn.Parameter(conv2.bias.data.clone())

        # Extract parameters from fully connected layers
        fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120)
        self.fc1_weight = nn.Parameter(fc1.weight.data.clone())
        self.fc1_bias = nn.Parameter(fc1.bias.data.clone())

        fc2 = nn.Linear(in_features=120, out_features=84)
        self.fc2_weight = nn.Parameter(fc2.weight.data.clone())
        self.fc2_bias = nn.Parameter(fc2.bias.data.clone())

        fc3 = nn.Linear(in_features=84, out_features=num_classes)
        self.fc3_weight = nn.Parameter(fc3.weight.data.clone())
        self.fc3_bias = nn.Parameter(fc3.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(
            x,
            self.conv1_weight,
            self.conv1_bias,
            self.conv2_weight,
            self.conv2_bias,
            self.fc1_weight,
            self.fc1_bias,
            self.fc2_weight,
            self.fc2_bias,
            self.fc3_weight,
            self.fc3_bias,
        )


# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10


def get_inputs():
    return [torch.randn(batch_size, 1, 32, 32)]


def get_init_inputs():
    return [num_classes]

import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, num_classes):
        """
        LeNet-5 architecture implementation in PyTorch.

        :param num_classes: The number of output classes.
        """
        super(Model, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        
        # Fully connected layers
        self.fc1 = nn.Linear(in_features=16*5*5, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=84)
        self.fc3 = nn.Linear(in_features=84, out_features=num_classes)
    
    def forward(self, x):
        """
        Forward pass of the LeNet-5 model.

        :param x: The input tensor, shape (batch_size, 1, 32, 32)
        :return: The output tensor, shape (batch_size, num_classes)
        """
        # First convolutional layer with ReLU activation and max pooling
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        # Second convolutional layer with ReLU activation and max pooling
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        # Flatten the output for the fully connected layers
        x = x.view(-1, 16*5*5)
        
        # First fully connected layer with ReLU activation
        x = F.relu(self.fc1(x))
        
        # Second fully connected layer with ReLU activation
        x = F.relu(self.fc2(x))
        
        # Final fully connected layer
        x = self.fc3(x)
        
        return x

# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10

def get_inputs():
    return [torch.randn(batch_size, 1, 32, 32)]

def get_init_inputs():
    return [num_classes]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	4_LeNet5
Level ID	3
Task ID	4
Kernel Name	balanced_workload_distribution_base
CUDA Speedup (Native)	2.204x
CUDA Speedup (Compile)	1.357x
CUDA Runtime	0.053 ms
PyTorch Runtime (Native)	0.117 ms
PyTorch Runtime (Compile)	0.072 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	azure-gpt-4o-2024-08-06
Temperature	0.00

View Experiment Progress Details

Related Kernels (Level 3, Task 4 • 4_LeNet5)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	4_LeNet5_fused_even_edit_1	0.05	2.38	1.47
🥇	4_LeNet5_fused_even_base	0.05	2.38	1.47
🥉	warp_uniform_optimized_base	0.05	2.29	1.41
4	modular_device_functions_base_base	0.05	2.25	1.38
4	4_LeNet5	0.05	2.25	1.38
4	optimized_sync_4_lenet5_base	0.05	2.25	1.38
4	4_LeNet5_strided_loops_edit_1	0.05	2.25	1.38
4	4_LeNet5_warp_divergence_edit_1	0.05	2.25	1.38
4	4_LeNet5_atomic_optimization_base	0.05	2.25	1.38
4	4_LeNet5_unroll_loops_base	0.05	2.25	1.38
4	4_lenet5_shared_mem_optimization_base	0.05	2.25	1.38
4	4_LeNet5_strided_loops_base	0.05	2.25	1.38
4	4_LeNet5_shared_memory_reduction_base	0.05	2.25	1.38
14	4_lenet5_workload_balancing_base	0.05	2.20	1.36
14	4_LeNet5_shared_memory_atomic_base	0.05	2.20	1.36
14	4_LeNet5_shared_memory_base	0.05	2.20	1.36
14	4_lenet5_memory_coalescing_edit_1	0.05	2.20	1.36
14	balanced_workload_distribution_base	0.05	2.20	1.36
14	no_divergence_fast_base	0.05	2.20	1.36
14	4_LeNet5_unroll_loops_edit_1	0.05	2.20	1.36

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>

// ReLU kernel with balanced workload distribution
__global__ void relu_kernel(float* input, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    for (int i = idx; i < size; i += stride) {
        input[i] = fmaxf(0.0f, input[i]);
    }
}

// Max pooling kernel with balanced workload distribution
__global__ void max_pool2d_kernel(
    const float* __restrict__ input, float* __restrict__ output,
    int batch_size, int channels, int height, int width,
    int pool_height, int pool_width, int stride
) {
    int out_h = (height - pool_height) / stride + 1;
    int out_w = (width - pool_width) / stride + 1;
    int total_elements = batch_size * channels * out_h * out_w;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride_threads = blockDim.x * gridDim.x;

    for (; idx < total_elements; idx += stride_threads) {
        int b = idx / (channels * out_h * out_w);
        int c = (idx / (out_h * out_w)) % channels;
        int h = (idx / out_w) % out_h;
        int w = idx % out_w;

        int in_h_start = h * stride;
        int in_w_start = w * stride;

        float max_val = -FLT_MAX;
        for (int i = in_h_start; i < in_h_start + pool_height; ++i) {
            for (int j = in_w_start; j < in_w_start + pool_width; ++j) {
                float val = input[((b * channels + c) * height + i) * width + j];
                max_val = fmaxf(max_val, val);
            }
        }
        output[idx] = max_val;
    }
}

// Linear layer kernel with balanced workload distribution
__global__ void linear_kernel(
    const float* __restrict__ input,
    const float* __restrict__ weight,
    const float* __restrict__ bias,
    float* __restrict__ output,
    int in_features, int out_features
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (; idx < out_features; idx += stride) {
        float val = bias[idx];
        for (int i = 0; i < in_features; ++i) {
            val += input[i] * weight[idx * in_features + i];
        }
        output[idx] = val;
    }
}

// Forward function for the LeNet-5 architecture
torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor conv1_weight, torch::Tensor conv1_bias,
    torch::Tensor conv2_weight, torch::Tensor conv2_bias,
    torch::Tensor fc1_weight, torch::Tensor fc1_bias,
    torch::Tensor fc2_weight, torch::Tensor fc2_bias,
    torch::Tensor fc3_weight, torch::Tensor fc3_bias
) {
    // Ensure inputs are on CUDA
    x = x.to(torch::kCUDA);
    conv1_weight = conv1_weight.to(torch::kCUDA);
    conv1_bias = conv1_bias.to(torch::kCUDA);
    conv2_weight = conv2_weight.to(torch::kCUDA);
    conv2_bias = conv2_bias.to(torch::kCUDA);
    fc1_weight = fc1_weight.to(torch::kCUDA);
    fc1_bias = fc1_bias.to(torch::kCUDA);
    fc2_weight = fc2_weight.to(torch::kCUDA);
    fc2_bias = fc2_bias.to(torch::kCUDA);
    fc3_weight = fc3_weight.to(torch::kCUDA);
    fc3_bias = fc3_bias.to(torch::kCUDA);

    const int block_size = 256;
    const int num_blocks = (x.numel() + block_size - 1) / block_size;

    // First convolutional layer
    auto conv1 = torch::conv2d(x, conv1_weight, conv1_bias, {1, 1});
    relu_kernel<<<num_blocks, block_size>>>(conv1.data_ptr<float>(), conv1.numel());
    auto pool1 = torch::max_pool2d(conv1, {2, 2}, {2, 2});

    // Second convolutional layer
    auto conv2 = torch::conv2d(pool1, conv2_weight, conv2_bias, {1, 1});
    relu_kernel<<<num_blocks, block_size>>>(conv2.data_ptr<float>(), conv2.numel());
    auto pool2 = torch::max_pool2d(conv2, {2, 2}, {2, 2});

    // Flatten the output
    auto flat = pool2.view({pool2.size(0), -1});

    // First fully connected layer
    auto fc1 = torch::linear(flat, fc1_weight, fc1_bias);
    relu_kernel<<<num_blocks, block_size>>>(fc1.data_ptr<float>(), fc1.numel());

    // Second fully connected layer
    auto fc2 = torch::linear(fc1, fc2_weight, fc2_bias);
    relu_kernel<<<num_blocks, block_size>>>(fc2.data_ptr<float>(), fc2.numel());

    // Final fully connected layer
    auto fc3 = torch::linear(fc2, fc3_weight, fc3_bias);

    return fc3;
}

// PyBind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "LeNet-5 forward pass");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::conv2d
CPU Time	818700.67	μs
Device Time	213068.69	μs
Self CPU Time	32225.07	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::convolution
CPU Time	786475.61	μs
Device Time	213068.69	μs
Self CPU Time	40787.30	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_convolution
CPU Time	745688.31	μs
Device Time	213068.69	μs
Self CPU Time	84438.57	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	499981.32	μs
Device Time	30330.90	μs
Self CPU Time	499981.32	μs
Self Device Time	30330.90	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::linear
CPU Time	528313.99	μs
Device Time	113215.57	μs
Self CPU Time	47099.50	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	87098.02	μs
Device Time	744525.00	μs
Self CPU Time	21199.17	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	65900.47	μs
Device Time	744525.00	μs
Self CPU Time	26001.96	μs
Self Device Time	744525.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	744525.00	μs
Self CPU Time	0.00	μs
Self Device Time	744525.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45313 warnings generated when compiling for host.
Suppressed 45346 warnings (45299 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:11:15 bugprone-narrowing-conversions

11 | int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:12:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

12 | int stride = gridDim.x * blockDim.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:21:47: warning: 2 adjacent parameters of 'max_pool2d_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]

21 | int batch_size, int channels, int height, int width,

| ^~~~~~~~~~

22 | int pool_height, int pool_width, int stride

| ~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:21:51: note: the first parameter in the range is 'width'

21 | int batch_size, int channels, int height, int width,

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:22:9: note: the last parameter in the range is 'pool_height'

22 | int pool_height, int pool_width, int stride

| ^~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:27:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

27 | int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:28:26: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

28 | int stride_threads = blockDim.x * gridDim.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:53:5: warning: 2 adjacent parameters of 'linear_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]

53 | const float* __restrict__ weight,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

54 | const float* __restrict__ bias,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:53:31: note: the first parameter in the range is 'weight'

53 | const float* __restrict__ weight,

| ^~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:54:31: note: the last parameter in the range is 'bias'

54 | const float* __restrict__ bias,

| ^~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:56:5: warning: 2 adjacent parameters of 'linear_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]

56 | int in_features, int out_features

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:56:9: note: the first parameter in the range is 'in_features'

56 | int in_features, int out_features

| ^~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:56:26: note: the last parameter in the range is 'out_features'

56 | int in_features, int out_features

| ^~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:58:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

58 | int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:59:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

59 | int stride = blockDim.x * gridDim.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:93:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

93 | const int num_blocks = (x.numel() + block_size - 1) / block_size;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:97:70: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

97 | relu_kernel<<<num_blocks, block_size>>>(conv1.data_ptr<float>(), conv1.numel());

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:102:70: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

102 | relu_kernel<<<num_blocks, block_size>>>(conv2.data_ptr<float>(), conv2.numel());

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:110:68: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

110 | relu_kernel<<<num_blocks, block_size>>>(fc1.data_ptr<float>(), fc1.numel());

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_3/task_4/b10_s2_balanced_workload_distribution/base/base.cu:114:68: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

114 | relu_kernel<<<num_blocks, block_size>>>(fc2.data_ptr<float>(), fc2.numel());

| ^

The AI CUDA Engineer 👷

`4_LeNet5` • `balanced_workload_distribution_base`

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

The AI CUDA Engineer 👷

4_LeNet5 • balanced_workload_distribution_base

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

`4_LeNet5` • `balanced_workload_distribution_base`