← Back to Leaderboard

The AI CUDA Engineer 👷

4_LeNet54_LeNet5_warp_divergence_edit_1

Level 3 • Task 4

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 4_LeNet5_fused_even_edit_1 0.05 2.38 1.47
🥇 4_LeNet5_fused_even_base 0.05 2.38 1.47
🥉 warp_uniform_optimized_base 0.05 2.29 1.41
4 modular_device_functions_base_base 0.05 2.25 1.38
4 4_LeNet5 0.05 2.25 1.38
4 optimized_sync_4_lenet5_base 0.05 2.25 1.38
4 4_LeNet5_strided_loops_edit_1 0.05 2.25 1.38
4 4_LeNet5_warp_divergence_edit_1 0.05 2.25 1.38
4 4_LeNet5_atomic_optimization_base 0.05 2.25 1.38
4 4_LeNet5_unroll_loops_base 0.05 2.25 1.38
4 4_lenet5_shared_mem_optimization_base 0.05 2.25 1.38
4 4_LeNet5_strided_loops_base 0.05 2.25 1.38
4 4_LeNet5_shared_memory_reduction_base 0.05 2.25 1.38
14 4_lenet5_workload_balancing_base 0.05 2.20 1.36
14 4_LeNet5_shared_memory_atomic_base 0.05 2.20 1.36
14 4_LeNet5_shared_memory_base 0.05 2.20 1.36
14 4_lenet5_memory_coalescing_edit_1 0.05 2.20 1.36
14 balanced_workload_distribution_base 0.05 2.20 1.36
14 no_divergence_fast_base 0.05 2.20 1.36
14 4_LeNet5_unroll_loops_edit_1 0.05 2.20 1.36
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>

// CUDA kernel for ReLU activation
__global__ void relu_kernel(float* input, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        input[idx] = fmaxf(0.0f, input[idx]);
    }
}

// CUDA kernel for max pooling with minimized warp divergence
__global__ void max_pool2d_kernel_min_warp_divergence(
    const float* input, float* output,
    int batch_size, int channels, int height, int width,
    int pool_height, int pool_width, int stride
) {
    int out_h = (height - pool_height) / stride + 1;
    int out_w = (width - pool_width) / stride + 1;

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < batch_size * channels * out_h * out_w) {
        int b = idx / (channels * out_h * out_w);
        int c = (idx / (out_h * out_w)) % channels;
        int h = (idx / out_w) % out_h;
        int w = idx % out_w;

        int in_h_start = h * stride;
        int in_w_start = w * stride;
        int in_h_end = in_h_start + pool_height;
        int in_w_end = in_w_start + pool_width;

        float max_val = input[((b * channels + c) * height + in_h_start) * width + in_w_start];
        for (int i = in_h_start; i < in_h_end; ++i) {
            for (int j = in_w_start; j < in_w_end; ++j) {
                float val = input[((b * channels + c) * height + i) * width + j];
                max_val = fmaxf(max_val, val);
            }
        }
        output[idx] = max_val;
    }
}

// CUDA kernel for flattening
__global__ void flatten_kernel(const float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = input[idx];
    }
}

// CUDA kernel for linear layer
__global__ void linear_kernel(
    const float* input, const float* weight, const float* bias,
    float* output, int in_features, int out_features
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < out_features) {
        float val = bias[idx];
        for (int i = 0; i < in_features; ++i) {
            val += input[i] * weight[idx * in_features + i];
        }
        output[idx] = val;
    }
}

// Forward function for the LeNet-5 architecture
torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor conv1_weight, torch::Tensor conv1_bias,
    torch::Tensor conv2_weight, torch::Tensor conv2_bias,
    torch::Tensor fc1_weight, torch::Tensor fc1_bias,
    torch::Tensor fc2_weight, torch::Tensor fc2_bias,
    torch::Tensor fc3_weight, torch::Tensor fc3_bias
) {
    // Ensure inputs are on CUDA
    x = x.to(torch::kCUDA);
    conv1_weight = conv1_weight.to(torch::kCUDA);
    conv1_bias = conv1_bias.to(torch::kCUDA);
    conv2_weight = conv2_weight.to(torch::kCUDA);
    conv2_bias = conv2_bias.to(torch::kCUDA);
    fc1_weight = fc1_weight.to(torch::kCUDA);
    fc1_bias = fc1_bias.to(torch::kCUDA);
    fc2_weight = fc2_weight.to(torch::kCUDA);
    fc2_bias = fc2_bias.to(torch::kCUDA);
    fc3_weight = fc3_weight.to(torch::kCUDA);
    fc3_bias = fc3_bias.to(torch::kCUDA);

    // First convolutional layer
    auto conv1 = torch::conv2d(x, conv1_weight, conv1_bias, {1, 1});
    relu_kernel<<<(conv1.numel() + 255) / 256, 256>>>(conv1.data_ptr<float>(), conv1.numel());
    auto pool1 = torch::max_pool2d(conv1, {2, 2}, {2, 2});

    // Second convolutional layer
    auto conv2 = torch::conv2d(pool1, conv2_weight, conv2_bias, {1, 1});
    relu_kernel<<<(conv2.numel() + 255) / 256, 256>>>(conv2.data_ptr<float>(), conv2.numel());
    auto pool2 = torch::max_pool2d(conv2, {2, 2}, {2, 2});

    // Flatten the output
    auto flat = pool2.view({pool2.size(0), -1});

    // First fully connected layer
    auto fc1 = torch::linear(flat, fc1_weight, fc1_bias);
    relu_kernel<<<(fc1.numel() + 255) / 256, 256>>>(fc1.data_ptr<float>(), fc1.numel());

    // Second fully connected layer
    auto fc2 = torch::linear(fc1, fc2_weight, fc2_bias);
    relu_kernel<<<(fc2.numel() + 255) / 256, 256>>>(fc2.data_ptr<float>(), fc2.numel());

    // Final fully connected layer
    auto fc3 = torch::linear(fc2, fc3_weight, fc3_bias);

    return fc3;
}

// PyBind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "LeNet-5 forward pass");
}