← Back to Leaderboard

The AI CUDA Engineer 👷

4_LeNet5modular_device_functions_base_base

Level 3 • Task 4

Kernel Information

Related Kernels (Level 3, Task 4 • 4_LeNet5)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 4_LeNet5_fused_even_edit_1 0.05 2.38 1.47
🥇 4_LeNet5_fused_even_base 0.05 2.38 1.47
🥉 warp_uniform_optimized_base 0.05 2.29 1.41
4 modular_device_functions_base_base 0.05 2.25 1.38
4 4_LeNet5 0.05 2.25 1.38
4 optimized_sync_4_lenet5_base 0.05 2.25 1.38
4 4_LeNet5_strided_loops_edit_1 0.05 2.25 1.38
4 4_LeNet5_warp_divergence_edit_1 0.05 2.25 1.38
4 4_LeNet5_atomic_optimization_base 0.05 2.25 1.38
4 4_LeNet5_unroll_loops_base 0.05 2.25 1.38
4 4_lenet5_shared_mem_optimization_base 0.05 2.25 1.38
4 4_LeNet5_strided_loops_base 0.05 2.25 1.38
4 4_LeNet5_shared_memory_reduction_base 0.05 2.25 1.38
14 4_lenet5_workload_balancing_base 0.05 2.20 1.36
14 4_LeNet5_shared_memory_atomic_base 0.05 2.20 1.36
14 4_LeNet5_shared_memory_base 0.05 2.20 1.36
14 4_lenet5_memory_coalescing_edit_1 0.05 2.20 1.36
14 balanced_workload_distribution_base 0.05 2.20 1.36
14 no_divergence_fast_base 0.05 2.20 1.36
14 4_LeNet5_unroll_loops_edit_1 0.05 2.20 1.36
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>

// Device function for ReLU activation
__device__ float relu(float x) {
    return fmaxf(0.0f, x);
}

// Optimized ReLU kernel using device function
__global__ void relu_kernel(float* input, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < size; i += stride) {
        input[i] = relu(input[i]);
    }
}

// Device function for max pooling
__device__ float max_pool(const float* input, int height, int width, int in_h_start, int in_w_start, int pool_height, int pool_width) {
    float max_val = -FLT_MAX;
    for (int i = in_h_start; i < in_h_start + pool_height; ++i) {
        for (int j = in_w_start; j < in_w_start + pool_width; ++j) {
            float val = input[i * width + j];
            max_val = fmaxf(max_val, val);
        }
    }
    return max_val;
}

// Optimized max pooling kernel using device function
__global__ void max_pool2d_kernel(
    const float* __restrict__ input, float* __restrict__ output,
    int batch_size, int channels, int height, int width,
    int pool_height, int pool_width, int stride
) {
    int out_h = (height - pool_height) / stride + 1;
    int out_w = (width - pool_width) / stride + 1;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride_threads = blockDim.x * gridDim.x;

    for (; idx < batch_size * channels * out_h * out_w; idx += stride_threads) {
        int b = idx / (channels * out_h * out_w);
        int c = (idx / (out_h * out_w)) % channels;
        int h = (idx / out_w) % out_h;
        int w = idx % out_w;

        int in_h_start = h * stride;
        int in_w_start = w * stride;

        output[idx] = max_pool(&input[((b * channels + c) * height) * width], height, width, in_h_start, in_w_start, pool_height, pool_width);
    }
}

// Device function for linear transformation
__device__ float linear_transform(const float* input, const float* weight, float bias, int in_features) {
    float val = bias;
    for (int i = 0; i < in_features; ++i) {
        val += input[i] * weight[i];
    }
    return val;
}

// Optimized linear layer kernel using device function
__global__ void linear_kernel(
    const float* __restrict__ input,
    const float* __restrict__ weight,
    const float* __restrict__ bias,
    float* __restrict__ output,
    int in_features, int out_features
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (; idx < out_features; idx += stride) {
        output[idx] = linear_transform(input, &weight[idx * in_features], bias[idx], in_features);
    }
}

// Forward function for the LeNet-5 architecture
torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor conv1_weight, torch::Tensor conv1_bias,
    torch::Tensor conv2_weight, torch::Tensor conv2_bias,
    torch::Tensor fc1_weight, torch::Tensor fc1_bias,
    torch::Tensor fc2_weight, torch::Tensor fc2_bias,
    torch::Tensor fc3_weight, torch::Tensor fc3_bias
) {
    // Ensure inputs are on CUDA
    x = x.to(torch::kCUDA);
    conv1_weight = conv1_weight.to(torch::kCUDA);
    conv1_bias = conv1_bias.to(torch::kCUDA);
    conv2_weight = conv2_weight.to(torch::kCUDA);
    conv2_bias = conv2_bias.to(torch::kCUDA);
    fc1_weight = fc1_weight.to(torch::kCUDA);
    fc1_bias = fc1_bias.to(torch::kCUDA);
    fc2_weight = fc2_weight.to(torch::kCUDA);
    fc2_bias = fc2_bias.to(torch::kCUDA);
    fc3_weight = fc3_weight.to(torch::kCUDA);
    fc3_bias = fc3_bias.to(torch::kCUDA);

    const int block_size = 256;
    const int max_blocks = 32;

    // First convolutional layer
    auto conv1 = torch::conv2d(x, conv1_weight, conv1_bias, {1, 1});
    relu_kernel<<<max_blocks, block_size>>>(conv1.data_ptr<float>(), conv1.numel());
    auto pool1 = torch::max_pool2d(conv1, {2, 2}, {2, 2});

    // Second convolutional layer
    auto conv2 = torch::conv2d(pool1, conv2_weight, conv2_bias, {1, 1});
    relu_kernel<<<max_blocks, block_size>>>(conv2.data_ptr<float>(), conv2.numel());
    auto pool2 = torch::max_pool2d(conv2, {2, 2}, {2, 2});

    // Flatten the output
    auto flat = pool2.view({pool2.size(0), -1});

    // First fully connected layer
    auto fc1 = torch::linear(flat, fc1_weight, fc1_bias);
    relu_kernel<<<max_blocks, block_size>>>(fc1.data_ptr<float>(), fc1.numel());

    // Second fully connected layer
    auto fc2 = torch::linear(fc1, fc2_weight, fc2_bias);
    relu_kernel<<<max_blocks, block_size>>>(fc2.data_ptr<float>(), fc2.numel());

    // Final fully connected layer
    auto fc3 = torch::linear(fc2, fc3_weight, fc3_bias);

    return fc3;
}

// PyBind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "LeNet-5 forward pass");
}