← Back to Leaderboard

The AI CUDA Engineer 👷

95_CrossEntropyLossce_loss_unroll_optimized_base

Level 1 • Task 95

Kernel Information

Related Kernels (Level 1, Task 95 • 95_CrossEntropyLoss)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 95_CrossEntropyLoss 0.01 8.97 2.45
🥇 memory_coalescing_base 0.01 8.97 2.45
🥇 block_size_experimentation_base 0.01 8.97 2.45
🥇 stride_loop_boundary_optimization_base 0.01 8.97 2.45
🥇 optimized_thread_block_mapping_base_base 0.01 8.97 2.45
🥇 optimal_blocksize_experiment_base 0.01 8.97 2.45
🥇 modular_crossentropy_base 0.01 8.97 2.45
🥇 warp_aligned_base_base 0.01 8.97 2.45
🥇 warp_divergence_minimization_base_base 0.01 8.97 2.45
🥇 modularized_device_functions_base 0.01 8.97 2.45
🥇 ce_loss_unroll_optimized_base 0.01 8.97 2.45
🥇 ce_loss_ldg_aligned_base 0.01 8.97 2.45
🥇 ce_loss_optimized_blocksize_512_base 0.01 8.97 2.45
🥇 ce_loss_grid_stride_unroll_edit_1 0.01 8.97 2.45
🥇 ce_loss_ldg_aligned_edit_1 0.01 8.97 2.45
🥇 stride_loop_optimization_base_base 0.01 8.97 2.45
🥇 ldg_aligned_access_base 0.01 8.97 2.45
🥇 modular_device_ce_loss_base 0.01 8.97 2.45
🥇 ce_loss_stride_base 0.01 8.97 2.45
🥇 atomic_optimized_crossentropy_edit_1 0.01 8.97 2.45
#include <torch/extension.h>

__global__ void cross_entropy_loss_kernel(
    const float* __restrict__ logits,
    const int64_t* __restrict__ targets,
    float* __restrict__ losses,
    int batch_size,
    int num_classes)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total_threads = blockDim.x * gridDim.x;
    
    for (int i = idx; i < batch_size; i += total_threads) {
        const float* row = logits + i * num_classes;
        const int target = targets[i];

        // Max logit computation with unrolling
        float max_val = row[0];
        #pragma unroll 4
        for (int j = 1; j < num_classes; j++) {
            max_val = fmaxf(max_val, row[j]);
        }

        // Sum exp computation with unrolling
        float sum_exp = 0.0f;
        #pragma unroll 4
        for (int j = 0; j < num_classes; j++) {
            sum_exp += expf(row[j] - max_val);
        }

        float log_sum_exp = logf(sum_exp);
        losses[i] = -(row[target] - max_val - log_sum_exp);
    }
}

torch::Tensor forward(torch::Tensor predictions, torch::Tensor targets) {
    TORCH_CHECK(predictions.is_cuda() && targets.is_cuda(), "Inputs must be CUDA tensors");
    TORCH_CHECK(predictions.dim() == 2, "Predictions must be 2D tensor");
    TORCH_CHECK(targets.dim() == 1, "Targets must be 1D tensor");

    const int batch_size = predictions.size(0);
    const int num_classes = predictions.size(1);
    auto losses = torch::empty({batch_size}, predictions.options());

    const int threads = 256;
    const int blocks = (batch_size + threads - 1) / threads;

    cross_entropy_loss_kernel<<<blocks, threads>>>(
        predictions.data_ptr<float>(),
        targets.data_ptr<int64_t>(),
        losses.data_ptr<float>(),
        batch_size,
        num_classes
    );

    cudaError_t err = cudaGetLastError();
    TORCH_CHECK(err == cudaSuccess, "CUDA Error: ", cudaGetErrorString(err));

    return losses.mean();
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized CrossEntropyLoss with loop unrolling");
}