← Back to Leaderboard

The AI CUDA Engineer 👷

29_Softplussoftplus_coalesced_base

Level 1 • Task 29

Kernel Information

Related Kernels (Level 1, Task 29 • 29_Softplus)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 softplus_modular_base_base 0.01 1.16 4.88
🥇 warp_and_alignment_optimized_softplus_edit_1 0.01 1.16 4.88
🥇 branchless_softplus_edit_1 0.01 1.16 4.88
🥇 warp_optimized_softplus_base 0.01 1.16 4.88
5 softplus_unrolled_base_base 0.01 0.99 4.18
5 softplus_coalesced_base 0.01 0.99 4.18
5 softplus_2d_block_thread_base 0.01 0.99 4.18
5 optimized_softplus_cuda_base 0.01 0.99 4.18
5 softplus_coalesced_memory_access_base 0.01 0.99 4.18
5 softplus_tuned_indexing_base_base 0.01 0.99 4.18
5 softplus_blockstride_base 0.01 0.99 4.18
5 softplus_loop_unroll_base_base 0.01 0.99 4.18
5 softplus_branchless_base 0.01 0.99 4.18
5 softplus_blocksize_experiment_base 0.01 0.99 4.18
5 softplus_unrolled_base 0.01 0.99 4.18
5 softplus_constant_memory_base_base 0.01 0.99 4.18
5 optimized_softplus_cuda_base 0.01 0.99 4.18
5 29_Softplus 0.01 0.99 4.18
5 softplus_constant_memory_base 0.01 0.99 4.18
5 optimized_softplus_base 0.01 0.99 4.18
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

// CUDA kernel with block-stride loop and __ldg for read-only cache to improve memory coalescing

template <typename scalar_t>
__global__ void softplus_kernel_coalesced(
    const scalar_t* __restrict__ input,
    scalar_t* __restrict__ output,
    const int size) {

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (; idx < size; idx += stride) {
        // Use __ldg to load from global memory via read-only cache
        scalar_t x = __ldg(&input[idx]);

        if (x > static_cast<scalar_t>(20.0)) {
            output[idx] = x;
        } else if (x < static_cast<scalar_t>(-20.0)) {
            output[idx] = exp(x);
        } else {
            output[idx] = log1p(exp(x));
        }
    }
}


// CUDA forward function

torch::Tensor softplus_cuda_forward(torch::Tensor input) {
    auto output = torch::empty_like(input);
    const int size = input.numel();
    const int threads = 256;
    const int blocks = (size + threads - 1) / threads;

    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softplus_forward_cuda", ([&] {
        softplus_kernel_coalesced<scalar_t><<<blocks, threads>>>(
            input.data_ptr<scalar_t>(),
            output.data_ptr<scalar_t>(),
            size);
    }));

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &softplus_cuda_forward, "Softplus forward (CUDA)");
}