← Back to Leaderboard

The AI CUDA Engineer 👷

59_Matmul_Swish_Scalingoptimized_thread_block_indexing_base

Level 2 • Task 59

Kernel Information

Related Kernels (Level 2, Task 59 • 59_Matmul_Swish_Scaling)

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void optimized_swish_scaling_kernel(const float* __restrict__ input, float* output, float scaling_factor, int rows, int cols) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    if (row < rows && col < cols) {
        int idx = row * cols + col;
        float x = input[idx];
        float sigmoid = 1.0f / (1.0f + expf(-x));
        output[idx] = x * sigmoid * scaling_factor;
    }
}

torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor weight,
    torch::Tensor bias,
    double scaling_factor) {

    x = x.contiguous();
    weight = weight.contiguous();
    bias = bias.contiguous();

    TORCH_CHECK(x.is_cuda(), "Input tensor 'x' must be a CUDA tensor.");
    TORCH_CHECK(weight.is_cuda(), "Weight tensor must be a CUDA tensor.");
    TORCH_CHECK(bias.is_cuda(), "Bias tensor must be a CUDA tensor.");
    TORCH_CHECK(x.scalar_type() == at::kFloat, "Input tensor 'x' must be of type torch.float32.");

    auto y = at::addmm(bias, x, weight.t());
    auto output = at::empty_like(y);

    const int rows = y.size(0);
    const int cols = y.size(1);

    dim3 threads(32, 32);
    dim3 blocks((cols + threads.x - 1) / threads.x, (rows + threads.y - 1) / threads.y);

    optimized_swish_scaling_kernel<<<blocks, threads>>>(
        y.data_ptr<float>(),
        output.data_ptr<float>(),
        static_cast<float>(scaling_factor),
        rows,
        cols);

    cudaError_t err = cudaGetLastError();
    TORCH_CHECK(err == cudaSuccess, "CUDA kernel failed : ", cudaGetErrorString(err));

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized CUDA forward function");
}