← Back to Leaderboard

The AI CUDA Engineer 👷

94_MSELossoptimized_grid_stride_warp_reduce_base

Level 1 • Task 94
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
    """
    Computes the Mean Squared Error loss for regression tasks.

    Args:
        predictions (torch.Tensor): Predicted values.
        targets (torch.Tensor): Target values.

    Returns:
        torch.Tensor: Mean Squared Error loss.
    """
    return F.mse_loss(predictions, targets, reduction="mean")


class Model(nn.Module):
    """
    A model that computes the Mean Squared Error loss for regression tasks.

    Parameters:
        None
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, predictions, targets, fn=module_fn):
        return fn(predictions, targets)


batch_size = 128
input_shape = (4096,)
dim = 1


def get_inputs():
    return [
        torch.randn(batch_size, *input_shape),
        torch.randn(batch_size, *input_shape),
    ]


def get_init_inputs():
    return []
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    A model that computes the Mean Squared Error loss for regression tasks.

    Parameters:
        None
    """
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, predictions, targets):
        return torch.mean((predictions - targets) ** 2)

batch_size = 128
input_shape = (4096, )
dim = 1

def get_inputs():
    return [torch.randn(batch_size, *input_shape), torch.randn(batch_size, *input_shape)]

def get_init_inputs():
    return []

Kernel Information

Related Kernels (Level 1, Task 94 • 94_MSELoss)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 optimized_thread_indexing_base 0.02 1.03 2.04
🥇 coalesced_shfl_mse_base 0.02 1.03 2.04
🥇 efficient_mse_base 0.02 1.03 2.04
🥇 mse_unrolled_optimized_base 0.02 1.03 2.04
🥇 mse_min_sync_edit_1 0.02 1.03 2.04
🥇 vectorized_ldg_mse_base 0.02 1.03 2.04
🥇 optimized_grid_stride_warp_reduce_base 0.02 1.03 2.04
🥇 mse_1d_optimized_indexing_base 0.02 1.03 2.04
🥇 mse_unrolled_optimized_edit_1 0.02 1.03 2.04
🥇 mse_warp_reduction_base 0.02 1.03 2.04
🥇 mse_unroll_pragma_base_base 0.02 1.03 2.04
🥇 mse_blocksize_experiment_base 0.02 1.03 2.04
🥇 mse_ldg_vectorized_edit_edit_1 0.02 1.03 2.04
🥇 mse_ldg_vectorized_edit_base 0.02 1.03 2.04
15 optimized_block_size_mse_base 0.02 0.97 1.92
15 stride_mse_loss_base 0.02 0.97 1.92
15 warp_uniform_mse_base 0.02 0.97 1.92
15 block_size_experimentation_base_base 0.02 0.97 1.92
15 optimized_mse_forward_base 0.02 0.97 1.92
15 warp_aligned_mse_base_base 0.02 0.97 1.92
#include <pybind11/pybind11.h>
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <algorithm>

static const int BLOCK_SIZE = 256;

template <typename scalar_t>
__global__ void optimized_mse_kernel(
    const scalar_t* __restrict__ preds,
    const scalar_t* __restrict__ tgts,
    double* __restrict__ sum_out,
    const int64_t num_elements
) {
    __shared__ double smem[BLOCK_SIZE];
    
    int tid = threadIdx.x;
    double thread_sum = 0.0;
    const int grid_stride = blockDim.x * gridDim.x;
    int idx = blockIdx.x * blockDim.x + tid;

    // Optimized grid-stride loop
    while (idx < num_elements) {
        double diff = static_cast<double>(preds[idx]) - static_cast<double>(tgts[idx]);
        thread_sum += diff * diff;
        idx += grid_stride;
    }

    smem[tid] = thread_sum;
    __syncthreads();

    // Unrolled reduction with warp-level optimization
    for (int s = blockDim.x/2; s > 32; s >>= 1) {
        if (tid < s) {
            smem[tid] += smem[tid + s];
        }
        __syncthreads();
    }

    // Warp-level reduction
    if (tid < 32) {
        volatile double* vsmem = smem;
        vsmem[tid] += vsmem[tid + 32];
        vsmem[tid] += vsmem[tid + 16];
        vsmem[tid] += vsmem[tid + 8];
        vsmem[tid] += vsmem[tid + 4];
        vsmem[tid] += vsmem[tid + 2];
        vsmem[tid] += vsmem[tid + 1];
    }

    if (tid == 0) {
        atomicAdd(sum_out, smem[0]);
    }
}

torch::Tensor forward(torch::Tensor predictions, torch::Tensor targets) {
    TORCH_CHECK(predictions.is_cuda(), "predictions must be a CUDA tensor");
    TORCH_CHECK(targets.is_cuda(), "targets must be a CUDA tensor");
    TORCH_CHECK(predictions.numel() == targets.numel(),
                "predictions and targets must have the same number of elements");

    const int64_t num_elements = predictions.numel();
    auto accumulator = torch::zeros({1}, predictions.options().dtype(at::kDouble));

    int grid_size = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
    grid_size = std::min(grid_size, 1024);

    AT_DISPATCH_FLOATING_TYPES(predictions.scalar_type(), "optimized_mse_cuda", ([&] {
        optimized_mse_kernel<scalar_t><<<grid_size, BLOCK_SIZE>>>(
            predictions.data_ptr<scalar_t>(),
            targets.data_ptr<scalar_t>(),
            accumulator.data_ptr<double>(),
            num_elements
        );
    }));

    auto result = accumulator.div_(static_cast<double>(num_elements));
    return result.to(predictions.dtype());
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized MSE forward (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 1.352 inst/cycle 0.000 5
Executed Ipc Elapsed 0.686 inst/cycle 0.000 5
Issue Slots Busy 36.046 % 0.041 5
Issued Ipc Active 1.442 inst/cycle 0.000 5
SM Busy 36.046 % 0.041 5
Memory Throughput 724896736366.736 byte/second 44080681346011742208.000 5
Mem Busy 14.084 % 0.015 5
Max Bandwidth 21.732 % 0.048 5
L1/TEX Hit Rate 0.000 % 0.000 5
L2 Hit Rate 18.686 % 0.008 5
Mem Pipes Busy 9.850 % 0.007 5
Warp Cycles Per Issued Instruction 35.452 cycle 0.028 5
Warp Cycles Per Executed Instruction 37.836 cycle 0.031 5
Avg. Active Threads Per Warp 31.730 0.000 5
Avg. Not Predicated Off Threads Per Warp 28.440 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 10.000 block 0.000 5
Block Limit Shared Mem 21.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 79.834 % 0.009 5
Achieved Active Warps Per SM 51.094 warp 0.004 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (20.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (79.7%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 1274560.64 μs
Device Time 225454.44 μs
Self CPU Time 76165.23 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 1198395.41 μs
Device Time 225454.44 μs
Self CPU Time 175706.42 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 4326439.45 μs
Device Time 6695453.63 μs
Self CPU Time 301330.07 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 4025111.13 μs
Device Time 6695453.63 μs
Self CPU Time 351206.85 μs
Self Device Time 6695453.63 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 4651041.03 μs
Device Time 1319.86 μs
Self CPU Time 4651041.03 μs
Self Device Time 1319.86 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void optimized_mse_kernel<float>(float const*, float const*, double*, long)
CPU Time 0.00 μs
Device Time 440105.23 μs
Self CPU Time 0.00 μs
Self Device Time 440105.23 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<double, double, double, at::native::binary_internal::MulFunctor<double> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<double, double, double, at::native::binary_internal::MulFunctor<double> >, at::detail::Array<char*, 2>)
CPU Time 0.00 μs
Device Time 228436.47 μs
Self CPU Time 0.00 μs
Self Device Time 228436.47 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 6495396.64 μs
Self CPU Time 0.00 μs
Self Device Time 6495396.64 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45282 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:18:15 bugprone-narrowing-conversions
18 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:20:29: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
20 | const int grid_stride = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:21:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int idx = blockIdx.x * blockDim.x + tid;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:34:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
34 | for (int s = blockDim.x/2; s > 32; s >>= 1) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:66:21: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
66 | int grid_size = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:69:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]
69 | AT_DISPATCH_FLOATING_TYPES(predictions.scalar_type(), "optimized_mse_cuda", ([&] {
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'
237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'
74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'
58 | AT_ERROR( \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'
711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'
536 | __func__, \
| ^