Kernel Details - optimized_grid_stride_warp_reduce

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
    """
    Computes the Mean Squared Error loss for regression tasks.

    Args:
        predictions (torch.Tensor): Predicted values.
        targets (torch.Tensor): Target values.

    Returns:
        torch.Tensor: Mean Squared Error loss.
    """
    return F.mse_loss(predictions, targets, reduction="mean")


class Model(nn.Module):
    """
    A model that computes the Mean Squared Error loss for regression tasks.

    Parameters:
        None
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, predictions, targets, fn=module_fn):
        return fn(predictions, targets)


batch_size = 128
input_shape = (4096,)
dim = 1


def get_inputs():
    return [
        torch.randn(batch_size, *input_shape),
        torch.randn(batch_size, *input_shape),
    ]


def get_init_inputs():
    return []

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    A model that computes the Mean Squared Error loss for regression tasks.

    Parameters:
        None
    """
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, predictions, targets):
        return torch.mean((predictions - targets) ** 2)

batch_size = 128
input_shape = (4096, )
dim = 1

def get_inputs():
    return [torch.randn(batch_size, *input_shape), torch.randn(batch_size, *input_shape)]

def get_init_inputs():
    return []

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	94_MSELoss
Level ID	1
Task ID	94
Kernel Name	optimized_grid_stride_warp_reduce_base
CUDA Speedup (Native)	1.034x
CUDA Speedup (Compile)	2.036x
CUDA Runtime	0.016 ms
PyTorch Runtime (Native)	0.017 ms
PyTorch Runtime (Compile)	0.033 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	deepseek-reasoner
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 1, Task 94 • 94_MSELoss)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	optimized_thread_indexing_base	0.02	1.03	2.04
🥇	coalesced_shfl_mse_base	0.02	1.03	2.04
🥇	efficient_mse_base	0.02	1.03	2.04
🥇	mse_unrolled_optimized_base	0.02	1.03	2.04
🥇	mse_min_sync_edit_1	0.02	1.03	2.04
🥇	vectorized_ldg_mse_base	0.02	1.03	2.04
🥇	optimized_grid_stride_warp_reduce_base	0.02	1.03	2.04
🥇	mse_1d_optimized_indexing_base	0.02	1.03	2.04
🥇	mse_unrolled_optimized_edit_1	0.02	1.03	2.04
🥇	mse_warp_reduction_base	0.02	1.03	2.04
🥇	mse_unroll_pragma_base_base	0.02	1.03	2.04
🥇	mse_blocksize_experiment_base	0.02	1.03	2.04
🥇	mse_ldg_vectorized_edit_edit_1	0.02	1.03	2.04
🥇	mse_ldg_vectorized_edit_base	0.02	1.03	2.04
15	optimized_block_size_mse_base	0.02	0.97	1.92
15	stride_mse_loss_base	0.02	0.97	1.92
15	warp_uniform_mse_base	0.02	0.97	1.92
15	block_size_experimentation_base_base	0.02	0.97	1.92
15	optimized_mse_forward_base	0.02	0.97	1.92
15	warp_aligned_mse_base_base	0.02	0.97	1.92

#include <pybind11/pybind11.h>
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <algorithm>

static const int BLOCK_SIZE = 256;

template <typename scalar_t>
__global__ void optimized_mse_kernel(
    const scalar_t* __restrict__ preds,
    const scalar_t* __restrict__ tgts,
    double* __restrict__ sum_out,
    const int64_t num_elements
) {
    __shared__ double smem[BLOCK_SIZE];
    
    int tid = threadIdx.x;
    double thread_sum = 0.0;
    const int grid_stride = blockDim.x * gridDim.x;
    int idx = blockIdx.x * blockDim.x + tid;

    // Optimized grid-stride loop
    while (idx < num_elements) {
        double diff = static_cast<double>(preds[idx]) - static_cast<double>(tgts[idx]);
        thread_sum += diff * diff;
        idx += grid_stride;
    }

    smem[tid] = thread_sum;
    __syncthreads();

    // Unrolled reduction with warp-level optimization
    for (int s = blockDim.x/2; s > 32; s >>= 1) {
        if (tid < s) {
            smem[tid] += smem[tid + s];
        }
        __syncthreads();
    }

    // Warp-level reduction
    if (tid < 32) {
        volatile double* vsmem = smem;
        vsmem[tid] += vsmem[tid + 32];
        vsmem[tid] += vsmem[tid + 16];
        vsmem[tid] += vsmem[tid + 8];
        vsmem[tid] += vsmem[tid + 4];
        vsmem[tid] += vsmem[tid + 2];
        vsmem[tid] += vsmem[tid + 1];
    }

    if (tid == 0) {
        atomicAdd(sum_out, smem[0]);
    }
}

torch::Tensor forward(torch::Tensor predictions, torch::Tensor targets) {
    TORCH_CHECK(predictions.is_cuda(), "predictions must be a CUDA tensor");
    TORCH_CHECK(targets.is_cuda(), "targets must be a CUDA tensor");
    TORCH_CHECK(predictions.numel() == targets.numel(),
                "predictions and targets must have the same number of elements");

    const int64_t num_elements = predictions.numel();
    auto accumulator = torch::zeros({1}, predictions.options().dtype(at::kDouble));

    int grid_size = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
    grid_size = std::min(grid_size, 1024);

    AT_DISPATCH_FLOATING_TYPES(predictions.scalar_type(), "optimized_mse_cuda", ([&] {
        optimized_mse_kernel<scalar_t><<<grid_size, BLOCK_SIZE>>>(
            predictions.data_ptr<scalar_t>(),
            targets.data_ptr<scalar_t>(),
            accumulator.data_ptr<double>(),
            num_elements
        );
    }));

    auto result = accumulator.div_(static_cast<double>(num_elements));
    return result.to(predictions.dtype());
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized MSE forward (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	1.352	inst/cycle	0.000	5
Executed Ipc Elapsed	0.686	inst/cycle	0.000	5
Issue Slots Busy	36.046	%	0.041	5
Issued Ipc Active	1.442	inst/cycle	0.000	5
SM Busy	36.046	%	0.041	5
Memory Throughput	724896736366.736	byte/second	44080681346011742208.000	5
Mem Busy	14.084	%	0.015	5
Max Bandwidth	21.732	%	0.048	5
L1/TEX Hit Rate	0.000	%	0.000	5
L2 Hit Rate	18.686	%	0.008	5
Mem Pipes Busy	9.850	%	0.007	5
Warp Cycles Per Issued Instruction	35.452	cycle	0.028	5
Warp Cycles Per Executed Instruction	37.836	cycle	0.031	5
Avg. Active Threads Per Warp	31.730		0.000	5
Avg. Not Predicated Off Threads Per Warp	28.440		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	10.000	block	0.000	5
Block Limit Shared Mem	21.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	64.000	warp	0.000	5
Theoretical Occupancy	100.000	%	0.000	5
Achieved Occupancy	79.834	%	0.009	5
Achieved Active Warps Per SM	51.094	warp	0.004	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (20.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall	Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy	This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (79.7%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	1274560.64	μs
Device Time	225454.44	μs
Self CPU Time	76165.23	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	1198395.41	μs
Device Time	225454.44	μs
Self CPU Time	175706.42	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	4326439.45	μs
Device Time	6695453.63	μs
Self CPU Time	301330.07	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	4025111.13	μs
Device Time	6695453.63	μs
Self CPU Time	351206.85	μs
Self Device Time	6695453.63	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	4651041.03	μs
Device Time	1319.86	μs
Self CPU Time	4651041.03	μs
Self Device Time	1319.86	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void optimized_mse_kernel<float>(float const, float const, double*, long)
CPU Time	0.00	μs
Device Time	440105.23	μs
Self CPU Time	0.00	μs
Self Device Time	440105.23	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<double, double, double, at::native::binary_internal::MulFunctor<double> >, at::detail::Array<char, 2> >(int, at::native::BUnaryFunctor<double, double, double, at::native::binary_internal::MulFunctor<double> >, at::detail::Array<char, 2>)
CPU Time	0.00	μs
Device Time	228436.47	μs
Self CPU Time	0.00	μs
Self Device Time	228436.47	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	6495396.64	μs
Self CPU Time	0.00	μs
Self Device Time	6495396.64	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45282 warnings generated when compiling for host.
Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:18:15 bugprone-narrowing-conversions

18 | int tid = threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:20:29: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

20 | const int grid_stride = blockDim.x * gridDim.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:21:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

21 | int idx = blockIdx.x * blockDim.x + tid;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:34:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

34 | for (int s = blockDim.x/2; s > 32; s >>= 1) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:66:21: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

66 | int grid_size = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_94/b8_s2_optimized_grid_stride_warp_reduce/base/base.cu:69:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]

69 | AT_DISPATCH_FLOATING_TYPES(predictions.scalar_type(), "optimized_mse_cuda", ([&] {

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'

237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'

233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'

74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)

| ^

note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'

58 | AT_ERROR( \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'

711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'

536 | __func__, \

| ^

The AI CUDA Engineer 👷

`94_MSELoss` • `optimized_grid_stride_warp_reduce_base`

Kernel Information

Related Kernels (Level 1, Task 94 • 94_MSELoss)

The AI CUDA Engineer 👷

94_MSELoss • optimized_grid_stride_warp_reduce_base

Kernel Information

Related Kernels (Level 1, Task 94 • 94_MSELoss)

`94_MSELoss` • `optimized_grid_stride_warp_reduce_base`