import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""
Computes the Mean Squared Error loss for regression tasks.
Args:
predictions (torch.Tensor): Predicted values.
targets (torch.Tensor): Target values.
Returns:
torch.Tensor: Mean Squared Error loss.
"""
return F.mse_loss(predictions, targets, reduction="mean")
class Model(nn.Module):
"""
A model that computes the Mean Squared Error loss for regression tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets, fn=module_fn):
return fn(predictions, targets)
batch_size = 128
input_shape = (4096,)
dim = 1
def get_inputs():
return [
torch.randn(batch_size, *input_shape),
torch.randn(batch_size, *input_shape),
]
def get_init_inputs():
return []
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model that computes the Mean Squared Error loss for regression tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets):
return torch.mean((predictions - targets) ** 2)
batch_size = 128
input_shape = (4096, )
dim = 1
def get_inputs():
return [torch.randn(batch_size, *input_shape), torch.randn(batch_size, *input_shape)]
def get_init_inputs():
return []
#include <pybind11/pybind11.h>
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
// Define block size and warp size
static const int BLOCK_SIZE = 256;
#define WARP_SIZE 32
// CUDA kernel using warp-level reduction to minimize __syncthreads() usage
template <typename scalar_t>
__global__ void mse_forward_kernel_warp_reduction(
const scalar_t* __restrict__ preds,
const scalar_t* __restrict__ tgts,
double* __restrict__ sum_out,
const int64_t num_elements
) {
double local_sum = 0.0;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x * blockDim.x;
// Grid-stride loop to accumulate squared differences
for (; idx < num_elements; idx += stride) {
double diff = static_cast<double>(preds[idx]) - static_cast<double>(tgts[idx]);
local_sum += diff * diff;
}
// Warp-level reduction using shuffle intrinsics
unsigned mask = 0xffffffff;
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
local_sum += __shfl_down_sync(mask, local_sum, offset);
}
// Each warp writes its reduced result to shared memory
__shared__ double warp_sums[BLOCK_SIZE / WARP_SIZE];
int lane = threadIdx.x % WARP_SIZE;
int warpId = threadIdx.x / WARP_SIZE;
if (lane == 0) {
warp_sums[warpId] = local_sum;
}
// Synchronize once to ensure all warp results are written
__syncthreads();
// First thread accumulates results from all warps in the block
if (threadIdx.x == 0) {
double block_sum = 0.0;
int numWarps = blockDim.x / WARP_SIZE;
for (int i = 0; i < numWarps; i++) {
block_sum += warp_sums[i];
}
atomicAdd(sum_out, block_sum);
}
}
// Host function that sets up the kernel launch
torch::Tensor forward(torch::Tensor predictions, torch::Tensor targets) {
TORCH_CHECK(predictions.is_cuda(), "predictions must be a CUDA tensor");
TORCH_CHECK(targets.is_cuda(), "targets must be a CUDA tensor");
TORCH_CHECK(predictions.numel() == targets.numel(),
"predictions and targets must have the same number of elements");
const int64_t num_elements = predictions.numel();
// Get SM count and set grid size (using 4 blocks per SM)
int device_id;
cudaGetDevice(&device_id);
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_id);
int grid_size = sm_count * 4;
auto accumulator = torch::zeros({1}, predictions.options().dtype(at::kDouble));
AT_DISPATCH_FLOATING_TYPES(predictions.scalar_type(), "mse_forward_cuda_warp", ([&]() {
mse_forward_kernel_warp_reduction<scalar_t><<<grid_size, BLOCK_SIZE>>>(
predictions.data_ptr<scalar_t>(),
targets.data_ptr<scalar_t>(),
accumulator.data_ptr<double>(),
num_elements
);
}));
// Final result: mean squared error
auto result = accumulator.div(static_cast<double>(num_elements));
return result.to(predictions.dtype());
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Mean Squared Error (MSE) forward using warp-level reduction (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.836 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.474 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 22.114 | % | 0.010 | 5 |
Issued Ipc Active | 0.884 | inst/cycle | 0.000 | 5 |
SM Busy | 22.114 | % | 0.010 | 5 |
Memory Throughput | 764068404880.190 | byte/second | 182694527171556081664.000 | 5 |
Mem Busy | 13.170 | % | 0.041 | 5 |
Max Bandwidth | 22.926 | % | 0.126 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 18.756 | % | 0.002 | 5 |
Mem Pipes Busy | 7.538 | % | 0.014 | 5 |
Warp Cycles Per Issued Instruction | 31.422 | cycle | 0.039 | 5 |
Warp Cycles Per Executed Instruction | 33.220 | cycle | 0.031 | 5 |
Avg. Active Threads Per Warp | 30.670 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 28.470 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 10.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 28.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 43.508 | % | 0.016 | 5 |
Achieved Active Warps Per SM | 27.846 | warp | 0.007 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (43.6%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 1402214.79 | μs |
Device Time | 243791.90 | μs |
Self CPU Time | 55090.79 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 1347124.00 | μs |
Device Time | 243791.90 | μs |
Self CPU Time | 193506.77 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 4033999.47 | μs |
Device Time | 6719481.68 | μs |
Self CPU Time | 258180.45 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 3775821.14 | μs |
Device Time | 6719481.68 | μs |
Self CPU Time | 346240.50 | μs |
Self Device Time | 6719481.68 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 4434872.85 | μs |
Device Time | 23014.12 | μs |
Self CPU Time | 4434872.85 | μs |
Self Device Time | 23014.12 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void mse_forward_kernel_warp_reduction<float>(float const*, float const*, double*, long) | ||
CPU Time | 0.00 | μs |
Device Time | 444021.36 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 444021.36 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 248516.71 | μs |
Device Time | 706170.05 | μs |
Self CPU Time | 248516.71 | μs |
Self Device Time | 706170.05 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 6522253.04 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 6522253.04 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45280 warnings generated when compiling for host. Suppressed 45319 warnings (45272 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.