import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""
Computes the Smooth L1 (Huber) Loss for regression tasks.
Args:
predictions (torch.Tensor): Predicted values.
targets (torch.Tensor): Target values.
Returns:
torch.Tensor: Smooth L1 (Huber) Loss.
"""
return F.smooth_l1_loss(predictions, targets)
class Model(nn.Module):
"""
A model that computes Smooth L1 (Huber) Loss for regression tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets, fn=module_fn):
return fn(predictions, targets)
batch_size = 128
input_shape = (4096,)
dim = 1
def get_inputs():
return [
torch.randn(batch_size, *input_shape),
torch.randn(batch_size, *input_shape),
]
def get_init_inputs():
return []
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model that computes Smooth L1 (Huber) Loss for regression tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets):
return torch.nn.functional.smooth_l1_loss(predictions, targets)
batch_size = 128
input_shape = (4096, )
dim = 1
def get_inputs():
return [torch.randn(batch_size, *input_shape), torch.randn(batch_size, *input_shape)]
def get_init_inputs():
return []
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#define UNROLL_FACTOR 4
// Device function for Smooth L1 (Huber) Loss computation
__device__ inline float huber_loss(float diff) {
float abs_diff = fabsf(diff);
float quad = 0.5f * diff * diff;
float linear = abs_diff - 0.5f;
// Branchless selection using built-in fminf and fmaxf
return quad * (abs_diff <= 1.0f) + linear * (abs_diff > 1.0f);
}
// Device function to perform warp-level reduction using shuffle-down operations
__device__ inline float warpReduceSum(float val) {
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
// Combined and optimized CUDA kernel leveraging loop unrolling and warp-level reduction
__global__ void smooth_l1_loss_combined_kernel(
const float* __restrict__ predictions,
const float* __restrict__ targets,
float* output,
int n_elements
) {
float thread_sum = 0.0f;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int total_threads = gridDim.x * blockDim.x;
// Each thread processes UNROLL_FACTOR elements at a time
int stride = total_threads * UNROLL_FACTOR;
// Unrolled loop processing multiple elements per iteration with inlined Huber loss computation
int i = tid * UNROLL_FACTOR;
for (; i <= n_elements - UNROLL_FACTOR; i += stride) {
float diff0 = __ldg(&predictions[i]) - __ldg(&targets[i]);
float abs0 = fabsf(diff0);
float loss0 = (abs0 <= 1.0f) ? 0.5f * diff0 * diff0 : abs0 - 0.5f;
float diff1 = __ldg(&predictions[i + 1]) - __ldg(&targets[i + 1]);
float abs1 = fabsf(diff1);
float loss1 = (abs1 <= 1.0f) ? 0.5f * diff1 * diff1 : abs1 - 0.5f;
float diff2 = __ldg(&predictions[i + 2]) - __ldg(&targets[i + 2]);
float abs2 = fabsf(diff2);
float loss2 = (abs2 <= 1.0f) ? 0.5f * diff2 * diff2 : abs2 - 0.5f;
float diff3 = __ldg(&predictions[i + 3]) - __ldg(&targets[i + 3]);
float abs3 = fabsf(diff3);
float loss3 = (abs3 <= 1.0f) ? 0.5f * diff3 * diff3 : abs3 - 0.5f;
thread_sum += loss0 + loss1 + loss2 + loss3;
}
// Process any remaining elements individually with inlined Huber loss computation
for (; i < n_elements; i++) {
float diff = __ldg(&predictions[i]) - __ldg(&targets[i]);
float abs_val = fabsf(diff);
thread_sum += (abs_val <= 1.0f) ? 0.5f * diff * diff : abs_val - 0.5f;
}
// Perform warp-level reduction
thread_sum = warpReduceSum(thread_sum);
// Shared memory for block-level reduction (one value per warp)
__shared__ float shared_data[32]; // assuming a maximum of 32 warps per block
int lane = threadIdx.x % warpSize;
int warpId = threadIdx.x / warpSize;
if (lane == 0) {
shared_data[warpId] = thread_sum;
}
// Synchronize only once after writing to shared memory
__syncthreads();
// First warp reduces the per-warp sums
if (warpId == 0) {
thread_sum = (lane < (blockDim.x + warpSize - 1) / warpSize) ? shared_data[lane] : 0.0f;
thread_sum = warpReduceSum(thread_sum);
}
// Thread 0 atomically adds the block's contribution to the output (averaged over n_elements)
if (threadIdx.x == 0) {
atomicAdd(output, thread_sum / n_elements);
}
}
// Host function to set up and launch the kernel
torch::Tensor smooth_l1_loss_cuda(
torch::Tensor predictions,
torch::Tensor targets
) {
TORCH_CHECK(
predictions.sizes() == targets.sizes(),
"Input tensors must have the same shape"
);
TORCH_CHECK(
predictions.is_contiguous() && targets.is_contiguous(),
"Input tensors must be contiguous"
);
TORCH_CHECK(
predictions.device().is_cuda() && targets.device().is_cuda(),
"Inputs must be CUDA tensors"
);
int n = predictions.numel();
auto output = torch::zeros({1}, predictions.options());
const int block_size = 256;
const int grid_size = (n + block_size * UNROLL_FACTOR - 1) / (block_size * UNROLL_FACTOR);
smooth_l1_loss_combined_kernel<<<grid_size, block_size>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
n
);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &smooth_l1_loss_cuda, "Smooth L1 Loss Combined Kernel (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.924 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.500 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 24.178 | % | 0.003 | 5 |
Issued Ipc Active | 0.966 | inst/cycle | 0.000 | 5 |
SM Busy | 24.178 | % | 0.003 | 5 |
Memory Throughput | 772216850088.956 | byte/second | 117640444873306390528.000 | 5 |
Mem Busy | 14.464 | % | 0.033 | 5 |
Max Bandwidth | 23.150 | % | 0.061 | 5 |
L1/TEX Hit Rate | 74.930 | % | 0.000 | 5 |
L2 Hit Rate | 18.666 | % | 0.000 | 5 |
Mem Pipes Busy | 6.588 | % | 0.006 | 5 |
Warp Cycles Per Issued Instruction | 27.316 | cycle | 0.003 | 5 |
Warp Cycles Per Executed Instruction | 28.602 | cycle | 0.003 | 5 |
Avg. Active Threads Per Warp | 31.830 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 27.930 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 28.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 41.296 | % | 0.022 | 5 |
Achieved Active Warps Per SM | 26.430 | warp | 0.010 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (41.2%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 363442.74 | μs |
Device Time | 350.66 | μs |
Self CPU Time | 43.70 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zeros | ||
CPU Time | 5858654.25 | μs |
Device Time | 224591.63 | μs |
Self CPU Time | 145929.03 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 6196262.07 | μs |
Device Time | 7712708.19 | μs |
Self CPU Time | 311488.22 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 5884775.36 | μs |
Device Time | 7712708.19 | μs |
Self CPU Time | 409346.56 | μs |
Self Device Time | 7712705.73 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 5856819.21 | μs |
Device Time | 2919.52 | μs |
Self CPU Time | 5856819.21 | μs |
Self Device Time | 2919.52 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
smooth_l1_loss_combined_kernel(float const*, float const*, float*, int) | ||
CPU Time | 0.00 | μs |
Device Time | 548079.50 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 548079.50 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 268468.06 | μs |
Device Time | 1241940.40 | μs |
Self CPU Time | 268468.06 | μs |
Self Device Time | 1241940.40 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 7488116.56 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 7488116.56 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45285 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.