import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""
Computes the Smooth L1 (Huber) Loss for regression tasks.
Args:
predictions (torch.Tensor): Predicted values.
targets (torch.Tensor): Target values.
Returns:
torch.Tensor: Smooth L1 (Huber) Loss.
"""
return F.smooth_l1_loss(predictions, targets)
class Model(nn.Module):
"""
A model that computes Smooth L1 (Huber) Loss for regression tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets, fn=module_fn):
return fn(predictions, targets)
batch_size = 128
input_shape = (4096,)
dim = 1
def get_inputs():
return [
torch.randn(batch_size, *input_shape),
torch.randn(batch_size, *input_shape),
]
def get_init_inputs():
return []
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model that computes Smooth L1 (Huber) Loss for regression tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets):
return torch.nn.functional.smooth_l1_loss(predictions, targets)
batch_size = 128
input_shape = (4096, )
dim = 1
def get_inputs():
return [torch.randn(batch_size, *input_shape), torch.randn(batch_size, *input_shape)]
def get_init_inputs():
return []
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
// Constant memory for frequently accessed values
__constant__ float THRESHOLD = 1.0f;
__constant__ float HALF = 0.5f;
__global__ void smooth_l1_loss_const_mem_kernel(
const float* __restrict__ predictions,
const float* __restrict__ targets,
float* output,
const int n_elements
) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
int stride = gridDim.x * blockDim.x;
float thread_sum = 0.0f;
// Pre-cast pointers for vectorized processing
const float4* pred4 = reinterpret_cast<const float4*>(predictions);
const float4* targ4 = reinterpret_cast<const float4*>(targets);
const int vec_count = n_elements / 4;
// Vectorized processing using float4
for (int i = idx; i < vec_count; i += stride) {
float4 p = __ldg(pred4 + i);
float4 t = __ldg(targ4 + i);
// Process x component
float diff = p.x - t.x;
float abs_diff = fabsf(diff);
thread_sum += (abs_diff < THRESHOLD) ?
HALF * diff * diff :
abs_diff - HALF;
// Process y component
diff = p.y - t.y;
abs_diff = fabsf(diff);
thread_sum += (abs_diff < THRESHOLD) ?
HALF * diff * diff :
abs_diff - HALF;
// Process z component
diff = p.z - t.z;
abs_diff = fabsf(diff);
thread_sum += (abs_diff < THRESHOLD) ?
HALF * diff * diff :
abs_diff - HALF;
// Process w component
diff = p.w - t.w;
abs_diff = fabsf(diff);
thread_sum += (abs_diff < THRESHOLD) ?
HALF * diff * diff :
abs_diff - HALF;
}
// Handle remaining elements
int scalar_start = vec_count * 4;
for (int i = scalar_start + idx; i < n_elements; i += stride) {
float diff = __ldg(predictions + i) - __ldg(targets + i);
float abs_diff = fabsf(diff);
thread_sum += (abs_diff < THRESHOLD) ?
HALF * diff * diff :
abs_diff - HALF;
}
// Block-level reduction using shared memory
__shared__ float shared_mem[256];
shared_mem[tid] = thread_sum;
__syncthreads();
// Optimized reduction
for (int s = blockDim.x/2; s > 32; s >>= 1) {
if (tid < s) {
shared_mem[tid] += shared_mem[tid + s];
}
__syncthreads();
}
// Warp-level reduction (last 32 elements)
if (tid < 32) {
volatile float* smem = shared_mem;
if (blockDim.x >= 64) smem[tid] += smem[tid + 32];
if (blockDim.x >= 32) smem[tid] += smem[tid + 16];
if (blockDim.x >= 16) smem[tid] += smem[tid + 8];
if (blockDim.x >= 8) smem[tid] += smem[tid + 4];
if (blockDim.x >= 4) smem[tid] += smem[tid + 2];
if (blockDim.x >= 2) smem[tid] += smem[tid + 1];
}
// Final atomic add
if (tid == 0) {
atomicAdd(output, shared_mem[0] / n_elements);
}
}
torch::Tensor smooth_l1_loss_const_mem(
torch::Tensor predictions,
torch::Tensor targets
) {
TORCH_CHECK(predictions.sizes() == targets.sizes(), "Input tensors must have the same shape");
TORCH_CHECK(predictions.is_contiguous() && targets.is_contiguous(), "Input tensors must be contiguous");
TORCH_CHECK(predictions.device().is_cuda() && targets.device().is_cuda(), "Inputs must be CUDA tensors");
int n = predictions.numel();
auto output = torch::zeros({1}, predictions.options());
const int block_size = 256;
int grid_size = (n / 4 + block_size - 1) / block_size;
grid_size = grid_size > 0 ? grid_size : 1;
smooth_l1_loss_const_mem_kernel<<<grid_size, block_size>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
n
);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &smooth_l1_loss_const_mem, "Smooth L1 Loss with constant memory optimization");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.656 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.372 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 18.110 | % | 0.016 | 5 |
Issued Ipc Active | 0.726 | inst/cycle | 0.000 | 5 |
SM Busy | 18.110 | % | 0.016 | 5 |
Memory Throughput | 800255768024.426 | byte/second | 101636694893556432896.000 | 5 |
Mem Busy | 13.940 | % | 0.031 | 5 |
Max Bandwidth | 24.096 | % | 0.110 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 18.612 | % | 0.001 | 5 |
Mem Pipes Busy | 8.956 | % | 0.015 | 5 |
Warp Cycles Per Issued Instruction | 38.456 | cycle | 0.819 | 5 |
Warp Cycles Per Executed Instruction | 42.314 | cycle | 0.966 | 5 |
Avg. Active Threads Per Warp | 31.570 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 27.550 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 16.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 42.388 | % | 0.001 | 5 |
Achieved Active Warps Per SM | 27.130 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (42.4%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 467656.40 | μs |
Device Time | 308.26 | μs |
Self CPU Time | 72.99 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zeros | ||
CPU Time | 5432731.74 | μs |
Device Time | 218186.47 | μs |
Self CPU Time | 158458.17 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 5860613.66 | μs |
Device Time | 7405671.45 | μs |
Self CPU Time | 321675.71 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 5538940.20 | μs |
Device Time | 7405671.45 | μs |
Self CPU Time | 391805.90 | μs |
Self Device Time | 7405671.45 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 5635682.94 | μs |
Device Time | 2840.51 | μs |
Self CPU Time | 5635682.94 | μs |
Self Device Time | 2840.51 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
smooth_l1_loss_const_mem_kernel(float const*, float const*, float*, int) | ||
CPU Time | 0.00 | μs |
Device Time | 463940.44 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 463940.44 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 253607.40 | μs |
Device Time | 1191926.89 | μs |
Self CPU Time | 253607.40 | μs |
Self Device Time | 1191926.89 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 7187484.98 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 7187484.98 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45283 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.