import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""
Computes the Kullback-Leibler Divergence for comparing two distributions.
Args:
predictions (torch.Tensor): Predicted values.
targets (torch.Tensor): Target values.
Returns:
torch.Tensor: Kullback-Leibler Divergence.
"""
return F.kl_div(torch.log(predictions), targets, reduction="batchmean")
class Model(nn.Module):
"""
A model that computes Kullback-Leibler Divergence for comparing two distributions.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets, fn=module_fn):
return fn(predictions, targets)
batch_size = 128
input_shape = (4096,)
dim = 1
def get_inputs():
return [
torch.randn(batch_size, *input_shape).softmax(dim=-1),
torch.randn(batch_size, *input_shape).softmax(dim=-1),
]
def get_init_inputs():
return []
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model that computes Kullback-Leibler Divergence for comparing two distributions.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets):
return torch.nn.functional.kl_div(torch.log(predictions), targets, reduction='batchmean')
batch_size = 128
input_shape = (4096, )
dim = 1
def get_inputs():
return [torch.randn(batch_size, *input_shape).softmax(dim=-1), torch.randn(batch_size, *input_shape).softmax(dim=-1)]
def get_init_inputs():
return []
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void kl_div_kernel(
const float* __restrict__ log_predictions,
const float* __restrict__ targets,
float* __restrict__ output,
const int n) {
const int tid = threadIdx.x;
const int warp_id = tid / 32;
const int lane = tid % 32;
const int global_idx = blockIdx.x * blockDim.x + tid;
extern __shared__ float warp_sums[];
float sum = 0.0f;
// Vector processing using float4
const int n4 = n / 4;
const float4* logp_vec = reinterpret_cast<const float4*>(log_predictions);
const float4* targ_vec = reinterpret_cast<const float4*>(targets);
int vec_idx = global_idx;
while (vec_idx < n4) {
float4 logp = __ldg(&logp_vec[vec_idx]);
float4 targ = __ldg(&targ_vec[vec_idx]);
sum += expf(logp.x) - targ.x * logp.x
+ expf(logp.y) - targ.y * logp.y
+ expf(logp.z) - targ.z * logp.z
+ expf(logp.w) - targ.w * logp.w;
vec_idx += gridDim.x * blockDim.x;
}
// Process scarlar remainder with unrolled stride
int scalar_idx = n4 * 4 + global_idx;
while (scalar_idx < n) {
float log_pred = __ldg(&log_predictions[scalar_idx]);
float target = __ldg(&targets[scalar_idx]);
sum += expf(log_pred) - target * log_pred;
scalar_idx += gridDim.x * blockDim.x;
}
// Manually unrolled warp reduction
sum += __shfl_down_sync(0xffffffff, sum, 16);
sum += __shfl_down_sync(0xffffffff, sum, 8);
sum += __shfl_down_sync(0xffffffff, sum, 4);
sum += __shfl_down_sync(0xffffffff, sum, 2);
sum += __shfl_down_sync(0xffffffff, sum, 1);
// Store warp sums in shared memory
if (lane == 0)
warp_sums[warp_id] = sum;
__syncthreads();
// Unrolled block reduction
if (warp_id == 0) {
float val = (lane < (blockDim.x / 32)) ? warp_sums[lane] : 0.0f;
val += __shfl_down_sync(0xffffffff, val, 16);
val += __shfl_down_sync(0xffffffff, val, 8);
val += __shfl_down_sync(0xffffffff, val, 4);
val += __shfl_down_sync(0xffffffff, val, 2);
val += __shfl_down_sync(0xffffffff, val, 1);
if (lane == 0)
atomicAdd(output, val);
}
}
torch::Tensor kl_div_cuda_forward(
torch::Tensor log_predictions,
torch::Tensor targets) {
const int n = log_predictions.numel();
auto output = torch::zeros({1}, log_predictions.options());
const int threads = 256;
const int warps_per_block = threads / 32;
const int blocks = (n + threads*4 - 1) / (threads*4);
const int shared_mem = warps_per_block * sizeof(float);
kl_div_kernel<<<blocks, threads, shared_mem>>>(
log_predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
n
);
return output / static_cast<float>(n);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &kl_div_cuda_forward, "KL divergence forward (CUDA Unrolled Reduction)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.554 | inst/cycle | 0.002 | 5 |
Executed Ipc Elapsed | 0.290 | inst/cycle | 0.001 | 5 |
Issue Slots Busy | 14.874 | % | 1.569 | 5 |
Issued Ipc Active | 0.594 | inst/cycle | 0.002 | 5 |
SM Busy | 14.874 | % | 1.569 | 5 |
Memory Throughput | 838034098910.416 | byte/second | 7730130406354232279040.000 | 5 |
Mem Busy | 14.516 | % | 2.337 | 5 |
Max Bandwidth | 25.104 | % | 6.937 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 18.602 | % | 0.007 | 5 |
Mem Pipes Busy | 7.498 | % | 0.614 | 5 |
Warp Cycles Per Issued Instruction | 42.602 | cycle | 0.006 | 5 |
Warp Cycles Per Executed Instruction | 45.540 | cycle | 0.007 | 5 |
Avg. Active Threads Per Warp | 31.790 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 28.140 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 28.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 41.518 | % | 0.299 | 5 |
Achieved Active Warps Per SM | 26.570 | warp | 0.122 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (42.0%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::zeros | ||
CPU Time | 4976328.92 | μs |
Device Time | 231757.13 | μs |
Self CPU Time | 129130.54 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 5304990.00 | μs |
Device Time | 7369127.03 | μs |
Self CPU Time | 331224.88 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 4973767.65 | μs |
Device Time | 7369127.03 | μs |
Self CPU Time | 378262.15 | μs |
Self Device Time | 7369127.03 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 5307156.76 | μs |
Device Time | 2050.38 | μs |
Self CPU Time | 5307156.76 | μs |
Self Device Time | 2050.38 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 231774.89 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 231774.89 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
kl_div_kernel(float const*, float const*, float*, int) | ||
CPU Time | 0.00 | μs |
Device Time | 429457.50 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 429457.50 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::div | ||
CPU Time | 866195.66 | μs |
Device Time | 230261.34 | μs |
Self CPU Time | 500925.62 | μs |
Self Device Time | 230261.34 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 7137369.90 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 7137369.90 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45283 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.