95_CrossEntropyLoss
• warp_divergence_minimization_base_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""
Computes the Cross Entropy Loss for multi-class classification tasks.
Args:
predictions (torch.Tensor): Predicted values.
targets (torch.Tensor): Target values.
Returns:
torch.Tensor: Cross Entropy Loss.
"""
return F.cross_entropy(predictions, targets)
class Model(nn.Module):
"""
A model that computes Cross Entropy Loss for multi-class classification tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets, fn=module_fn):
return fn(predictions, targets)
batch_size = 4096
num_classes = 10
input_shape = (num_classes,) # Output for each class
dim = 1
def get_inputs():
return [
torch.randn(batch_size, *input_shape),
torch.randint(0, num_classes, (batch_size,)),
]
def get_init_inputs():
return []
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model that computes Cross Entropy Loss for multi-class classification tasks.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets):
return torch.nn.functional.cross_entropy(predictions, targets)
batch_size = 4096
num_classes = 10
input_shape = (num_classes, ) # Output for each class
dim = 1
def get_inputs():
return [torch.randn(batch_size, *input_shape), torch.randint(0, num_classes, (batch_size,))]
def get_init_inputs():
return []
#include <torch/extension.h>
__global__ void cross_entropy_loss_kernel_warp_divergence(
const float* __restrict__ logits,
const int64_t* __restrict__ targets,
float* __restrict__ losses,
int batch_size,
int num_classes
)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < batch_size)
{
// Get pointer to logits for sample i
const float* logits_i = logits + i * num_classes;
int64_t target = targets[i];
// Compute max logit for numerical stability
float max_logit = logits_i[0];
for (int j = 1; j < num_classes; j++)
{
max_logit = fmaxf(max_logit, logits_i[j]);
}
// Compute sum of exp(logits - max_logit)
float sum_exp = 0.0f;
for (int j = 0; j < num_classes; j++)
{
sum_exp += expf(logits_i[j] - max_logit);
}
// Compute log_sum_exp
float log_sum_exp = logf(sum_exp);
// Compute loss for this sample
float loss = - (logits_i[target] - max_logit - log_sum_exp);
losses[i] = loss;
}
}
torch::Tensor forward(torch::Tensor predictions, torch::Tensor targets)
{
// Ensure inputs are on CUDA
TORCH_CHECK(predictions.is_cuda(), "predictions must be a CUDA tensor");
TORCH_CHECK(targets.is_cuda(), "targets must be a CUDA tensor");
// Ensure inputs have correct dimensions
TORCH_CHECK(predictions.dim() == 2, "predictions must be a 2D tensor");
TORCH_CHECK(targets.dim() == 1, "targets must be a 1D tensor");
// Ensure data types are correct
TORCH_CHECK(predictions.dtype() == torch::kFloat32, "predictions must be Float32 tensor");
TORCH_CHECK(targets.dtype() == torch::kInt64, "targets must be Int64 tensor");
int batch_size = predictions.size(0);
int num_classes = predictions.size(1);
TORCH_CHECK(targets.size(0) == batch_size, "targets must have same batch size as predictions");
// Output tensor for losses per sample
auto losses = torch::empty({batch_size}, predictions.options());
// Launch CUDA kernel
int threads = 256;
int blocks = (batch_size + threads - 1) / threads;
cross_entropy_loss_kernel_warp_divergence<<<blocks, threads>>>(
predictions.data_ptr<float>(),
targets.data_ptr<int64_t>(),
losses.data_ptr<float>(),
batch_size,
num_classes);
// Check for CUDA errors
cudaError_t err = cudaGetLastError();
TORCH_CHECK(err == cudaSuccess, "Error in cross_entropy_loss_kernel_warp_divergence: ", cudaGetErrorString(err));
// Compute mean loss over batch
auto loss = losses.mean();
return loss;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Cross Entropy Loss forward (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.574 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.030 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 14.618 | % | 0.012 | 5 |
Issued Ipc Active | 0.584 | inst/cycle | 0.000 | 5 |
SM Busy | 14.618 | % | 0.012 | 5 |
Memory Throughput | 52918856781.570 | byte/second | 262867735991106368.000 | 5 |
Mem Busy | 8.884 | % | 0.007 | 5 |
Max Bandwidth | 4.816 | % | 0.002 | 5 |
L1/TEX Hit Rate | 92.360 | % | 0.000 | 5 |
L2 Hit Rate | 87.796 | % | 0.005 | 5 |
Mem Pipes Busy | 0.384 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 13.176 | cycle | 0.053 | 5 |
Warp Cycles Per Executed Instruction | 13.434 | cycle | 0.054 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 29.840 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 11.970 | % | 0.001 | 5 |
Achieved Active Warps Per SM | 7.660 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.0%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 560048.26 | μs |
Device Time | 11.26 | μs |
Self CPU Time | 44.17 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 560004.09 | μs |
Device Time | 11.26 | μs |
Self CPU Time | 103.36 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 559768.17 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 92.67 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 558831.99 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 558831.99 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 754907.95 | μs |
Device Time | 8098.39 | μs |
Self CPU Time | 754907.95 | μs |
Self Device Time | 8098.39 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::mean | ||
CPU Time | 175654.26 | μs |
Device Time | 53654.13 | μs |
Self CPU Time | 104552.61 | μs |
Self Device Time | 53654.13 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float, float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float, float, float>, unsigned int, float, 4>) | ||
CPU Time | 0.00 | μs |
Device Time | 53654.13 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 53654.13 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 103765.50 | μs |
Device Time | 1016099.40 | μs |
Self CPU Time | 23011.42 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 80755.62 | μs |
Device Time | 1016099.40 | μs |
Self CPU Time | 26815.77 | μs |
Self Device Time | 1016099.40 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 1016178.02 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 1016178.02 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45282 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.