97_CosineSimilarityLoss
• blocksize_tuning_cosine_loss_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""
Computes the Cosine Similarity Loss for comparing vectors.
Args:
predictions (torch.Tensor): Predicted values.
targets (torch.Tensor): Target values.
Returns:
torch.Tensor: Cosine Similarity Loss.
"""
cosine_sim = F.cosine_similarity(predictions, targets, dim=1)
return torch.mean(1 - cosine_sim)
class Model(nn.Module):
"""
A model that computes Cosine Similarity Loss for comparing vectors.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets, fn=module_fn):
return fn(predictions, targets)
batch_size = 128
input_shape = (4096,)
dim = 1
def get_inputs():
return [
torch.randn(batch_size, *input_shape),
torch.randn(batch_size, *input_shape),
]
def get_init_inputs():
return []
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model that computes Cosine Similarity Loss for comparing vectors.
Parameters:
None
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, predictions, targets):
cosine_sim = torch.nn.functional.cosine_similarity(predictions, targets, dim=1)
return torch.mean(1 - cosine_sim)
batch_size = 128
input_shape = (4096, )
dim = 1
def get_inputs():
return [torch.randn(batch_size, *input_shape), torch.randn(batch_size, *input_shape)]
def get_init_inputs():
return []
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
// Templated kernel that uses different block sizes for tuning performance
template <int BLOCK_SIZE>
__global__ void blocksize_tuning_cosine_similarity_loss_kernel(const float* __restrict__ predictions,
const float* __restrict__ targets,
float* output,
const int N,
const int D) {
// Each block processes one row
const int row = blockIdx.x;
const int tid = threadIdx.x;
float sum_dot = 0.0f;
float sum_pred_sq = 0.0f;
float sum_target_sq = 0.0f;
// Iterate over the D dimension in strides of BLOCK_SIZE
for (int i = tid; i < D; i += BLOCK_SIZE) {
float p = predictions[row * D + i];
float t = targets[row * D + i];
sum_dot += p * t;
sum_pred_sq += p * p;
sum_target_sq += t * t;
}
// Warp-level reduction using shuffle within each warp (warp size is 32)
for (int offset = 16; offset > 0; offset /= 2) {
sum_dot += __shfl_down_sync(0xffffffff, sum_dot, offset);
sum_pred_sq += __shfl_down_sync(0xffffffff, sum_pred_sq, offset);
sum_target_sq += __shfl_down_sync(0xffffffff, sum_target_sq, offset);
}
// Allocate shared memory for partial results from each warp
constexpr int NUM_WARPS = BLOCK_SIZE / 32;
__shared__ float s_dot[NUM_WARPS];
__shared__ float s_pred_sq[NUM_WARPS];
__shared__ float s_target_sq[NUM_WARPS];
int warp_id = tid / 32;
int lane = tid & 31; // tid % 32
if (lane == 0) {
s_dot[warp_id] = sum_dot;
s_pred_sq[warp_id] = sum_pred_sq;
s_target_sq[warp_id] = sum_target_sq;
}
__syncthreads();
// Final reduction: first warp reduces the partial sums
float final_dot = 0.0f;
float final_pred_sq = 0.0f;
float final_target_sq = 0.0f;
if (tid < NUM_WARPS) {
final_dot = s_dot[tid];
final_pred_sq = s_pred_sq[tid];
final_target_sq = s_target_sq[tid];
// Reduce within the first warp
for (int offset = NUM_WARPS / 2; offset > 0; offset /= 2) {
final_dot += __shfl_down_sync(0xffffffff, final_dot, offset);
final_pred_sq += __shfl_down_sync(0xffffffff, final_pred_sq, offset);
final_target_sq += __shfl_down_sync(0xffffffff, final_target_sq, offset);
}
if (tid == 0) {
const float eps = 1e-8f;
float norm_pred = sqrtf(final_pred_sq);
float norm_target = sqrtf(final_target_sq);
float denominator = norm_pred * norm_target;
denominator = fmaxf(denominator, eps);
float cos_sim = final_dot / denominator;
// Accumulate loss over rows and average by dividing by N
atomicAdd(output, (1.0f - cos_sim) / N);
}
}
}
// Host binding function with block size dispatching
torch::Tensor blocksize_tuning_cosine_similarity_loss_forward(torch::Tensor predictions, torch::Tensor targets) {
TORCH_CHECK(predictions.dim() == 2, "predictions must be 2D");
TORCH_CHECK(targets.dim() == 2, "targets must be 2D");
TORCH_CHECK(predictions.sizes() == targets.sizes(), "Input tensors must have the same shape");
TORCH_CHECK(predictions.scalar_type() == torch::kFloat32, "predictions must be float32");
TORCH_CHECK(targets.scalar_type() == torch::kFloat32, "targets must be float32");
int N = predictions.size(0);
int D = predictions.size(1);
auto output = torch::zeros({1}, predictions.options());
// Experiment with a range of block sizes based on the D dimension
if (D <= 64) {
blocksize_tuning_cosine_similarity_loss_kernel<32><<<N, 32>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
N, D);
} else if (D <= 128) {
blocksize_tuning_cosine_similarity_loss_kernel<64><<<N, 64>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
N, D);
} else if (D <= 256) {
blocksize_tuning_cosine_similarity_loss_kernel<128><<<N, 128>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
N, D);
} else if (D <= 512) {
blocksize_tuning_cosine_similarity_loss_kernel<256><<<N, 256>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
N, D);
} else {
blocksize_tuning_cosine_similarity_loss_kernel<512><<<N, 512>>>(
predictions.data_ptr<float>(),
targets.data_ptr<float>(),
output.data_ptr<float>(),
N, D);
}
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &blocksize_tuning_cosine_similarity_loss_forward, "Blocksize Tuning Cosine Similarity Loss Forward (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.560 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.294 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 14.424 | % | 0.003 | 5 |
Issued Ipc Active | 0.578 | inst/cycle | 0.000 | 5 |
SM Busy | 14.424 | % | 0.003 | 5 |
Memory Throughput | 833607712359.544 | byte/second | 312386639960832344064.000 | 5 |
Mem Busy | 14.484 | % | 0.093 | 5 |
Max Bandwidth | 25.040 | % | 0.298 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 18.636 | % | 0.001 | 5 |
Mem Pipes Busy | 7.032 | % | 0.023 | 5 |
Warp Cycles Per Issued Instruction | 25.692 | cycle | 0.008 | 5 |
Warp Cycles Per Executed Instruction | 26.372 | cycle | 0.008 | 5 |
Avg. Active Threads Per Warp | 28.050 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 27.390 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 4.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 12.000 | block | 0.000 | 5 |
Block Limit Warps | 4.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 23.286 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 14.902 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (23.3%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 588346.83 | μs |
Device Time | 312.09 | μs |
Self CPU Time | 68.37 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zeros | ||
CPU Time | 5658079.52 | μs |
Device Time | 224613.26 | μs |
Self CPU Time | 155619.54 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 5985695.14 | μs |
Device Time | 7420625.95 | μs |
Self CPU Time | 316012.81 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 5669687.50 | μs |
Device Time | 7420625.95 | μs |
Self CPU Time | 401783.50 | μs |
Self Device Time | 7420625.95 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 5646784.67 | μs |
Device Time | 2847.33 | μs |
Self CPU Time | 5646784.67 | μs |
Self Device Time | 2847.33 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void blocksize_tuning_cosine_similarity_loss_kernel<512>(float const*, float const*, float*, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 435520.27 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 435520.27 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 254963.05 | μs |
Device Time | 1193306.76 | μs |
Self CPU Time | 254963.05 | μs |
Self Device Time | 1193306.76 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 7196012.70 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 7196012.70 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45284 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.