47_NetVladNoGhostClusters
• tiled_unroll_block_optimization_base
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch as th
def module_fn(
x: torch.Tensor,
clusters: torch.Tensor,
clusters2: torch.Tensor,
bn_weight: torch.Tensor,
bn_bias: torch.Tensor,
bn_running_mean: torch.Tensor,
bn_running_var: torch.Tensor,
feature_size: int,
cluster_size: int,
is_training: bool,
) -> torch.Tensor:
"""
Functional version of the NetVLAD without ghost clusters
Args:
x: Input tensor of shape (batch_size, num_features, feature_size)
clusters: Weight tensor for cluster assignments
clusters2: Weight tensor for visual words
bn_weight: BatchNorm weight
bn_bias: BatchNorm bias
bn_running_mean: BatchNorm running mean
bn_running_var: BatchNorm running var
feature_size: Size of each feature
cluster_size: Number of clusters (excluding ghost clusters)
is_training: Whether in training mode
Returns:
Output tensor of shape (batch_size, cluster_size * feature_size)
"""
max_sample = x.size()[1]
x = x.view(-1, feature_size) # B x N x D -> BN x D
if x.device != clusters.device:
msg = f"x.device {x.device} != cluster.device {clusters.device}"
raise ValueError(msg)
assignment = th.matmul(x, clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G)
assignment = F.batch_norm(
assignment,
bn_running_mean,
bn_running_var,
bn_weight,
bn_bias,
training=is_training,
)
assignment = F.softmax(assignment, dim=1) # BN x (K+G) -> BN x (K+G)
# remove ghost assigments
assignment = assignment[:, :cluster_size]
assignment = assignment.view(-1, max_sample, cluster_size) # -> B x N x K
a_sum = th.sum(assignment, dim=1, keepdim=True) # B x N x K -> B x 1 x K
a = a_sum * clusters2
assignment = assignment.transpose(1, 2) # B x N x K -> B x K x N
x = x.view(-1, max_sample, feature_size) # BN x D -> B x N x D
vlad = th.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D
vlad = vlad.transpose(1, 2) # -> B x D x K
vlad = vlad - a
# L2 intra norm
vlad = F.normalize(vlad)
# flattening + L2 norm
vlad = vlad.reshape(-1, cluster_size * feature_size) # -> B x DK
vlad = F.normalize(vlad)
return vlad # B x DK
class Model(nn.Module):
def __init__(self, cluster_size, feature_size, ghost_clusters):
super(Model, self).__init__()
self.feature_size = feature_size
self.cluster_size = cluster_size
self.ghost_clusters = ghost_clusters
init_sc = 1 / math.sqrt(feature_size)
clusters = cluster_size + ghost_clusters
# The `clusters` weights are the `(w,b)` in the paper
self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
# Extract batchnorm parameters
bn = nn.BatchNorm1d(clusters)
self.bn_weight = nn.Parameter(bn.weight.data.clone())
self.bn_bias = nn.Parameter(bn.bias.data.clone())
self.bn_running_mean = nn.Parameter(bn.running_mean.data.clone())
self.bn_running_var = nn.Parameter(bn.running_var.data.clone())
# The `clusters2` weights are the visual words `c_k` in the paper
self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
self.out_dim = self.cluster_size * feature_size
def forward(self, x, fn=module_fn):
return fn(
x,
self.clusters,
self.clusters2,
self.bn_weight,
self.bn_bias,
self.bn_running_mean,
self.bn_running_var,
self.feature_size,
self.cluster_size,
self.training,
)
batch_size = 32
num_features = 100
num_clusters = 32
feature_size = 512
ghost_clusters = 0
def get_inputs():
return [torch.randn(batch_size, num_features, feature_size)]
def get_init_inputs():
return [num_clusters, feature_size, ghost_clusters]
# Copyright 2018 Antoine Miech All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Code modified from here
https://github.com/albanie/collaborative-experts/blob/master/model/net_vlad.py
"""
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch as th
class Model(nn.Module):
def __init__(self, cluster_size, feature_size, ghost_clusters):
super(Model, self).__init__()
self.feature_size = feature_size
self.cluster_size = cluster_size
self.ghost_clusters = ghost_clusters
init_sc = (1 / math.sqrt(feature_size))
clusters = cluster_size + ghost_clusters
# The `clusters` weights are the `(w,b)` in the paper
self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
self.batch_norm = nn.BatchNorm1d(clusters)
# The `clusters2` weights are the visual words `c_k` in the paper
self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
self.out_dim = self.cluster_size * feature_size
def forward(self, x, mask=None):
"""Aggregates feature maps into a fixed size representation. In the following
notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.
Args:
x (th.Tensor): B x N x D
Returns:
(th.Tensor): B x DK
"""
max_sample = x.size()[1]
x = x.view(-1, self.feature_size) # B x N x D -> BN x D
if x.device != self.clusters.device:
msg = f"x.device {x.device} != cluster.device {self.clusters.device}"
raise ValueError(msg)
assignment = th.matmul(x, self.clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G)
assignment = self.batch_norm(assignment)
assignment = F.softmax(assignment, dim=1) # BN x (K+G) -> BN x (K+G)
# remove ghost assigments
assignment = assignment[:, :self.cluster_size]
assignment = assignment.view(-1, max_sample, self.cluster_size) # -> B x N x K
a_sum = th.sum(assignment, dim=1, keepdim=True) # B x N x K -> B x 1 x K
a = a_sum * self.clusters2
assignment = assignment.transpose(1, 2) # B x N x K -> B x K x N
x = x.view(-1, max_sample, self.feature_size) # BN x D -> B x N x D
vlad = th.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D
vlad = vlad.transpose(1, 2) # -> B x D x K
vlad = vlad - a
# L2 intra norm
vlad = F.normalize(vlad)
# flattening + L2 norm
vlad = vlad.reshape(-1, self.cluster_size * self.feature_size) # -> B x DK
vlad = F.normalize(vlad)
return vlad # B x DK
batch_size = 32
num_features = 100
num_clusters = 32
feature_size = 512
ghost_clusters = 0
def get_inputs():
return [torch.randn(batch_size, num_features, feature_size)]
def get_init_inputs():
return [num_clusters, feature_size, ghost_clusters]
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <vector>
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
// Optimized block size for shared memory tiling
#define TILE_DIM 32
// Tiled matrix multiplication kernel using shared memory with loop unrolling and optimized block size
// Computes: C = A (MxD) * B (DxN), where A is x (reshaped) and B is clusters
__global__ void tiledMatMulOptimizedBlock(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C,
int M, int D, int N) {
// Each thread computes one element in the output matrix C
// using tiling to reduce global memory accesses
int row = blockIdx.y * TILE_DIM + threadIdx.y;
int col = blockIdx.x * TILE_DIM + threadIdx.x;
__shared__ float tile_A[TILE_DIM][TILE_DIM];
__shared__ float tile_B[TILE_DIM][TILE_DIM];
float sum = 0.0f;
// Loop over tiles in the inner dimension
for (int t = 0; t < (D + TILE_DIM - 1) / TILE_DIM; t++) {
// Load element of A into shared memory if within bounds
int a_col = t * TILE_DIM + threadIdx.x;
if (row < M && a_col < D) {
tile_A[threadIdx.y][threadIdx.x] = A[row * D + a_col];
} else {
tile_A[threadIdx.y][threadIdx.x] = 0.0f;
}
// Load element of B into shared memory if within bounds
int b_row = t * TILE_DIM + threadIdx.y;
if (b_row < D && col < N) {
tile_B[threadIdx.y][threadIdx.x] = B[b_row * N + col];
} else {
tile_B[threadIdx.y][threadIdx.x] = 0.0f;
}
__syncthreads();
// Multiply the two tiles together with loop unrolling
#pragma unroll
for (int i = 0; i < TILE_DIM; i++) {
sum += tile_A[threadIdx.y][i] * tile_B[i][threadIdx.x];
}
__syncthreads();
}
// Write the result to global memory if within bounds
if (row < M && col < N) {
C[row * N + col] = sum;
}
}
// Forward function for NetVLAD (no ghost clusters) using the shared memory optimized kernel
torch::Tensor forward(
torch::Tensor x, // B x N x D
torch::Tensor clusters, // D x (K+G)
torch::Tensor clusters2, // 1 x D x K
torch::Tensor bn_weight, // (K+G)
torch::Tensor bn_bias, // (K+G)
torch::Tensor bn_running_mean, // (K+G)
torch::Tensor bn_running_var, // (K+G)
int64_t feature_size, // D
int64_t cluster_size, // K
bool is_training
) {
CHECK_INPUT(x);
CHECK_INPUT(clusters);
CHECK_INPUT(clusters2);
CHECK_INPUT(bn_weight);
CHECK_INPUT(bn_bias);
CHECK_INPUT(bn_running_mean);
CHECK_INPUT(bn_running_var);
int64_t B = x.size(0);
int64_t N_points = x.size(1); // Number of descriptors/features per sample
int64_t D = feature_size;
int64_t K = cluster_size;
int64_t KplusG = clusters.size(1); // Total clusters including ghost clusters
// Reshape x: B x N x D -> (B*N) x D
auto x_reshaped = x.reshape({B * N_points, D});
// Ensure device consistency
if (x_reshaped.device() != clusters.device()) {
TORCH_CHECK(false, "x.device() != clusters.device()");
}
// Allocate memory for assignment: (B*N) x (K+G)
auto assignment = torch::empty({B * N_points, KplusG}, x.options());
// Launch tiled matrix multiplication kernel
// x_reshaped: (B*N) x D, clusters: D x (K+G), output: (B*N) x (K+G)
int M = B * N_points; // Number of rows
int inner = D; // Shared inner dimension
int N_dim = KplusG; // Number of columns
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((N_dim + TILE_DIM - 1) / TILE_DIM, (M + TILE_DIM - 1) / TILE_DIM);
tiledMatMulOptimizedBlock<<<grid, block>>>(x_reshaped.data_ptr<float>(), clusters.data_ptr<float>(), assignment.data_ptr<float>(), M, inner, N_dim);
// Apply BatchNorm
assignment = torch::batch_norm(
assignment,
bn_weight,
bn_bias,
bn_running_mean,
bn_running_var,
is_training,
0.1,
1e-5,
true
);
// Apply Softmax along dim=1
assignment = torch::softmax(assignment, 1); // (B*N) x (K+G)
// Remove ghost assignments: keep first K columns
assignment = assignment.narrow(1, 0, K); // (B*N) x K
// Reshape assignment: (B*N) x K -> B x N x K
assignment = assignment.reshape({B, N_points, K});
// Compute a_sum = assignment.sum(dim=1, keepdim=true) // B x 1 x K
auto a_sum = assignment.sum(1, /*keepdim=*/true);
// Expand clusters2 to match batch size: clusters2 is originally 1 x D x K
auto clusters2_exp = clusters2.expand({B, D, K}); // B x D x K
// Compute a = a_sum * clusters2
auto a = clusters2_exp * a_sum; // B x D x K
// Transpose assignment: B x N x K -> B x K x N
assignment = assignment.transpose(1, 2);
// Reshape x back to B x N x D
auto x_orig = x.reshape({B, N_points, D});
// Compute vlad = assignment @ x_orig; assignment: B x K x N, x_orig: B x N x D
auto vlad = torch::bmm(assignment, x_orig); // B x K x D
// Transpose vlad to B x D x K
vlad = vlad.transpose(1, 2);
// Compute vlad = vlad - a
vlad = vlad - a;
// L2 intra-normalization along feature dimension D
vlad = torch::nn::functional::normalize(
vlad, torch::nn::functional::NormalizeFuncOptions().p(2).dim(1));
// Flatten vlad: B x D x K -> B x (D*K)
vlad = vlad.reshape({B, D * K});
// L2 normalization along the flattened dimension
vlad = torch::nn::functional::normalize(
vlad, torch::nn::functional::NormalizeFuncOptions().p(2).dim(1));
return vlad;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "NetVLAD forward with optimized block size and loop unrolling");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.190 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.820 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 29.856 | % | 0.013 | 5 |
Issued Ipc Active | 1.196 | inst/cycle | 0.000 | 5 |
SM Busy | 29.856 | % | 0.013 | 5 |
Memory Throughput | 228928191355.378 | byte/second | 825393374517274880.000 | 5 |
Mem Busy | 44.408 | % | 0.008 | 5 |
Max Bandwidth | 42.550 | % | 0.007 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 35.076 | % | 4.726 | 5 |
Mem Pipes Busy | 37.510 | % | 0.005 | 5 |
Warp Cycles Per Issued Instruction | 26.794 | cycle | 0.030 | 5 |
Warp Cycles Per Executed Instruction | 26.858 | cycle | 0.030 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 31.940 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 2.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 3.000 | block | 0.000 | 5 |
Block Limit Warps | 2.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 49.920 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 31.950 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (49.9%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::zero_ | ||
CPU Time | 477481.33 | μs |
Device Time | 2280392.71 | μs |
Self CPU Time | 110246.19 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 367260.72 | μs |
Device Time | 2280392.71 | μs |
Self CPU Time | 141130.08 | μs |
Self Device Time | 2280392.71 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 2022240.85 | μs |
Device Time | 22497.51 | μs |
Self CPU Time | 2022240.85 | μs |
Self Device Time | 22497.51 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
tiledMatMulOptimizedBlock(float const*, float const*, float*, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 736620.39 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 736620.39 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::batch_norm | ||
CPU Time | 1624600.08 | μs |
Device Time | 495819.22 | μs |
Self CPU Time | 52411.75 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_batch_norm_impl_index | ||
CPU Time | 1572188.33 | μs |
Device Time | 495819.22 | μs |
Self CPU Time | 76135.96 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::native_batch_norm | ||
CPU Time | 1424856.37 | μs |
Device Time | 495819.22 | μs |
Self CPU Time | 414805.42 | μs |
Self Device Time | 429495.57 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 2280392.71 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2280392.71 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::norm | ||
CPU Time | 755282.52 | μs |
Device Time | 312622.19 | μs |
Self CPU Time | 230835.87 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45297 warnings generated when compiling for host. Suppressed 45330 warnings (45283 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.