← Back to Leaderboard

The AI CUDA Engineer 👷

47_NetVladNoGhostClustersnetvlad_fused_modular_edit_1

Level 3 • Task 47
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch as th


def module_fn(
    x: torch.Tensor,
    clusters: torch.Tensor,
    clusters2: torch.Tensor,
    bn_weight: torch.Tensor,
    bn_bias: torch.Tensor,
    bn_running_mean: torch.Tensor,
    bn_running_var: torch.Tensor,
    feature_size: int,
    cluster_size: int,
    is_training: bool,
) -> torch.Tensor:
    """
    Functional version of the NetVLAD without ghost clusters

    Args:
        x: Input tensor of shape (batch_size, num_features, feature_size)
        clusters: Weight tensor for cluster assignments
        clusters2: Weight tensor for visual words
        bn_weight: BatchNorm weight
        bn_bias: BatchNorm bias
        bn_running_mean: BatchNorm running mean
        bn_running_var: BatchNorm running var
        feature_size: Size of each feature
        cluster_size: Number of clusters (excluding ghost clusters)
        is_training: Whether in training mode

    Returns:
        Output tensor of shape (batch_size, cluster_size * feature_size)
    """
    max_sample = x.size()[1]
    x = x.view(-1, feature_size)  # B x N x D -> BN x D

    if x.device != clusters.device:
        msg = f"x.device {x.device} != cluster.device {clusters.device}"
        raise ValueError(msg)

    assignment = th.matmul(x, clusters)  # (BN x D) x (D x (K+G)) -> BN x (K+G)
    assignment = F.batch_norm(
        assignment,
        bn_running_mean,
        bn_running_var,
        bn_weight,
        bn_bias,
        training=is_training,
    )

    assignment = F.softmax(assignment, dim=1)  # BN x (K+G) -> BN x (K+G)
    # remove ghost assigments
    assignment = assignment[:, :cluster_size]
    assignment = assignment.view(-1, max_sample, cluster_size)  # -> B x N x K
    a_sum = th.sum(assignment, dim=1, keepdim=True)  # B x N x K -> B x 1 x K
    a = a_sum * clusters2

    assignment = assignment.transpose(1, 2)  # B x N x K -> B x K x N

    x = x.view(-1, max_sample, feature_size)  # BN x D -> B x N x D
    vlad = th.matmul(assignment, x)  # (B x K x N) x (B x N x D) -> B x K x D
    vlad = vlad.transpose(1, 2)  # -> B x D x K
    vlad = vlad - a

    # L2 intra norm
    vlad = F.normalize(vlad)

    # flattening + L2 norm
    vlad = vlad.reshape(-1, cluster_size * feature_size)  # -> B x DK
    vlad = F.normalize(vlad)
    return vlad  # B x DK


class Model(nn.Module):
    def __init__(self, cluster_size, feature_size, ghost_clusters):
        super(Model, self).__init__()

        self.feature_size = feature_size
        self.cluster_size = cluster_size
        self.ghost_clusters = ghost_clusters

        init_sc = 1 / math.sqrt(feature_size)
        clusters = cluster_size + ghost_clusters

        # The `clusters` weights are the `(w,b)` in the paper
        self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))

        # Extract batchnorm parameters
        bn = nn.BatchNorm1d(clusters)
        self.bn_weight = nn.Parameter(bn.weight.data.clone())
        self.bn_bias = nn.Parameter(bn.bias.data.clone())
        self.bn_running_mean = nn.Parameter(bn.running_mean.data.clone())
        self.bn_running_var = nn.Parameter(bn.running_var.data.clone())

        # The `clusters2` weights are the visual words `c_k` in the paper
        self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
        self.out_dim = self.cluster_size * feature_size

    def forward(self, x, fn=module_fn):
        return fn(
            x,
            self.clusters,
            self.clusters2,
            self.bn_weight,
            self.bn_bias,
            self.bn_running_mean,
            self.bn_running_var,
            self.feature_size,
            self.cluster_size,
            self.training,
        )


batch_size = 32
num_features = 100
num_clusters = 32
feature_size = 512
ghost_clusters = 0


def get_inputs():
    return [torch.randn(batch_size, num_features, feature_size)]


def get_init_inputs():
    return [num_clusters, feature_size, ghost_clusters]
# Copyright 2018 Antoine Miech All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Code modified from here
https://github.com/albanie/collaborative-experts/blob/master/model/net_vlad.py
"""


import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch as th


class Model(nn.Module):
    def __init__(self, cluster_size, feature_size, ghost_clusters):
        super(Model, self).__init__()

        self.feature_size = feature_size
        self.cluster_size = cluster_size
        self.ghost_clusters = ghost_clusters

        init_sc = (1 / math.sqrt(feature_size))
        clusters = cluster_size + ghost_clusters

        # The `clusters` weights are the `(w,b)` in the paper
        self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
        self.batch_norm = nn.BatchNorm1d(clusters)
        # The `clusters2` weights are the visual words `c_k` in the paper
        self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
        self.out_dim = self.cluster_size * feature_size

    def forward(self, x, mask=None):
        """Aggregates feature maps into a fixed size representation.  In the following
        notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.

        Args:
            x (th.Tensor): B x N x D

        Returns:
            (th.Tensor): B x DK
        """
        max_sample = x.size()[1]
        x = x.view(-1, self.feature_size)  # B x N x D -> BN x D

        if x.device != self.clusters.device:
            msg = f"x.device {x.device} != cluster.device {self.clusters.device}"
            raise ValueError(msg)

        assignment = th.matmul(x, self.clusters)  # (BN x D) x (D x (K+G)) -> BN x (K+G)
        assignment = self.batch_norm(assignment)

        assignment = F.softmax(assignment, dim=1)  # BN x (K+G) -> BN x (K+G)
        # remove ghost assigments
        assignment = assignment[:, :self.cluster_size]
        assignment = assignment.view(-1, max_sample, self.cluster_size)  # -> B x N x K
        a_sum = th.sum(assignment, dim=1, keepdim=True)  # B x N x K -> B x 1 x K
        a = a_sum * self.clusters2

        assignment = assignment.transpose(1, 2)  # B x N x K -> B x K x N

        x = x.view(-1, max_sample, self.feature_size)  # BN x D -> B x N x D
        vlad = th.matmul(assignment, x)  # (B x K x N) x (B x N x D) -> B x K x D
        vlad = vlad.transpose(1, 2)  # -> B x D x K
        vlad = vlad - a

        # L2 intra norm
        vlad = F.normalize(vlad)

        # flattening + L2 norm
        vlad = vlad.reshape(-1, self.cluster_size * self.feature_size)  # -> B x DK
        vlad = F.normalize(vlad)
        return vlad  # B x DK

batch_size = 32
num_features = 100
num_clusters = 32
feature_size = 512
ghost_clusters = 0

def get_inputs():
  return [torch.randn(batch_size, num_features, feature_size)]

def get_init_inputs():
  return [num_clusters, feature_size, ghost_clusters]

Kernel Information

Related Kernels (Level 3, Task 47 • 47_NetVladNoGhostClusters)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 netvlad_fused_streams_edit_1 0.07 1.79 1.14
🥈 netvlad_fused_assign_edit_1 0.07 1.77 1.12
🥈 netvlad_fused_assign_warp_reduce_edit_1 0.07 1.77 1.12
4 netvlad_fused_assign_base 0.07 1.71 1.09
5 netvlad_stream_overlap_edit_1 0.07 1.69 1.07
6 netvlad_fused_modular_base 0.09 1.39 0.88
7 netvlad_fused_modular_edit_1 0.09 1.31 0.83
8 netvlad_stride_fused_edit_1 0.10 1.23 0.78
9 47_NetVladNoGhostClusters 0.10 1.17 0.74
9 netvlad_stride_fused_base 0.10 1.17 0.74
11 tiled_index_opt_base 0.11 1.11 0.70
11 tiled_unroll_block_optimization_base 0.11 1.11 0.70
13 tiled_matmul_unified_base 0.11 1.10 0.69
13 hybrid_netvlad_matmul_base 0.11 1.10 0.69
13 47_netvlad_noghostclusters_unroll_base_base 0.11 1.10 0.69
16 47_netvlad_noghostclusters_shared_base 0.11 1.09 0.69
16 tiled_reduced_sync_base_base 0.11 1.09 0.69
16 tiled_unroll_min_sync_base 0.11 1.09 0.69
16 tiled_unroll_min_sync_optimized_base 0.11 1.09 0.69
20 optimized_tiled_assignment_base 0.11 1.08 0.68
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

constexpr int TILE_SIZE = 128;

// Device function for computing dot product
__device__ float dot_product(const float* x, const float* cluster, int D) {
    float sum = 0.0f;
    int tid = threadIdx.x;

    for (int i = tid; i < D; i += TILE_SIZE) {
        sum += x[i] * cluster[i];
    }

    return sum;
}

// Device function for applying batch normalization
__device__ float batch_norm(float val, float mean, float var, float weight, float bias) {
    return (val - mean) * weight / sqrtf(var + 1e-5f) + bias;
}

// Device function for applying softmax
__device__ float softmax_reduction(float val, const float* row_cache, int KplusG) {
    float max_val = -INFINITY;
    float sum_exp = 0.0f;

    for (int i = 0; i < KplusG; ++i) {
        max_val = fmaxf(max_val, row_cache[i]);
    }

    __syncthreads();

    val = __expf(val - max_val);

    for (int i = 0; i < KplusG; ++i) {
        sum_exp += __expf(row_cache[i] - max_val);
    }

    __syncthreads();

    return val / sum_exp;
}

__global__ void modular_fused_assignment_kernel(
    const float* __restrict__ x,
    const float* __restrict__ clusters,
    const float* bn_weight,
    const float* bn_bias,
    const float* bn_mean,
    const float* bn_var,
    float* output,
    int64_t BxN,
    int64_t D,
    int64_t KplusG,
    bool is_training) {

    int row = blockIdx.x * blockDim.y + threadIdx.y;
    int col = threadIdx.y;

    __shared__ float smem_cache[TILE_SIZE * TILE_SIZE];

    if (row >= BxN) return;

    extern __shared__ float shared[];
    float* row_cache = shared;

    // Compute dot product
    float dot = dot_product(x + row * D, clusters + col * D, D);
    row_cache[col] = dot;

    __syncthreads();

    // Apply BN, if in inference mode
    if (!is_training) {
        row_cache[col] = batch_norm(row_cache[col], bn_mean[col], bn_var[col], bn_weight[col], bn_bias[col]);
    }

    __syncthreads();

    // Apply softmax
    output[row * KplusG + col] = softmax_reduction(row_cache[col], row_cache, KplusG);
}

void call_modular_fused_assignment_kernel(
    torch::Tensor x,
    torch::Tensor clusters,
    torch::Tensor bn_weight,
    torch::Tensor bn_bias,
    torch::Tensor bn_running_mean,
    torch::Tensor bn_running_var,
    torch::Tensor assignment,
    int64_t BxN,
    int64_t D,
    int64_t KplusG,
    bool is_training) {

    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((BxN + TILE_SIZE - 1) / TILE_SIZE);
    size_t shared_mem = KplusG * sizeof(float);

    modular_fused_assignment_kernel<<<grid, block, shared_mem>>>(
        x.data_ptr<float>(),
        clusters.data_ptr<float>(),
        bn_weight.data_ptr<float>(),
        bn_bias.data_ptr<float>(),
        bn_running_mean.data_ptr<float>(),
        bn_running_var.data_ptr<float>(),
        assignment.data_ptr<float>(),
        BxN,
        D,
        KplusG,
        is_training);

    cudaDeviceSynchronize();
}

// Forward function
// Refactor code to invoke modular kernel call

torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor clusters,
    torch::Tensor clusters2,
    torch::Tensor bn_weight,
    torch::Tensor bn_bias,
    torch::Tensor bn_running_mean,
    torch::Tensor bn_running_var,
    int64_t feature_size,
    int64_t cluster_size,
    bool is_training) {

    CHECK_INPUT(x);
    CHECK_INPUT(clusters);
    CHECK_INPUT(clusters2);
    CHECK_INPUT(bn_weight);
    CHECK_INPUT(bn_bias);
    CHECK_INPUT(bn_running_mean);
    CHECK_INPUT(bn_running_var);

    int64_t B = x.size(0);
    int64_t N = x.size(1);
    int64_t D = feature_size;
    int64_t K = cluster_size;
    int64_t KplusG = clusters.size(1);
    int64_t BxN = B * N;

    x = x.reshape({-1, D});
    auto assignment = torch::empty({BxN, KplusG}, x.options());

    call_modular_fused_assignment_kernel(
        x,
        clusters,
        bn_weight,
        bn_bias,
        bn_running_mean,
        bn_running_var,
        assignment,
        BxN,
        D,
        KplusG,
        is_training);

    assignment = assignment.narrow(1, 0, K).reshape({B, N, K});

    auto a_sum = assignment.sum(1, true);
    clusters2 = clusters2.expand({B, D, K});
    auto a = clusters2 * a_sum;

    assignment = assignment.transpose(1, 2);
    x = x.reshape({B, N, D});
    auto vlad = torch::bmm(assignment, x).transpose(1, 2) - a;

    vlad = torch::nn::functional::normalize(
        vlad, torch::nn::functional::NormalizeFuncOptions().p(2).dim(1));
    vlad = vlad.reshape({B, D * K});
    vlad = torch::nn::functional::normalize(
        vlad, torch::nn::functional::NormalizeFuncOptions().p(2).dim(1));

    return vlad;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "NetVLAD forward");
}
Performance Metrics
Metric Value Unit Variance Samples
Analysis Rules
Rule Description
Operation / Metric Value Unit
aten::zero_
CPU Time 111272.43 μs
Device Time 1108920.16 μs
Self CPU Time 25278.06 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 86016.39 μs
Device Time 1108920.16 μs
Self CPU Time 30389.83 μs
Self Device Time 1108920.16 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::reshape
CPU Time 282117.36 μs
Device Time 79679.84 μs
Self CPU Time 63354.23 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 675685.51 μs
Device Time 110301.77 μs
Self CPU Time 675685.51 μs
Self Device Time 110301.77 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceSynchronize
CPU Time 922473.80 μs
Device Time 36.77 μs
Self CPU Time 922473.80 μs
Self Device Time 36.77 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::bmm
CPU Time 325179.25 μs
Device Time 145735.34 μs
Self CPU Time 220084.95 μs
Self Device Time 145735.34 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::norm
CPU Time 369100.38 μs
Device Time 160833.40 μs
Self CPU Time 113160.57 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::linalg_vector_norm
CPU Time 255939.82 μs
Device Time 160833.40 μs
Self CPU Time 122552.91 μs
Self Device Time 160833.40 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 1108920.16 μs
Self CPU Time 0.00 μs
Self Device Time 1108920.16 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45304 warnings generated when compiling for host.
Suppressed 45330 warnings (45283 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:7:35 bugprone-macro-parentheses
7 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:8:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
8 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:16:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
16 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:60:5: warning: 3 adjacent parameters of 'modular_fused_assignment_kernel' of similar type ('int64_t') are easily swapped by mistake [bugprone-easily-swappable-parameters]
60 | int64_t BxN,
| ^~~~~~~~~~~~
61 | int64_t D,
| ~~~~~~~~~~
62 | int64_t KplusG,
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:60:13: note: the first parameter in the range is 'BxN'
60 | int64_t BxN,
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:62:13: note: the last parameter in the range is 'KplusG'
62 | int64_t KplusG,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:65:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
65 | int row = blockIdx.x * blockDim.y + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:66:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
66 | int col = threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:76:62: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
76 | float dot = dot_product(x + row * D, clusters + col * D, D);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:89:79: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
89 | output[row * KplusG + col] = softmax_reduction(row_cache[col], row_cache, KplusG);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:93:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
93 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:94:19: warning: the parameter 'clusters' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
94 | torch::Tensor clusters,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:95:19: warning: the parameter 'bn_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
95 | torch::Tensor bn_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:96:19: warning: the parameter 'bn_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
96 | torch::Tensor bn_bias,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:97:19: warning: the parameter 'bn_running_mean' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
97 | torch::Tensor bn_running_mean,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:98:19: warning: the parameter 'bn_running_var' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
98 | torch::Tensor bn_running_var,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:99:19: warning: the parameter 'assignment' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
99 | torch::Tensor assignment,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:130:19: warning: the parameter 'clusters' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
130 | torch::Tensor clusters,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:132:19: warning: the parameter 'bn_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
132 | torch::Tensor bn_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:133:19: warning: the parameter 'bn_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
133 | torch::Tensor bn_bias,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:134:19: warning: the parameter 'bn_running_mean' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
134 | torch::Tensor bn_running_mean,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:135:19: warning: the parameter 'bn_running_var' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
135 | torch::Tensor bn_running_var,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:136:5: warning: 2 adjacent parameters of 'forward' of similar type ('int64_t') are easily swapped by mistake [bugprone-easily-swappable-parameters]
136 | int64_t feature_size,
| ^~~~~~~~~~~~~~~~~~~~~
137 | int64_t cluster_size,
| ~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:136:13: note: the first parameter in the range is 'feature_size'
136 | int64_t feature_size,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_47/b4_s0_netvlad_fused_modular/edit_1/edit_1.cu:137:13: note: the last parameter in the range is 'cluster_size'
137 | int64_t cluster_size,
| ^~~~~~~~~~~~