← Back to Leaderboard

The AI CUDA Engineer 👷

6_Conv3d_Softmax_MaxPool_MaxPoolblocksize_experiment_maxpool_base

Level 2 • Task 6
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    conv_weight: torch.Tensor,
    conv_bias: torch.Tensor,
) -> torch.Tensor:
    """Applies 3D convolution, softmax activation, and two max pooling operations.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width)
        conv_weight (torch.Tensor): Convolution weight tensor of shape
            (out_channels, in_channels, kernel_size, kernel_size, kernel_size)
        conv_bias (torch.Tensor): Bias tensor for convolution of shape (out_channels)

    Returns:
        torch.Tensor: Output tensor after applying convolution, softmax and max pooling,
            with shape (batch_size, out_channels, depth', height', width') where:
            depth' = ((depth - kernel_size + 1) // 4)
            height' = ((height - kernel_size + 1) // 4)
            width' = ((width - kernel_size + 1) // 4)
            The //4 comes from two max pooling operations with kernel_size=2
    """
    x = F.conv3d(x, conv_weight, conv_bias, stride=1, padding=0)
    x = F.softmax(x, dim=1)
    x = F.max_pool3d(x, kernel_size=2)
    x = F.max_pool3d(x, kernel_size=2)
    return x


class Model(nn.Module):
    """
    Model that performs a 3D convolution, applies Softmax, and performs two max pooling operations.
    """

    def __init__(self, in_channels, out_channels, kernel_size, pool_kernel_size):
        super(Model, self).__init__()
        conv = nn.Conv3d(in_channels, out_channels, kernel_size, padding=1)
        self.conv_weight = nn.Parameter(conv.weight)
        self.conv_bias = nn.Parameter(conv.bias)

    def forward(self, x, fn=module_fn):
        return fn(x, self.conv_weight, self.conv_bias)


batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
pool_kernel_size = 2


def get_inputs():
    return [torch.randn(batch_size, in_channels, depth, height, width)]


def get_init_inputs():
    return [in_channels, out_channels, kernel_size, pool_kernel_size]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Model that performs a 3D convolution, applies Softmax, and performs two max pooling operations.
    """
    def __init__(self, in_channels, out_channels, kernel_size, pool_kernel_size):
        super(Model, self).__init__()
        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size)
        self.pool1 = nn.MaxPool3d(pool_kernel_size)
        self.pool2 = nn.MaxPool3d(pool_kernel_size)
        

    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch_size, in_channels, depth, height, width)
        Returns:
            Output tensor of shape (batch_size, out_channels, depth', height', width') where depth', height', width' are the dimensions after pooling.
        """
        x = self.conv(x)
        x = torch.softmax(x, dim=1)
        x = self.pool1(x)
        x = self.pool2(x)
        return x

batch_size = 128
in_channels = 3
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
pool_kernel_size = 2

def get_inputs():
    return [torch.randn(batch_size, in_channels, depth, height, width)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, pool_kernel_size]

Kernel Information

Related Kernels (Level 2, Task 6 • 6_Conv3d_Softmax_MaxPool_MaxPool)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 strided_maxpool_base_base 0.95 1.13 0.90
🥈 stride_loop_maxpool_optimized_base 0.95 1.13 0.89
🥉 coalesced_maxpool_base_base 0.96 1.12 0.89
4 hybrid_maxpool3d_kernel_edit_1 0.96 1.11 0.88
5 hybrid_maxpool3d_kernel_base 0.96 1.11 0.88
5 balanced_thread_block_distribution_base 0.96 1.11 0.88
7 balanced_thread_block_distribution_edit_1 0.96 1.11 0.88
8 fused_max_pool3d_base 0.98 1.10 0.87
9 fused_maxpool_opt_base 0.99 1.08 0.86
10 coalesced_maxpool_edit_1 0.99 1.08 0.86
10 efficient_double_pooling_edit_1 0.99 1.08 0.86
10 efficient_double_pooling_base 0.99 1.08 0.86
13 coalesced_maxpool_base 1.00 1.07 0.85
14 blocksize_experiment_maxpool_base 1.00 1.07 0.85
15 divergence_free_maxpool_base_base 1.03 1.04 0.83
16 stride_loop_maxpool_base_base 1.03 1.04 0.83
16 optimized_shared_maxpool_base_base 1.03 1.04 0.83
18 strided_modular_maxpool_base 1.04 1.03 0.82
19 fused_double_maxpool_base 1.04 1.03 0.82
20 efficient_fused_pooling_base 1.04 1.03 0.82
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cfloat>

// Define block size for experimentation
#define BLOCK_SIZE 256  // Experiment with 32, 64, 128, 256, 512

// Each pooling window corresponds to a 4x4x4 region (64 elements).
// In this kernel, we assign 64 threads for each pooling window.
// Thus, each block processes (BLOCK_SIZE / 64) pooling windows concurrently.

__global__ void blocksize_maxpool_kernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    const int total_windows,  // total number of output pooling windows
    const int N, const int C, const int D, const int H, const int W,
    const int outD, const int outH, const int outW
) {
    // Each block processes several windows: windows_per_block = blockDim.x / 64
    int windows_per_block = blockDim.x / 64;

    // Allocate shared memory to store partial results for each pooling window in this block.
    // Each window requires 2 floats (from its two warps of 32 threads each).
    extern __shared__ float shared_data[];  // size: windows_per_block * 2 floats

    // Determine the current group's index within the block (which pooling window this group handles)
    int group = threadIdx.x / 64;          // group index [0, windows_per_block - 1]
    int lane = threadIdx.x % 64;             // lane within the group [0,63]

    // Map lane to 3D coordinates within the 4x4x4 pooling window
    int local_d = lane / 16;               // ranges 0..3
    int local_h = (lane % 16) / 4;           // ranges 0..3
    int local_w = lane % 4;                  // ranges 0..3

    // Compute the starting global window index for this group in the block
    int base_win_idx = blockIdx.x * windows_per_block + group;

    // Stride over windows: each group processes multiple windows if needed
    for (int win_idx = base_win_idx; win_idx < total_windows; win_idx += gridDim.x * windows_per_block) {
        // Decode the linear window index 'win_idx' into 5D coordinates: [n, c, out_d, out_h, out_w]
        int tmp = win_idx;
        int n = tmp / (C * outD * outH * outW);
        tmp = tmp % (C * outD * outH * outW);
        int c = tmp / (outD * outH * outW);
        tmp = tmp % (outD * outH * outW);
        int out_d = tmp / (outH * outW);
        tmp = tmp % (outH * outW);
        int out_h = tmp / outW;
        int out_w = tmp % outW;

        // Compute the starting index (top-left-front corner) of the corresponding 4x4x4 window in the input tensor
        int d_start = out_d * 4;
        int h_start = out_h * 4;
        int w_start = out_w * 4;

        // Determine global indices for each thread within the pooling window
        int d = d_start + local_d;
        int h = h_start + local_h;
        int w = w_start + local_w;

        // Load input value with boundary check
        float val = -FLT_MAX;
        if (d < D && h < H && w < W) {
            int input_idx = n * (C * D * H * W) + c * (D * H * W) + d * (H * W) + h * W + w;
            val = __ldg(&input[input_idx]);
        }

        // Each pooling window is processed by 64 threads, which are divided into two warps (of 32 threads each).
        // Perform warp-level reduction within each warp (for 32 threads)
        unsigned int mask = 0xffffffff;
        int lane_in_warp = lane % 32;
        for (int offset = 16; offset > 0; offset /= 2) {
            float other = __shfl_down_sync(mask, val, offset);
            val = fmaxf(val, other);
        }

        // Each warp's leader (lane_in_warp == 0) writes its partial reduction into shared memory
        if (lane_in_warp == 0) {
            shared_data[group * 2 + (lane / 32)] = val;  // (lane/32) is 0 for first warp, 1 for second
        }

        __syncthreads();

        // Final reduction: only thread 0 in each group reduces the two partial values
        if (lane == 0) {
            float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
            output[win_idx] = final_val;
        }
        __syncthreads();
    }
}


torch::Tensor forward(
    torch::Tensor x,
    torch::Tensor conv_weight,
    torch::Tensor conv_bias
) {
    // Ensure contiguous tensors
    x = x.contiguous();
    conv_weight = conv_weight.contiguous();
    conv_bias = conv_bias.contiguous();

    // Apply 3D convolution and softmax activation
    auto conv_output = at::conv3d(x, conv_weight, conv_bias, {1, 1, 1}, {0, 0, 0});
    auto softmax_output = at::softmax(conv_output, /*dim=*/1);

    // Retrieve dimensions (assumed input format: [N, C, D, H, W])
    const int N = softmax_output.size(0);
    const int C = softmax_output.size(1);
    const int D = softmax_output.size(2);
    const int H = softmax_output.size(3);
    const int W = softmax_output.size(4);

    // Determine output dimensions after two fused max pooling operations (each spatial dim reduced by factor of 4)
    int outD = D / 4;
    int outH = H / 4;
    int outW = W / 4;
    int total_windows = N * C * outD * outH * outW;  // total number of pooling windows

    auto options = softmax_output.options();
    auto output = torch::empty({N, C, outD, outH, outW}, options);

    // Experiment with block sizes: here we use BLOCK_SIZE threads per block.
    // Each pooling window uses 64 threads, so each block processes (BLOCK_SIZE / 64) windows.
    int blockSize = BLOCK_SIZE;  
    int windows_per_block = blockSize / 64;
    int gridSize = (total_windows + windows_per_block - 1) / windows_per_block;
    
    // Allocate shared memory: one float for each warp in each window, total = windows_per_block * 2 * sizeof(float)
    int shared_mem = windows_per_block * 2 * sizeof(float);

    blocksize_maxpool_kernel<<<gridSize, blockSize, shared_mem>>>(
        softmax_output.data_ptr<float>(),
        output.data_ptr<float>(),
        total_windows,
        N, C, D, H, W,
        outD, outH, outW
    );

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized CUDA kernel with experiment-based block size configuration");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 3.230 inst/cycle 0.000 5
Executed Ipc Elapsed 3.190 inst/cycle 0.000 5
Issue Slots Busy 80.796 % 0.000 5
Issued Ipc Active 3.230 inst/cycle 0.000 5
SM Busy 80.796 % 0.000 5
Memory Throughput 463155448249.034 byte/second 101581877399382432.000 5
Mem Busy 22.258 % 0.000 5
Max Bandwidth 15.262 % 0.000 5
L1/TEX Hit Rate 40.832 % 0.000 5
L2 Hit Rate 46.718 % 0.006 5
Mem Pipes Busy 33.604 % 0.000 5
Warp Cycles Per Issued Instruction 13.668 cycle 0.000 5
Warp Cycles Per Executed Instruction 13.670 cycle 0.000 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 27.620 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 6.000 block 0.000 5
Block Limit Shared Mem 14.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 48.000 warp 0.000 5
Theoretical Occupancy 75.000 % 0.000 5
Achieved Occupancy 69.466 % 0.000 5
Achieved Active Warps Per SM 44.462 warp 0.000 5
Analysis Rules
Rule Description
WRN HighPipeUtilization ALU is the highest-utilized pipeline (64.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. The pipeline is well-utilized, but might become a bottleneck if more work is added. Based on the number of executed instructions, the highest utilized pipeline (64.2%) is ALU. It executes integer and logic operations. Comparing the two, the overall pipeline utilization appears to be caused by frequent, low-latency instructions. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-decoder) or hover over the pipeline name to understand the workloads handled by each pipeline. The Instruction Statistics section shows the mix of executed instructions in this kernel.
WRN Occupancy This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::conv3d
CPU Time 4564399.14 μs
Device Time 4626190.80 μs
Self CPU Time 11692.76 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::convolution
CPU Time 4552706.38 μs
Device Time 4626190.80 μs
Self CPU Time 16293.17 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_convolution
CPU Time 4536413.21 μs
Device Time 4626190.80 μs
Self CPU Time 35823.75 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::cudnn_convolution
CPU Time 3905584.14 μs
Device Time 4010730.83 μs
Self CPU Time 169998.10 μs
Self Device Time 4010730.83 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernelExC
CPU Time 3703702.75 μs
Device Time 0.00 μs
Self CPU Time 3703702.75 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
sm80_xmma_fprop_implicit_gemm_indexed_f32f32_f32f32_f32_nchwkcrs_nchw_tilesize32x32x8_stage3_warpsize1x2x1_g1_ffma_aligna4_alignc4_execute_kernel__5x_cudnn
CPU Time 0.00 μs
Device Time 4010728.30 μs
Self CPU Time 0.00 μs
Self Device Time 4010728.30 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45291 warnings generated when compiling for host.
Suppressed 45323 warnings (45276 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:16:5 bugprone-easily-swappable-parameters
16 | const int total_windows, // total number of output pooling windows
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17 | const int N, const int C, const int D, const int H, const int W,
| ~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:16:15: note: the first parameter in the range is 'total_windows'
16 | const int total_windows, // total number of output pooling windows
| ^~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:17:28: note: the last parameter in the range is 'C'
17 | const int N, const int C, const int D, const int H, const int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:17:57: warning: 2 adjacent parameters of 'blocksize_maxpool_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
17 | const int N, const int C, const int D, const int H, const int W,
| ^~~~~~~~~~~~
18 | const int outD, const int outH, const int outW
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:17:67: note: the first parameter in the range is 'W'
17 | const int N, const int C, const int D, const int H, const int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:18:15: note: the last parameter in the range is 'outD'
18 | const int outD, const int outH, const int outW
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:21:29: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int windows_per_block = blockDim.x / 64;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:28:17: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | int group = threadIdx.x / 64; // group index [0, windows_per_block - 1]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:29:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
29 | int lane = threadIdx.x % 64; // lane within the group [0,63]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:37:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
37 | int base_win_idx = blockIdx.x * windows_per_block + group;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:40:74: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
40 | for (int win_idx = base_win_idx; win_idx < total_windows; win_idx += gridDim.x * windows_per_block) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:87:37: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
87 | float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:87:49: note: make conversion explicit to silence this warning
5 | float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
| ^~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:87:49: note: perform multiplication in a wider type
87 | float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
| ^~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:110:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
110 | const int N = softmax_output.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:111:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
111 | const int C = softmax_output.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:112:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
112 | const int D = softmax_output.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:113:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
113 | const int H = softmax_output.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:114:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
114 | const int W = softmax_output.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: note: make conversion explicit to silence this warning
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^~~~~~~~~~~~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: note: perform multiplication in a wider type
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^~~~~~~~~~~~~~~~~
| static_cast<long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^