← Back to Leaderboard

The AI CUDA Engineer 👷

41_Max_Pooling_1Daligned_memory_access_base

Level 1 • Task 41
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    kernel_size: int,
    stride: int,
    padding: int,
    dilation: int,
    return_indices: bool,
) -> torch.Tensor:
    """
    Functional implementation of Max Pooling 1D.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, num_features, sequence_length).
        kernel_size (int): Size of the window to take a max over.
        stride (int): Stride of the window.
        padding (int): Implicit zero padding to be added on both sides.
        dilation (int): Spacing between kernel elements.
        return_indices (bool): Whether to return the indices of the maximum values.

    Returns:
        torch.Tensor: Output tensor with Max Pooling 1D applied.
    """
    return F.max_pool1d(
        x,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        return_indices=return_indices,
    )


class Model(nn.Module):
    """
    Simple model that performs Max Pooling 1D.
    """

    def __init__(
        self,
        kernel_size: int,
        stride: int,
        padding: int,
        dilation: int,
        return_indices: bool,
    ):
        """
        Initializes the Max Pooling 1D layer.

        Args:
            kernel_size (int): Size of the window to take a max over.
            stride (int): Stride of the window.
            padding (int): Implicit zero padding to be added on both sides.
            dilation (int): Spacing between kernel elements.
            return_indices (bool): Whether to return the indices of the maximum values.
        """
        super(Model, self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.return_indices = return_indices

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies Max Pooling 1D to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, num_features, sequence_length).
            fn: Function to apply (defaults to module_fn)

        Returns:
            torch.Tensor: Output tensor with Max Pooling 1D applied.
        """
        return fn(
            x,
            self.kernel_size,
            self.stride,
            self.padding,
            self.dilation,
            self.return_indices,
        )


batch_size = 16
features = 64
sequence_length = 128
kernel_size = 4
stride = 2
padding = 2
dilation = 3
return_indices = False


def get_inputs():
    x = torch.randn(batch_size, features, sequence_length)
    return [x]


def get_init_inputs():
    return [kernel_size, stride, padding, dilation, return_indices]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs Max Pooling 1D.
    """
    def __init__(self, kernel_size: int, stride: int = None, padding: int = 0, dilation: int = 1, return_indices: bool = False):
        """
        Initializes the Max Pooling 1D layer.

        Args:
            kernel_size (int): Size of the window to take a max over.
            stride (int, optional): Stride of the window. Defaults to None (same as kernel_size).
            padding (int, optional): Implicit zero padding to be added on both sides. Defaults to 0.
            dilation (int, optional): Spacing between kernel elements. Defaults to 1.
            return_indices (bool, optional): Whether to return the indices of the maximum values. Defaults to False.
        """
        super(Model, self).__init__()
        self.maxpool = nn.MaxPool1d(kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, return_indices=return_indices)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies Max Pooling 1D to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, num_features, sequence_length).

        Returns:
            torch.Tensor: Output tensor with Max Pooling 1D applied, shape (batch_size, num_features, output_sequence_length).
        """
        return self.maxpool(x)

batch_size = 16
features = 64
sequence_length = 128
kernel_size = 4
stride = 2
padding = 2
dilation = 3
return_indices = False

def get_inputs():
    x = torch.randn(batch_size, features, sequence_length)
    return [x]

def get_init_inputs():
    return [kernel_size, stride, padding, dilation, return_indices]

Kernel Information

Related Kernels (Level 1, Task 41 • 41_Max_Pooling_1D)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 41_Max_Pooling_1D 0.01 1.18 5.01
🥇 max_pool1d_shared_opt_base 0.01 1.18 5.01
🥇 optimized_workload_distributed_pool1d_base 0.01 1.18 5.01
🥇 max_pool1d_optimized_grid_base 0.01 1.18 5.01
🥇 optimized_max_pool1d_kernel_base 0.01 1.18 5.01
🥇 max_pool1d_kernel_combined_base 0.01 1.18 5.01
🥇 max_pool1d_nosync_base 0.01 1.18 5.01
🥇 coalesced_writes_edit_1 0.01 1.18 5.01
🥇 aligned_memory_access_base 0.01 1.18 5.01
🥇 aligned_memory_access_edit_1 0.01 1.18 5.01
🥇 loop_unrolling_base 0.01 1.18 5.01
🥇 balanced_workload_distribution_base 0.01 1.18 5.01
🥇 balanced_max_pool1d_base 0.01 1.18 5.01
🥇 balanced_workload_distribution_edit_1 0.01 1.18 5.01
🥇 max_pool1d_fused_kernel_base 0.01 1.18 5.01
🥇 coalesced_max_pool1d_kernel_base_base 0.01 1.18 5.01
🥇 experimental_block_size_pool1d_base_base 0.01 1.18 5.01
🥇 max_pool1d_tunable_base 0.01 1.18 5.01
🥇 coalesced_aligned_pooling_base 0.01 1.18 5.01
🥇 modular_device_functions_edit_1_base 0.01 1.18 5.01
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void max_pool1d_aligned_kernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    int64_t* __restrict__ indices,
    const int batch_size,
    const int num_channels,
    const int input_length,
    const int kernel_size,
    const int stride,
    const int padding,
    const int dilation,
    const int output_length,
    bool return_indices)
{
    const int elements_per_bc = output_length;
    const int total_elements = batch_size * num_channels * output_length;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (tid >= total_elements) return;
    
    const int bc = tid / elements_per_bc;
    const int i = tid % elements_per_bc;
    const int b = bc / num_channels;
    const int c = bc % num_channels;
    
    if (b >= batch_size || c >= num_channels) return;

    const int input_start = i * stride - padding;
    float max_val = -INFINITY;
    int max_idx = -1;

    // Base input offset for current batch and channel
    const float* input_bc = input + (b * num_channels * input_length + c * input_length);

    // Align kernel reads to 128-bit boundaries where possible
    #pragma unroll
    for (int k = 0; k < kernel_size; ++k) {
        const int pos = input_start + k * dilation;
        if (pos >= 0 && pos < input_length) {
            const float val = __ldg(input_bc + pos);
            if (val > max_val) {
                max_val = val;
                max_idx = pos;
            }
        }
    }

    // Ensure coalesced writes by having consecutive threads write to consecutive memory locations
    const int out_idx = b * num_channels * output_length + c * output_length + i;
    output[out_idx] = max_val;
    if (return_indices) indices[out_idx] = max_idx;
}

torch::Tensor forward(
    torch::Tensor x,
    int64_t kernel_size,
    int64_t stride,
    int64_t padding,
    int64_t dilation,
    bool return_indices)
{
    TORCH_CHECK(x.dim() == 3, "Input must be 3D");
    TORCH_CHECK(x.is_cuda(), "Input must be on CUDA");
    TORCH_CHECK(x.is_contiguous(), "Input must be contiguous");

    const int batch_size = x.size(0);
    const int num_channels = x.size(1);
    const int input_length = x.size(2);

    const int output_length = ((input_length + 2 * padding - dilation * (kernel_size - 1) - 1) / stride) + 1;
    TORCH_CHECK(output_length > 0, "Output length must be positive");

    auto options = torch::TensorOptions().dtype(x.dtype()).device(x.device());
    auto output = torch::empty({batch_size, num_channels, output_length}, options);
    torch::Tensor indices;

    if (return_indices) {
        indices = torch::empty({batch_size, num_channels, output_length}, 
            options.dtype(torch::kInt64));
    }

    const int total_elements = batch_size * num_channels * output_length;
    const int threads_per_block = 256;
    const int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;

    max_pool1d_aligned_kernel<<<num_blocks, threads_per_block>>>(
        x.data_ptr<float>(),
        output.data_ptr<float>(),
        return_indices ? indices.data_ptr<int64_t>() : nullptr,
        batch_size,
        num_channels,
        input_length,
        kernel_size,
        stride,
        padding,
        dilation,
        output_length,
        return_indices
    );

    return return_indices ? torch::cat({output, indices}, -1) : output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "MaxPool1D forward with aligned memory access (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 1.004 inst/cycle 0.003 5
Executed Ipc Elapsed 0.402 inst/cycle 0.000 5
Issue Slots Busy 25.944 % 2.141 5
Issued Ipc Active 1.036 inst/cycle 0.003 5
SM Busy 25.944 % 2.141 5
Memory Throughput 141562395474.560 byte/second 11141548443120211968.000 5
Mem Busy 10.686 % 0.074 5
Max Bandwidth 7.030 % 0.028 5
L1/TEX Hit Rate 68.700 % 0.000 5
L2 Hit Rate 72.496 % 0.006 5
Mem Pipes Busy 7.172 % 0.023 5
Warp Cycles Per Issued Instruction 14.132 cycle 1.017 5
Warp Cycles Per Executed Instruction 14.606 cycle 1.078 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 28.900 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 22.056 % 0.005 5
Achieved Active Warps Per SM 14.114 warp 0.002 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (24.9%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (22.1%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 275710.73 μs
Device Time 21.15 μs
Self CPU Time 34.38 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 275676.35 μs
Device Time 21.15 μs
Self CPU Time 78.29 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 275443.21 μs
Device Time 0.00 μs
Self CPU Time 78.18 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 275174.77 μs
Device Time 0.00 μs
Self CPU Time 275174.77 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 520606.50 μs
Device Time 22406.25 μs
Self CPU Time 520606.50 μs
Self Device Time 22406.25 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
max_pool1d_aligned_kernel(float const*, float*, long*, int, int, int, int, int, int, int, int, bool)
CPU Time 0.00 μs
Device Time 31110.90 μs
Self CPU Time 0.00 μs
Self Device Time 31110.90 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 23158.07 μs
Device Time 44564.98 μs
Self CPU Time 23158.07 μs
Self Device Time 44564.98 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 69357.50 μs
Device Time 664671.36 μs
Self CPU Time 14970.60 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 54388.62 μs
Device Time 664671.36 μs
Self CPU Time 17822.99 μs
Self Device Time 664671.36 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 664671.36 μs
Self CPU Time 0.00 μs
Self Device Time 664671.36 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45291 warnings generated when compiling for host.
Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:11:5 bugprone-easily-swappable-parameters
11 | const int input_length,
| ^~~~~~~~~~~~~~~~~~~~~~~
12 | const int kernel_size,
| ~~~~~~~~~~~~~~~~~~~~~~
13 | const int stride,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:11:15: note: the first parameter in the range is 'input_length'
11 | const int input_length,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:13:15: note: the last parameter in the range is 'stride'
13 | const int stride,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:14:5: warning: 3 adjacent parameters of 'max_pool1d_aligned_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
14 | const int padding,
| ^~~~~~~~~~~~~~~~~~
15 | const int dilation,
| ~~~~~~~~~~~~~~~~~~~
16 | const int output_length,
| ~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:14:15: note: the first parameter in the range is 'padding'
14 | const int padding,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:16:15: note: the last parameter in the range is 'output_length'
16 | const int output_length,
| ^~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:21:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | const int tid = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:59:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
59 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:70:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
70 | const int batch_size = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:71:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
71 | const int num_channels = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:72:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
72 | const int input_length = x.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:74:31: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
74 | const int output_length = ((input_length + 2 * padding - dilation * (kernel_size - 1) - 1) / stride) + 1;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:97:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
97 | kernel_size,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:98:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
98 | stride,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:99:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
99 | padding,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_1/task_41/b3_s0_aligned_memory_access/base/base.cu:100:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
100 | dilation,
| ^