← Back to Leaderboard

The AI CUDA Engineer 👷

78_ConvTranspose3d_Max_Max_Sumoptimized_maxpool_kernel_base

Level 2 • Task 78
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    stride: int,
    padding: int,
    conv_transpose: torch.Tensor,
    conv_transpose_bias: torch.Tensor,
) -> torch.Tensor:
    """
    Applies a 3D transposed convolution operation followed by two max pooling layers and a sum operation.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, depth, height, width)
        stride (int): Stride of the transposed convolution
        padding (int): Padding of the transposed convolution
        conv_transpose (torch.Tensor): Transposed convolution weight tensor
        conv_transpose_bias (torch.Tensor): Bias tensor for transposed convolution

    Returns:
        torch.Tensor: Output tensor after applying transposed convolution, max pooling and sum reduction
    """
    x = F.conv_transpose3d(
        x, conv_transpose, bias=conv_transpose_bias, stride=stride, padding=padding
    )
    x = F.max_pool3d(x, kernel_size=2)
    x = F.max_pool3d(x, kernel_size=3)
    x = torch.sum(x, dim=1, keepdim=True)
    return x


class Model(nn.Module):
    """
    Model that performs a 3D transposed convolution, followed by two max pooling layers and a sum operation.
    """

    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(Model, self).__init__()
        conv = nn.ConvTranspose3d(in_channels, out_channels, kernel_size)
        self.conv_transpose_parameter = nn.Parameter(conv.weight)
        self.conv_transpose_bias = nn.Parameter(conv.bias)

    def forward(self, x, stride, padding, fn=module_fn):
        return fn(
            x, stride, padding, self.conv_transpose_parameter, self.conv_transpose_bias
        )


batch_size = 16
in_channels = 8
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
stride = 2
padding = 1


def get_inputs():
    return [torch.randn(batch_size, in_channels, depth, height, width), stride, padding]


def get_init_inputs():
    return [in_channels, out_channels, kernel_size, stride, padding]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Model that performs a 3D transposed convolution, followed by two max pooling layers and a sum operation.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(Model, self).__init__()
        self.conv_transpose = nn.ConvTranspose3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
        self.max_pool1 = nn.MaxPool3d(kernel_size=2)
        self.max_pool2 = nn.MaxPool3d(kernel_size=3)

    def forward(self, x):
        x = self.conv_transpose(x)
        x = self.max_pool1(x)
        x = self.max_pool2(x)
        x = torch.sum(x, dim=1, keepdim=True) 
        return x

batch_size = 16
in_channels = 8
out_channels = 16
depth, height, width = 16, 32, 32
kernel_size = 3
stride = 2
padding = 1

def get_inputs():
    return [torch.randn(batch_size, in_channels, depth, height, width)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, stride, padding]

Kernel Information

Related Kernels (Level 2, Task 78 • 78_ConvTranspose3d_Max_Max_Sum)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 optimized_maxpool_kernel_base 0.58 1.05 1.21
🥇 adaptive_blocksize_maxpool_opt_base 0.58 1.05 1.21
🥉 minimized_divergence_maxpool_base_base 0.58 1.05 1.21
4 unrolled_78_convtranspose3d_optimized_base 0.59 1.03 1.19
4 modular_maxpool_kernel_base 0.59 1.03 1.19
6 fully_unrolled_maxpool_base_base 0.59 1.03 1.19
7 balanced_load_distribution_maxpool_base 0.59 1.03 1.19
8 manual_unroll_maxpool_base_base 0.59 1.03 1.19
9 coalesced_maxpool_shared_mem_base 0.60 1.02 1.18
10 unrolled_78_convtranspose3d_base 0.61 1.01 1.16
11 78_ConvTranspose3d_Max_Max_Sum 0.61 1.00 1.16
12 unroll_conv3d_max_sum_base 0.61 1.00 1.15
13 modular_conv3d_max_sum_edit_1 0.61 1.00 1.15
13 modular_conv3d_max_sum_base 0.61 1.00 1.15
13 shared_mem_reduction_max_sum_base 0.61 1.00 1.15
13 unroll_conv3d_max_sum_edit_1 0.61 1.00 1.15
17 optimized_stride_max_pool_base 0.61 1.00 1.15
17 shared_mem_reduction_max_sum_edit_1 0.61 1.00 1.15
19 constant_memory_optimization_base_edit_1 0.62 0.99 1.14
19 balanced_workload_distribution_base 0.62 0.99 1.14
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void optimized_maxpool_kernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    const int N, const int C,
    const int D1, const int H1, const int W1,  // Dimensions after conv_transpose
    const int D3, const int H3, const int W3)  // Final dimensions
{
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N * C * D3 * H3 * W3) return;

    // Decode output index
    const int w3 = idx % W3;
    const int h3 = (idx / W3) % H3;
    const int d3 = (idx / (W3 * H3)) % D3;
    const int c = (idx / (W3 * H3 * D3)) % C;
    const int n = idx / (W3 * H3 * D3 * C);

    // Calculate starting indices for the 3x3x3 window in the first maxpool output
    const int start_d2 = d3 * 3;
    const int start_h2 = h3 * 3;
    const int start_w2 = w3 * 3;

    float final_max = -FLT_MAX;

    // Use a single loop to minimize divergence
    for (int offset = 0; offset < 27; offset++) {
        int d2_offset = offset / 9;
        int h2_offset = (offset / 3) % 3;
        int w2_offset = offset % 3;

        const int d2 = start_d2 + d2_offset;
        const int h2 = start_h2 + h2_offset;
        const int w2 = start_w2 + w2_offset;

        // Check bounds collectively to minimize divergence
        if (d2 < D1/2 && h2 < H1/2 && w2 < W1/2) {
            // For each position in the 3x3x3 window, compute 2x2x2 maxpool
            float local_max = -FLT_MAX;

            // Starting indices for the 2x2x2 window in the original input
            const int start_d1 = d2 * 2;
            const int start_h1 = h2 * 2;
            const int start_w1 = w2 * 2;

            // Unrolled 2x2x2 maxpool
            for (int sub_offset = 0; sub_offset < 8; sub_offset++) {
                int d1_offset = sub_offset / 4;
                int h1_offset = (sub_offset / 2) % 2;
                int w1_offset = sub_offset % 2;

                const int d1 = start_d1 + d1_offset;
                const int h1 = start_h1 + h1_offset;
                const int w1 = start_w1 + w1_offset;

                // Check bounds collectively
                if (d1 < D1 && h1 < H1 && w1 < W1) {
                    const int input_idx = ((n * C + c) * D1 + d1) * H1 * W1 + h1 * W1 + w1;
                    local_max = max(local_max, input[input_idx]);
                }
            }

            final_max = max(final_max, local_max);
        }
    }

    output[idx] = final_max;
}

torch::Tensor forward(
    torch::Tensor x,
    int64_t stride,
    int64_t padding,
    torch::Tensor conv_transpose,
    torch::Tensor conv_transpose_bias) {

    x = x.contiguous();
    conv_transpose = conv_transpose.contiguous();
    conv_transpose_bias = conv_transpose_bias.contiguous();

    TORCH_CHECK(x.is_cuda(), "Input x must be a CUDA tensor");
    TORCH_CHECK(conv_transpose.is_cuda(), "conv_transpose must be a CUDA tensor");
    TORCH_CHECK(conv_transpose_bias.is_cuda(), "conv_transpose_bias must be a CUDA tensor");

    // Apply transposed convolution using ATen op
    x = at::conv_transpose3d(
        x,
        conv_transpose,
        conv_transpose_bias,
        {stride, stride, stride},
        {padding, padding, padding}
    );

    // Get dimensions after conv_transpose
    auto sizes = x.sizes();
    const int N = sizes[0];
    const int C = sizes[1];
    const int D1 = sizes[2];
    const int H1 = sizes[3];
    const int W1 = sizes[4];

    // Calculate final dimensions after combined maxpool
    const int D3 = D1 / 6;
    const int H3 = H1 / 6;
    const int W3 = W1 / 6;

    // Allocate output tensor
    auto output = torch::empty({N, C, D3, H3, W3}, x.options());

    // Launch kernel
    const int total_elements = N * C * D3 * H3 * W3;
    const int threads = 256;
    const int blocks = (total_elements + threads - 1) / threads;

    optimized_maxpool_kernel<<<blocks, threads>>>(
        x.data_ptr<float>(),
        output.data_ptr<float>(),
        N, C, D1, H1, W1, D3, H3, W3
    );

    // Sum over channels
    return output.sum(1, /*keepdim=*/true);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Forward pass with optimized max pooling");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 1.160 inst/cycle 0.000 5
Executed Ipc Elapsed 1.010 inst/cycle 0.000 5
Issue Slots Busy 28.992 % 0.081 5
Issued Ipc Active 1.160 inst/cycle 0.000 5
SM Busy 28.992 % 0.081 5
Memory Throughput 2487325928568.716 byte/second 209318512193741946880.000 5
Mem Busy 43.886 % 0.064 5
Max Bandwidth 74.250 % 0.188 5
L1/TEX Hit Rate 80.826 % 0.000 5
L2 Hit Rate 14.690 % 0.001 5
Mem Pipes Busy 8.642 % 0.003 5
Warp Cycles Per Issued Instruction 24.360 cycle 0.052 5
Warp Cycles Per Executed Instruction 24.390 cycle 0.052 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 31.240 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 43.940 % 0.002 5
Achieved Active Warps Per SM 28.124 warp 0.001 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (27.7%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (43.9%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::conv_transpose3d
CPU Time 5467290.32 μs
Device Time 5015711.04 μs
Self CPU Time 17157.36 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::convolution
CPU Time 5450132.97 μs
Device Time 5015711.04 μs
Self CPU Time 21979.89 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_convolution
CPU Time 5428153.07 μs
Device Time 5015711.04 μs
Self CPU Time 50155.98 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::cudnn_convolution_transpose
CPU Time 4904205.02 μs
Device Time 3956791.34 μs
Self CPU Time 235981.43 μs
Self Device Time 3956785.67 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaMemsetAsync
CPU Time 2262921.43 μs
Device Time 0.00 μs
Self CPU Time 2262921.43 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
sm90_xmma_dgrad_implicit_gemm_indexed_f32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize256x64x32_warpgroupsize1x1x1_g1_strided_execute_kernel__5x_cudnn
CPU Time 0.00 μs
Device Time 2514953.76 μs
Self CPU Time 0.00 μs
Self Device Time 2514953.76 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45285 warnings generated when compiling for host.
Suppressed 45325 warnings (45278 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:10:33 bugprone-easily-swappable-parameters
10 | const int D1, const int H1, const int W1, // Dimensions after conv_transpose
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 | const int D3, const int H3, const int W3) // Final dimensions
| ~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:10:43: note: the first parameter in the range is 'W1'
10 | const int D1, const int H1, const int W1, // Dimensions after conv_transpose
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:11:15: note: the last parameter in the range is 'D3'
11 | const int D3, const int H3, const int W3) // Final dimensions
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:13:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
13 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:100:19: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
100 | const int N = sizes[0];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:101:19: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
101 | const int C = sizes[1];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:102:20: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
102 | const int D1 = sizes[2];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:103:20: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
103 | const int H1 = sizes[3];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_78/b8_s1_optimized_maxpool_kernel/base/base.cu:104:20: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
104 | const int W1 = sizes[4];
| ^