← Back to Leaderboard

The AI CUDA Engineer 👷

49_Max_reduction_over_a_dimensionaligned_coalesced_max_reduction_base

Level 1 • Task 49
import torch
import torch.nn as nn
import torch.functional as F


def module_fn(x: torch.Tensor, dim: int) -> torch.Tensor:
    """
    Applies Max reduction over the specified dimension to the input tensor.

    Args:
        x (torch.Tensor): Input tensor
        dim (int): The dimension to reduce over

    Returns:
        torch.Tensor: Output tensor after Max reduction over the specified dimension
    """
    return torch.max(x, dim=dim)[0]


class Model(nn.Module):
    """
    Simple model that performs Max reduction over a specific dimension.
    """

    def __init__(self, dim: int):
        """
        Initializes the model with the dimension to reduce over.

        Args:
            dim (int): The dimension to reduce over.
        """
        super(Model, self).__init__()
        self.dim = dim

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies Max reduction over the specified dimension to the input tensor.

        Args:
            x (torch.Tensor): Input tensor
            fn: Function to apply (defaults to module_fn)

        Returns:
            torch.Tensor: Output tensor after Max reduction over the specified dimension
        """
        return fn(x, self.dim)


batch_size = 16
dim1 = 256
dim2 = 256


def get_inputs():
    x = torch.randn(batch_size, dim1, dim2)
    return [x]


def get_init_inputs():
    return [1]  # Example, change to desired dimension
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs Max reduction over a specific dimension.
    """
    def __init__(self, dim: int):
        """
        Initializes the model with the dimension to reduce over.

        Args:
            dim (int): The dimension to reduce over.
        """
        super(Model, self).__init__()
        self.dim = dim

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies Max reduction over the specified dimension to the input tensor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after Max reduction over the specified dimension.
        """
        return torch.max(x, dim=self.dim)[0]

batch_size = 16
dim1 = 256
dim2 = 256

def get_inputs():
    x = torch.randn(batch_size, dim1, dim2)
    return [x]

def get_init_inputs():
    return [1] # Example, change to desired dimension

Kernel Information

Related Kernels (Level 1, Task 49 • 49_Max_reduction_over_a_dimension)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 distributed_max_reduction_base 0.02 1.50 2.04
🥇 adaptive_max_reduce_base 0.02 1.50 2.04
🥇 optimal_blocksize_max_reduce_experiment_base 0.02 1.50 2.04
🥇 optimal_block_size_max_reduce_base 0.02 1.50 2.04
🥇 optimized_max_reduce_base 0.02 1.50 2.04
🥇 adaptive_blocksize_max_reduce_base 0.02 1.50 2.04
🥇 coalesced_global_access_max_reduce_base 0.02 1.50 2.04
8 stride_loop_optimization_base 0.02 1.04 1.42
8 aligned_coalesced_max_reduction_base 0.02 1.04 1.42
8 stride_loop_optimization_edit_1 0.02 1.04 1.42
8 balanced_coalesced_max_reduce_edit_1 0.02 1.04 1.42
8 modular_max_reduce_base 0.02 1.04 1.42
8 optimized_block_size_max_reduce_edit_1 0.02 1.04 1.42
8 optimized_max_reduce_base 0.02 1.04 1.42
8 optimized_block_size_max_reduce_base 0.02 1.04 1.42
8 modular_max_reduce_edit_1 0.02 1.04 1.42
8 warp_divergence_optimization_edit_1 0.02 1.04 1.42
8 warp_divergence_optimization_base 0.02 1.04 1.42
8 coalesced_max_reduce_base 0.02 1.04 1.42
8 balanced_coalesced_max_reduce_base 0.02 1.04 1.42
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

// This kernel assumes the input tensor is conceptually [outer, dim, inner] where:
//  outer_size = product of dimensions before the reduced dimension
//  dim_size   = size of the reduced dimension
//  inner_size = product of dimensions after the reduced dimension
// Each block in the x-dimension handles one outer index, and blocks in the y-dimension tile the inner dimension.

template <typename scalar_t>
__global__ void aligned_coalesced_max_reduce_kernel(
    const scalar_t* __restrict__ input,
    scalar_t* __restrict__ output,
    const int64_t inner_size,
    const int64_t dim_size
) {
    // Determine which outer index this block is working on
    int outer_idx = blockIdx.x;
    
    // Determine the tile index in the inner dimension
    int inner_idx = blockIdx.y * blockDim.x + threadIdx.x;
    if (inner_idx >= inner_size) return;

    // Base offset for this outer index
    int64_t base_offset = outer_idx * dim_size * inner_size;

    // Initialize maximum value with the first element in the reduction dimension
    scalar_t max_val = input[base_offset + inner_idx];

    // Loop over the reduction dimension; note that for each i, the memory access
    // is to a contiguous block of memory for threads in the same warp, ensuring coalescing.
    for (int i = 1; i < dim_size; i++) {
        scalar_t val = input[base_offset + i * inner_size + inner_idx];
        max_val = max(max_val, val);
    }

    // Write the result to output. The output tensor is conceptually [outer, inner].
    output[outer_idx * inner_size + inner_idx] = max_val;
}

// This function computes the outer_size and inner_size from the input shape, similar to the reference implementation,
// but then it launches a 2D grid that ensures coalesced memory accesses along the inner dimension.

torch::Tensor aligned_coalesced_max_reduce_cuda_forward(torch::Tensor input, int64_t dim) {
    // Handle negative dimension
    if (dim < 0) dim += input.dim();

    // Calculate outer_size: product of sizes before the 'dim' dimension
    int64_t outer_size = 1;
    for (int i = 0; i < dim; i++) {
        outer_size *= input.size(i);
    }
    
    // Calculate inner_size: product of sizes after the 'dim' dimension
    int64_t inner_size = 1;
    for (int i = dim + 1; i < input.dim(); i++) {
        inner_size *= input.size(i);
    }
    
    // Size along the reduction dimension
    const int64_t dim_size = input.size(dim);

    // Create the output tensor by removing the reduced dimension
    auto output_sizes = input.sizes().vec();
    output_sizes.erase(output_sizes.begin() + dim);
    auto output = torch::empty(output_sizes, input.options());

    // Configure block and grid sizes.
    // Use a 2D grid: grid.x = outer_size; grid.y covers the inner dimension tiled by the block.
    const int threads = 256;
    int blocks_y = (inner_size + threads - 1) / threads;
    dim3 grid(outer_size, blocks_y);

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "aligned_coalesced_max_reduce_forward", ([&] {
        aligned_coalesced_max_reduce_kernel<scalar_t><<<grid, threads>>>(
            input.data_ptr<scalar_t>(),
            output.data_ptr<scalar_t>(),
            inner_size,
            dim_size
        );
    }));

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &aligned_coalesced_max_reduce_cuda_forward, "Aligned and coalesced Max reduction forward (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 0.360 inst/cycle 0.000 5
Executed Ipc Elapsed 0.040 inst/cycle 0.000 5
Issue Slots Busy 9.096 % 0.002 5
Issued Ipc Active 0.364 inst/cycle 0.000 5
SM Busy 12.466 % 0.003 5
Memory Throughput 212442109379.020 byte/second 8692007485116255232.000 5
Mem Busy 3.662 % 0.002 5
Max Bandwidth 6.346 % 0.008 5
L1/TEX Hit Rate 0.000 % 0.000 5
L2 Hit Rate 18.736 % 0.007 5
Mem Pipes Busy 0.798 % 0.000 5
Warp Cycles Per Issued Instruction 21.462 cycle 0.000 5
Warp Cycles Per Executed Instruction 21.552 cycle 0.000 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 31.720 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 12.248 % 0.000 5
Achieved Active Warps Per SM 7.838 warp 0.000 5
Analysis Rules
Rule Description
WRN HighPipeUtilization All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.3%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 511497.06 μs
Device Time 371.52 μs
Self CPU Time 34.15 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 511462.92 μs
Device Time 371.52 μs
Self CPU Time 86.86 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 510781.29 μs
Device Time 0.00 μs
Self CPU Time 78.52 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 509585.54 μs
Device Time 0.00 μs
Self CPU Time 509585.54 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 635417.45 μs
Device Time 21781.55 μs
Self CPU Time 635417.45 μs
Self Device Time 21781.55 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void aligned_coalesced_max_reduce_kernel<float>(float const*, float*, long, long)
CPU Time 0.00 μs
Device Time 161390.65 μs
Self CPU Time 0.00 μs
Self Device Time 161390.65 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 22777.82 μs
Device Time 43157.34 μs
Self CPU Time 22777.82 μs
Self Device Time 43157.34 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 129246.25 μs
Device Time 645851.04 μs
Self CPU Time 14212.82 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 115035.15 μs
Device Time 645851.04 μs
Self CPU Time 16557.93 μs
Self Device Time 645851.04 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 645929.47 μs
Self CPU Time 0.00 μs
Self Device Time 645929.47 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45283 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:19:21 bugprone-narrowing-conversions
19 | int outer_idx = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:22:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | int inner_idx = blockIdx.y * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:57:18: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
57 | for (int i = dim + 1; i < input.dim(); i++) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:72:20: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
72 | int blocks_y = (inner_size + threads - 1) / threads;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:75:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]
75 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "aligned_coalesced_max_reduce_forward", ([&] {
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:246:19: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES_AND_HALF'
246 | TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(__VA_ARGS__))
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:240:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF'
240 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'
74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'
58 | AT_ERROR( \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'
711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'
536 | __func__, \
| ^