Kernel Details - aligned_coalesced_max_reduction

import torch
import torch.nn as nn
import torch.functional as F


def module_fn(x: torch.Tensor, dim: int) -> torch.Tensor:
    """
    Applies Max reduction over the specified dimension to the input tensor.

    Args:
        x (torch.Tensor): Input tensor
        dim (int): The dimension to reduce over

    Returns:
        torch.Tensor: Output tensor after Max reduction over the specified dimension
    """
    return torch.max(x, dim=dim)[0]


class Model(nn.Module):
    """
    Simple model that performs Max reduction over a specific dimension.
    """

    def __init__(self, dim: int):
        """
        Initializes the model with the dimension to reduce over.

        Args:
            dim (int): The dimension to reduce over.
        """
        super(Model, self).__init__()
        self.dim = dim

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        """
        Applies Max reduction over the specified dimension to the input tensor.

        Args:
            x (torch.Tensor): Input tensor
            fn: Function to apply (defaults to module_fn)

        Returns:
            torch.Tensor: Output tensor after Max reduction over the specified dimension
        """
        return fn(x, self.dim)


batch_size = 16
dim1 = 256
dim2 = 256


def get_inputs():
    x = torch.randn(batch_size, dim1, dim2)
    return [x]


def get_init_inputs():
    return [1]  # Example, change to desired dimension

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs Max reduction over a specific dimension.
    """
    def __init__(self, dim: int):
        """
        Initializes the model with the dimension to reduce over.

        Args:
            dim (int): The dimension to reduce over.
        """
        super(Model, self).__init__()
        self.dim = dim

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Applies Max reduction over the specified dimension to the input tensor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after Max reduction over the specified dimension.
        """
        return torch.max(x, dim=self.dim)[0]

batch_size = 16
dim1 = 256
dim2 = 256

def get_inputs():
    x = torch.randn(batch_size, dim1, dim2)
    return [x]

def get_init_inputs():
    return [1] # Example, change to desired dimension

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	49_Max_reduction_over_a_dimension
Level ID	1
Task ID	49
Kernel Name	aligned_coalesced_max_reduction_base
CUDA Speedup (Native)	1.044x
CUDA Speedup (Compile)	1.422x
CUDA Runtime	0.023 ms
PyTorch Runtime (Native)	0.024 ms
PyTorch Runtime (Compile)	0.033 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 1, Task 49 • 49_Max_reduction_over_a_dimension)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	distributed_max_reduction_base	0.02	1.50	2.04
🥇	adaptive_max_reduce_base	0.02	1.50	2.04
🥇	optimal_blocksize_max_reduce_experiment_base	0.02	1.50	2.04
🥇	optimal_block_size_max_reduce_base	0.02	1.50	2.04
🥇	optimized_max_reduce_base	0.02	1.50	2.04
🥇	adaptive_blocksize_max_reduce_base	0.02	1.50	2.04
🥇	coalesced_global_access_max_reduce_base	0.02	1.50	2.04
8	stride_loop_optimization_base	0.02	1.04	1.42
8	aligned_coalesced_max_reduction_base	0.02	1.04	1.42
8	stride_loop_optimization_edit_1	0.02	1.04	1.42
8	balanced_coalesced_max_reduce_edit_1	0.02	1.04	1.42
8	modular_max_reduce_base	0.02	1.04	1.42
8	optimized_block_size_max_reduce_edit_1	0.02	1.04	1.42
8	optimized_max_reduce_base	0.02	1.04	1.42
8	optimized_block_size_max_reduce_base	0.02	1.04	1.42
8	modular_max_reduce_edit_1	0.02	1.04	1.42
8	warp_divergence_optimization_edit_1	0.02	1.04	1.42
8	warp_divergence_optimization_base	0.02	1.04	1.42
8	coalesced_max_reduce_base	0.02	1.04	1.42
8	balanced_coalesced_max_reduce_base	0.02	1.04	1.42

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

// This kernel assumes the input tensor is conceptually [outer, dim, inner] where:
//  outer_size = product of dimensions before the reduced dimension
//  dim_size   = size of the reduced dimension
//  inner_size = product of dimensions after the reduced dimension
// Each block in the x-dimension handles one outer index, and blocks in the y-dimension tile the inner dimension.

template <typename scalar_t>
__global__ void aligned_coalesced_max_reduce_kernel(
    const scalar_t* __restrict__ input,
    scalar_t* __restrict__ output,
    const int64_t inner_size,
    const int64_t dim_size
) {
    // Determine which outer index this block is working on
    int outer_idx = blockIdx.x;
    
    // Determine the tile index in the inner dimension
    int inner_idx = blockIdx.y * blockDim.x + threadIdx.x;
    if (inner_idx >= inner_size) return;

    // Base offset for this outer index
    int64_t base_offset = outer_idx * dim_size * inner_size;

    // Initialize maximum value with the first element in the reduction dimension
    scalar_t max_val = input[base_offset + inner_idx];

    // Loop over the reduction dimension; note that for each i, the memory access
    // is to a contiguous block of memory for threads in the same warp, ensuring coalescing.
    for (int i = 1; i < dim_size; i++) {
        scalar_t val = input[base_offset + i * inner_size + inner_idx];
        max_val = max(max_val, val);
    }

    // Write the result to output. The output tensor is conceptually [outer, inner].
    output[outer_idx * inner_size + inner_idx] = max_val;
}

// This function computes the outer_size and inner_size from the input shape, similar to the reference implementation,
// but then it launches a 2D grid that ensures coalesced memory accesses along the inner dimension.

torch::Tensor aligned_coalesced_max_reduce_cuda_forward(torch::Tensor input, int64_t dim) {
    // Handle negative dimension
    if (dim < 0) dim += input.dim();

    // Calculate outer_size: product of sizes before the 'dim' dimension
    int64_t outer_size = 1;
    for (int i = 0; i < dim; i++) {
        outer_size *= input.size(i);
    }
    
    // Calculate inner_size: product of sizes after the 'dim' dimension
    int64_t inner_size = 1;
    for (int i = dim + 1; i < input.dim(); i++) {
        inner_size *= input.size(i);
    }
    
    // Size along the reduction dimension
    const int64_t dim_size = input.size(dim);

    // Create the output tensor by removing the reduced dimension
    auto output_sizes = input.sizes().vec();
    output_sizes.erase(output_sizes.begin() + dim);
    auto output = torch::empty(output_sizes, input.options());

    // Configure block and grid sizes.
    // Use a 2D grid: grid.x = outer_size; grid.y covers the inner dimension tiled by the block.
    const int threads = 256;
    int blocks_y = (inner_size + threads - 1) / threads;
    dim3 grid(outer_size, blocks_y);

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "aligned_coalesced_max_reduce_forward", ([&] {
        aligned_coalesced_max_reduce_kernel<scalar_t><<<grid, threads>>>(
            input.data_ptr<scalar_t>(),
            output.data_ptr<scalar_t>(),
            inner_size,
            dim_size
        );
    }));

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &aligned_coalesced_max_reduce_cuda_forward, "Aligned and coalesced Max reduction forward (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	0.360	inst/cycle	0.000	5
Executed Ipc Elapsed	0.040	inst/cycle	0.000	5
Issue Slots Busy	9.096	%	0.002	5
Issued Ipc Active	0.364	inst/cycle	0.000	5
SM Busy	12.466	%	0.003	5
Memory Throughput	212442109379.020	byte/second	8692007485116255232.000	5
Mem Busy	3.662	%	0.002	5
Max Bandwidth	6.346	%	0.008	5
L1/TEX Hit Rate	0.000	%	0.000	5
L2 Hit Rate	18.736	%	0.007	5
Mem Pipes Busy	0.798	%	0.000	5
Warp Cycles Per Issued Instruction	21.462	cycle	0.000	5
Warp Cycles Per Executed Instruction	21.552	cycle	0.000	5
Avg. Active Threads Per Warp	32.000		0.000	5
Avg. Not Predicated Off Threads Per Warp	31.720		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	8.000	block	0.000	5
Block Limit Shared Mem	32.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	64.000	warp	0.000	5
Theoretical Occupancy	100.000	%	0.000	5
Achieved Occupancy	12.248	%	0.000	5
Achieved Active Warps Per SM	7.838	warp	0.000	5

Analysis Rules

Rule	Description
WRN HighPipeUtilization	All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.
INF CPIStall	Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy	This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.3%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	511497.06	μs
Device Time	371.52	μs
Self CPU Time	34.15	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	511462.92	μs
Device Time	371.52	μs
Self CPU Time	86.86	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	510781.29	μs
Device Time	0.00	μs
Self CPU Time	78.52	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	509585.54	μs
Device Time	0.00	μs
Self CPU Time	509585.54	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	635417.45	μs
Device Time	21781.55	μs
Self CPU Time	635417.45	μs
Self Device Time	21781.55	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void aligned_coalesced_max_reduce_kernel<float>(float const, float, long, long)
CPU Time	0.00	μs
Device Time	161390.65	μs
Self CPU Time	0.00	μs
Self Device Time	161390.65	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	22777.82	μs
Device Time	43157.34	μs
Self CPU Time	22777.82	μs
Self Device Time	43157.34	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	129246.25	μs
Device Time	645851.04	μs
Self CPU Time	14212.82	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	115035.15	μs
Device Time	645851.04	μs
Self CPU Time	16557.93	μs
Self Device Time	645851.04	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	645929.47	μs
Self CPU Time	0.00	μs
Self Device Time	645929.47	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45283 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:19:21 bugprone-narrowing-conversions

19 | int outer_idx = blockIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:22:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

22 | int inner_idx = blockIdx.y * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:57:18: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

57 | for (int i = dim + 1; i < input.dim(); i++) {

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:72:20: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

72 | int blocks_y = (inner_size + threads - 1) / threads;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250207_optimize_b5_s4_e1_sweep/level_1/task_49/b1_s2_aligned_coalesced_max_reduction/base/base.cu:75:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]

75 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "aligned_coalesced_max_reduce_forward", ([&] {

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:246:19: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES_AND_HALF'

246 | TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(__VA_ARGS__))

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:240:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF'

240 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'

74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)

| ^

note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'

58 | AT_ERROR( \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'

711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'

536 | __func__, \

| ^

The AI CUDA Engineer 👷

`49_Max_reduction_over_a_dimension` • `aligned_coalesced_max_reduction_base`

Kernel Information

Related Kernels (Level 1, Task 49 • 49_Max_reduction_over_a_dimension)

The AI CUDA Engineer 👷

49_Max_reduction_over_a_dimension • aligned_coalesced_max_reduction_base

Kernel Information

Related Kernels (Level 1, Task 49 • 49_Max_reduction_over_a_dimension)

`49_Max_reduction_over_a_dimension` • `aligned_coalesced_max_reduction_base`