← Back to Leaderboard

The AI CUDA Engineer 👷

9_Tall_skinny_matrix_multiplication_unrolled_loop_matmul_base

Level 1 • Task 9
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A, B):
    """
    Performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M).

    Args:
        A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
        B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

    Returns:
        torch.Tensor: Output matrix of shape (M, N) or (N, M)
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A, B, fn=module_fn):
        return fn(A, B)


M = 16384
N = 16


def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A, B):
        """
        Performs the matrix multiplication.

        Args:
            A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
            B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

        Returns:
            torch.Tensor: Output matrix of shape (M, N) or (N, M)
        """
        return torch.matmul(A, B)

M = 16384
N = 16

def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 unrolled_loop_matmul_base 0.68 0.78 0.59
🥈 constant_mem_matmul_base_base 0.69 0.78 0.58
🥉 unrolled_matmul_kernel_base 0.69 0.77 0.58
4 balanced_workload_matmul_base_base 0.71 0.75 0.56
4 multi_tile_mapping_base 0.71 0.75 0.56
6 optimized_tiled_gemm_base 0.71 0.75 0.56
6 optimized_matmul_kernel_base 0.71 0.75 0.56
8 streamed_balanced_matmul_base 0.75 0.71 0.53
9 streamed_balanced_matmul_base 0.75 0.71 0.53
9 streamed_pipelined_matmul_base 0.75 0.71 0.53
11 predicated_tile_loading_unrolled_edit_1 1.26 0.42 0.32
11 unrolled_loop_optimization_base 1.26 0.42 0.32
11 unrolled_loop_optimization_edit_1 1.26 0.42 0.32
11 modular_device_functions_edit_1 1.26 0.42 0.32
15 uniform_flow_matmul_base 1.26 0.42 0.32
15 warp_optimized_reduction_edit_1 1.26 0.42 0.32
17 predicated_tile_loading_unrolled_base 1.26 0.42 0.32
18 modular_device_functions_base 1.26 0.42 0.32
19 warp_divergence_optimized_base_base 1.27 0.42 0.32
20 coalesced_memory_access_base_base 1.27 0.42 0.32
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
#define ELEMENTS_PER_THREAD 4

// Constant memory for configuration parameters
__constant__ int const_dims[6];  // M, N, K, lda, ldb, ldc
__constant__ bool const_trans[2];  // transA, transB

// Helper to fetch matrix elements considering transpose
__device__ inline float get_element(const float* __restrict__ matrix, int row, int col, int ld, bool transpose) {
    return transpose ? matrix[col * ld + row] : matrix[row * ld + col];
}

// Kernel with manual loop unrolling for critical loops
__global__ void unrolled_matmul_kernel(const float* __restrict__ A,
                                        const float* __restrict__ B,
                                        float* __restrict__ C) {
    // Load configuration from constant memory
    const int M = const_dims[0];
    const int N = const_dims[1];
    const int K = const_dims[2];
    const int lda = const_dims[3];
    const int ldb = const_dims[4];
    const int ldc = const_dims[5];
    const bool transA = const_trans[0];
    const bool transB = const_trans[1];

    // Calculate block's starting indices
    int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);
    int block_col = blockIdx.x * BLOCK_SIZE;
    int thread_row = threadIdx.y;
    int thread_col = threadIdx.x;

    // Shared memory tiles
    __shared__ float As[ELEMENTS_PER_THREAD][BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    // Accumulators: each thread computes ELEMENTS_PER_THREAD output elements
    float C_values[ELEMENTS_PER_THREAD] = {0.0f};

    int numTiles = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    for (int t = 0; t < numTiles; ++t) {
        int tiledK = t * BLOCK_SIZE;

        // Load tile of B into shared memory with bounds check
        if (tiledK + thread_row < K && block_col + thread_col < N)
            Bs[thread_row][thread_col] = get_element(B, tiledK + thread_row, block_col + thread_col, ldb, transB);
        else
            Bs[thread_row][thread_col] = 0.0f;

        // Load a tile of A into shared memory. Each thread loads ELEMENTS_PER_THREAD elements
        #pragma unroll
        for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
            int row = block_row + e * BLOCK_SIZE + thread_row;
            if (row < M && tiledK + thread_col < K)
                As[e][thread_row][thread_col] = get_element(A, row, tiledK + thread_col, lda, transA);
            else
                As[e][thread_row][thread_col] = 0.0f;
        }

        __syncthreads();

        // Multiply the loaded tiles
        #pragma unroll
        for (int k = 0; k < BLOCK_SIZE; ++k) {
            #pragma unroll
            for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
                C_values[e] += As[e][thread_row][k] * Bs[k][thread_col];
            }
        }

        __syncthreads();
    }

    // Write the computed results back to global memory
    #pragma unroll
    for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
        int row = block_row + e * BLOCK_SIZE + thread_row;
        int col = block_col + thread_col;
        if (row < M && col < N) {
            C[row * ldc + col] = C_values[e];
        }
    }
}

torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
    if (!A.is_cuda() || !B.is_cuda()) {
        throw std::invalid_argument("Input tensors must be on CUDA devices");
    }

    int dims[6];
    dims[0] = A.size(0);   // M
    dims[1] = B.size(1);   // N
    dims[2] = A.size(1);   // K
    dims[3] = A.stride(0); // lda
    dims[4] = B.stride(0); // ldb
    dims[5] = B.size(1);   // ldc

    bool trans[2] = {false, false};

    // Copy configuration to constant memory
    cudaMemcpyToSymbol(const_dims, dims, sizeof(dims));
    cudaMemcpyToSymbol(const_trans, trans, sizeof(trans));

    auto C = torch::empty({dims[0], dims[1]}, A.options());

    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((dims[1] + BLOCK_SIZE - 1) / BLOCK_SIZE,
                 (dims[0] + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

    unrolled_matmul_kernel<<<gridDim, blockDim>>>(A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>());
    cudaDeviceSynchronize();
    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &matmul_cuda, "Matrix multiplication with unrolled loops optimization (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 3.030 inst/cycle 0.000 5
Executed Ipc Elapsed 3.022 inst/cycle 0.000 5
Issue Slots Busy 75.820 % 0.001 5
Issued Ipc Active 3.030 inst/cycle 0.000 5
SM Busy 75.820 % 0.001 5
Memory Throughput 1361303654974.884 byte/second 303751204967985024.000 5
Mem Busy 94.498 % 0.002 5
Max Bandwidth 71.080 % 0.001 5
L1/TEX Hit Rate 38.470 % 0.000 5
L2 Hit Rate 99.088 % 0.000 5
Mem Pipes Busy 62.030 % 0.001 5
Warp Cycles Per Issued Instruction 14.640 cycle 0.000 5
Warp Cycles Per Executed Instruction 14.642 cycle 0.000 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 30.910 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 6.000 block 0.000 5
Block Limit Shared Mem 16.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 48.000 warp 0.000 5
Theoretical Occupancy 75.000 % 0.000 5
Achieved Occupancy 69.744 % 0.000 5
Achieved Active Warps Per SM 44.636 warp 0.000 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (34.3%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 386154.96 μs
Device Time 79.84 μs
Self CPU Time 39.39 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 386115.57 μs
Device Time 79.84 μs
Self CPU Time 109.27 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 385542.14 μs
Device Time 0.00 μs
Self CPU Time 99.16 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 381876.74 μs
Device Time 0.00 μs
Self CPU Time 381876.74 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaMemcpyToSymbol
CPU Time 358043.53 μs
Device Time 19263.26 μs
Self CPU Time 358043.53 μs
Self Device Time 19263.26 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
unrolled_matmul_kernel(float const*, float const*, float*)
CPU Time 0.00 μs
Device Time 2746536.17 μs
Self CPU Time 0.00 μs
Self Device Time 2746536.17 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceSynchronize
CPU Time 2767733.40 μs
Device Time 10477.20 μs
Self CPU Time 2767733.40 μs
Self Device Time 10477.20 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 47957.58 μs
Device Time 323599.52 μs
Self CPU Time 8345.93 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 39612.92 μs
Device Time 323599.52 μs
Self CPU Time 13367.68 μs
Self Device Time 323599.52 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 323599.52 μs
Self CPU Time 0.00 μs
Self Device Time 323599.52 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45287 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:32:21 bugprone-narrowing-conversions
32 | int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:33:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
33 | int block_col = blockIdx.x * BLOCK_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:34:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
34 | int thread_row = threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:35:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
35 | int thread_col = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:90:41: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
90 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:90:58: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
90 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:96:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
96 | dims[0] = A.size(0); // M
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:97:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
97 | dims[1] = B.size(1); // N
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:98:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
98 | dims[2] = A.size(1); // K
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:99:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
99 | dims[3] = A.stride(0); // lda
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:100:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
100 | dims[4] = B.stride(0); // ldb
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:101:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
101 | dims[5] = B.size(1); // ldc
| ^