Kernel Details - unrolled_loop_matmul

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A, B):
    """
    Performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M).

    Args:
        A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
        B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

    Returns:
        torch.Tensor: Output matrix of shape (M, N) or (N, M)
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A, B, fn=module_fn):
        return fn(A, B)


M = 16384
N = 16


def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A, B):
        """
        Performs the matrix multiplication.

        Args:
            A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
            B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

        Returns:
            torch.Tensor: Output matrix of shape (M, N) or (N, M)
        """
        return torch.matmul(A, B)

M = 16384
N = 16

def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	9_Tall_skinny_matrix_multiplication_
Level ID	1
Task ID	9
Kernel Name	unrolled_loop_matmul_base
CUDA Speedup (Native)	0.785x
CUDA Speedup (Compile)	0.591x
CUDA Runtime	0.678 ms
PyTorch Runtime (Native)	0.532 ms
PyTorch Runtime (Compile)	0.401 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	unrolled_loop_matmul_base	0.68	0.78	0.59
🥈	constant_mem_matmul_base_base	0.69	0.78	0.58
🥉	unrolled_matmul_kernel_base	0.69	0.77	0.58
4	balanced_workload_matmul_base_base	0.71	0.75	0.56
4	multi_tile_mapping_base	0.71	0.75	0.56
6	optimized_tiled_gemm_base	0.71	0.75	0.56
6	optimized_matmul_kernel_base	0.71	0.75	0.56
8	streamed_balanced_matmul_base	0.75	0.71	0.53
9	streamed_balanced_matmul_base	0.75	0.71	0.53
9	streamed_pipelined_matmul_base	0.75	0.71	0.53
11	predicated_tile_loading_unrolled_edit_1	1.26	0.42	0.32
11	unrolled_loop_optimization_base	1.26	0.42	0.32
11	unrolled_loop_optimization_edit_1	1.26	0.42	0.32
11	modular_device_functions_edit_1	1.26	0.42	0.32
15	uniform_flow_matmul_base	1.26	0.42	0.32
15	warp_optimized_reduction_edit_1	1.26	0.42	0.32
17	predicated_tile_loading_unrolled_base	1.26	0.42	0.32
18	modular_device_functions_base	1.26	0.42	0.32
19	warp_divergence_optimized_base_base	1.27	0.42	0.32
20	coalesced_memory_access_base_base	1.27	0.42	0.32

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
#define ELEMENTS_PER_THREAD 4

// Constant memory for configuration parameters
__constant__ int const_dims[6];  // M, N, K, lda, ldb, ldc
__constant__ bool const_trans[2];  // transA, transB

// Helper to fetch matrix elements considering transpose
__device__ inline float get_element(const float* __restrict__ matrix, int row, int col, int ld, bool transpose) {
    return transpose ? matrix[col * ld + row] : matrix[row * ld + col];
}

// Kernel with manual loop unrolling for critical loops
__global__ void unrolled_matmul_kernel(const float* __restrict__ A,
                                        const float* __restrict__ B,
                                        float* __restrict__ C) {
    // Load configuration from constant memory
    const int M = const_dims[0];
    const int N = const_dims[1];
    const int K = const_dims[2];
    const int lda = const_dims[3];
    const int ldb = const_dims[4];
    const int ldc = const_dims[5];
    const bool transA = const_trans[0];
    const bool transB = const_trans[1];

    // Calculate block's starting indices
    int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);
    int block_col = blockIdx.x * BLOCK_SIZE;
    int thread_row = threadIdx.y;
    int thread_col = threadIdx.x;

    // Shared memory tiles
    __shared__ float As[ELEMENTS_PER_THREAD][BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    // Accumulators: each thread computes ELEMENTS_PER_THREAD output elements
    float C_values[ELEMENTS_PER_THREAD] = {0.0f};

    int numTiles = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    for (int t = 0; t < numTiles; ++t) {
        int tiledK = t * BLOCK_SIZE;

        // Load tile of B into shared memory with bounds check
        if (tiledK + thread_row < K && block_col + thread_col < N)
            Bs[thread_row][thread_col] = get_element(B, tiledK + thread_row, block_col + thread_col, ldb, transB);
        else
            Bs[thread_row][thread_col] = 0.0f;

        // Load a tile of A into shared memory. Each thread loads ELEMENTS_PER_THREAD elements
        #pragma unroll
        for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
            int row = block_row + e * BLOCK_SIZE + thread_row;
            if (row < M && tiledK + thread_col < K)
                As[e][thread_row][thread_col] = get_element(A, row, tiledK + thread_col, lda, transA);
            else
                As[e][thread_row][thread_col] = 0.0f;
        }

        __syncthreads();

        // Multiply the loaded tiles
        #pragma unroll
        for (int k = 0; k < BLOCK_SIZE; ++k) {
            #pragma unroll
            for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
                C_values[e] += As[e][thread_row][k] * Bs[k][thread_col];
            }
        }

        __syncthreads();
    }

    // Write the computed results back to global memory
    #pragma unroll
    for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
        int row = block_row + e * BLOCK_SIZE + thread_row;
        int col = block_col + thread_col;
        if (row < M && col < N) {
            C[row * ldc + col] = C_values[e];
        }
    }
}

torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
    if (!A.is_cuda() || !B.is_cuda()) {
        throw std::invalid_argument("Input tensors must be on CUDA devices");
    }

    int dims[6];
    dims[0] = A.size(0);   // M
    dims[1] = B.size(1);   // N
    dims[2] = A.size(1);   // K
    dims[3] = A.stride(0); // lda
    dims[4] = B.stride(0); // ldb
    dims[5] = B.size(1);   // ldc

    bool trans[2] = {false, false};

    // Copy configuration to constant memory
    cudaMemcpyToSymbol(const_dims, dims, sizeof(dims));
    cudaMemcpyToSymbol(const_trans, trans, sizeof(trans));

    auto C = torch::empty({dims[0], dims[1]}, A.options());

    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((dims[1] + BLOCK_SIZE - 1) / BLOCK_SIZE,
                 (dims[0] + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

    unrolled_matmul_kernel<<<gridDim, blockDim>>>(A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>());
    cudaDeviceSynchronize();
    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &matmul_cuda, "Matrix multiplication with unrolled loops optimization (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	3.030	inst/cycle	0.000	5
Executed Ipc Elapsed	3.022	inst/cycle	0.000	5
Issue Slots Busy	75.820	%	0.001	5
Issued Ipc Active	3.030	inst/cycle	0.000	5
SM Busy	75.820	%	0.001	5
Memory Throughput	1361303654974.884	byte/second	303751204967985024.000	5
Mem Busy	94.498	%	0.002	5
Max Bandwidth	71.080	%	0.001	5
L1/TEX Hit Rate	38.470	%	0.000	5
L2 Hit Rate	99.088	%	0.000	5
Mem Pipes Busy	62.030	%	0.001	5
Warp Cycles Per Issued Instruction	14.640	cycle	0.000	5
Warp Cycles Per Executed Instruction	14.642	cycle	0.000	5
Avg. Active Threads Per Warp	32.000		0.000	5
Avg. Not Predicated Off Threads Per Warp	30.910		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	6.000	block	0.000	5
Block Limit Shared Mem	16.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	48.000	warp	0.000	5
Theoretical Occupancy	75.000	%	0.000	5
Achieved Occupancy	69.744	%	0.000	5
Achieved Active Warps Per SM	44.636	warp	0.000	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (34.3%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy	This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	386154.96	μs
Device Time	79.84	μs
Self CPU Time	39.39	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	386115.57	μs
Device Time	79.84	μs
Self CPU Time	109.27	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	385542.14	μs
Device Time	0.00	μs
Self CPU Time	99.16	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	381876.74	μs
Device Time	0.00	μs
Self CPU Time	381876.74	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaMemcpyToSymbol
CPU Time	358043.53	μs
Device Time	19263.26	μs
Self CPU Time	358043.53	μs
Self Device Time	19263.26	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
unrolled_matmul_kernel(float const, float const, float*)
CPU Time	0.00	μs
Device Time	2746536.17	μs
Self CPU Time	0.00	μs
Self Device Time	2746536.17	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceSynchronize
CPU Time	2767733.40	μs
Device Time	10477.20	μs
Self CPU Time	2767733.40	μs
Self Device Time	10477.20	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	47957.58	μs
Device Time	323599.52	μs
Self CPU Time	8345.93	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	39612.92	μs
Device Time	323599.52	μs
Self CPU Time	13367.68	μs
Self Device Time	323599.52	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	323599.52	μs
Self CPU Time	0.00	μs
Self Device Time	323599.52	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45287 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:32:21 bugprone-narrowing-conversions

32 | int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:33:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

33 | int block_col = blockIdx.x * BLOCK_SIZE;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:34:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

34 | int thread_row = threadIdx.y;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:35:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

35 | int thread_col = threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:90:41: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

90 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:90:58: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

90 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:96:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

96 | dims[0] = A.size(0); // M

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:97:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

97 | dims[1] = B.size(1); // N

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:98:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

98 | dims[2] = A.size(1); // K

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:99:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

99 | dims[3] = A.stride(0); // lda

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:100:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

100 | dims[4] = B.stride(0); // ldb

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b10_s1_unrolled_loop_matmul/base/base.cu:101:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

101 | dims[5] = B.size(1); // ldc

| ^

The AI CUDA Engineer 👷

`9_Tall_skinny_matrix_multiplication_` • `unrolled_loop_matmul_base`

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

The AI CUDA Engineer 👷

9_Tall_skinny_matrix_multiplication_ • unrolled_loop_matmul_base

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

`9_Tall_skinny_matrix_multiplication_` • `unrolled_loop_matmul_base`