Kernel Details - unrolled_hybrid_matmul

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A, B):
    """
    performs a single general matrix multiplication (C = A * B).

    Args:
        A: Input tensor of shape (M, K).
        B: Input tensor of shape (K, N).

    Returns:
        Output tensor of shape (M, N).
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


M = 1024
K = 4096
N = 2048


def get_inputs():
    A = torch.randn(M, K)
    B = torch.randn(K, N)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
        """
        Performs matrix multiplication.

        Args:
            A: Input tensor of shape (M, K).
            B: Input tensor of shape (K, N).

        Returns:
            Output tensor of shape (M, N).
        """
        return torch.matmul(A, B)

M = 1024
K = 4096
N = 2048

def get_inputs():
    A = torch.randn(M, K)
    B = torch.randn(K, N)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	2_Standard_matrix_multiplication_
Level ID	1
Task ID	2
Kernel Name	unrolled_hybrid_matmul_base
CUDA Speedup (Native)	0.989x
CUDA Speedup (Compile)	1.068x
CUDA Runtime	0.430 ms
PyTorch Runtime (Native)	0.425 ms
PyTorch Runtime (Compile)	0.459 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	azure-gpt-4o-2024-08-06
Temperature	0.50

View Experiment Progress Details

Related Kernels (Level 1, Task 2 • 2_Standard_matrix_multiplication_)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	optimized_tiled_matmul_base	0.43	1.00	1.08
🥇	hybrid_matmul_base	0.43	1.00	1.08
🥇	hybrid_regtiled_base	0.43	1.00	1.08
🥇	double_buffered_matmul_base	0.43	1.00	1.08
5	warp_optimized_matmul_base_base	0.43	1.00	1.08
5	coalesced_hybrid_matmul_base_base	0.43	1.00	1.08
5	strided_tiled_matmul_base	0.43	1.00	1.08
8	hybrid_matmul_base	0.43	0.99	1.07
8	aligned_tiled_matmul_base_base	0.43	0.99	1.07
10	unrolled_hybrid_matmul_base	0.43	0.99	1.07
11	unrolled_hybrid_matmul_base_base	0.43	0.98	1.06
11	dynamic_blocksize_matmul_base	0.43	0.98	1.06
13	doublebuffer_tiled_matmul_base	0.43	0.98	1.06
13	optimized_single_stream_matmul_base	0.43	0.98	1.06
15	hybrid_tiled_cublas_base	0.43	0.98	1.06
16	constant_hybrid_matmul_base_base	0.45	0.96	1.03
17	streamed_pipelined_matmul_base_base	0.45	0.95	1.02
18	tiled_regtile_base	1.26	0.34	0.36
19	optimized_sync_matrix_mult_edit_1	1.92	0.22	0.24
20	divergence_free_matrix_mult_base	1.93	0.22	0.24

#include <torch/extension.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

#define TILE_SIZE 32

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

// Static cuBLAS handle to avoid recreation overhead
static cublasHandle_t handle = nullptr;

// Unrolled tiled matrix multiplication kernel
__global__ void unrolled_matmul_kernel(const float* __restrict__ A,
                                        const float* __restrict__ B,
                                        float* __restrict__ C,
                                        const int M, const int N, const int K) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    // Block indices
    const int bx = blockIdx.x;
    const int by = blockIdx.y;
    // Thread indices
    const int tx = threadIdx.x;
    const int ty = threadIdx.y;

    // Compute row and col for C
    const int row = by * TILE_SIZE + ty;
    const int col = bx * TILE_SIZE + tx;

    float sum = 0.0f;

    // Loop over tiles
    for (int tile = 0; tile < (K + TILE_SIZE - 1) / TILE_SIZE; ++tile) {
        // Load A tile
        if (row < M && tile * TILE_SIZE + tx < K) {
            As[ty][tx] = A[row * K + tile * TILE_SIZE + tx];
        } else {
            As[ty][tx] = 0.0f;
        }

        // Load B tile
        if (tile * TILE_SIZE + ty < K && col < N) {
            Bs[ty][tx] = B[(tile * TILE_SIZE + ty) * N + col];
        } else {
            Bs[ty][tx] = 0.0f;
        }

        __syncthreads();

        // Compute partial dot product using the tile
        #pragma unroll
        for (int k = 0; k < TILE_SIZE; ++k) {
            sum += As[ty][k] * Bs[k][tx];
        }

        __syncthreads();
    }

    // Write the result
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

// Hybrid matrix multiplication: chooses custom kernel for small matrices, cuBLAS for larger ones
void matrix_multiply_cuda(const torch::Tensor &A, const torch::Tensor &B, torch::Tensor &C) {
    CHECK_INPUT(A);
    CHECK_INPUT(B);
    CHECK_INPUT(C);

    const int M = A.size(0);
    const int K = A.size(1);
    const int N = B.size(1);

    const float* d_A = A.data_ptr<float>();
    const float* d_B = B.data_ptr<float>();
    float* d_C = C.data_ptr<float>();

    // Heuristic: use custom kernel for small matrices, cuBLAS otherwise.
    if (M <= 128 && N <= 128 && K <= 128) {
        // Launch unrolled tiled kernel
        dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
        dim3 numBlocks((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
        unrolled_matmul_kernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, M, N, K);
    } else {
        // Initialize cuBLAS handle if needed
        if (handle == nullptr) {
            cublasCreate(&handle);
            // Optionally, set math mode to use Tensor Cores if available
            cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
        }

        const float alpha = 1.0f;
        const float beta = 0.0f;

        // Note: cuBLAS assumes column-major order. Here we use arguments in a way that allows using row-major data.
        // We swap A and B pointers so that C = A*B is computed correctly.
        cublasSgemm(handle,
                    CUBLAS_OP_N, CUBLAS_OP_N,
                    N, M, K,
                    &alpha,
                    d_B, N,  // B's leading dimension
                    d_A, K,  // A's leading dimension
                    &beta,
                    d_C, N); // C's leading dimension
    }
}

// PyTorch forward interface
torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
    CHECK_INPUT(A);
    CHECK_INPUT(B);

    const int M = A.size(0);
    const int N = B.size(1);

    auto options = torch::TensorOptions()
                       .dtype(A.dtype())
                       .device(A.device())
                       .requires_grad(false);
    
    torch::Tensor C = torch::empty({M, N}, options);
    matrix_multiply_cuda(A, B, C);
    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Unrolled hybrid matrix multiplication (CUDA): custom kernel for small matrices and cuBLAS for large matrices");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::to
CPU Time	713801.39	μs
Device Time	5394.84	μs
Self CPU Time	50.07	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	713751.33	μs
Device Time	5394.84	μs
Self CPU Time	152.66	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaStreamGetCaptureInfo
CPU Time	7309.34	μs
Device Time	37141.32	μs
Self CPU Time	7309.34	μs
Self Device Time	37141.32	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
sm80_xmma_gemm_f32f32_f32f32_f32_nn_n_tilesize64x64x8_stage3_warpsize1x4x1_ffma_aligna4_alignc4_execute_kernel__51_cublas
CPU Time	0.00	μs
Device Time	2727113.51	μs
Self CPU Time	0.00	μs
Self Device Time	2727113.51	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	2563696.81	μs
Device Time	492849.69	μs
Self CPU Time	14653.51	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	2549044.85	μs
Device Time	492849.69	μs
Self CPU Time	19750.81	μs
Self Device Time	492849.69	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	2529294.05	μs
Device Time	0.00	μs
Self CPU Time	2529294.05	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	492849.69	μs
Self CPU Time	0.00	μs
Self Device Time	492849.69	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45293 warnings generated when compiling for host.
Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:8:35 bugprone-macro-parentheses

8 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:9:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]

9 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")

| ^

| ()

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:16:40: warning: 2 adjacent parameters of 'unrolled_matmul_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]

16 | __global__ void unrolled_matmul_kernel(const float* __restrict__ A,

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~

17 | const float* __restrict__ B,

| ~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:16:66: note: the first parameter in the range is 'A'

16 | __global__ void unrolled_matmul_kernel(const float* __restrict__ A,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:17:67: note: the last parameter in the range is 'B'

17 | const float* __restrict__ B,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:24:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

24 | const int bx = blockIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:25:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

25 | const int by = blockIdx.y;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:27:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

27 | const int tx = threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:28:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

28 | const int ty = threadIdx.y;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:75:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

75 | const int M = A.size(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:76:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

76 | const int K = A.size(1);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:77:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

77 | const int N = B.size(1);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:114:37: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

114 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:114:54: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

114 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:118:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

118 | const int M = A.size(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_2/b6_s1_unrolled_hybrid_matmul/base/base.cu:119:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

119 | const int N = B.size(1);

| ^

The AI CUDA Engineer 👷

`2_Standard_matrix_multiplication_` • `unrolled_hybrid_matmul_base`

Kernel Information

Related Kernels (Level 1, Task 2 • 2_Standard_matrix_multiplication_)

The AI CUDA Engineer 👷

2_Standard_matrix_multiplication_ • unrolled_hybrid_matmul_base

Kernel Information

Related Kernels (Level 1, Task 2 • 2_Standard_matrix_multiplication_)

`2_Standard_matrix_multiplication_` • `unrolled_hybrid_matmul_base`