Kernel Details - streamed_balanced_matmul

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A, B):
    """
    Performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M).

    Args:
        A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
        B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

    Returns:
        torch.Tensor: Output matrix of shape (M, N) or (N, M)
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A, B, fn=module_fn):
        return fn(A, B)


M = 16384
N = 16


def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A, B):
        """
        Performs the matrix multiplication.

        Args:
            A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
            B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

        Returns:
            torch.Tensor: Output matrix of shape (M, N) or (N, M)
        """
        return torch.matmul(A, B)

M = 16384
N = 16

def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	9_Tall_skinny_matrix_multiplication_
Level ID	1
Task ID	9
Kernel Name	streamed_balanced_matmul_base
CUDA Speedup (Native)	0.709x
CUDA Speedup (Compile)	0.535x
CUDA Runtime	0.750 ms
PyTorch Runtime (Native)	0.532 ms
PyTorch Runtime (Compile)	0.401 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	azure-gpt-4o-2024-08-06
Temperature	0.00

View Experiment Progress Details

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	unrolled_loop_matmul_base	0.68	0.78	0.59
🥈	constant_mem_matmul_base_base	0.69	0.78	0.58
🥉	unrolled_matmul_kernel_base	0.69	0.77	0.58
4	balanced_workload_matmul_base_base	0.71	0.75	0.56
4	multi_tile_mapping_base	0.71	0.75	0.56
6	optimized_tiled_gemm_base	0.71	0.75	0.56
6	optimized_matmul_kernel_base	0.71	0.75	0.56
8	streamed_balanced_matmul_base	0.75	0.71	0.53
9	streamed_balanced_matmul_base	0.75	0.71	0.53
9	streamed_pipelined_matmul_base	0.75	0.71	0.53
11	predicated_tile_loading_unrolled_edit_1	1.26	0.42	0.32
11	unrolled_loop_optimization_base	1.26	0.42	0.32
11	unrolled_loop_optimization_edit_1	1.26	0.42	0.32
11	modular_device_functions_edit_1	1.26	0.42	0.32
15	uniform_flow_matmul_base	1.26	0.42	0.32
15	warp_optimized_reduction_edit_1	1.26	0.42	0.32
17	predicated_tile_loading_unrolled_base	1.26	0.42	0.32
18	modular_device_functions_base	1.26	0.42	0.32
19	warp_divergence_optimized_base_base	1.27	0.42	0.32
20	coalesced_memory_access_base_base	1.27	0.42	0.32

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>

#define BLOCK_SIZE 16
#define ELEMENTS_PER_THREAD 4
#define NUM_STREAMS 4

__device__ float get_element(const float* __restrict__ matrix, int row, int col, int ld, bool transpose) {
    return transpose ? matrix[col * ld + row] : matrix[row * ld + col];
}

__global__ void matmul_kernel_streamed_balanced(const float* __restrict__ A,
                                                const float* __restrict__ B,
                                                float* __restrict__ C,
                                                int M, int N, int K,
                                                int lda, int ldb, int ldc,
                                                int m_offset,
                                                bool transA, bool transB) {
    int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD) + m_offset;
    int block_col = blockIdx.x * BLOCK_SIZE;
    int thread_row = threadIdx.y;
    int thread_col = threadIdx.x;

    __shared__ float As[ELEMENTS_PER_THREAD][BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
    
    float C_values[ELEMENTS_PER_THREAD] = {0.0f};

    for (int t = 0; t < (K + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) {
        if (t * BLOCK_SIZE + thread_row < K && block_col + thread_col < N) {
            Bs[thread_row][thread_col] = get_element(B, 
                t * BLOCK_SIZE + thread_row,
                block_col + thread_col,
                ldb, transB);
        } else {
            Bs[thread_row][thread_col] = 0.0f;
        }

        #pragma unroll
        for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
            int row = block_row + e * BLOCK_SIZE + thread_row;
            if (row < M && t * BLOCK_SIZE + thread_col < K) {
                As[e][thread_row][thread_col] = get_element(A,
                    row,
                    t * BLOCK_SIZE + thread_col,
                    lda, transA);
            } else {
                As[e][thread_row][thread_col] = 0.0f;
            }
        }

        __syncthreads();

        #pragma unroll
        for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
            #pragma unroll
            for (int k = 0; k < BLOCK_SIZE; ++k) {
                C_values[e] += As[e][thread_row][k] * Bs[k][thread_col];
            }
        }

        __syncthreads();
    }

    #pragma unroll
    for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
        int row = block_row + e * BLOCK_SIZE + thread_row;
        int col = block_col + thread_col;
        if (row < M && col < N) {
            C[row * ldc + col] = C_values[e];
        }
    }
}

torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
    if (!A.is_cuda() || !B.is_cuda()) {
        throw std::invalid_argument("Input tensors must be on CUDA devices");
    }

    int64_t M = A.size(0);
    int64_t K = A.size(1);
    int64_t N = B.size(1);

    bool transA = false, transB = false;
    int lda = A.stride(0);
    int ldb = B.stride(0);
    int ldc = N;

    auto C = torch::empty({M, N}, A.options());

    std::vector<cudaStream_t> streams(NUM_STREAMS);
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
    }

    int rows_per_stream = (M + NUM_STREAMS - 1) / NUM_STREAMS;
    rows_per_stream = ((rows_per_stream + BLOCK_SIZE * ELEMENTS_PER_THREAD - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD)) * (BLOCK_SIZE * ELEMENTS_PER_THREAD);

    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        int m_offset = i * rows_per_stream;
        int current_M = std::min(rows_per_stream, static_cast<int>(M - m_offset));
        
        if (current_M <= 0) continue;

        dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
                     (current_M + BLOCK_SIZE * ELEMENTS_PER_THREAD - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

        matmul_kernel_streamed_balanced<<<gridDim, blockDim, 0, streams[i]>>>(
            A.data_ptr<float>(),
            B.data_ptr<float>(),
            C.data_ptr<float>(),
            M, N, K,
            lda, ldb, ldc,
            m_offset,
            transA, transB);
    }

    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamSynchronize(streams[i]);
        cudaStreamDestroy(streams[i]);
    }

    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &matmul_cuda, "Matrix multiplication with streamed and balanced workload (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	2.912	inst/cycle	0.000	5
Executed Ipc Elapsed	2.880	inst/cycle	0.000	5
Issue Slots Busy	72.790	%	0.003	5
Issued Ipc Active	2.912	inst/cycle	0.000	5
SM Busy	72.790	%	0.003	5
Memory Throughput	1126443256199.028	byte/second	375505697259161664.000	5
Mem Busy	90.800	%	0.004	5
Max Bandwidth	66.870	%	0.002	5
L1/TEX Hit Rate	38.750	%	0.000	5
L2 Hit Rate	98.246	%	0.946	5
Mem Pipes Busy	58.932	%	0.001	5
Warp Cycles Per Issued Instruction	12.602	cycle	0.000	5
Warp Cycles Per Executed Instruction	12.604	cycle	0.000	5
Avg. Active Threads Per Warp	32.000		0.000	5
Avg. Not Predicated Off Threads Per Warp	31.240		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	5.000	block	0.000	5
Block Limit Shared Mem	10.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	40.000	warp	0.000	5
Theoretical Occupancy	62.500	%	0.000	5
Achieved Occupancy	57.670	%	0.000	5
Achieved Active Warps Per SM	36.910	warp	0.000	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (39.6%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy	This kernel's theoretical occupancy (62.5%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	568384.03	μs
Device Time	79.90	μs
Self CPU Time	49.10	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	568334.93	μs
Device Time	79.90	μs
Self CPU Time	105.90	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::empty_strided
CPU Time	567763.10	μs
Device Time	0.00	μs
Self CPU Time	88.43	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceGetStreamPriorityRange
CPU Time	567420.26	μs
Device Time	0.00	μs
Self CPU Time	567420.26	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaStreamSynchronize
CPU Time	2792430.07	μs
Device Time	24936.26	μs
Self CPU Time	2792430.07	μs
Self Device Time	24936.26	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	80726.75	μs
Device Time	25549.28	μs
Self CPU Time	80726.75	μs
Self Device Time	25549.28	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
matmul_kernel_streamed_balanced(float const, float const, float*, int, int, int, int, int, int, int, bool, bool)
CPU Time	0.00	μs
Device Time	2653268.28	μs
Self CPU Time	0.00	μs
Self Device Time	2653268.28	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	41827.18	μs
Device Time	283139.83	μs
Self CPU Time	7966.43	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	33864.45	μs
Device Time	283139.83	μs
Self CPU Time	9919.83	μs
Self Device Time	283139.83	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	283139.83	μs
Self CPU Time	0.00	μs
Self Device Time	283139.83	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45298 warnings generated when compiling for host.
Suppressed 45330 warnings (45283 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:17:63 bugprone-easily-swappable-parameters

17 | int M, int N, int K,

| ^~~~~~

18 | int lda, int ldb, int ldc,

| ~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:17:67: note: the first parameter in the range is 'K'

17 | int M, int N, int K,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:18:53: note: the last parameter in the range is 'lda'

18 | int lda, int ldb, int ldc,

| ^~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:18:67: warning: 2 adjacent parameters of 'matmul_kernel_streamed_balanced' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]

18 | int lda, int ldb, int ldc,

| ^~~~~~~~

19 | int m_offset,

| ~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:18:71: note: the first parameter in the range is 'ldc'

18 | int lda, int ldb, int ldc,

| ^~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:19:53: note: the last parameter in the range is 'm_offset'

19 | int m_offset,

| ^~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:21:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

21 | int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD) + m_offset;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:22:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

22 | int block_col = blockIdx.x * BLOCK_SIZE;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:23:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

23 | int thread_row = threadIdx.y;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:24:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

24 | int thread_col = threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:77:41: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

77 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:77:58: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

77 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:87:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

87 | int lda = A.stride(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:88:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

88 | int ldb = B.stride(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:89:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

89 | int ldc = N;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:98:27: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

98 | int rows_per_stream = (M + NUM_STREAMS - 1) / NUM_STREAMS;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:116:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

116 | M, N, K,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:116:16: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

116 | M, N, K,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:116:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

116 | M, N, K,

| ^

The AI CUDA Engineer 👷

`9_Tall_skinny_matrix_multiplication_` • `streamed_balanced_matmul_base`

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

The AI CUDA Engineer 👷

9_Tall_skinny_matrix_multiplication_ • streamed_balanced_matmul_base

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

`9_Tall_skinny_matrix_multiplication_` • `streamed_balanced_matmul_base`