Kernel Details - balanced_workload_matmul_base

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A, B):
    """
    Performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M).

    Args:
        A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
        B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

    Returns:
        torch.Tensor: Output matrix of shape (M, N) or (N, M)
    """
    return torch.matmul(A, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A, B, fn=module_fn):
        return fn(A, B)


M = 16384
N = 16


def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A, B):
        """
        Performs the matrix multiplication.

        Args:
            A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
            B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.

        Returns:
            torch.Tensor: Output matrix of shape (M, N) or (N, M)
        """
        return torch.matmul(A, B)

M = 16384
N = 16

def get_inputs():
    A = torch.randn(M, N)
    B = torch.randn(N, M)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	9_Tall_skinny_matrix_multiplication_
Level ID	1
Task ID	9
Kernel Name	balanced_workload_matmul_base_base
CUDA Speedup (Native)	0.749x
CUDA Speedup (Compile)	0.565x
CUDA Runtime	0.710 ms
PyTorch Runtime (Native)	0.532 ms
PyTorch Runtime (Compile)	0.401 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
Temperature	0.50

View Experiment Progress Details

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	unrolled_loop_matmul_base	0.68	0.78	0.59
🥈	constant_mem_matmul_base_base	0.69	0.78	0.58
🥉	unrolled_matmul_kernel_base	0.69	0.77	0.58
4	balanced_workload_matmul_base_base	0.71	0.75	0.56
4	multi_tile_mapping_base	0.71	0.75	0.56
6	optimized_tiled_gemm_base	0.71	0.75	0.56
6	optimized_matmul_kernel_base	0.71	0.75	0.56
8	streamed_balanced_matmul_base	0.75	0.71	0.53
9	streamed_balanced_matmul_base	0.75	0.71	0.53
9	streamed_pipelined_matmul_base	0.75	0.71	0.53
11	predicated_tile_loading_unrolled_edit_1	1.26	0.42	0.32
11	unrolled_loop_optimization_base	1.26	0.42	0.32
11	unrolled_loop_optimization_edit_1	1.26	0.42	0.32
11	modular_device_functions_edit_1	1.26	0.42	0.32
15	uniform_flow_matmul_base	1.26	0.42	0.32
15	warp_optimized_reduction_edit_1	1.26	0.42	0.32
17	predicated_tile_loading_unrolled_base	1.26	0.42	0.32
18	modular_device_functions_base	1.26	0.42	0.32
19	warp_divergence_optimized_base_base	1.27	0.42	0.32
20	coalesced_memory_access_base_base	1.27	0.42	0.32

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
#define ELEMENTS_PER_THREAD 4

__device__ float get_element(const float* __restrict__ matrix, int row, int col, int ld, bool transpose) {
    return transpose ? matrix[col * ld + row] : matrix[row * ld + col];
}

__global__ void matmul_kernel_balanced(const float* __restrict__ A,
                                      const float* __restrict__ B,
                                      float* __restrict__ C,
                                      int M, int N, int K,
                                      int lda, int ldb, int ldc,
                                      bool transA, bool transB) {
    // Each thread computes multiple elements
    int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);
    int block_col = blockIdx.x * BLOCK_SIZE;
    int thread_row = threadIdx.y;
    int thread_col = threadIdx.x;

    __shared__ float As[ELEMENTS_PER_THREAD][BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
    
    float C_values[ELEMENTS_PER_THREAD] = {0.0f};

    // Process multiple rows per thread
    for (int t = 0; t < (K + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) {
        // Load B tile into shared memory
        if (t * BLOCK_SIZE + thread_row < K && block_col + thread_col < N) {
            Bs[thread_row][thread_col] = get_element(B, 
                t * BLOCK_SIZE + thread_row,
                block_col + thread_col,
                ldb, transB);
        } else {
            Bs[thread_row][thread_col] = 0.0f;
        }

        // Load multiple A tiles into shared memory
        #pragma unroll
        for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
            int row = block_row + e * BLOCK_SIZE + thread_row;
            if (row < M && t * BLOCK_SIZE + thread_col < K) {
                As[e][thread_row][thread_col] = get_element(A,
                    row,
                    t * BLOCK_SIZE + thread_col,
                    lda, transA);
            } else {
                As[e][thread_row][thread_col] = 0.0f;
            }
        }

        __syncthreads();

        // Compute multiple elements per thread
        #pragma unroll
        for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
            #pragma unroll
            for (int k = 0; k < BLOCK_SIZE; ++k) {
                C_values[e] += As[e][thread_row][k] * Bs[k][thread_col];
            }
        }

        __syncthreads();
    }

    // Store results
    #pragma unroll
    for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
        int row = block_row + e * BLOCK_SIZE + thread_row;
        int col = block_col + thread_col;
        if (row < M && col < N) {
            C[row * ldc + col] = C_values[e];
        }
    }
}

torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
    if (!A.is_cuda() || !B.is_cuda()) {
        throw std::invalid_argument("Input tensors must be on CUDA devices");
    }

    int64_t M = A.size(0);
    int64_t K = A.size(1);
    int64_t N = B.size(1);

    bool transA = false, transB = false;
    int lda = A.stride(0), ldb = B.stride(0), ldc = N;

    auto C = torch::empty({M, N}, A.options());

    // Adjust grid dimensions for multiple elements per thread
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
                 (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

    matmul_kernel_balanced<<<gridDim, blockDim>>>(
        A.data_ptr<float>(),
        B.data_ptr<float>(),
        C.data_ptr<float>(),
        M, N, K,
        lda, ldb, ldc,
        transA, transB);

    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &matmul_cuda, "Matrix multiplication with balanced workload (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	2.950	inst/cycle	0.000	5
Executed Ipc Elapsed	2.942	inst/cycle	0.000	5
Issue Slots Busy	73.838	%	0.001	5
Issued Ipc Active	2.950	inst/cycle	0.000	5
SM Busy	73.838	%	0.001	5
Memory Throughput	1228360470275.820	byte/second	785553927735018368.000	5
Mem Busy	93.280	%	0.002	5
Max Bandwidth	68.650	%	0.001	5
L1/TEX Hit Rate	38.740	%	0.000	5
L2 Hit Rate	99.092	%	0.001	5
Mem Pipes Busy	60.500	%	0.001	5
Warp Cycles Per Issued Instruction	12.446	cycle	0.000	5
Warp Cycles Per Executed Instruction	12.448	cycle	0.000	5
Avg. Active Threads Per Warp	32.000		0.000	5
Avg. Not Predicated Off Threads Per Warp	31.240		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	5.000	block	0.000	5
Block Limit Shared Mem	10.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	40.000	warp	0.000	5
Theoretical Occupancy	62.500	%	0.000	5
Achieved Occupancy	57.724	%	0.000	5
Achieved Active Warps Per SM	36.942	warp	0.000	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (40.2%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
WRN Occupancy	This kernel's theoretical occupancy (62.5%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.

Operation / Metric	Value	Unit
aten::to
CPU Time	313719.20	μs
Device Time	80.29	μs
Self CPU Time	47.97	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_to_copy
CPU Time	313671.23	μs
Device Time	80.29	μs
Self CPU Time	100.62	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	2455350.20	μs
Device Time	8752.42	μs
Self CPU Time	2455350.20	μs
Self Device Time	8752.42	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
matmul_kernel_balanced(float const, float const, float*, int, int, int, int, int, int, bool, bool)
CPU Time	0.00	μs
Device Time	2469378.01	μs
Self CPU Time	0.00	μs
Self Device Time	2469378.01	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	9557.22	μs
Device Time	17347.94	μs
Self CPU Time	9557.22	μs
Self Device Time	17347.94	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	2267199.02	μs
Device Time	267797.53	μs
Self CPU Time	7177.32	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	2260023.41	μs
Device Time	267797.53	μs
Self CPU Time	9202.96	μs
Self Device Time	267797.53	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	267797.53	μs
Self CPU Time	0.00	μs
Self Device Time	267797.53	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45290 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:15:53 bugprone-easily-swappable-parameters

15 | int M, int N, int K,

| ^~~~~~

16 | int lda, int ldb, int ldc,

| ~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:15:57: note: the first parameter in the range is 'K'

15 | int M, int N, int K,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:16:43: note: the last parameter in the range is 'lda'

16 | int lda, int ldb, int ldc,

| ^~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:19:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

19 | int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:20:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

20 | int block_col = blockIdx.x * BLOCK_SIZE;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:21:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

21 | int thread_row = threadIdx.y;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:22:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

22 | int thread_col = threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:80:41: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

80 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:80:58: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

80 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:90:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

90 | int lda = A.stride(0), ldb = B.stride(0), ldc = N;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:90:34: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

90 | int lda = A.stride(0), ldb = B.stride(0), ldc = N;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:90:53: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

90 | int lda = A.stride(0), ldb = B.stride(0), ldc = N;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:97:24: warning: performing an implicit widening conversion to type 'int64_t' (aka 'long') of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]

97 | (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:5:20: note: expanded from macro 'BLOCK_SIZE'

5 | #define BLOCK_SIZE 16

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:97:24: note: make conversion explicit to silence this warning

97 | (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

| ^

| static_cast<int64_t>( )

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:5:20: note: expanded from macro 'BLOCK_SIZE'

5 | #define BLOCK_SIZE 16

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:97:24: note: perform multiplication in a wider type

97 | (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

| ^

| static_cast<int64_t>( )

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:5:20: note: expanded from macro 'BLOCK_SIZE'

5 | #define BLOCK_SIZE 16

| ^~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:97:66: warning: performing an implicit widening conversion to type 'int64_t' (aka 'long') of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]

97 | (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:5:20: note: expanded from macro 'BLOCK_SIZE'

5 | #define BLOCK_SIZE 16

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:97:66: note: make conversion explicit to silence this warning

97 | (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

| ^

| static_cast<int64_t>( )

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:5:20: note: expanded from macro 'BLOCK_SIZE'

5 | #define BLOCK_SIZE 16

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:97:66: note: perform multiplication in a wider type

97 | (M + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));

| ^

| static_cast<int64_t>( )

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:5:20: note: expanded from macro 'BLOCK_SIZE'

5 | #define BLOCK_SIZE 16

| ^~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:103:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

103 | M, N, K,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:103:12: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

103 | M, N, K,

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b7_s3_balanced_workload_matmul_base/base/base.cu:103:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

103 | M, N, K,

| ^

The AI CUDA Engineer 👷

`9_Tall_skinny_matrix_multiplication_` • `balanced_workload_matmul_base_base`

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

The AI CUDA Engineer 👷

9_Tall_skinny_matrix_multiplication_ • balanced_workload_matmul_base_base

Kernel Information

Related Kernels (Level 1, Task 9 • 9_Tall_skinny_matrix_multiplication_)

`9_Tall_skinny_matrix_multiplication_` • `balanced_workload_matmul_base_base`