Kernel Details - matmul_transpose_modular

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single matrix multiplication with transposed A and B (C = A.T * B.T).

    Args:
        A: Input tensor of shape (K, M).
        B: Input tensor of shape (N, K).

    Returns:
        Output tensor of shape (M, N).
    """
    return torch.matmul(A.T, B.T)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


M = 1024
K = 4096
N = 2048


def get_inputs():
    A = torch.randn(K, M)
    B = torch.randn(N, K)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
        """
        Performs matrix multiplication.

        Args:
            A: Input tensor of shape (M, K).
            B: Input tensor of shape (K, N).

        Returns:
            Output tensor of shape (M, N).
        """
        return torch.matmul(A.T, B.T)

M = 1024
K = 4096
N = 2048

def get_inputs():
    A = torch.randn(K, M)
    B = torch.randn(N, K)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	18_Matmul_with_transposed_both
Level ID	1
Task ID	18
Kernel Name	matmul_transpose_modular_base
CUDA Speedup (Native)	0.163x
CUDA Speedup (Compile)	0.185x
CUDA Runtime	2.233 ms
PyTorch Runtime (Native)	0.365 ms
PyTorch Runtime (Compile)	0.413 ms
Correct	True
Max Diff (vs. Reference)	0.001000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 1, Task 18 • 18_Matmul_with_transposed_both)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	optimized_matmul_transpose_base	1.87	0.19	0.22
🥈	matmul_transpose_ldg_128bit_align_base	1.90	0.19	0.22
🥉	matmul_transpose_ldg_optimization_base	1.91	0.19	0.22
🥉	optimized_matmul_transpose_base	1.91	0.19	0.22
5	optimized_matmul_transpose_base	1.91	0.19	0.22
6	stride_loop_matmul_transpose_base	1.91	0.19	0.22
6	warp_divergence_optimized_matmul_base	1.91	0.19	0.22
8	optimized_matmul_transpose_base	1.91	0.19	0.22
9	18_matmul_transposed_block32_base	1.96	0.19	0.21
10	combined_matmul_transpose_base	1.97	0.19	0.21
11	optimized_matmul_transpose_base	1.99	0.18	0.21
11	reduced_sync_matmul_transposed_edit_1	1.99	0.18	0.21
13	reduced_sync_matmul_transposed_base	1.99	0.18	0.21
13	optimized_matmul_transpose_edit_1	1.99	0.18	0.21
15	combined_matmul_transpose_edit_1	2.04	0.18	0.20
16	18_Matmul_transposed_coalesced_base	2.15	0.17	0.19
17	modular_matmul_transposed_base	2.15	0.17	0.19
18	18_Matmul_transposed_coalesced_edit_1	2.16	0.17	0.19
19	matmul_transpose_modular_base	2.23	0.16	0.18
20	modular_vectorized_matmul_transposed_base_base	2.24	0.16	0.18

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Define tile size
#define TILE_SIZE 16

// Modular device function to load a tile from matrix A into shared memory
// A is stored as (K x M): A[k * M + m], where m is the row index
template <typename scalar_t>
__device__ inline void load_A_tile(const scalar_t* __restrict__ A,
                                    scalar_t A_tile[TILE_SIZE][TILE_SIZE],
                                    int tile_idx, int row, int M, int K,
                                    int tx, int ty) {
    int global_k = tile_idx * TILE_SIZE + ty; // k index for the tile
    if (row < M && global_k < K)
        A_tile[ty][tx] = A[global_k * M + row];
    else
        A_tile[ty][tx] = static_cast<scalar_t>(0);
}

// Modular device function to load a tile from matrix B into shared memory
// B is stored as (N x K): B[n * K + k], where n is the column index in output C
template <typename scalar_t>
__device__ inline void load_B_tile(const scalar_t* __restrict__ B,
                                    scalar_t B_tile[TILE_SIZE][TILE_SIZE],
                                    int tile_idx, int col, int N, int K,
                                    int tx, int ty) {
    int global_k = tile_idx * TILE_SIZE + tx; // k index for the tile
    if (col < N && global_k < K)
        B_tile[ty][tx] = B[col * K + global_k];
    else
        B_tile[ty][tx] = static_cast<scalar_t>(0);
}

// Modular device function to compute dot product from the loaded tiles
// Each thread computes its partial sum for one output element
template <typename scalar_t>
__device__ inline scalar_t compute_tile_dot(const scalar_t A_tile[TILE_SIZE][TILE_SIZE],
                                               const scalar_t B_tile[TILE_SIZE][TILE_SIZE],
                                               int tx, int ty, int tile_bound) {
    scalar_t sum = 0;
    for (int k = 0; k < tile_bound; k++) {
        sum += A_tile[k][tx] * B_tile[ty][k];
    }
    return sum;
}

// Kernel: Each thread computes one element of the output matrix C
// C[m, n] = sum_{k} A[k, m] * B[n, k]
// where A and B are stored in transposed forms compared to standard layout
template <typename scalar_t>
__global__ void matmul_transpose_modular_kernel(
    const scalar_t* __restrict__ A,   // A: (K x M)
    const scalar_t* __restrict__ B,   // B: (N x K)
    scalar_t* __restrict__ C,         // C: (M x N)
    int M, int N, int K) {

    // Compute global row and column for C
    int row = blockIdx.x * TILE_SIZE + threadIdx.x; // corresponds to m
    int col = blockIdx.y * TILE_SIZE + threadIdx.y; // corresponds to n

    scalar_t acc = 0;

    // Allocate shared memory for tiles
    __shared__ scalar_t A_tile[TILE_SIZE][TILE_SIZE];
    __shared__ scalar_t B_tile[TILE_SIZE][TILE_SIZE];

    // Loop over tiles along k dimension
    int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
    for (int t = 0; t < numTiles; t++) {
        // Load tile from A and B using modular device functions
        load_A_tile<scalar_t>(A, A_tile, t, row, M, K, threadIdx.x, threadIdx.y);
        load_B_tile<scalar_t>(B, B_tile, t, col, N, K, threadIdx.x, threadIdx.y);

        __syncthreads();

        // Determine actual number of iterations for this tile (handles boundary cases)
        int tile_bound = TILE_SIZE;
        if (t == numTiles - 1) {
            int remainder = K - t * TILE_SIZE;
            tile_bound = remainder < TILE_SIZE ? remainder : TILE_SIZE;
        }

        // Compute partial dot product from the current tile
        acc += compute_tile_dot<scalar_t>(A_tile, B_tile, threadIdx.x, threadIdx.y, tile_bound);

        __syncthreads();
    }

    // Write the result to the output matrix C if within valid bounds
    if (row < M && col < N) {
        C[row * N + col] = acc;
    }
}

// PyTorch binding

torch::Tensor matmul_transpose_cuda(torch::Tensor A, torch::Tensor B) {
    // Dimensions:
    // A: (K x M), B: (N x K) => C: (M x N)
    int K = A.size(0);
    int M = A.size(1);
    int N = B.size(0);

    auto C = torch::empty({M, N}, A.options());

    dim3 threads(TILE_SIZE, TILE_SIZE);
    dim3 blocks((M + TILE_SIZE - 1) / TILE_SIZE,
                (N + TILE_SIZE - 1) / TILE_SIZE);

    AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "matmul_transpose_modular_kernel", ([&] {
        matmul_transpose_modular_kernel<scalar_t><<<blocks, threads>>>(
            A.data_ptr<scalar_t>(),
            B.data_ptr<scalar_t>(),
            C.data_ptr<scalar_t>(),
            M, N, K);
    }));

    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &matmul_transpose_cuda, "Matrix multiplication with transposed inputs using modular device functions (CUDA)");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples
Executed Ipc Active	2.930	inst/cycle	0.000	5
Executed Ipc Elapsed	2.900	inst/cycle	0.000	5
Issue Slots Busy	73.164	%	0.001	5
Issued Ipc Active	2.930	inst/cycle	0.000	5
SM Busy	73.164	%	0.001	5
Memory Throughput	21648082665.942	byte/second	1792405861919605.250	5
Mem Busy	88.926	%	0.005	5
Max Bandwidth	81.116	%	0.004	5
L1/TEX Hit Rate	1.156	%	0.003	5
L2 Hit Rate	97.278	%	0.110	5
Mem Pipes Busy	81.116	%	0.004	5
Warp Cycles Per Issued Instruction	20.868	cycle	0.000	5
Warp Cycles Per Executed Instruction	20.868	cycle	0.000	5
Avg. Active Threads Per Warp	32.000		0.000	5
Avg. Not Predicated Off Threads Per Warp	30.400		0.000	5
Max Active Clusters	0.000	cluster	0.000	5
Max Cluster Size	8.000	block	0.000	5
Overall GPU Occupancy	0.000	%	0.000	5
Cluster Occupancy	0.000	%	0.000	5
Block Limit SM	32.000	block	0.000	5
Block Limit Registers	8.000	block	0.000	5
Block Limit Shared Mem	21.000	block	0.000	5
Block Limit Warps	8.000	block	0.000	5
Theoretical Active Warps per SM	64.000	warp	0.000	5
Theoretical Occupancy	100.000	%	0.000	5
Achieved Occupancy	95.440	%	0.001	5
Achieved Active Warps Per SM	61.082	warp	0.000	5

Analysis Rules

Rule	Description
INF HighPipeUtilization	ALU is the highest-utilized pipeline (39.5%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF Occupancy	This kernel's theoretical occupancy is not impacted by any block limit.

Operation / Metric	Value	Unit
aten::to
CPU Time	566812.04	μs
Device Time	5273.58	μs
Self CPU Time	42.78	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	5461393.68	μs
Device Time	6595.94	μs
Self CPU Time	5461393.68	μs
Self Device Time	6595.94	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void matmul_transpose_modular_kernel<float>(float const, float const, float*, int, int, int)
CPU Time	0.00	μs
Device Time	5924897.70	μs
Self CPU Time	0.00	μs
Self Device Time	5924897.70	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaDeviceSynchronize
CPU Time	606099.47	μs
Device Time	77.86	μs
Self CPU Time	606099.47	μs
Self Device Time	77.86	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	9580.54	μs
Device Time	13115.57	μs
Self CPU Time	9580.54	μs
Self Device Time	13115.57	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::zero_
CPU Time	5326245.68	μs
Device Time	205712.53	μs
Self CPU Time	5714.05	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::fill_
CPU Time	5320533.45	μs
Device Time	205712.53	μs
Self CPU Time	7875.57	μs
Self Device Time	205712.53	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char, 1>)
CPU Time	0.00	μs
Device Time	205712.53	μs
Self CPU Time	0.00	μs
Self Device Time	205712.53	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45288 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_18/b6_s1_matmul_transpose_modular/base/base.cu:13:37 bugprone-easily-swappable-parameters