← Back to Leaderboard

The AI CUDA Engineer 👷

16_Matmul_with_transposed_Aoptimized_tiled_matmul_base

Level 1 • Task 16
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single matrix multiplication with transposed A (C = A.T * B).

    Args:
        A: Input tensor of shape (K, M).
        B: Input tensor of shape (K, N).

    Returns:
        Output tensor of shape (M, N).
    """
    return torch.matmul(A.T, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


M = 1024
K = 4096
N = 2048


def get_inputs():
    A = torch.randn(K, M)
    B = torch.randn(K, N)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
        """
        Performs matrix multiplication.

        Args:
            A: Input tensor of shape (M, K).
            B: Input tensor of shape (K, N).

        Returns:
            Output tensor of shape (M, N).
        """
        return torch.matmul(A.T, B)

M = 1024
K = 4096
N = 2048

def get_inputs():
    A = torch.randn(K, M)
    B = torch.randn(K, N)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Kernel Information

Related Kernels (Level 1, Task 16 • 16_Matmul_with_transposed_A)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 tiled_double_output_base 2.29 0.15 0.17
🥈 pipelined_tiled_matmul_base_base 2.70 0.13 0.15
🥉 hybrid_tiled_linear_matmul_base 2.76 0.13 0.14
4 modular_tiled_matmul_base_base 2.77 0.13 0.14
5 unrolled_tiled_matmul_base_base 2.81 0.13 0.14
6 optimized_tiled_matmul_base 2.81 0.13 0.14
6 tiled_shared_ldg_aligned_base 2.81 0.13 0.14
6 optimized_tiled_matmul_base 2.81 0.13 0.14
6 hybrid_tiling_grid_stride_base 2.81 0.13 0.14
10 syncthreads_optimized_tiling_edit_1 3.00 0.12 0.13
10 atomic_operations_optimized_tiling_base 3.00 0.12 0.13
12 streams_partitioned_matmul_edit_1 3.02 0.12 0.13
13 tiled_shared_unroll_base_base 3.02 0.12 0.13
14 streams_partitioned_matmul_base 3.03 0.12 0.13
15 modular_device_functions_tiling_2_base 3.04 0.12 0.13
15 modular_tiled_kernel_edit_1 3.04 0.12 0.13
15 modular_tiled_kernel_base 3.04 0.12 0.13
18 optimized_matmul_combined_kernel_edit_1 3.04 0.12 0.13
18 tiled_shared_const_memory_base 3.04 0.12 0.13
18 optimized_matmul_combined_kernel_base 3.04 0.12 0.13
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdexcept>

// Tile dimensions and chunk size
#define TILE_M 16
#define TILE_N 16
#define BLOCK_K 32

// This CUDA kernel computes C = A.T * B where:
//   A: shape (K, M) [read-only]
//   B: shape (K, N) [read-only]
//   C: shape (M, N) with C[i,j] = sum_{k=0}^{K-1} A[k*M + i] * B[k*N + j]
// It uses shared memory tiling and loop unrolling, and optimizes global memory loads
// by employing __ldg() to use the read-only cache. Global loads are assumed to be
// 128-bit aligned to maximize throughput.
__global__ void optimizedTiledMatMulKernel(const float* __restrict__ A,
                                            const float* __restrict__ B,
                                            float* __restrict__ C,
                                            int K, int M, int N) {
    // Compute global output indices
    int row = blockIdx.x * TILE_M + threadIdx.x;   // Corresponds to output row (and A column index)
    int col = blockIdx.y * TILE_N + threadIdx.y;   // Corresponds to output column of C

    float sum = 0.0f;

    // Allocate shared memory for current tile from A and B
    __shared__ float As[BLOCK_K][TILE_M];
    __shared__ float Bs[BLOCK_K][TILE_N];

    // Unique thread index in the block to coordinate global loads
    int tid = threadIdx.y * blockDim.x + threadIdx.x;
    int totalThreads = blockDim.x * blockDim.y;  // Should equal TILE_M * TILE_N

    // Loop over the K dimension in chunks
    for (int k0 = 0; k0 < K; k0 += BLOCK_K) {
        // Load a tile of A into shared memory
        for (int index = tid; index < BLOCK_K * TILE_M; index += totalThreads) {
            int t = index / TILE_M;    // local index in K tile
            int m = index % TILE_M;      // local index in M dimension
            int global_k = k0 + t;
            int global_m = blockIdx.x * TILE_M + m;
            As[t][m] = (global_k < K && global_m < M) ? __ldg(&A[global_k * M + global_m]) : 0.0f;
        }

        // Load a tile of B into shared memory
        for (int index = tid; index < BLOCK_K * TILE_N; index += totalThreads) {
            int t = index / TILE_N;    // local index in K tile
            int n = index % TILE_N;      // local index in N dimension
            int global_k = k0 + t;
            int global_n = blockIdx.y * TILE_N + n;
            Bs[t][n] = (global_k < K && global_n < N) ? __ldg(&B[global_k * N + global_n]) : 0.0f;
        }

        __syncthreads(); // Ensure entire tile is loaded

        // Compute partial dot product for this tile with loop unrolling
        #pragma unroll
        for (int t = 0; t < BLOCK_K; t++) {
            sum += As[t][threadIdx.x] * Bs[t][threadIdx.y];
        }

        __syncthreads(); // Prepare for next tile load
    }

    // Write the computed value to output if within bounds
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

// Forward function exposed to PyTorch via PyBind11
// A: Tensor with shape (K, M) [CUDA, float32]
// B: Tensor with shape (K, N) [CUDA, float32]
// Returns: C with shape (M, N) computed as C = A.T * B
torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
    TORCH_CHECK(A.is_cuda(), "Input A must be a CUDA tensor");
    TORCH_CHECK(B.is_cuda(), "Input B must be a CUDA tensor");
    TORCH_CHECK(A.dtype() == torch::kFloat32, "Input A must be float32");
    TORCH_CHECK(B.dtype() == torch::kFloat32, "Input B must be float32");

    int K = A.size(0);
    int M = A.size(1);
    TORCH_CHECK(B.size(0) == K, "Dimension mismatch: A and B must have the same first dimension (K)");
    int N = B.size(1);

    auto C = torch::zeros({M, N}, torch::device(A.device()).dtype(A.dtype()));

    // Define block and grid dimensions based on tile sizes
    dim3 block(TILE_M, TILE_N);
    dim3 grid((M + TILE_M - 1) / TILE_M, (N + TILE_N - 1) / TILE_N);

    const float* A_ptr = A.data_ptr<float>();
    const float* B_ptr = B.data_ptr<float>();
    float* C_ptr = C.data_ptr<float>();

    optimizedTiledMatMulKernel<<<grid, block>>>(A_ptr, B_ptr, C_ptr, K, M, N);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        throw std::runtime_error(cudaGetErrorString(err));
    }

    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Compute C = A.T * B using optimized tiled shared memory (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.260 inst/cycle 0.000 5
Executed Ipc Elapsed 2.240 inst/cycle 0.000 5
Issue Slots Busy 56.596 % 0.000 5
Issued Ipc Active 2.260 inst/cycle 0.000 5
SM Busy 56.596 % 0.000 5
Memory Throughput 17413933503.286 byte/second 7708177575424083.000 5
Mem Busy 88.988 % 0.002 5
Max Bandwidth 83.492 % 0.002 5
L1/TEX Hit Rate 0.682 % 0.000 5
L2 Hit Rate 97.194 % 0.060 5
Mem Pipes Busy 83.492 % 0.002 5
Warp Cycles Per Issued Instruction 26.890 cycle 0.000 5
Warp Cycles Per Executed Instruction 26.890 cycle 0.000 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 31.500 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 20.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 95.122 % 0.000 5
Achieved Active Warps Per SM 60.878 warp 0.000 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (32.8%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF Occupancy This kernel's theoretical occupancy is not impacted by any block limit.
Operation / Metric Value Unit
aten::to
CPU Time 444432.13 μs
Device Time 5037.47 μs
Self CPU Time 48.55 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 9252240.49 μs
Device Time 280982.06 μs
Self CPU Time 15355.52 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 9236889.57 μs
Device Time 280982.06 μs
Self CPU Time 19874.01 μs
Self Device Time 280982.06 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 9237767.28 μs
Device Time 7253.91 μs
Self CPU Time 9237767.28 μs
Self Device Time 7253.91 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
optimizedTiledMatMulKernel(float const*, float const*, float*, int, int, int)
CPU Time 0.00 μs
Device Time 9686626.77 μs
Self CPU Time 0.00 μs
Self Device Time 9686626.77 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceSynchronize
CPU Time 609480.90 μs
Device Time 77.06 μs
Self CPU Time 609480.90 μs
Self Device Time 77.06 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 15997.19 μs
Device Time 14666.14 μs
Self CPU Time 15997.19 μs
Self Device Time 14666.14 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 267196.54 μs
Self CPU Time 0.00 μs
Self Device Time 267196.54 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45287 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:18:44 bugprone-easily-swappable-parameters
18 | __global__ void optimizedTiledMatMulKernel(const float* __restrict__ A,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
19 | const float* __restrict__ B,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:18:70: note: the first parameter in the range is 'A'
18 | __global__ void optimizedTiledMatMulKernel(const float* __restrict__ A,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:19:71: note: the last parameter in the range is 'B'
19 | const float* __restrict__ B,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:23:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
23 | int row = blockIdx.x * TILE_M + threadIdx.x; // Corresponds to output row (and A column index)
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:24:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
24 | int col = blockIdx.y * TILE_N + threadIdx.y; // Corresponds to output column of C
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:33:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
33 | int tid = threadIdx.y * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:34:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
34 | int totalThreads = blockDim.x * blockDim.y; // Should equal TILE_M * TILE_N
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:43:28: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
43 | int global_m = blockIdx.x * TILE_M + m;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:52:28: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
52 | int global_n = blockIdx.y * TILE_N + n;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:77:37: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
77 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:77:54: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
77 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:83:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
83 | int K = A.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:84:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
84 | int M = A.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s3_optimized_tiled_matmul/base/base.cu:86:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
86 | int N = B.size(1);
| ^