← Back to Leaderboard

The AI CUDA Engineer 👷

16_Matmul_with_transposed_Ahybrid_tiling_grid_stride_base

Level 1 • Task 16
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs a single matrix multiplication with transposed A (C = A.T * B).

    Args:
        A: Input tensor of shape (K, M).
        B: Input tensor of shape (K, N).

    Returns:
        Output tensor of shape (M, N).
    """
    return torch.matmul(A.T, B)


class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """

    def __init__(self):
        super(Model, self).__init__()

    def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(A, B)


M = 1024
K = 4096
N = 2048


def get_inputs():
    A = torch.randn(K, M)
    B = torch.randn(K, N)
    return [A, B]


def get_init_inputs():
    return []  # No special initialization inputs needed
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Simple model that performs a single matrix multiplication (C = A * B)
    """
    def __init__(self):
        super(Model, self).__init__()
    
    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
        """
        Performs matrix multiplication.

        Args:
            A: Input tensor of shape (M, K).
            B: Input tensor of shape (K, N).

        Returns:
            Output tensor of shape (M, N).
        """
        return torch.matmul(A.T, B)

M = 1024
K = 4096
N = 2048

def get_inputs():
    A = torch.randn(K, M)
    B = torch.randn(K, N)
    return [A, B]

def get_init_inputs():
    return []  # No special initialization inputs needed

Kernel Information

Related Kernels (Level 1, Task 16 • 16_Matmul_with_transposed_A)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 tiled_double_output_base 2.29 0.15 0.17
🥈 pipelined_tiled_matmul_base_base 2.70 0.13 0.15
🥉 hybrid_tiled_linear_matmul_base 2.76 0.13 0.14
4 modular_tiled_matmul_base_base 2.77 0.13 0.14
5 unrolled_tiled_matmul_base_base 2.81 0.13 0.14
6 optimized_tiled_matmul_base 2.81 0.13 0.14
6 tiled_shared_ldg_aligned_base 2.81 0.13 0.14
6 optimized_tiled_matmul_base 2.81 0.13 0.14
6 hybrid_tiling_grid_stride_base 2.81 0.13 0.14
10 syncthreads_optimized_tiling_edit_1 3.00 0.12 0.13
10 atomic_operations_optimized_tiling_base 3.00 0.12 0.13
12 streams_partitioned_matmul_edit_1 3.02 0.12 0.13
13 tiled_shared_unroll_base_base 3.02 0.12 0.13
14 streams_partitioned_matmul_base 3.03 0.12 0.13
15 modular_device_functions_tiling_2_base 3.04 0.12 0.13
15 modular_tiled_kernel_edit_1 3.04 0.12 0.13
15 modular_tiled_kernel_base 3.04 0.12 0.13
18 optimized_matmul_combined_kernel_edit_1 3.04 0.12 0.13
18 tiled_shared_const_memory_base 3.04 0.12 0.13
18 optimized_matmul_combined_kernel_base 3.04 0.12 0.13
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdexcept>

#define TILE_M 16
#define TILE_N 16
#define BLOCK_K 32

__global__ void hybridKernel(const float* __restrict__ A,
                            const float* __restrict__ B,
                            float* __restrict__ C,
                            int K, int M, int N) {
    int row = blockIdx.x * TILE_M + threadIdx.x;
    int col = blockIdx.y * TILE_N + threadIdx.y;

    float sum = 0.0f;

    __shared__ float As[BLOCK_K][TILE_M];
    __shared__ float Bs[BLOCK_K][TILE_N];

    int tid = threadIdx.y * blockDim.x + threadIdx.x;
    int totalThreads = blockDim.x * blockDim.y;

    for (int k0 = 0; k0 < K; k0 += BLOCK_K) {
        for (int index = tid; index < BLOCK_K * TILE_M; index += totalThreads) {
            int t = index / TILE_M;
            int m = index % TILE_M;
            int global_k = k0 + t;
            int global_m = blockIdx.x * TILE_M + m;
            As[t][m] = (global_k < K && global_m < M) ? __ldg(&A[global_k * M + global_m]) : 0.0f;
        }

        for (int index = tid; index < BLOCK_K * TILE_N; index += totalThreads) {
            int t = index / TILE_N;
            int n = index % TILE_N;
            int global_k = k0 + t;
            int global_n = blockIdx.y * TILE_N + n;
            Bs[t][n] = (global_k < K && global_n < N) ? __ldg(&B[global_k * N + global_n]) : 0.0f;
        }

        __syncthreads();

        #pragma unroll
        for (int t = 0; t < BLOCK_K; t++) {
            sum += As[t][threadIdx.x] * Bs[t][threadIdx.y];
        }

        __syncthreads();
    }

    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
    TORCH_CHECK(A.is_cuda(), "Input A must be a CUDA tensor");
    TORCH_CHECK(B.is_cuda(), "Input B must be a CUDA tensor");
    TORCH_CHECK(A.dtype() == torch::kFloat32, "Input A must be float32");
    TORCH_CHECK(B.dtype() == torch::kFloat32, "Input B must be float32");

    int K = A.size(0);
    int M = A.size(1);
    TORCH_CHECK(B.size(0) == K, "Dimension mismatch: A and B must have the same first dimension (K)");
    int N = B.size(1);

    auto C = torch::zeros({M, N}, torch::device(A.device()).dtype(A.dtype()));

    dim3 block(TILE_M, TILE_N);
    dim3 grid((M + TILE_M - 1) / TILE_M, (N + TILE_N - 1) / TILE_N);

    const float* A_ptr = A.data_ptr<float>();
    const float* B_ptr = B.data_ptr<float>();
    float* C_ptr = C.data_ptr<float>();

    hybridKernel<<<grid, block>>>(A_ptr, B_ptr, C_ptr, K, M, N);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        throw std::runtime_error(cudaGetErrorString(err));
    }

    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Compute C = A.T * B using a hybrid approach (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.260 inst/cycle 0.000 5
Executed Ipc Elapsed 2.240 inst/cycle 0.000 5
Issue Slots Busy 56.586 % 0.000 5
Issued Ipc Active 2.260 inst/cycle 0.000 5
SM Busy 56.586 % 0.000 5
Memory Throughput 17378226883.180 byte/second 2203532370847152.000 5
Mem Busy 88.986 % 0.005 5
Max Bandwidth 83.490 % 0.004 5
L1/TEX Hit Rate 0.690 % 0.000 5
L2 Hit Rate 97.018 % 0.125 5
Mem Pipes Busy 83.490 % 0.004 5
Warp Cycles Per Issued Instruction 26.892 cycle 0.000 5
Warp Cycles Per Executed Instruction 26.892 cycle 0.000 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 31.500 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 20.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 95.102 % 0.000 5
Achieved Active Warps Per SM 60.864 warp 0.000 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (32.8%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF Occupancy This kernel's theoretical occupancy is not impacted by any block limit.
Operation / Metric Value Unit
aten::to
CPU Time 563853.44 μs
Device Time 5092.35 μs
Self CPU Time 41.17 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 9275463.04 μs
Device Time 280858.04 μs
Self CPU Time 13581.21 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 9261882.85 μs
Device Time 280858.04 μs
Self CPU Time 18306.97 μs
Self Device Time 280858.04 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 9260609.23 μs
Device Time 7313.44 μs
Self CPU Time 9260609.23 μs
Self Device Time 7313.44 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
hybridKernel(float const*, float const*, float*, int, int, int)
CPU Time 0.00 μs
Device Time 9695895.19 μs
Self CPU Time 0.00 μs
Self Device Time 9695895.19 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceSynchronize
CPU Time 609764.28 μs
Device Time 77.67 μs
Self CPU Time 609764.28 μs
Self Device Time 77.67 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 17370.93 μs
Device Time 14643.21 μs
Self CPU Time 17370.93 μs
Self Device Time 14643.21 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 267235.91 μs
Self CPU Time 0.00 μs
Self Device Time 267235.91 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45287 warnings generated when compiling for host.
Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:10:30 bugprone-easily-swappable-parameters
10 | __global__ void hybridKernel(const float* __restrict__ A,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 | const float* __restrict__ B,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:10:56: note: the first parameter in the range is 'A'
10 | __global__ void hybridKernel(const float* __restrict__ A,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:11:55: note: the last parameter in the range is 'B'
11 | const float* __restrict__ B,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:14:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
14 | int row = blockIdx.x * TILE_M + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:15:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
15 | int col = blockIdx.y * TILE_N + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:22:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | int tid = threadIdx.y * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:23:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
23 | int totalThreads = blockDim.x * blockDim.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:30:28: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
30 | int global_m = blockIdx.x * TILE_M + m;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:38:28: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
38 | int global_n = blockIdx.y * TILE_N + n;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:57:37: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
57 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:57:54: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
57 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:63:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
63 | int K = A.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:64:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
64 | int M = A.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_16/b8_s2_hybrid_tiling_grid_stride/base/base.cu:66:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
66 | int N = B.size(1);
| ^