import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs a single matrix multiplication with transposed A (C = A.T * B).
Args:
A: Input tensor of shape (K, M).
B: Input tensor of shape (K, N).
Returns:
Output tensor of shape (M, N).
"""
return torch.matmul(A.T, B)
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B)
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
return fn(A, B)
M = 1024
K = 4096
N = 2048
def get_inputs():
A = torch.randn(K, M)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B)
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs matrix multiplication.
Args:
A: Input tensor of shape (M, K).
B: Input tensor of shape (K, N).
Returns:
Output tensor of shape (M, N).
"""
return torch.matmul(A.T, B)
M = 1024
K = 4096
N = 2048
def get_inputs():
A = torch.randn(K, M)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
Operation Name | 16_Matmul_with_transposed_A |
Level ID | 1 |
Task ID | 16 |
Kernel Name | optimized_tiled_matmul_base |
CUDA Speedup (Native) | 0.125x |
CUDA Speedup (Compile) | 0.142x |
CUDA Runtime | 2.808 ms |
PyTorch Runtime (Native) | 0.351 ms |
PyTorch Runtime (Compile) | 0.400 ms |
Correct | True |
Max Diff (vs. Reference) | 0.001000 |
Model | azure-gpt-4o-2024-08-06 |
Temperature | 0.50 |
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdexcept>
// Tile dimensions and chunk size
#define TILE_M 16
#define TILE_N 16
#define BLOCK_K 32
// This CUDA kernel computes C = A.T * B where:
// A: shape (K, M) [read-only]
// B: shape (K, N) [read-only]
// C: shape (M, N) with C[i,j] = sum_{k=0}^{K-1} A[k*M + i] * B[k*N + j]
// It uses shared memory tiling and loop unrolling, and optimizes global memory loads
// by employing __ldg() to use the read-only cache. Global loads are assumed to be
// 128-bit aligned to maximize throughput.
__global__ void optimizedTiledMatMulKernel(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C,
int K, int M, int N) {
// Compute global output indices
int row = blockIdx.x * TILE_M + threadIdx.x; // Corresponds to output row (and A column index)
int col = blockIdx.y * TILE_N + threadIdx.y; // Corresponds to output column of C
float sum = 0.0f;
// Allocate shared memory for current tile from A and B
__shared__ float As[BLOCK_K][TILE_M];
__shared__ float Bs[BLOCK_K][TILE_N];
// Unique thread index in the block to coordinate global loads
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int totalThreads = blockDim.x * blockDim.y; // Should equal TILE_M * TILE_N
// Loop over the K dimension in chunks
for (int k0 = 0; k0 < K; k0 += BLOCK_K) {
// Load a tile of A into shared memory
for (int index = tid; index < BLOCK_K * TILE_M; index += totalThreads) {
int t = index / TILE_M; // local index in K tile
int m = index % TILE_M; // local index in M dimension
int global_k = k0 + t;
int global_m = blockIdx.x * TILE_M + m;
As[t][m] = (global_k < K && global_m < M) ? __ldg(&A[global_k * M + global_m]) : 0.0f;
}
// Load a tile of B into shared memory
for (int index = tid; index < BLOCK_K * TILE_N; index += totalThreads) {
int t = index / TILE_N; // local index in K tile
int n = index % TILE_N; // local index in N dimension
int global_k = k0 + t;
int global_n = blockIdx.y * TILE_N + n;
Bs[t][n] = (global_k < K && global_n < N) ? __ldg(&B[global_k * N + global_n]) : 0.0f;
}
__syncthreads(); // Ensure entire tile is loaded
// Compute partial dot product for this tile with loop unrolling
#pragma unroll
for (int t = 0; t < BLOCK_K; t++) {
sum += As[t][threadIdx.x] * Bs[t][threadIdx.y];
}
__syncthreads(); // Prepare for next tile load
}
// Write the computed value to output if within bounds
if (row < M && col < N) {
C[row * N + col] = sum;
}
}
// Forward function exposed to PyTorch via PyBind11
// A: Tensor with shape (K, M) [CUDA, float32]
// B: Tensor with shape (K, N) [CUDA, float32]
// Returns: C with shape (M, N) computed as C = A.T * B
torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
TORCH_CHECK(A.is_cuda(), "Input A must be a CUDA tensor");
TORCH_CHECK(B.is_cuda(), "Input B must be a CUDA tensor");
TORCH_CHECK(A.dtype() == torch::kFloat32, "Input A must be float32");
TORCH_CHECK(B.dtype() == torch::kFloat32, "Input B must be float32");
int K = A.size(0);
int M = A.size(1);
TORCH_CHECK(B.size(0) == K, "Dimension mismatch: A and B must have the same first dimension (K)");
int N = B.size(1);
auto C = torch::zeros({M, N}, torch::device(A.device()).dtype(A.dtype()));
// Define block and grid dimensions based on tile sizes
dim3 block(TILE_M, TILE_N);
dim3 grid((M + TILE_M - 1) / TILE_M, (N + TILE_N - 1) / TILE_N);
const float* A_ptr = A.data_ptr<float>();
const float* B_ptr = B.data_ptr<float>();
float* C_ptr = C.data_ptr<float>();
optimizedTiledMatMulKernel<<<grid, block>>>(A_ptr, B_ptr, C_ptr, K, M, N);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(err));
}
return C;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Compute C = A.T * B using optimized tiled shared memory (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.260 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.240 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 56.596 | % | 0.000 | 5 |
Issued Ipc Active | 2.260 | inst/cycle | 0.000 | 5 |
SM Busy | 56.596 | % | 0.000 | 5 |
Memory Throughput | 17413933503.286 | byte/second | 7708177575424083.000 | 5 |
Mem Busy | 88.988 | % | 0.002 | 5 |
Max Bandwidth | 83.492 | % | 0.002 | 5 |
L1/TEX Hit Rate | 0.682 | % | 0.000 | 5 |
L2 Hit Rate | 97.194 | % | 0.060 | 5 |
Mem Pipes Busy | 83.492 | % | 0.002 | 5 |
Warp Cycles Per Issued Instruction | 26.890 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 26.890 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 31.500 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 20.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 95.122 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 60.878 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (32.8%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 444432.13 | μs |
Device Time | 5037.47 | μs |
Self CPU Time | 48.55 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 9252240.49 | μs |
Device Time | 280982.06 | μs |
Self CPU Time | 15355.52 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 9236889.57 | μs |
Device Time | 280982.06 | μs |
Self CPU Time | 19874.01 | μs |
Self Device Time | 280982.06 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 9237767.28 | μs |
Device Time | 7253.91 | μs |
Self CPU Time | 9237767.28 | μs |
Self Device Time | 7253.91 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
optimizedTiledMatMulKernel(float const*, float const*, float*, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 9686626.77 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 9686626.77 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 609480.90 | μs |
Device Time | 77.06 | μs |
Self CPU Time | 609480.90 | μs |
Self Device Time | 77.06 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 15997.19 | μs |
Device Time | 14666.14 | μs |
Self CPU Time | 15997.19 | μs |
Self Device Time | 14666.14 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 267196.54 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 267196.54 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45287 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.