9_Tall_skinny_matrix_multiplication_
• unrolled_loop_matmul_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(A, B):
"""
Performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M).
Args:
A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.
Returns:
torch.Tensor: Output matrix of shape (M, N) or (N, M)
"""
return torch.matmul(A, B)
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A, B, fn=module_fn):
return fn(A, B)
M = 16384
N = 16
def get_inputs():
A = torch.randn(M, N)
B = torch.randn(N, M)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B) where one of the matrices is tall and skinny (M >> N or N >> M)
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A, B):
"""
Performs the matrix multiplication.
Args:
A (torch.Tensor): Input matrix of shape (M, K) or (K, M) where M >> N or N >> M.
B (torch.Tensor): Input matrix of shape (K, N) or (N, K) where M >> N or N >> M.
Returns:
torch.Tensor: Output matrix of shape (M, N) or (N, M)
"""
return torch.matmul(A, B)
M = 16384
N = 16
def get_inputs():
A = torch.randn(M, N)
B = torch.randn(N, M)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
#define ELEMENTS_PER_THREAD 4
// Constant memory for configuration parameters
__constant__ int const_dims[6]; // M, N, K, lda, ldb, ldc
__constant__ bool const_trans[2]; // transA, transB
// Helper to fetch matrix elements considering transpose
__device__ inline float get_element(const float* __restrict__ matrix, int row, int col, int ld, bool transpose) {
return transpose ? matrix[col * ld + row] : matrix[row * ld + col];
}
// Kernel with manual loop unrolling for critical loops
__global__ void unrolled_matmul_kernel(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C) {
// Load configuration from constant memory
const int M = const_dims[0];
const int N = const_dims[1];
const int K = const_dims[2];
const int lda = const_dims[3];
const int ldb = const_dims[4];
const int ldc = const_dims[5];
const bool transA = const_trans[0];
const bool transB = const_trans[1];
// Calculate block's starting indices
int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD);
int block_col = blockIdx.x * BLOCK_SIZE;
int thread_row = threadIdx.y;
int thread_col = threadIdx.x;
// Shared memory tiles
__shared__ float As[ELEMENTS_PER_THREAD][BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Accumulators: each thread computes ELEMENTS_PER_THREAD output elements
float C_values[ELEMENTS_PER_THREAD] = {0.0f};
int numTiles = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
for (int t = 0; t < numTiles; ++t) {
int tiledK = t * BLOCK_SIZE;
// Load tile of B into shared memory with bounds check
if (tiledK + thread_row < K && block_col + thread_col < N)
Bs[thread_row][thread_col] = get_element(B, tiledK + thread_row, block_col + thread_col, ldb, transB);
else
Bs[thread_row][thread_col] = 0.0f;
// Load a tile of A into shared memory. Each thread loads ELEMENTS_PER_THREAD elements
#pragma unroll
for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
int row = block_row + e * BLOCK_SIZE + thread_row;
if (row < M && tiledK + thread_col < K)
As[e][thread_row][thread_col] = get_element(A, row, tiledK + thread_col, lda, transA);
else
As[e][thread_row][thread_col] = 0.0f;
}
__syncthreads();
// Multiply the loaded tiles
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
#pragma unroll
for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
C_values[e] += As[e][thread_row][k] * Bs[k][thread_col];
}
}
__syncthreads();
}
// Write the computed results back to global memory
#pragma unroll
for (int e = 0; e < ELEMENTS_PER_THREAD; ++e) {
int row = block_row + e * BLOCK_SIZE + thread_row;
int col = block_col + thread_col;
if (row < M && col < N) {
C[row * ldc + col] = C_values[e];
}
}
}
torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
if (!A.is_cuda() || !B.is_cuda()) {
throw std::invalid_argument("Input tensors must be on CUDA devices");
}
int dims[6];
dims[0] = A.size(0); // M
dims[1] = B.size(1); // N
dims[2] = A.size(1); // K
dims[3] = A.stride(0); // lda
dims[4] = B.stride(0); // ldb
dims[5] = B.size(1); // ldc
bool trans[2] = {false, false};
// Copy configuration to constant memory
cudaMemcpyToSymbol(const_dims, dims, sizeof(dims));
cudaMemcpyToSymbol(const_trans, trans, sizeof(trans));
auto C = torch::empty({dims[0], dims[1]}, A.options());
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
dim3 gridDim((dims[1] + BLOCK_SIZE - 1) / BLOCK_SIZE,
(dims[0] + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD));
unrolled_matmul_kernel<<<gridDim, blockDim>>>(A.data_ptr<float>(), B.data_ptr<float>(), C.data_ptr<float>());
cudaDeviceSynchronize();
return C;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &matmul_cuda, "Matrix multiplication with unrolled loops optimization (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 3.030 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 3.022 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 75.820 | % | 0.001 | 5 |
Issued Ipc Active | 3.030 | inst/cycle | 0.000 | 5 |
SM Busy | 75.820 | % | 0.001 | 5 |
Memory Throughput | 1361303654974.884 | byte/second | 303751204967985024.000 | 5 |
Mem Busy | 94.498 | % | 0.002 | 5 |
Max Bandwidth | 71.080 | % | 0.001 | 5 |
L1/TEX Hit Rate | 38.470 | % | 0.000 | 5 |
L2 Hit Rate | 99.088 | % | 0.000 | 5 |
Mem Pipes Busy | 62.030 | % | 0.001 | 5 |
Warp Cycles Per Issued Instruction | 14.640 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 14.642 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 30.910 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 6.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 16.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 48.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 75.000 | % | 0.000 | 5 |
Achieved Occupancy | 69.744 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 44.636 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (34.3%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
WRN Occupancy | This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 386154.96 | μs |
Device Time | 79.84 | μs |
Self CPU Time | 39.39 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 386115.57 | μs |
Device Time | 79.84 | μs |
Self CPU Time | 109.27 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 385542.14 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 99.16 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 381876.74 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 381876.74 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaMemcpyToSymbol | ||
CPU Time | 358043.53 | μs |
Device Time | 19263.26 | μs |
Self CPU Time | 358043.53 | μs |
Self Device Time | 19263.26 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
unrolled_matmul_kernel(float const*, float const*, float*) | ||
CPU Time | 0.00 | μs |
Device Time | 2746536.17 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2746536.17 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 2767733.40 | μs |
Device Time | 10477.20 | μs |
Self CPU Time | 2767733.40 | μs |
Self Device Time | 10477.20 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 47957.58 | μs |
Device Time | 323599.52 | μs |
Self CPU Time | 8345.93 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 39612.92 | μs |
Device Time | 323599.52 | μs |
Self CPU Time | 13367.68 | μs |
Self Device Time | 323599.52 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 323599.52 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 323599.52 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45287 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.