6_Matmul_with_large_K_dimension_
• double_buffered_matmul_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(A, B):
"""
Performs a single matrix multiplication (C = A * B) with a large K dimension.
Args:
A: Input tensor of shape (M, K)
B: Input tensor of shape (K, N)
Returns:
Output tensor of shape (M, N)
"""
return torch.matmul(A, B)
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B) with a large K dimension
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
return fn(A, B)
M = 256
N = 256
K = 131072
def get_inputs():
A = torch.randn(M, K)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B) with a large K dimension
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs matrix multiplication of A and B.
Args:
A: Input tensor of shape (M, K)
B: Input tensor of shape (K, N)
Returns:
Output tensor of shape (M, N)
"""
return torch.matmul(A, B)
M = 256
N = 256
K = 131072
def get_inputs():
A = torch.randn(M, K)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define TILE_WIDTH 32
template <typename scalar_t>
__global__ void matmul_double_buffered(const scalar_t* __restrict__ A, const scalar_t* __restrict__ B,
scalar_t* __restrict__ C, int M, int K, int N) {
__shared__ scalar_t sA[2][TILE_WIDTH][TILE_WIDTH];
__shared__ scalar_t sB[2][TILE_WIDTH][TILE_WIDTH];
int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
scalar_t accum = 0;
// Preload first tile
int load_idx = 0;
int t = 0;
if (row < M && t * TILE_WIDTH + threadIdx.x < K)
sA[load_idx][threadIdx.y][threadIdx.x] = A[row * K + t * TILE_WIDTH + threadIdx.x];
else
sA[load_idx][threadIdx.y][threadIdx.x] = 0;
if (col < N && t * TILE_WIDTH + threadIdx.y < K)
sB[load_idx][threadIdx.y][threadIdx.x] = B[(t * TILE_WIDTH + threadIdx.y) * N + col];
else
sB[load_idx][threadIdx.y][threadIdx.x] = 0;
__syncthreads();
for (t = 0; t < (K + TILE_WIDTH - 1) / TILE_WIDTH - 1; ++t) {
int compute_idx = load_idx;
load_idx = 1 - load_idx;
// Asynchronous load next tile
if (row < M && (t + 1) * TILE_WIDTH + threadIdx.x < K)
sA[load_idx][threadIdx.y][threadIdx.x] = A[row * K + (t + 1) * TILE_WIDTH + threadIdx.x];
else
sA[load_idx][threadIdx.y][threadIdx.x] = 0;
if (col < N && (t + 1) * TILE_WIDTH + threadIdx.y < K)
sB[load_idx][threadIdx.y][threadIdx.x] = B[((t + 1) * TILE_WIDTH + threadIdx.y) * N + col];
else
sB[load_idx][threadIdx.y][threadIdx.x] = 0;
// Compute current tile
for (int i = 0; i < TILE_WIDTH; ++i) {
accum += sA[compute_idx][threadIdx.y][i] * sB[compute_idx][i][threadIdx.x];
}
__syncthreads();
}
// Process last tile
for (int i = 0; i < TILE_WIDTH; ++i) {
accum += sA[load_idx][threadIdx.y][i] * sB[load_idx][i][threadIdx.x];
}
if (row < M && col < N) {
C[row * N + col] = accum;
}
}
torch::Tensor module_fn(torch::Tensor A, torch::Tensor B) {
TORCH_CHECK(A.is_cuda(), "Input tensor A must be a CUDA tensor");
TORCH_CHECK(B.is_cuda(), "Input tensor B must be a CUDA tensor");
int64_t M = A.size(0);
int64_t K = A.size(1);
int64_t N = B.size(1);
TORCH_CHECK(K == B.size(0), "Inner dimensions must match");
auto C = torch::empty({M, N}, A.options());
dim3 threads(TILE_WIDTH, TILE_WIDTH);
dim3 blocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH - 1) / TILE_WIDTH);
AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "matmul_double_buffered", ([&] {
matmul_double_buffered<scalar_t><<<blocks, threads>>>(
A.data_ptr<scalar_t>(),
B.data_ptr<scalar_t>(),
C.data_ptr<scalar_t>(),
M, K, N);
}));
cudaDeviceSynchronize();
return C;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &module_fn, "Double buffered matrix multiplication");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.420 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.690 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 35.554 | % | 0.000 | 5 |
Issued Ipc Active | 1.420 | inst/cycle | 0.000 | 5 |
SM Busy | 35.554 | % | 0.000 | 5 |
Memory Throughput | 44549328713.780 | byte/second | 2169827498075433.500 | 5 |
Mem Busy | 35.624 | % | 0.000 | 5 |
Max Bandwidth | 32.808 | % | 0.000 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 68.078 | % | 0.000 | 5 |
Mem Pipes Busy | 30.184 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 22.500 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 22.502 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 31.390 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 2.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 3.000 | block | 0.000 | 5 |
Block Limit Warps | 2.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 50.000 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 32.000 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (50.0%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 592954.52 | μs |
Device Time | 28378.90 | μs |
Self CPU Time | 43.21 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 592911.31 | μs |
Device Time | 28378.90 | μs |
Self CPU Time | 127.82 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 563972.60 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 127.80 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 553680.27 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 553680.27 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 6544590.02 | μs |
Device Time | 8306.56 | μs |
Self CPU Time | 6544590.02 | μs |
Self Device Time | 8306.56 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void matmul_double_buffered<float>(float const*, float const*, float*, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 6457907.85 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 6457907.85 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 25899.00 | μs |
Device Time | 98157.65 | μs |
Self CPU Time | 2790.66 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 23109.91 | μs |
Device Time | 98157.65 | μs |
Self CPU Time | 3628.12 | μs |
Self Device Time | 98157.65 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 98157.65 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 98157.65 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45281 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.