6_Matmul_with_large_K_dimension_
• optimized_matmul_kernel_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(A, B):
"""
Performs a single matrix multiplication (C = A * B) with a large K dimension.
Args:
A: Input tensor of shape (M, K)
B: Input tensor of shape (K, N)
Returns:
Output tensor of shape (M, N)
"""
return torch.matmul(A, B)
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B) with a large K dimension
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
return fn(A, B)
M = 256
N = 256
K = 131072
def get_inputs():
A = torch.randn(M, K)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B) with a large K dimension
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs matrix multiplication of A and B.
Args:
A: Input tensor of shape (M, K)
B: Input tensor of shape (K, N)
Returns:
Output tensor of shape (M, N)
"""
return torch.matmul(A, B)
M = 256
N = 256
K = 131072
def get_inputs():
A = torch.randn(M, K)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define TILE_WIDTH 16
// Optimized CUDA kernel combining techniques from both kernels
// Uses shared memory, __ldg for memory access, and minimizes warp divergence
template <typename scalar_t>
__global__ void optimized_matmul_kernel(const scalar_t* __restrict__ A,
const scalar_t* __restrict__ B,
scalar_t* __restrict__ C,
int M, int K, int N) {
int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
scalar_t value = 0;
// Determine if this block is completely interior
bool interior_block = ((blockIdx.x + 1) * TILE_WIDTH <= N) && ((blockIdx.y + 1) * TILE_WIDTH <= M);
__shared__ scalar_t sA[TILE_WIDTH][TILE_WIDTH];
__shared__ scalar_t sB[TILE_WIDTH][TILE_WIDTH];
int num_tiles = (K + TILE_WIDTH - 1) / TILE_WIDTH;
for (int t = 0; t < num_tiles; ++t) {
int tiledA_col = t * TILE_WIDTH + threadIdx.x;
int tiledB_row = t * TILE_WIDTH + threadIdx.y;
// Load tiles with consideration for interior blocks
if (interior_block) {
sA[threadIdx.y][threadIdx.x] = __ldg(&A[row * K + tiledA_col]);
sB[threadIdx.y][threadIdx.x] = __ldg(&B[tiledB_row * N + col]);
} else {
sA[threadIdx.y][threadIdx.x] = (row < M && tiledA_col < K) ? __ldg(&A[row * K + tiledA_col]) : static_cast<scalar_t>(0);
sB[threadIdx.y][threadIdx.x] = (col < N && tiledB_row < K) ? __ldg(&B[tiledB_row * N + col]) : static_cast<scalar_t>(0);
}
__syncthreads();
// Compute the dot product
#pragma unroll
for (int i = 0; i < TILE_WIDTH; ++i) {
value += sA[threadIdx.y][i] * sB[i][threadIdx.x];
}
__syncthreads();
}
// Write the result to the output matrix
if (row < M && col < N) {
C[row * N + col] = value;
}
}
// Host function exposed to Python via Pybind11
torch::Tensor module_fn(torch::Tensor A, torch::Tensor B) {
TORCH_CHECK(A.is_cuda(), "Input tensor A must be a CUDA tensor");
TORCH_CHECK(B.is_cuda(), "Input tensor B must be a CUDA tensor");
int64_t M = A.size(0);
int64_t K = A.size(1);
int64_t N = B.size(1);
TORCH_CHECK(K == B.size(0), "Inner dimensions of A and B must match");
auto C = torch::empty({M, N}, A.options());
dim3 threads(TILE_WIDTH, TILE_WIDTH);
dim3 blocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH - 1) / TILE_WIDTH);
AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "optimized_matmul_kernel", ([&] {
optimized_matmul_kernel<scalar_t><<<blocks, threads>>>(
A.data_ptr<scalar_t>(),
B.data_ptr<scalar_t>(),
C.data_ptr<scalar_t>(),
M, K, N);
}));
cudaDeviceSynchronize();
return C;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &module_fn, "Optimized matrix multiplication kernel (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.750 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.740 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 18.760 | % | 0.000 | 5 |
Issued Ipc Active | 0.750 | inst/cycle | 0.000 | 5 |
SM Busy | 18.760 | % | 0.000 | 5 |
Memory Throughput | 46642035924.132 | byte/second | 1493984580046127.250 | 5 |
Mem Busy | 41.996 | % | 0.001 | 5 |
Max Bandwidth | 35.724 | % | 0.001 | 5 |
L1/TEX Hit Rate | 0.208 | % | 0.003 | 5 |
L2 Hit Rate | 83.490 | % | 0.007 | 5 |
Mem Pipes Busy | 32.974 | % | 0.001 | 5 |
Warp Cycles Per Issued Instruction | 20.532 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 20.532 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 32.000 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 21.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 24.078 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 15.410 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (24.1%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 360882.64 | μs |
Device Time | 30754.11 | μs |
Self CPU Time | 46.60 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 360836.03 | μs |
Device Time | 30754.11 | μs |
Self CPU Time | 125.03 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 329571.87 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 100.22 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 328914.44 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 328914.44 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceSynchronize | ||
CPU Time | 8271866.20 | μs |
Device Time | 10393.55 | μs |
Self CPU Time | 8271866.20 | μs |
Self Device Time | 10393.55 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void optimized_matmul_kernel<float>(float const*, float const*, float*, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 8172912.95 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 8172912.95 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 26391.58 | μs |
Device Time | 118610.46 | μs |
Self CPU Time | 4442.03 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 21954.60 | μs |
Device Time | 118610.46 | μs |
Self CPU Time | 6435.90 | μs |
Self Device Time | 118610.46 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 118610.46 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 118610.46 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45283 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.