16_Matmul_with_transposed_A
• tiled_shared_const_memory_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs a single matrix multiplication with transposed A (C = A.T * B).
Args:
A: Input tensor of shape (K, M).
B: Input tensor of shape (K, N).
Returns:
Output tensor of shape (M, N).
"""
return torch.matmul(A.T, B)
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B)
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor, fn=module_fn) -> torch.Tensor:
return fn(A, B)
M = 1024
K = 4096
N = 2048
def get_inputs():
A = torch.randn(K, M)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Simple model that performs a single matrix multiplication (C = A * B)
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
"""
Performs matrix multiplication.
Args:
A: Input tensor of shape (M, K).
B: Input tensor of shape (K, N).
Returns:
Output tensor of shape (M, N).
"""
return torch.matmul(A.T, B)
M = 1024
K = 4096
N = 2048
def get_inputs():
A = torch.randn(K, M)
B = torch.randn(K, N)
return [A, B]
def get_init_inputs():
return [] # No special initialization inputs needed
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdexcept>
#define TILE_M 16
#define TILE_N 16
#define BLOCK_K 32
__constant__ int cK, cM, cN;
__global__ void tiledSharedConstKernel(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C) {
int row = blockIdx.x * TILE_M + threadIdx.x;
int col = blockIdx.y * TILE_N + threadIdx.y;
float sum = 0.0f;
__shared__ float As[BLOCK_K][TILE_M];
__shared__ float Bs[BLOCK_K][TILE_N];
int tid = threadIdx.y * TILE_M + threadIdx.x;
int totalThreads = TILE_M * TILE_N;
for (int k0 = 0; k0 < cK; k0 += BLOCK_K) {
for (int index = tid; index < BLOCK_K * TILE_M; index += totalThreads) {
int t = index / TILE_M;
int i = index % TILE_M;
int global_i = blockIdx.x * TILE_M + i;
int global_k = k0 + t;
As[t][i] = (global_i < cM && global_k < cK) ? A[global_k * cM + global_i] : 0.0f;
}
for (int index = tid; index < BLOCK_K * TILE_N; index += totalThreads) {
int t = index / TILE_N;
int j = index % TILE_N;
int global_j = blockIdx.y * TILE_N + j;
int global_k = k0 + t;
Bs[t][j] = (global_j < cN && global_k < cK) ? B[global_k * cN + global_j] : 0.0f;
}
__syncthreads();
#pragma unroll
for (int t = 0; t < BLOCK_K; t += 4) {
sum += As[t][threadIdx.x] * Bs[t][threadIdx.y]
+ As[t+1][threadIdx.x] * Bs[t+1][threadIdx.y]
+ As[t+2][threadIdx.x] * Bs[t+2][threadIdx.y]
+ As[t+3][threadIdx.x] * Bs[t+3][threadIdx.y];
}
__syncthreads();
}
if (row < cM && col < cN) {
C[row * cN + col] = sum;
}
}
torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
TORCH_CHECK(A.is_cuda(), "A must be CUDA tensor");
TORCH_CHECK(B.is_cuda(), "B must be CUDA tensor");
TORCH_CHECK(A.dtype() == torch::kFloat32, "A must be float32");
TORCH_CHECK(B.dtype() == torch::kFloat32, "B must be float32");
int K = A.size(0);
int M = A.size(1);
TORCH_CHECK(B.size(0) == K, "A and B must have same K");
int N = B.size(1);
cudaMemcpyToSymbol(cK, &K, sizeof(int));
cudaMemcpyToSymbol(cM, &M, sizeof(int));
cudaMemcpyToSymbol(cN, &N, sizeof(int));
auto C = torch::zeros({M, N}, A.options());
dim3 block(TILE_M, TILE_N);
dim3 grid((M + TILE_M - 1) / TILE_M, (N + TILE_N - 1) / TILE_N);
const float* A_ptr = A.data_ptr<float>();
const float* B_ptr = B.data_ptr<float>();
float* C_ptr = C.data_ptr<float>();
tiledSharedConstKernel<<<grid, block>>>(A_ptr, B_ptr, C_ptr);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(err));
}
return C;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "C = A.T @ B with constant memory optimization");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.290 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.270 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 57.276 | % | 0.000 | 5 |
Issued Ipc Active | 2.290 | inst/cycle | 0.000 | 5 |
SM Busy | 57.276 | % | 0.000 | 5 |
Memory Throughput | 16441767634.484 | byte/second | 2680067297416843.000 | 5 |
Mem Busy | 87.786 | % | 0.001 | 5 |
Max Bandwidth | 80.950 | % | 0.001 | 5 |
L1/TEX Hit Rate | 1.062 | % | 0.005 | 5 |
L2 Hit Rate | 96.726 | % | 0.267 | 5 |
Mem Pipes Busy | 80.950 | % | 0.001 | 5 |
Warp Cycles Per Issued Instruction | 20.274 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 20.274 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 30.450 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 6.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 12.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 48.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 75.000 | % | 0.000 | 5 |
Achieved Occupancy | 72.606 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 46.468 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
WRN Occupancy | This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 458690.18 | μs |
Device Time | 5278.17 | μs |
Self CPU Time | 39.60 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 458650.58 | μs |
Device Time | 5278.17 | μs |
Self CPU Time | 102.24 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 452878.26 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 92.29 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 452356.08 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 452356.08 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
Memcpy HtoD (Pageable -> Device) | ||
CPU Time | 0.00 | μs |
Device Time | 20622.75 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 20622.75 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaMemcpyToSymbol | ||
CPU Time | 9823170.30 | μs |
Device Time | 12851.57 | μs |
Self CPU Time | 9823170.30 | μs |
Self Device Time | 12851.57 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 59273.40 | μs |
Device Time | 261248.72 | μs |
Self CPU Time | 11027.85 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 48246.95 | μs |
Device Time | 261248.72 | μs |
Self CPU Time | 15749.01 | μs |
Self Device Time | 261248.72 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
tiledSharedConstKernel(float const*, float const*, float*) | ||
CPU Time | 0.00 | μs |
Device Time | 9585862.11 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 9585862.11 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 245788.38 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 245788.38 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45286 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.