98_Matmul_AvgPool_GELU_Scale_Max
• shared_memory_optimized_base_base
import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
pool_kernel_size: int,
scale_factor: float,
weight: torch.Tensor,
bias: torch.Tensor,
) -> torch.Tensor:
"""
Implements Matmul_AvgPool_GELU_Scale_Max pattern using functional operations.
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_features)
pool_kernel_size (int): Kernel size for average pooling
scale_factor (float): Scale factor to multiply features by
weight (torch.Tensor): Weight matrix for linear layer
bias (torch.Tensor): Bias vector for linear layer
Returns:
torch.Tensor: Output tensor of shape (batch_size,)
"""
x = F.linear(x, weight, bias)
x = F.avg_pool1d(x.unsqueeze(1), kernel_size=pool_kernel_size).squeeze(1)
x = F.gelu(x)
x = x * scale_factor
x = torch.max(x, dim=1).values
return x
class Model(nn.Module):
"""
A model implementing the pattern "Matmul_AvgPool_GELU_Scale_Max".
"""
def __init__(self, in_features, out_features, pool_kernel_size, scale_factor):
super(Model, self).__init__()
gemm = nn.Linear(in_features, out_features)
self.weight = gemm.weight
self.bias = gemm.bias
def forward(self, x, pool_kernel_size, scale_factor, fn=module_fn):
return fn(x, pool_kernel_size, scale_factor, self.weight, self.bias)
batch_size = 128
in_features = 512
out_features = 256
pool_kernel_size = 4
scale_factor = 2.0
def get_inputs():
return [torch.randn(batch_size, in_features), pool_kernel_size, scale_factor]
def get_init_inputs():
return [in_features, out_features, pool_kernel_size, scale_factor]
import torch
import torch.nn as nn
class Model(nn.Module):
"""
A model implementing the pattern "Matmul_AvgPool_GELU_Scale_Max".
"""
def __init__(self, in_features, out_features, pool_kernel_size, scale_factor):
super(Model, self).__init__()
self.matmul = nn.Linear(in_features, out_features)
self.avg_pool = nn.AvgPool1d(kernel_size=pool_kernel_size)
self.scale_factor = scale_factor
def forward(self, x):
"""
Args:
x (torch.Tensor): Input tensor of shape (batch_size, in_features).
Returns:
torch.Tensor: Output tensor of shape (batch_size, out_features).
"""
x = self.matmul(x)
x = self.avg_pool(x.unsqueeze(1)).squeeze(1)
x = torch.nn.functional.gelu(x)
x = x * self.scale_factor
x = torch.max(x, dim=1).values
return x
batch_size = 128
in_features = 512
out_features = 256
pool_kernel_size = 4
scale_factor = 2.0
def get_inputs():
return [torch.randn(batch_size, in_features)]
def get_init_inputs():
return [in_features, out_features, pool_kernel_size, scale_factor]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#include <float.h>
#ifndef TILE_SIZE
#define TILE_SIZE 16
#endif
__global__ void FusedMatMulBiasKernel(const float* __restrict__ A,
const float* __restrict__ B,
const float* __restrict__ bias,
float* __restrict__ C,
int M, int N, int K) {
__shared__ float Asub[TILE_SIZE][TILE_SIZE];
__shared__ float Bsub[TILE_SIZE][TILE_SIZE];
__shared__ float shared_bias[TILE_SIZE]; // Cache bias values
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
// Preload bias into shared memory
if (threadIdx.y == 0 && (blockIdx.x * TILE_SIZE + threadIdx.x) < N) {
shared_bias[threadIdx.x] = bias[blockIdx.x * TILE_SIZE + threadIdx.x];
}
__syncthreads();
float sum = 0.0f;
for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
int tiled_k = t * TILE_SIZE;
// Coalesced loading of A and B tiles
if (row < M && (tiled_k + threadIdx.x) < K)
Asub[threadIdx.y][threadIdx.x] = A[row * K + tiled_k + threadIdx.x];
else
Asub[threadIdx.y][threadIdx.x] = 0.0f;
if (col < N && (tiled_k + threadIdx.y) < K)
Bsub[threadIdx.y][threadIdx.x] = B[col * K + tiled_k + threadIdx.y];
else
Bsub[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads();
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
sum += Asub[threadIdx.y][i] * Bsub[i][threadIdx.x];
}
__syncthreads();
}
if (row < M && col < N) {
C[row * N + col] = sum + shared_bias[threadIdx.x];
}
}
__global__ void FusedPoolActMaxKernel(const float* __restrict__ linear_output,
float* __restrict__ output,
int M, int N,
int pool_kernel_size,
int output_length,
float scale_factor) {
extern __shared__ float shared_mem[];
float* pool_buffer = shared_mem; // Used for storing pooling results
float* reduction_buffer = &shared_mem[blockDim.x]; // Used for max reduction
int row = blockIdx.x;
int tid = threadIdx.x;
float local_max = -FLT_MAX;
// Process multiple pooling windows per thread
for (int bin = tid; bin < output_length; bin += blockDim.x) {
int start = bin * pool_kernel_size;
float sum = 0.0f;
int count = 0;
// Load pooling window elements into shared memory
#pragma unroll
for (int j = 0; j < pool_kernel_size; j++) {
int col = start + j;
if (col < N) {
pool_buffer[tid] = linear_output[row * N + col];
sum += pool_buffer[tid];
count++;
}
}
// Compute average and GELU in registers
float avg = (count > 0) ? sum / count : 0.0f;
float gelu = 0.5f * avg * (1.0f + erff(avg * 0.70710678f));
gelu *= scale_factor;
local_max = fmaxf(local_max, gelu);
}
// Store local max in reduction buffer
reduction_buffer[tid] = local_max;
__syncthreads();
// Parallel reduction in shared memory
for (unsigned int s = blockDim.x/2; s > 32; s >>= 1) {
if (tid < s) {
reduction_buffer[tid] = fmaxf(reduction_buffer[tid], reduction_buffer[tid + s]);
}
__syncthreads();
}
// Warp-level reduction (no sync needed within a warp)
if (tid < 32) {
volatile float* smem = reduction_buffer;
if (blockDim.x >= 64) smem[tid] = fmaxf(smem[tid], smem[tid + 32]);
if (blockDim.x >= 32) smem[tid] = fmaxf(smem[tid], smem[tid + 16]);
if (blockDim.x >= 16) smem[tid] = fmaxf(smem[tid], smem[tid + 8]);
if (blockDim.x >= 8) smem[tid] = fmaxf(smem[tid], smem[tid + 4]);
if (blockDim.x >= 4) smem[tid] = fmaxf(smem[tid], smem[tid + 2]);
if (blockDim.x >= 2) smem[tid] = fmaxf(smem[tid], smem[tid + 1]);
}
if (tid == 0) {
output[row] = reduction_buffer[0];
}
}
torch::Tensor forward(
torch::Tensor x,
int pool_kernel_size,
float scale_factor,
torch::Tensor weight,
torch::Tensor bias) {
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(weight.is_cuda(), "weight must be a CUDA tensor");
TORCH_CHECK(bias.is_cuda(), "bias must be a CUDA tensor");
x = x.contiguous();
weight = weight.contiguous();
bias = bias.contiguous();
int M = x.size(0);
int K = x.size(1);
int N = weight.size(0);
auto options = torch::TensorOptions().dtype(x.dtype()).device(x.device());
auto linear_output = torch::empty({M, N}, options);
dim3 blockDim(TILE_SIZE, TILE_SIZE);
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
FusedMatMulBiasKernel<<<gridDim, blockDim>>>(
x.data_ptr<float>(),
weight.data_ptr<float>(),
bias.data_ptr<float>(),
linear_output.data_ptr<float>(),
M, N, K);
int output_length = (N + pool_kernel_size - 1) / pool_kernel_size;
auto output = torch::empty({M}, options);
int threads = 256;
// Shared memory size = space for pooling buffer + reduction buffer
size_t sharedMemSize = threads * sizeof(float) * 2;
FusedPoolActMaxKernel<<<M, threads, sharedMemSize>>>(
linear_output.data_ptr<float>(),
output.data_ptr<float>(),
M, N, pool_kernel_size, output_length, scale_factor);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Shared Memory Optimized Forward");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.278 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.104 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 7.394 | % | 0.020 | 5 |
Issued Ipc Active | 0.296 | inst/cycle | 0.000 | 5 |
SM Busy | 7.394 | % | 0.020 | 5 |
Memory Throughput | 37360605581.682 | byte/second | 2253707248179934464.000 | 5 |
Mem Busy | 8.770 | % | 0.007 | 5 |
Max Bandwidth | 4.602 | % | 0.001 | 5 |
L1/TEX Hit Rate | 74.420 | % | 0.000 | 5 |
L2 Hit Rate | 90.478 | % | 0.332 | 5 |
Mem Pipes Busy | 2.396 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 24.054 | cycle | 1.040 | 5 |
Warp Cycles Per Executed Instruction | 25.568 | cycle | 1.175 | 5 |
Avg. Active Threads Per Warp | 31.760 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 27.690 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 10.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 21.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 11.132 | % | 0.001 | 5 |
Achieved Active Warps Per SM | 7.124 | warp | 0.000 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (11.1%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 334095.51 | μs |
Device Time | 34.66 | μs |
Self CPU Time | 51.68 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 334043.82 | μs |
Device Time | 34.66 | μs |
Self CPU Time | 98.81 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 333697.34 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 111.15 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 333393.65 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 333393.65 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 543167.85 | μs |
Device Time | 36270.15 | μs |
Self CPU Time | 543167.85 | μs |
Self Device Time | 36270.15 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
FusedMatMulBiasKernel(float const*, float const*, float const*, float*, int, int, int) | ||
CPU Time | 0.00 | μs |
Device Time | 200507.68 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 200507.68 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 78594.93 | μs |
Device Time | 623737.75 | μs |
Self CPU Time | 14160.69 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 64436.46 | μs |
Device Time | 623737.75 | μs |
Self CPU Time | 18810.54 | μs |
Self Device Time | 623737.75 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 623737.75 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 623737.75 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45289 warnings generated when compiling for host. Suppressed 45325 warnings (45278 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.