import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv_weights: nn.ParameterList,
conv_biases: nn.ParameterList,
fc_weights: nn.ParameterList,
fc_biases: nn.ParameterList,
is_training: bool,
) -> torch.Tensor:
"""
Implements the VGG16 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
conv_weights (nn.ParameterList): List of convolutional weights
conv_biases (nn.ParameterList): List of convolutional biases
fc_weights (nn.ParameterList): List of fully connected weights
fc_biases (nn.ParameterList): List of fully connected biases
is_training (bool): Whether in training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Block 1
x = F.conv2d(x, conv_weights[0], conv_biases[0], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[1], conv_biases[1], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 2
x = F.conv2d(x, conv_weights[2], conv_biases[2], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[3], conv_biases[3], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 3
x = F.conv2d(x, conv_weights[4], conv_biases[4], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[5], conv_biases[5], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[6], conv_biases[6], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 4
x = F.conv2d(x, conv_weights[7], conv_biases[7], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[8], conv_biases[8], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[9], conv_biases[9], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 5
x = F.conv2d(x, conv_weights[10], conv_biases[10], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[11], conv_biases[11], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[12], conv_biases[12], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Classifier
x = torch.flatten(x, 1)
x = F.linear(x, fc_weights[0], fc_biases[0])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[1], fc_biases[1])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[2], fc_biases[2])
return x
class Model(nn.Module):
def __init__(self, num_classes=1000):
super(Model, self).__init__()
# Extract convolutional parameters
self.conv_weights = nn.ParameterList()
self.conv_biases = nn.ParameterList()
# Block 1
conv = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 2
conv = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 3
conv = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 4
conv = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 5
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Extract fully connected parameters
self.fc_weights = nn.ParameterList()
self.fc_biases = nn.ParameterList()
fc = nn.Linear(512 * 7 * 7, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, num_classes)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
def forward(self, x, fn=module_fn):
return fn(
x,
self.conv_weights,
self.conv_biases,
self.fc_weights,
self.fc_biases,
self.training,
)
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, num_classes=1000):
"""
Initialize the VGG16 model.
:param num_classes: The number of output classes (default is 1000 for ImageNet)
"""
super(Model, self).__init__()
# VGG16 architecture: 5 blocks of convolutional layers followed by max pooling
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 4
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 5
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
)
# Fully connected layers
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, num_classes)
)
def forward(self, x):
"""
Forward pass of the VGG16 model.
:param x: The input tensor, shape (batch_size, 3, 224, 224)
:return: The output tensor, shape (batch_size, num_classes)
"""
x = self.features(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <algorithm>
// Tile dimensions for spatial tiling
#define TILE_W 16
#define TILE_H 16
#define KERNEL_SIZE 3 // Assuming a 3x3 kernel for simplicity
//---------------------------------------------------------------------
// Fused convolution and ReLU kernel
// Each thread computes one output pixel for a given sample and output channel.
// Grid organization:
// grid.x: covers output width in tiles
// grid.y: covers output height in tiles
// grid.z: covers (N * K) where N is the number of samples and K the number of output channels
// Within a block, blockDim.x = TILE_W and blockDim.y = TILE_H.
// The output position is computed as:
// out_x = blockIdx.x * TILE_W + threadIdx.x
// out_y = blockIdx.y * TILE_H + threadIdx.y
// Sample and channel indices are recovered from blockIdx.z as:
// n = blockIdx.z / K
// k = blockIdx.z % K
//---------------------------------------------------------------------
__global__ void fused_conv2d_relu_kernel(const float* __restrict__ input,
const float* __restrict__ weight,
const float* __restrict__ bias,
float* __restrict__ output,
int N, int C, int H, int W,
int K, int pad, int stride) {
// Recover sample and output channel from gridIdx.z
int n = blockIdx.z / K;
int k = blockIdx.z % K;
// Compute output spatial coordinates
int out_x = blockIdx.x * TILE_W + threadIdx.x;
int out_y = blockIdx.y * TILE_H + threadIdx.y;
// Check bounds (assuming output spatial dimensions are same as input HxW for stride=1 and pad=1)
if (out_x < W && out_y < H) {
// Initialize with bias value for this output channel
float sum = bias[k];
// Perform convolution sum over input channels and kernel window
for (int c = 0; c < C; c++) {
for (int i = 0; i < KERNEL_SIZE; i++) {
for (int j = 0; j < KERNEL_SIZE; j++) {
// Compute input coordinates with stride and padding
int in_y = out_y * stride - pad + i;
int in_x = out_x * stride - pad + j;
// Check if within input bounds
if (in_y >= 0 && in_y < H && in_x >= 0 && in_x < W) {
float in_val = input[n * C * H * W + c * H * W + in_y * W + in_x];
float wt = weight[k * C * KERNEL_SIZE * KERNEL_SIZE + c * KERNEL_SIZE * KERNEL_SIZE + i * KERNEL_SIZE + j];
sum += in_val * wt;
}
}
}
}
// Fused ReLU activation
sum = fmaxf(sum, 0.0f);
// Write the result
output[n * K * H * W + k * H * W + out_y * W + out_x] = sum;
}
}
//---------------------------------------------------------------------
// Custom convolution function that invokes the fused kernel for the first conv layer
//---------------------------------------------------------------------
torch::Tensor custom_conv2d_fused(torch::Tensor input, torch::Tensor weight, torch::Tensor bias, int stride = 1, int pad = 1) {
const int N = input.size(0);
const int C = input.size(1);
const int H = input.size(2);
const int W = input.size(3);
const int K = weight.size(0);
// Allocate output tensor. Here we assume output spatial dimensions equal H x W (for stride=1, pad=1)
auto output = torch::empty({N, K, H, W}, input.options());
// Define block dimensions
dim3 block(TILE_W, TILE_H, 1);
// Define grid dimensions
int grid_x = (W + TILE_W - 1) / TILE_W; // number of tiles along width
int grid_y = (H + TILE_H - 1) / TILE_H; // number of tiles along height
// Combine N and K into grid.z so that each block processes one (n, k) pair
dim3 grid(grid_x, grid_y, N * K);
fused_conv2d_relu_kernel<<<grid, block>>>(
input.data_ptr<float>(),
weight.data_ptr<float>(),
bias.data_ptr<float>(),
output.data_ptr<float>(),
N, C, H, W, K, pad, stride
);
return output;
}
//---------------------------------------------------------------------
// VGG16 forward pass using the fused convolution kernel for Block 1,
// and standard torch::conv2d (cuDNN) for subsequent layers
//---------------------------------------------------------------------
torch::Tensor vgg16_forward_cuda(
torch::Tensor x,
std::vector<torch::Tensor> conv_weights,
std::vector<torch::Tensor> conv_biases,
std::vector<torch::Tensor> fc_weights,
std::vector<torch::Tensor> fc_biases,
bool is_training
) {
auto current = x;
// Block 1 - Use fused custom conv2d + ReLU for the first layer
current = custom_conv2d_fused(current, conv_weights[0], conv_biases[0], /*stride=*/1, /*pad=*/1);
// Second convolution in Block 1 using cuDNN optimized conv2d
current = torch::conv2d(current, conv_weights[1], conv_biases[1], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 2
current = torch::conv2d(current, conv_weights[2], conv_biases[2], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[3], conv_biases[3], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 3
current = torch::conv2d(current, conv_weights[4], conv_biases[4], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[5], conv_biases[5], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[6], conv_biases[6], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 4
current = torch::conv2d(current, conv_weights[7], conv_biases[7], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[8], conv_biases[8], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[9], conv_biases[9], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 5
current = torch::conv2d(current, conv_weights[10], conv_biases[10], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[11], conv_biases[11], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[12], conv_biases[12], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Classifier
current = current.flatten(1);
current = torch::linear(current, fc_weights[0], fc_biases[0]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[1], fc_biases[1]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[2], fc_biases[2]);
return current;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &vgg16_forward_cuda, "VGG16 forward (Fused Conv2D + ReLU, CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.520 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.500 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 62.940 | % | 0.000 | 5 |
Issued Ipc Active | 2.520 | inst/cycle | 0.000 | 5 |
SM Busy | 62.940 | % | 0.000 | 5 |
Memory Throughput | 238330086350.680 | byte/second | 38098084166915672.000 | 5 |
Mem Busy | 92.764 | % | 0.001 | 5 |
Max Bandwidth | 66.272 | % | 0.000 | 5 |
L1/TEX Hit Rate | 85.336 | % | 0.000 | 5 |
L2 Hit Rate | 98.898 | % | 0.005 | 5 |
Mem Pipes Busy | 57.864 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 15.828 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 15.830 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 31.050 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 6.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 48.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 75.000 | % | 0.000 | 5 |
Achieved Occupancy | 62.566 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 40.040 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (39.1%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. The difference between calculated theoretical (75.0%) and measured achieved occupancy (62.6%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
cudaLaunchKernel | ||
CPU Time | 4558909.69 | μs |
Device Time | 10068.93 | μs |
Self CPU Time | 4558909.69 | μs |
Self Device Time | 10068.93 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv2d | ||
CPU Time | 5749554.78 | μs |
Device Time | 4740063.40 | μs |
Self CPU Time | 46186.43 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 5703368.35 | μs |
Device Time | 4740063.40 | μs |
Self CPU Time | 61122.04 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 5642246.30 | μs |
Device Time | 4740063.40 | μs |
Self CPU Time | 123532.61 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 3897214.09 | μs |
Device Time | 3862758.16 | μs |
Self CPU Time | 557110.23 | μs |
Self Device Time | 3862758.16 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::linear | ||
CPU Time | 821793.10 | μs |
Device Time | 1158770.12 | μs |
Self CPU Time | 15744.83 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45296 warnings generated when compiling for host. Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.