import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv_weights: nn.ParameterList,
conv_biases: nn.ParameterList,
fc_weights: nn.ParameterList,
fc_biases: nn.ParameterList,
is_training: bool,
) -> torch.Tensor:
"""
Implements the VGG16 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
conv_weights (nn.ParameterList): List of convolutional weights
conv_biases (nn.ParameterList): List of convolutional biases
fc_weights (nn.ParameterList): List of fully connected weights
fc_biases (nn.ParameterList): List of fully connected biases
is_training (bool): Whether in training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Block 1
x = F.conv2d(x, conv_weights[0], conv_biases[0], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[1], conv_biases[1], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 2
x = F.conv2d(x, conv_weights[2], conv_biases[2], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[3], conv_biases[3], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 3
x = F.conv2d(x, conv_weights[4], conv_biases[4], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[5], conv_biases[5], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[6], conv_biases[6], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 4
x = F.conv2d(x, conv_weights[7], conv_biases[7], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[8], conv_biases[8], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[9], conv_biases[9], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 5
x = F.conv2d(x, conv_weights[10], conv_biases[10], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[11], conv_biases[11], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[12], conv_biases[12], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Classifier
x = torch.flatten(x, 1)
x = F.linear(x, fc_weights[0], fc_biases[0])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[1], fc_biases[1])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[2], fc_biases[2])
return x
class Model(nn.Module):
def __init__(self, num_classes=1000):
super(Model, self).__init__()
# Extract convolutional parameters
self.conv_weights = nn.ParameterList()
self.conv_biases = nn.ParameterList()
# Block 1
conv = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 2
conv = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 3
conv = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 4
conv = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 5
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Extract fully connected parameters
self.fc_weights = nn.ParameterList()
self.fc_biases = nn.ParameterList()
fc = nn.Linear(512 * 7 * 7, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, num_classes)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
def forward(self, x, fn=module_fn):
return fn(
x,
self.conv_weights,
self.conv_biases,
self.fc_weights,
self.fc_biases,
self.training,
)
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, num_classes=1000):
"""
Initialize the VGG16 model.
:param num_classes: The number of output classes (default is 1000 for ImageNet)
"""
super(Model, self).__init__()
# VGG16 architecture: 5 blocks of convolutional layers followed by max pooling
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 4
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 5
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
)
# Fully connected layers
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, num_classes)
)
def forward(self, x):
"""
Forward pass of the VGG16 model.
:param x: The input tensor, shape (batch_size, 3, 224, 224)
:return: The output tensor, shape (batch_size, num_classes)
"""
x = self.features(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
// This kernel is specialized for VGG16's first convolution layer where input has 3 channels and kernel size is 3.
// It manually unrolls the loops over the 3x3 kernel and the small channel dimension (C==3) to reduce loop overhead.
__global__ void conv2d_first_layer_kernel(
const float* __restrict__ input, // shape: (N, 3, H, W)
const float* __restrict__ weight, // shape: (K, 3, 3, 3) -> K x 27
const float* __restrict__ bias, // shape: (K)
float* __restrict__ output, // shape: (N, K, H, W)
int N, int H, int W, int K
) {
// Compute spatial coordinates
int w = blockIdx.x * blockDim.x + threadIdx.x;
int h = blockIdx.y * blockDim.y + threadIdx.y;
// Combine batch and output channel in blockIdx.z
int nk = blockIdx.z; // n * K + k
int n = nk / K;
int k = nk % K;
if (h >= H || w >= W) return;
// Padding = 1 assumed
float sum = bias[k];
// For each input channel (C=3) and 3x3 kernel, completely unroll the loops.
// Compute base indexes
int in_base = n * 3 * H * W; // starting offset for input of batch n
int weight_base = k * 27; // each filter has 27 elements
// Unroll for channel 0
{
int c = 0;
int offset = in_base + c * H * W;
int r = h - 1; // starting row index in input
int cidx;
float v0, v1, v2;
// position (0,0)
cidx = w - 1;
v0 = ((r >= 0 && r < H && cidx >= 0 && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
// position (0,1)
cidx = w;
v1 = ((r >= 0 && r < H && cidx >= 0 && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
// position (0,2)
cidx = w + 1;
v2 = ((r >= 0 && r < H && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
sum += v0 * weight[weight_base + 0];
sum += v1 * weight[weight_base + 1];
sum += v2 * weight[weight_base + 2];
}
// Unroll for channel 1
{
int c = 1;
int offset = in_base + c * H * W;
int weight_base_c = weight_base + 9; // next 9 weights for channel 1
int r = h - 1;
int cidx;
float v0, v1, v2;
cidx = w - 1;
v0 = ((r >= 0 && r < H && cidx >= 0 && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
cidx = w;
v1 = ((r >= 0 && r < H && cidx >= 0 && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
cidx = w + 1;
v2 = ((r >= 0 && r < H && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
sum += v0 * weight[weight_base_c + 0];
sum += v1 * weight[weight_base_c + 1];
sum += v2 * weight[weight_base_c + 2];
}
// Unroll for channel 2
{
int c = 2;
int offset = in_base + c * H * W;
int weight_base_c = weight_base + 18; // next 9 weights for channel 2
int r = h - 1;
int cidx;
float v0, v1, v2;
cidx = w - 1;
v0 = ((r >= 0 && r < H && cidx >= 0 && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
cidx = w;
v1 = ((r >= 0 && r < H && cidx >= 0 && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
cidx = w + 1;
v2 = ((r >= 0 && r < H && cidx < W) ? input[offset + r * W + cidx] : 0.0f);
sum += v0 * weight[weight_base_c + 0];
sum += v1 * weight[weight_base_c + 1];
sum += v2 * weight[weight_base_c + 2];
}
output[n * K * H * W + k * H * W + h * W + w] = sum;
}
// Host function to launch the specialized first-layer convolution kernel
torch::Tensor custom_conv2d_first_layer(torch::Tensor input, torch::Tensor weight, torch::Tensor bias) {
int N = input.size(0);
int H = input.size(2);
int W = input.size(3);
int K = weight.size(0); // output channels
auto output = torch::empty({N, K, H, W}, input.options());
dim3 block(16, 16);
dim3 grid((W + block.x - 1) / block.x, (H + block.y - 1) / block.y, N * K);
conv2d_first_layer_kernel<<<grid, block>>>(
input.data_ptr<float>(),
weight.data_ptr<float>(),
bias.data_ptr<float>(),
output.data_ptr<float>(),
N, H, W, K
);
return output;
}
// VGG16 forward pass using the specialized unrolled conv kernel for the first layer
// and standard torch calls for the remaining layers.
torch::Tensor vgg16_forward_cuda(
torch::Tensor x,
std::vector<torch::Tensor> conv_weights,
std::vector<torch::Tensor> conv_biases,
std::vector<torch::Tensor> fc_weights,
std::vector<torch::Tensor> fc_biases,
bool is_training
) {
auto current = x;
// Block 1 - use the custom unrolled convolution for the first layer (assumes input channels == 3 and kernel size == 3, padding=1)
current = custom_conv2d_first_layer(current, conv_weights[0], conv_biases[0]);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[1], conv_biases[1], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 2
current = torch::conv2d(current, conv_weights[2], conv_biases[2], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[3], conv_biases[3], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 3
current = torch::conv2d(current, conv_weights[4], conv_biases[4], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[5], conv_biases[5], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[6], conv_biases[6], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 4
current = torch::conv2d(current, conv_weights[7], conv_biases[7], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[8], conv_biases[8], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[9], conv_biases[9], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 5
current = torch::conv2d(current, conv_weights[10], conv_biases[10], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[11], conv_biases[11], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[12], conv_biases[12], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Classifier
current = current.flatten(1);
current = torch::linear(current, fc_weights[0], fc_biases[0]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[1], fc_biases[1]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[2], fc_biases[2]);
return current;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &vgg16_forward_cuda, "VGG16 forward (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.940 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.902 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 73.554 | % | 0.000 | 5 |
Issued Ipc Active | 2.940 | inst/cycle | 0.000 | 5 |
SM Busy | 73.554 | % | 0.000 | 5 |
Memory Throughput | 613251473706.144 | byte/second | 1676306043945586688.000 | 5 |
Mem Busy | 97.394 | % | 0.003 | 5 |
Max Bandwidth | 64.868 | % | 0.001 | 5 |
L1/TEX Hit Rate | 61.892 | % | 0.001 | 5 |
L2 Hit Rate | 98.870 | % | 0.011 | 5 |
Mem Pipes Busy | 57.036 | % | 0.001 | 5 |
Warp Cycles Per Issued Instruction | 18.664 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 18.668 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 30.270 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 86.572 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 55.406 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (46.0%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (86.6%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
cudaLaunchKernel | ||
CPU Time | 4222923.65 | μs |
Device Time | 7840.01 | μs |
Self CPU Time | 4222923.65 | μs |
Self Device Time | 7840.01 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv2d | ||
CPU Time | 4040363.66 | μs |
Device Time | 3533389.73 | μs |
Self CPU Time | 35324.16 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 4005039.50 | μs |
Device Time | 3533389.73 | μs |
Self CPU Time | 45853.29 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 3959186.22 | μs |
Device Time | 3533389.73 | μs |
Self CPU Time | 92868.38 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 3214941.50 | μs |
Device Time | 2878096.49 | μs |
Self CPU Time | 451050.90 | μs |
Self Device Time | 2878096.49 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::linear | ||
CPU Time | 596192.30 | μs |
Device Time | 895083.74 | μs |
Self CPU Time | 10550.92 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45293 warnings generated when compiling for host. Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.