import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv_weights: nn.ParameterList,
conv_biases: nn.ParameterList,
fc_weights: nn.ParameterList,
fc_biases: nn.ParameterList,
is_training: bool,
) -> torch.Tensor:
"""
Implements the VGG16 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
conv_weights (nn.ParameterList): List of convolutional weights
conv_biases (nn.ParameterList): List of convolutional biases
fc_weights (nn.ParameterList): List of fully connected weights
fc_biases (nn.ParameterList): List of fully connected biases
is_training (bool): Whether in training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Block 1
x = F.conv2d(x, conv_weights[0], conv_biases[0], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[1], conv_biases[1], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 2
x = F.conv2d(x, conv_weights[2], conv_biases[2], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[3], conv_biases[3], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 3
x = F.conv2d(x, conv_weights[4], conv_biases[4], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[5], conv_biases[5], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[6], conv_biases[6], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 4
x = F.conv2d(x, conv_weights[7], conv_biases[7], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[8], conv_biases[8], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[9], conv_biases[9], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 5
x = F.conv2d(x, conv_weights[10], conv_biases[10], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[11], conv_biases[11], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[12], conv_biases[12], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Classifier
x = torch.flatten(x, 1)
x = F.linear(x, fc_weights[0], fc_biases[0])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[1], fc_biases[1])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[2], fc_biases[2])
return x
class Model(nn.Module):
def __init__(self, num_classes=1000):
super(Model, self).__init__()
# Extract convolutional parameters
self.conv_weights = nn.ParameterList()
self.conv_biases = nn.ParameterList()
# Block 1
conv = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 2
conv = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 3
conv = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 4
conv = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 5
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Extract fully connected parameters
self.fc_weights = nn.ParameterList()
self.fc_biases = nn.ParameterList()
fc = nn.Linear(512 * 7 * 7, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, num_classes)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
def forward(self, x, fn=module_fn):
return fn(
x,
self.conv_weights,
self.conv_biases,
self.fc_weights,
self.fc_biases,
self.training,
)
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, num_classes=1000):
"""
Initialize the VGG16 model.
:param num_classes: The number of output classes (default is 1000 for ImageNet)
"""
super(Model, self).__init__()
# VGG16 architecture: 5 blocks of convolutional layers followed by max pooling
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 4
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 5
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
)
# Fully connected layers
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, num_classes)
)
def forward(self, x):
"""
Forward pass of the VGG16 model.
:param x: The input tensor, shape (batch_size, 3, 224, 224)
:return: The output tensor, shape (batch_size, num_classes)
"""
x = self.features(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
// Allow tuning of the block dimension at compile time. Candidate values: 16, 32, etc.
#ifndef BLOCK_DIM
#define BLOCK_DIM 16
#endif
// Definitions for the first conv layer (Block 1)
#define CONV1_OUT_CHANNELS 64
#define CONV1_IN_CHANNELS 3
#define CONV1_KERNEL_SIZE 3
#define CONV1_WEIGHTS_SIZE (CONV1_OUT_CHANNELS * CONV1_IN_CHANNELS * CONV1_KERNEL_SIZE * CONV1_KERNEL_SIZE)
// Store frequently accessed weights and biases in constant memory
__constant__ float const_conv1_weights[CONV1_WEIGHTS_SIZE];
__constant__ float const_conv1_bias[CONV1_OUT_CHANNELS];
// Custom CUDA kernel for the first convolution layer using constant memory & tunable block sizes
__global__ void conv1_kernel_block_optimized(const float* __restrict__ input,
float* __restrict__ output,
int N, int H, int W) {
// Each thread computes one output pixel for one channel
int oc = blockIdx.z % CONV1_OUT_CHANNELS;
int n = blockIdx.z / CONV1_OUT_CHANNELS;
int x = blockIdx.x * BLOCK_DIM + threadIdx.x;
int y = blockIdx.y * BLOCK_DIM + threadIdx.y;
if (x < W && y < H) {
float sum = 0.0f;
// Loop over input channels and kernel spatial dimensions
for (int c = 0; c < CONV1_IN_CHANNELS; ++c) {
for (int ky = 0; ky < CONV1_KERNEL_SIZE; ++ky) {
for (int kx = 0; kx < CONV1_KERNEL_SIZE; ++kx) {
int in_y = y + ky - 1; // padding = 1
int in_x = x + kx - 1;
if (in_y >= 0 && in_y < H && in_x >= 0 && in_x < W) {
int input_index = n * (CONV1_IN_CHANNELS * H * W) + c * (H * W) + in_y * W + in_x;
int weight_index = oc * (CONV1_IN_CHANNELS * CONV1_KERNEL_SIZE * CONV1_KERNEL_SIZE) +
c * (CONV1_KERNEL_SIZE * CONV1_KERNEL_SIZE) + ky * CONV1_KERNEL_SIZE + kx;
sum += input[input_index] * const_conv1_weights[weight_index];
}
}
}
}
sum += const_conv1_bias[oc];
int output_index = n * (CONV1_OUT_CHANNELS * H * W) + oc * (H * W) + y * W + x;
output[output_index] = sum;
}
}
// Helper function to launch the optimized conv1 kernel with tunable block sizes
torch::Tensor conv1_with_optimized_blocksize(torch::Tensor input, torch::Tensor weight, torch::Tensor bias) {
int N = input.size(0);
int H = input.size(2);
int W = input.size(3);
auto output = torch::empty({N, CONV1_OUT_CHANNELS, H, W}, input.options());
// Copy the first conv layer weights and biases to constant memory
cudaMemcpyToSymbol(const_conv1_weights, weight.data_ptr<float>(), CONV1_WEIGHTS_SIZE * sizeof(float));
cudaMemcpyToSymbol(const_conv1_bias, bias.data_ptr<float>(), CONV1_OUT_CHANNELS * sizeof(float));
// Use a block of BLOCK_DIM x BLOCK_DIM threads; experiment with BLOCK_DIM (e.g., compile with -DBLOCK_DIM=32)
dim3 block(BLOCK_DIM, BLOCK_DIM);
dim3 grid((W + BLOCK_DIM - 1) / BLOCK_DIM,
(H + BLOCK_DIM - 1) / BLOCK_DIM,
N * CONV1_OUT_CHANNELS);
conv1_kernel_block_optimized<<<grid, block>>>(input.data_ptr<float>(), output.data_ptr<float>(), N, H, W);
cudaDeviceSynchronize();
return output;
}
// VGG16 forward pass using a mix of optimized constant memory conv1 with tunable block sizes and torch ops for the rest
torch::Tensor vgg16_forward_cuda(
torch::Tensor x,
std::vector<torch::Tensor> conv_weights,
std::vector<torch::Tensor> conv_biases,
std::vector<torch::Tensor> fc_weights,
std::vector<torch::Tensor> fc_biases,
bool is_training
) {
auto current = x;
// --- Block 1 ---
// Use the optimized conv1 kernel for the first convolution layer (weights in constant memory & block size tuning)
auto conv1_out = conv1_with_optimized_blocksize(current, conv_weights[0], conv_biases[0]);
current = torch::relu(conv1_out);
current = torch::conv2d(current, conv_weights[1], conv_biases[1], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// --- Block 2 ---
current = torch::conv2d(current, conv_weights[2], conv_biases[2], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[3], conv_biases[3], 1, 1);
current = torch::relu(current);
current = torch::max_pool2d(current, 2, 2);
// --- Block 3 ---
current = torch::conv2d(current, conv_weights[4], conv_biases[4], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[5], conv_biases[5], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[6], conv_biases[6], 1, 1);
current = torch::relu(current);
current = torch::max_pool2d(current, 2, 2);
// --- Block 4 ---
current = torch::conv2d(current, conv_weights[7], conv_biases[7], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[8], conv_biases[8], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[9], conv_biases[9], 1, 1);
current = torch::relu(current);
current = torch::max_pool2d(current, 2, 2);
// --- Block 5 ---
current = torch::conv2d(current, conv_weights[10], conv_biases[10], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[11], conv_biases[11], 1, 1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[12], conv_biases[12], 1, 1);
current = torch::relu(current);
current = torch::max_pool2d(current, 2, 2);
// --- Classifier ---
current = current.flatten(1);
current = torch::linear(current, fc_weights[0], fc_biases[0]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[1], fc_biases[1]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[2], fc_biases[2]);
return current;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &vgg16_forward_cuda, "VGG16 forward (optimized CUDA with tunable block sizes)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.510 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 2.490 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 62.680 | % | 0.000 | 5 |
Issued Ipc Active | 2.510 | inst/cycle | 0.000 | 5 |
SM Busy | 62.680 | % | 0.000 | 5 |
Memory Throughput | 319684799450.876 | byte/second | 244195700899629984.000 | 5 |
Mem Busy | 90.256 | % | 0.002 | 5 |
Max Bandwidth | 54.216 | % | 0.001 | 5 |
L1/TEX Hit Rate | 82.676 | % | 0.000 | 5 |
L2 Hit Rate | 97.908 | % | 0.004 | 5 |
Mem Pipes Busy | 94.674 | % | 0.002 | 5 |
Warp Cycles Per Issued Instruction | 16.740 | cycle | 0.000 | 5 |
Warp Cycles Per Executed Instruction | 16.740 | cycle | 0.000 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 31.750 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 6.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 48.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 75.000 | % | 0.000 | 5 |
Achieved Occupancy | 65.964 | % | 0.000 | 5 |
Achieved Active Warps Per SM | 42.216 | warp | 0.000 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (38.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
cudaDeviceSynchronize | ||
CPU Time | 5741553.13 | μs |
Device Time | 12457.54 | μs |
Self CPU Time | 5741553.13 | μs |
Self Device Time | 12457.54 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv2d | ||
CPU Time | 1498191.39 | μs |
Device Time | 4574874.65 | μs |
Self CPU Time | 43253.56 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 1454937.84 | μs |
Device Time | 4574874.65 | μs |
Self CPU Time | 53949.47 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 1400988.36 | μs |
Device Time | 4574874.65 | μs |
Self CPU Time | 111801.98 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 1053870.68 | μs |
Device Time | 3723263.30 | μs |
Self CPU Time | 514083.48 | μs |
Self Device Time | 3723263.30 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::linear | ||
CPU Time | 291597.46 | μs |
Device Time | 1163487.39 | μs |
Self CPU Time | 13304.94 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45293 warnings generated when compiling for host. Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.