import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv_weights: nn.ParameterList,
conv_biases: nn.ParameterList,
fc_weights: nn.ParameterList,
fc_biases: nn.ParameterList,
is_training: bool,
) -> torch.Tensor:
"""
Implements the VGG16 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
conv_weights (nn.ParameterList): List of convolutional weights
conv_biases (nn.ParameterList): List of convolutional biases
fc_weights (nn.ParameterList): List of fully connected weights
fc_biases (nn.ParameterList): List of fully connected biases
is_training (bool): Whether in training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Block 1
x = F.conv2d(x, conv_weights[0], conv_biases[0], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[1], conv_biases[1], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 2
x = F.conv2d(x, conv_weights[2], conv_biases[2], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[3], conv_biases[3], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 3
x = F.conv2d(x, conv_weights[4], conv_biases[4], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[5], conv_biases[5], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[6], conv_biases[6], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 4
x = F.conv2d(x, conv_weights[7], conv_biases[7], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[8], conv_biases[8], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[9], conv_biases[9], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Block 5
x = F.conv2d(x, conv_weights[10], conv_biases[10], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[11], conv_biases[11], padding=1)
x = F.relu(x)
x = F.conv2d(x, conv_weights[12], conv_biases[12], padding=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Classifier
x = torch.flatten(x, 1)
x = F.linear(x, fc_weights[0], fc_biases[0])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[1], fc_biases[1])
x = F.relu(x)
x = F.dropout(x, p=0.0, training=is_training)
x = F.linear(x, fc_weights[2], fc_biases[2])
return x
class Model(nn.Module):
def __init__(self, num_classes=1000):
super(Model, self).__init__()
# Extract convolutional parameters
self.conv_weights = nn.ParameterList()
self.conv_biases = nn.ParameterList()
# Block 1
conv = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 2
conv = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 3
conv = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 4
conv = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Block 5
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
conv = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv_weights.append(nn.Parameter(conv.weight.data.clone()))
self.conv_biases.append(nn.Parameter(conv.bias.data.clone()))
# Extract fully connected parameters
self.fc_weights = nn.ParameterList()
self.fc_biases = nn.ParameterList()
fc = nn.Linear(512 * 7 * 7, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, 4096)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
fc = nn.Linear(4096, num_classes)
self.fc_weights.append(nn.Parameter(fc.weight.data.clone()))
self.fc_biases.append(nn.Parameter(fc.bias.data.clone()))
def forward(self, x, fn=module_fn):
return fn(
x,
self.conv_weights,
self.conv_biases,
self.fc_weights,
self.fc_biases,
self.training,
)
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, num_classes=1000):
"""
Initialize the VGG16 model.
:param num_classes: The number of output classes (default is 1000 for ImageNet)
"""
super(Model, self).__init__()
# VGG16 architecture: 5 blocks of convolutional layers followed by max pooling
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 4
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 5
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
)
# Fully connected layers
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.0),
nn.Linear(4096, num_classes)
)
def forward(self, x):
"""
Forward pass of the VGG16 model.
:param x: The input tensor, shape (batch_size, 3, 224, 224)
:return: The output tensor, shape (batch_size, num_classes)
"""
x = self.features(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
# Test code
batch_size = 10
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define TILE_WIDTH 16
// Optimized kernel for 2D convolution with better thread and block indexing
__global__ void conv2d_optimized_kernel(const float* __restrict__ input, const float* __restrict__ weight, const float* __restrict__ bias,
float* __restrict__ output, int N, int C, int H, int W, int K, int kernel_size, int P) {
int n = blockIdx.z;
int k = blockIdx.y;
int h = blockIdx.x * TILE_WIDTH + threadIdx.y;
int w = threadIdx.x;
if (h < H && w < W) {
float sum = bias[k];
for (int c = 0; c < C; ++c) {
for (int kh = 0; kh < kernel_size; ++kh) {
for (int kw = 0; kw < kernel_size; ++kw) {
int ih = h - P + kh;
int iw = w - P + kw;
if (ih >= 0 && ih < H && iw >= 0 && iw < W) {
float in_val = input[n * C * H * W + c * H * W + ih * W + iw];
float weight_val = weight[k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + kh * kernel_size + kw];
sum += in_val * weight_val;
}
}
}
}
output[n * K * H * W + k * H * W + h * W + w] = sum;
}
}
// Custom convolution function using the optimized kernel
torch::Tensor custom_conv2d_optimized(torch::Tensor input, torch::Tensor weight, torch::Tensor bias) {
const int N = input.size(0);
const int C = input.size(1);
const int H = input.size(2);
const int W = input.size(3);
const int K = weight.size(0);
const int kernel_size = weight.size(2);
const int P = kernel_size / 2;
auto output = torch::empty({N, K, H, W}, input.options());
dim3 block(TILE_WIDTH, TILE_WIDTH);
dim3 grid((W + TILE_WIDTH - 1) / TILE_WIDTH, K, N);
conv2d_optimized_kernel<<<grid, block>>>(
input.data_ptr<float>(),
weight.data_ptr<float>(),
bias.data_ptr<float>(),
output.data_ptr<float>(),
N, C, H, W, K, kernel_size, P
);
return output;
}
// VGG16 forward pass using the optimized convolution kernel
torch::Tensor vgg16_forward_cuda(
torch::Tensor x,
std::vector<torch::Tensor> conv_weights,
std::vector<torch::Tensor> conv_biases,
std::vector<torch::Tensor> fc_weights,
std::vector<torch::Tensor> fc_biases,
bool is_training
) {
auto current = x;
// Block 1 - Use optimized conv2d for first layer
current = custom_conv2d_optimized(current, conv_weights[0], conv_biases[0]);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[1], conv_biases[1], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Rest of the network using optimized cudnn calls
// Block 2
current = torch::conv2d(current, conv_weights[2], conv_biases[2], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[3], conv_biases[3], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 3
current = torch::conv2d(current, conv_weights[4], conv_biases[4], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[5], conv_biases[5], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[6], conv_biases[6], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 4
current = torch::conv2d(current, conv_weights[7], conv_biases[7], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[8], conv_biases[8], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[9], conv_biases[9], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Block 5
current = torch::conv2d(current, conv_weights[10], conv_biases[10], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[11], conv_biases[11], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::conv2d(current, conv_weights[12], conv_biases[12], /*stride=*/1, /*padding=*/1);
current = torch::relu(current);
current = torch::max_pool2d(current, /*kernel_size=*/2, /*stride=*/2);
// Classifier with aligned memory access
current = current.flatten(1);
current = torch::linear(current, fc_weights[0], fc_biases[0]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[1], fc_biases[1]);
current = torch::relu(current);
if (is_training) {
current = torch::dropout(current, /*p=*/0.0, /*train=*/true);
}
current = torch::linear(current, fc_weights[2], fc_biases[2]);
return current;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &vgg16_forward_cuda, "VGG16 forward (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 3.250 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 3.056 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 81.298 | % | 0.006 | 5 |
Issued Ipc Active | 3.252 | inst/cycle | 0.000 | 5 |
SM Busy | 81.298 | % | 0.006 | 5 |
Memory Throughput | 32303969295.408 | byte/second | 249016985067787200.000 | 5 |
Mem Busy | 47.970 | % | 0.007 | 5 |
Max Bandwidth | 32.440 | % | 0.004 | 5 |
L1/TEX Hit Rate | 87.178 | % | 0.002 | 5 |
L2 Hit Rate | 97.114 | % | 0.238 | 5 |
Mem Pipes Busy | 53.300 | % | 0.009 | 5 |
Warp Cycles Per Issued Instruction | 16.594 | cycle | 0.001 | 5 |
Warp Cycles Per Executed Instruction | 16.610 | cycle | 0.001 | 5 |
Avg. Active Threads Per Warp | 31.980 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 28.930 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 84.438 | % | 0.004 | 5 |
Achieved Active Warps Per SM | 54.040 | warp | 0.001 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (56.0%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (84.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
cudaLaunchKernel | ||
CPU Time | 3980651.83 | μs |
Device Time | 7765.56 | μs |
Self CPU Time | 3980651.83 | μs |
Self Device Time | 7765.56 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv2d | ||
CPU Time | 4023994.95 | μs |
Device Time | 3500126.03 | μs |
Self CPU Time | 34588.89 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 3989406.06 | μs |
Device Time | 3500126.03 | μs |
Self CPU Time | 45478.69 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 3943927.36 | μs |
Device Time | 3500126.03 | μs |
Self CPU Time | 91669.44 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 3215908.76 | μs |
Device Time | 2850275.83 | μs |
Self CPU Time | 538897.91 | μs |
Self Device Time | 2850275.83 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::linear | ||
CPU Time | 600411.73 | μs |
Device Time | 887851.58 | μs |
Self CPU Time | 10173.67 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45297 warnings generated when compiling for host. Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.