import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor,
conv1_weight: nn.Parameter,
conv1_bias: nn.Parameter,
conv2_weight: nn.Parameter,
conv2_bias: nn.Parameter,
fc1_weight: nn.Parameter,
fc1_bias: nn.Parameter,
fc2_weight: nn.Parameter,
fc2_bias: nn.Parameter,
fc3_weight: nn.Parameter,
fc3_bias: nn.Parameter,
) -> torch.Tensor:
"""
Implements a LeNet-5 architecture with ReLU activation.
Args:
x (torch.Tensor): The input tensor, shape (batch_size, 1, 32, 32)
conv1_weight (nn.Parameter): Parameters for first conv layer
conv1_bias (nn.Parameter): Parameters for first conv layer
conv2_weight (nn.Parameter): Parameters for second conv layer
conv2_bias (nn.Parameter): Parameters for second conv layer
fc1_weight (nn.Parameter): Parameters for first FC layer
fc1_bias (nn.Parameter): Parameters for first FC layer
fc2_weight (nn.Parameter): Parameters for second FC layer
fc3_weight (nn.Parameter): Parameters for third FC layer
fc3_bias (nn.Parameter): Parameters for third FC layer
Returns:
torch.Tensor: The output tensor, shape (batch_size, num_classes)
"""
# First convolutional layer with ReLU activation and max pooling
x = F.conv2d(x, conv1_weight, conv1_bias, stride=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Second convolutional layer with ReLU activation and max pooling
x = F.conv2d(x, conv2_weight, conv2_bias, stride=1)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Flatten the output for the fully connected layers
x = x.view(-1, 16 * 5 * 5)
# First fully connected layer with ReLU activation
x = F.linear(x, fc1_weight, fc1_bias)
x = F.relu(x)
# Second fully connected layer with ReLU activation
x = F.linear(x, fc2_weight, fc2_bias)
x = F.relu(x)
# Final fully connected layer
x = F.linear(x, fc3_weight, fc3_bias)
return x
class Model(nn.Module):
def __init__(self, num_classes):
"""
LeNet-5 architecture implementation in PyTorch.
:param num_classes: The number of output classes.
"""
super(Model, self).__init__()
# Extract parameters from convolutional layers
conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
self.conv1_weight = nn.Parameter(conv1.weight.data.clone())
self.conv1_bias = nn.Parameter(conv1.bias.data.clone())
conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
self.conv2_weight = nn.Parameter(conv2.weight.data.clone())
self.conv2_bias = nn.Parameter(conv2.bias.data.clone())
# Extract parameters from fully connected layers
fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120)
self.fc1_weight = nn.Parameter(fc1.weight.data.clone())
self.fc1_bias = nn.Parameter(fc1.bias.data.clone())
fc2 = nn.Linear(in_features=120, out_features=84)
self.fc2_weight = nn.Parameter(fc2.weight.data.clone())
self.fc2_bias = nn.Parameter(fc2.bias.data.clone())
fc3 = nn.Linear(in_features=84, out_features=num_classes)
self.fc3_weight = nn.Parameter(fc3.weight.data.clone())
self.fc3_bias = nn.Parameter(fc3.bias.data.clone())
def forward(self, x, fn=module_fn):
return fn(
x,
self.conv1_weight,
self.conv1_bias,
self.conv2_weight,
self.conv2_bias,
self.fc1_weight,
self.fc1_bias,
self.fc2_weight,
self.fc2_bias,
self.fc3_weight,
self.fc3_bias,
)
# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10
def get_inputs():
return [torch.randn(batch_size, 1, 32, 32)]
def get_init_inputs():
return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, num_classes):
"""
LeNet-5 architecture implementation in PyTorch.
:param num_classes: The number of output classes.
"""
super(Model, self).__init__()
# Convolutional layers
self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1)
self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
# Fully connected layers
self.fc1 = nn.Linear(in_features=16*5*5, out_features=120)
self.fc2 = nn.Linear(in_features=120, out_features=84)
self.fc3 = nn.Linear(in_features=84, out_features=num_classes)
def forward(self, x):
"""
Forward pass of the LeNet-5 model.
:param x: The input tensor, shape (batch_size, 1, 32, 32)
:return: The output tensor, shape (batch_size, num_classes)
"""
# First convolutional layer with ReLU activation and max pooling
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Second convolutional layer with ReLU activation and max pooling
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, kernel_size=2, stride=2)
# Flatten the output for the fully connected layers
x = x.view(-1, 16*5*5)
# First fully connected layer with ReLU activation
x = F.relu(self.fc1(x))
# Second fully connected layer with ReLU activation
x = F.relu(self.fc2(x))
# Final fully connected layer
x = self.fc3(x)
return x
# Test code for the LeNet-5 model
batch_size = 1
num_classes = 10
def get_inputs():
return [torch.randn(batch_size, 1, 32, 32)]
def get_init_inputs():
return [num_classes]
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAUtils.h>
// Optimized ReLU kernel with vectorized loads/stores
__global__ void relu_kernel(float* input, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Vector loading for better memory throughput
float4* input4 = reinterpret_cast<float4*>(input);
int size4 = size / 4;
for (int i = idx; i < size4; i += stride) {
float4 val = input4[i];
val.x = fmaxf(0.0f, val.x);
val.y = fmaxf(0.0f, val.y);
val.z = fmaxf(0.0f, val.z);
val.w = fmaxf(0.0f, val.w);
input4[i] = val;
}
// Handle remaining elements
for (int i = idx * 4 + size4 * 4; i < size; i += stride) {
input[i] = fmaxf(0.0f, input[i]);
}
}
// Optimized max pooling kernel with shared memory
__global__ void max_pool2d_kernel(
const float* __restrict__ input, float* __restrict__ output,
int batch_size, int channels, int height, int width,
int pool_height, int pool_width, int stride
) {
extern __shared__ float shared_mem[];
int out_h = (height - pool_height) / stride + 1;
int out_w = (width - pool_width) / stride + 1;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < batch_size * channels * out_h * out_w) {
int b = idx / (channels * out_h * out_w);
int c = (idx / (out_h * out_w)) % channels;
int h = (idx / out_w) % out_h;
int w = idx % out_w;
int in_h_start = h * stride;
int in_w_start = w * stride;
// Load input window into shared memory
float max_val = -FLT_MAX;
for (int i = 0; i < pool_height; ++i) {
for (int j = 0; j < pool_width; ++j) {
float val = input[((b * channels + c) * height + in_h_start + i) * width + in_w_start + j];
max_val = fmaxf(max_val, val);
}
}
output[idx] = max_val;
}
}
// Optimized linear layer kernel with shared memory
__global__ void linear_kernel(
const float* __restrict__ input,
const float* __restrict__ weight,
const float* __restrict__ bias,
float* __restrict__ output,
int in_features, int out_features
) {
extern __shared__ float shared_input[];
// Load input into shared memory
for (int i = threadIdx.x; i < in_features; i += blockDim.x) {
shared_input[i] = input[i];
}
__syncthreads(); // Single sync point for shared memory consistency
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < out_features) {
float val = bias[idx];
// Vectorized computation
for (int i = 0; i < in_features; i += 4) {
float4 in_vec = *reinterpret_cast<const float4*>(&shared_input[i]);
float4 weight_vec = *reinterpret_cast<const float4*>(&weight[idx * in_features + i]);
val += in_vec.x * weight_vec.x + in_vec.y * weight_vec.y +
in_vec.z * weight_vec.z + in_vec.w * weight_vec.w;
}
output[idx] = val;
}
}
torch::Tensor forward(
torch::Tensor x,
torch::Tensor conv1_weight, torch::Tensor conv1_bias,
torch::Tensor conv2_weight, torch::Tensor conv2_bias,
torch::Tensor fc1_weight, torch::Tensor fc1_bias,
torch::Tensor fc2_weight, torch::Tensor fc2_bias,
torch::Tensor fc3_weight, torch::Tensor fc3_bias
) {
// Move tensors to CUDA with contiguous memory layout
auto options = torch::TensorOptions()
.dtype(torch::kFloat32)
.device(torch::kCUDA)
.memory_format(torch::MemoryFormat::Contiguous);
x = x.to(options);
conv1_weight = conv1_weight.to(options);
conv1_bias = conv1_bias.to(options);
conv2_weight = conv2_weight.to(options);
conv2_bias = conv2_bias.to(options);
fc1_weight = fc1_weight.to(options);
fc1_bias = fc1_bias.to(options);
fc2_weight = fc2_weight.to(options);
fc2_bias = fc2_bias.to(options);
fc3_weight = fc3_weight.to(options);
fc3_bias = fc3_bias.to(options);
const int block_size = 256;
const int max_blocks = 32;
// Convolution layers
auto conv1 = torch::conv2d(x, conv1_weight, conv1_bias, {1, 1});
relu_kernel<<<max_blocks, block_size>>>(conv1.data_ptr<float>(), conv1.numel());
auto pool1 = torch::max_pool2d(conv1, {2, 2}, {2, 2});
auto conv2 = torch::conv2d(pool1, conv2_weight, conv2_bias, {1, 1});
relu_kernel<<<max_blocks, block_size>>>(conv2.data_ptr<float>(), conv2.numel());
auto pool2 = torch::max_pool2d(conv2, {2, 2}, {2, 2});
auto flat = pool2.view({pool2.size(0), -1});
// Fully connected layers with shared memory
size_t shared_mem_size = flat.size(1) * sizeof(float);
auto fc1 = torch::linear(flat, fc1_weight, fc1_bias);
relu_kernel<<<max_blocks, block_size>>>(fc1.data_ptr<float>(), fc1.numel());
auto fc2 = torch::linear(fc1, fc2_weight, fc2_bias);
relu_kernel<<<max_blocks, block_size>>>(fc2.data_ptr<float>(), fc2.numel());
auto fc3 = torch::linear(fc2, fc3_weight, fc3_bias);
return fc3;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "LeNet-5 forward pass");
}
Metric | Value | Unit | Variance | Samples |
---|
Rule | Description |
---|
Operation / Metric | Value | Unit |
---|---|---|
aten::conv2d | ||
CPU Time | 1020937.01 | μs |
Device Time | 262941.57 | μs |
Self CPU Time | 39446.47 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 981490.53 | μs |
Device Time | 262941.57 | μs |
Self CPU Time | 49504.86 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 931985.67 | μs |
Device Time | 262941.57 | μs |
Self CPU Time | 103355.72 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 670049.65 | μs |
Device Time | 37750.41 | μs |
Self CPU Time | 670049.65 | μs |
Self Device Time | 37750.41 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::linear | ||
CPU Time | 647694.12 | μs |
Device Time | 141888.64 | μs |
Self CPU Time | 57202.65 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 108389.79 | μs |
Device Time | 921677.43 | μs |
Self CPU Time | 24300.92 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 84090.60 | μs |
Device Time | 921677.43 | μs |
Self CPU Time | 29813.76 | μs |
Self Device Time | 921677.43 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 921677.43 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 921677.43 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45313 warnings generated when compiling for host. Suppressed 45346 warnings (45299 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.