import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
"""
Implements the ResNet101 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
params (nn.ParameterDict): Dictionary of parameters
is_training (bool): Whether to use training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Initial layers
x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
x = F.batch_norm(
x,
params["bn1_m"].to(x.device),
params["bn1_v"].to(x.device),
params["bn1_w"].to(x.device),
params["bn1_b"].to(x.device),
training=is_training,
)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
def bottleneck_fn(
x,
conv1_w,
conv2_w,
conv3_w,
bn1_w,
bn1_b,
bn1_m,
bn1_v,
bn2_w,
bn2_b,
bn2_m,
bn2_v,
bn3_w,
bn3_b,
bn3_m,
bn3_v,
downsample_conv_w=None,
downsample_bn_w=None,
downsample_bn_b=None,
downsample_bn_m=None,
downsample_bn_v=None,
stride=1,
is_training=True,
):
identity = x
out = F.conv2d(x, conv1_w.to(x.device), bias=None)
out = F.batch_norm(
out,
bn1_m.to(x.device),
bn1_v.to(x.device),
bn1_w.to(x.device),
bn1_b.to(x.device),
training=is_training,
)
out = F.relu(out)
out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
out = F.batch_norm(
out,
bn2_m.to(x.device),
bn2_v.to(x.device),
bn2_w.to(x.device),
bn2_b.to(x.device),
training=is_training,
)
out = F.relu(out)
out = F.conv2d(out, conv3_w.to(x.device), bias=None)
out = F.batch_norm(
out,
bn3_m.to(x.device),
bn3_v.to(x.device),
bn3_w.to(x.device),
bn3_b.to(x.device),
training=is_training,
)
if downsample_conv_w is not None:
identity = F.conv2d(
x, downsample_conv_w.to(x.device), bias=None, stride=stride
)
identity = F.batch_norm(
identity,
downsample_bn_m.to(x.device),
downsample_bn_v.to(x.device),
downsample_bn_w.to(x.device),
downsample_bn_b.to(x.device),
training=is_training,
)
out += identity
out = F.relu(out)
return out
# Layer 1-4
for layer_idx in range(1, 5):
blocks = params[f"layer{layer_idx}_blocks"]
for block_idx in range(len(blocks)):
block_params = blocks[block_idx]
downsample_params = None
if "downsample_conv_w" in block_params:
downsample_params = [
block_params["downsample_conv_w"],
block_params["downsample_bn_w"],
block_params["downsample_bn_b"],
block_params["downsample_bn_m"],
block_params["downsample_bn_v"],
]
x = bottleneck_fn(
x,
block_params["conv1_w"],
block_params["conv2_w"],
block_params["conv3_w"],
block_params["bn1_w"],
block_params["bn1_b"],
block_params["bn1_m"],
block_params["bn1_v"],
block_params["bn2_w"],
block_params["bn2_b"],
block_params["bn2_m"],
block_params["bn2_v"],
block_params["bn3_w"],
block_params["bn3_b"],
block_params["bn3_m"],
block_params["bn3_v"],
*(downsample_params if downsample_params else [None] * 5),
stride=2 if block_idx == 0 and layer_idx > 1 else 1,
is_training=is_training,
)
x = F.adaptive_avg_pool2d(x, (1, 1))
x = torch.flatten(x, 1)
x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))
return x
class Model(nn.Module):
def __init__(self, layers, num_classes=1000):
super(Model, self).__init__()
self.params = nn.ParameterDict()
in_channels = 64
expansion = 4
# Initial layers
conv1 = nn.Conv2d(
3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
)
bn1 = nn.BatchNorm2d(in_channels)
self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())
# Layers 1-4
channels = [64, 128, 256, 512]
for layer_idx, (out_channels, num_blocks) in enumerate(
zip(channels, layers), 1
):
layer_blocks = []
for block_idx in range(num_blocks):
block_in_channels = (
in_channels if block_idx == 0 else out_channels * expansion
)
# Create block parameters
block_params = {}
# First block may have downsample
if block_idx == 0 and (
layer_idx > 1 or block_in_channels != out_channels * expansion
):
downsample_conv = nn.Conv2d(
block_in_channels,
out_channels * expansion,
kernel_size=1,
stride=2 if layer_idx > 1 else 1,
bias=False,
)
downsample_bn = nn.BatchNorm2d(out_channels * expansion)
block_params["downsample_conv_w"] = nn.Parameter(
downsample_conv.weight.data.clone()
)
block_params["downsample_bn_w"] = nn.Parameter(
downsample_bn.weight.data.clone()
)
block_params["downsample_bn_b"] = nn.Parameter(
downsample_bn.bias.data.clone()
)
block_params["downsample_bn_m"] = nn.Parameter(
downsample_bn.running_mean.data.clone()
)
block_params["downsample_bn_v"] = nn.Parameter(
downsample_bn.running_var.data.clone()
)
conv1 = nn.Conv2d(
block_in_channels, out_channels, kernel_size=1, bias=False
)
bn1 = nn.BatchNorm2d(out_channels)
conv2 = nn.Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=2 if block_idx == 0 and layer_idx > 1 else 1,
padding=1,
bias=False,
)
bn2 = nn.BatchNorm2d(out_channels)
conv3 = nn.Conv2d(
out_channels, out_channels * expansion, kernel_size=1, bias=False
)
bn3 = nn.BatchNorm2d(out_channels * expansion)
block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())
block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())
block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())
layer_blocks.append(block_params)
self.params[f"layer{layer_idx}_blocks"] = layer_blocks
in_channels = out_channels * expansion
# Final FC layer
fc = nn.Linear(512 * expansion, num_classes)
self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())
def forward(self, x, fn=module_fn):
return fn(x, self.params, self.training)
# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, height, width)]
def get_init_inputs():
return [layers, num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
"""
:param in_channels: Number of input channels
:param out_channels: Number of output channels
:param stride: Stride for the first convolutional layer
:param downsample: Downsample layer for the shortcut connection
"""
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
"""
:param x: Input tensor, shape (batch_size, in_channels, height, width)
:return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
"""
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Model(nn.Module):
def __init__(self, layers, num_classes=1000):
"""
:param block: Type of block to use (BasicBlock or Bottleneck)
:param layers: List of integers specifying the number of blocks in each layer
:param num_classes: Number of output classes
"""
super(Model, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(self.in_channels)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
block = Bottleneck
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels * block.expansion),
)
layers = []
layers.append(block(self.in_channels, out_channels, stride, downsample))
self.in_channels = out_channels * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
"""
:param x: Input tensor, shape (batch_size, 3, height, width)
:return: Output tensor, shape (batch_size, num_classes)
"""
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, height, width)]
def get_init_inputs():
return [layers, num_classes]
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>
namespace py = pybind11;
// This kernel fuses the elementwise addition of the identity and the main branch, and applies ReLU
// It evenly distributes the workload across threads and blocks using a grid-stride loop
__global__ void fused_add_relu_kernel(const float* __restrict__ identity, float* __restrict__ out, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (; idx < size; idx += stride) {
float val = out[idx] + identity[idx];
out[idx] = (val > 0.0f) ? val : 0.0f;
}
}
// The bottleneck function implements the main branch and optionally a downsample branch.
// The final residual addition and activation are fused via a custom CUDA kernel to evenly distribute work.
torch::Tensor bottleneck_fn(
torch::Tensor x,
const torch::Tensor& conv1_w,
const torch::Tensor& conv2_w,
const torch::Tensor& conv3_w,
const torch::Tensor& bn1_w,
const torch::Tensor& bn1_b,
const torch::Tensor& bn1_m,
const torch::Tensor& bn1_v,
const torch::Tensor& bn2_w,
const torch::Tensor& bn2_b,
const torch::Tensor& bn2_m,
const torch::Tensor& bn2_v,
const torch::Tensor& bn3_w,
const torch::Tensor& bn3_b,
const torch::Tensor& bn3_m,
const torch::Tensor& bn3_v,
const torch::Tensor& downsample_conv_w,
const torch::Tensor& downsample_bn_w,
const torch::Tensor& downsample_bn_b,
const torch::Tensor& downsample_bn_m,
const torch::Tensor& downsample_bn_v,
int64_t stride,
bool is_training
) {
torch::Tensor identity = x;
bool has_downsample = downsample_conv_w.defined();
torch::Tensor downsample_out;
if (has_downsample) {
downsample_out = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride)
.to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
downsample_out = torch::batch_norm(
downsample_out,
downsample_bn_w,
downsample_bn_b,
downsample_bn_m,
downsample_bn_v,
is_training,
/*momentum=*/0.1,
/*eps=*/1e-5,
/*cudnn_enabled=*/true
);
identity = downsample_out;
}
torch::Tensor out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor())
.to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
out = torch::batch_norm(
out,
bn1_w,
bn1_b,
bn1_m,
bn1_v,
is_training,
0.1,
1e-5,
true
);
out = torch::relu(out);
out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, /*padding=*/1)
.to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
out = torch::batch_norm(
out,
bn2_w,
bn2_b,
bn2_m,
bn2_v,
is_training,
0.1,
1e-5,
true
);
out = torch::relu(out);
out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor())
.to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
out = torch::batch_norm(
out,
bn3_w,
bn3_b,
bn3_m,
bn3_v,
is_training,
0.1,
1e-5,
true
);
// Fuse the residual addition and ReLU activation using the custom CUDA kernel
int64_t num_elements = out.numel();
const int threads = 256;
const int blocks = (num_elements + threads - 1) / threads;
fused_add_relu_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
identity.data_ptr<float>(),
out.data_ptr<float>(),
num_elements
);
cudaDeviceSynchronize(); // Ensure kernel execution completes before proceeding
return out;
}
// The forward function processes the ResNet101 architecture layer by layer.
// It prefetches all parameters on the correct device and uses the bottleneck_fn to apply each block.
// The fused kernel ensures that the residual addition and activation are distributed evenly among CUDA threads.
torch::Tensor forward(
torch::Tensor x,
py::object params,
bool is_training
) {
auto device = x.device();
// Initial layer parameters
auto conv1_w = params.attr("get")(py::str("conv1_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_w = params.attr("get")(py::str("bn1_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_b = params.attr("get")(py::str("bn1_b")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_m = params.attr("get")(py::str("bn1_m")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_v = params.attr("get")(py::str("bn1_v")).cast<torch::Tensor>().contiguous().to(device, true);
x = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor(), /*stride=*/2, /*padding=*/3)
.to(x.dtype(), /*non_blocking=*/true, /*copy=*/false, torch::MemoryFormat::Contiguous);
x = torch::batch_norm(
x,
bn1_w,
bn1_b,
bn1_m,
bn1_v,
is_training,
0.1,
1e-5,
true
);
x = torch::relu(x);
x = torch::max_pool2d(x, /*kernel_size=*/3, /*stride=*/2, /*padding=*/1);
// Process each layer's blocks
for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
std::string layer_key = "layer" + std::to_string(layer_idx) + "_blocks";
py::list blocks = params.attr("get")(py::str(layer_key)).cast<py::list>();
for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
py::object block_params = blocks[block_idx];
auto conv1_w = block_params.attr("get")(py::str("conv1_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto conv2_w = block_params.attr("get")(py::str("conv2_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto conv3_w = block_params.attr("get")(py::str("conv3_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_w = block_params.attr("get")(py::str("bn1_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_b = block_params.attr("get")(py::str("bn1_b")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_m = block_params.attr("get")(py::str("bn1_m")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_v = block_params.attr("get")(py::str("bn1_v")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_w = block_params.attr("get")(py::str("bn2_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_b = block_params.attr("get")(py::str("bn2_b")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_m = block_params.attr("get")(py::str("bn2_m")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_v = block_params.attr("get")(py::str("bn2_v")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_w = block_params.attr("get")(py::str("bn3_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_b = block_params.attr("get")(py::str("bn3_b")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_m = block_params.attr("get")(py::str("bn3_m")).cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_v = block_params.attr("get")(py::str("bn3_v")).cast<torch::Tensor>().contiguous().to(device, true);
torch::Tensor downsample_conv_w = torch::Tensor();
torch::Tensor downsample_bn_w = torch::Tensor();
torch::Tensor downsample_bn_b = torch::Tensor();
torch::Tensor downsample_bn_m = torch::Tensor();
torch::Tensor downsample_bn_v = torch::Tensor();
if (py::bool_(block_params.attr("__contains__")(py::str("downsample_conv_w")))) {
downsample_conv_w = block_params.attr("get")(py::str("downsample_conv_w")).cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_w = block_params.attr("get")(py::str("downsample_bn_w")).cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_b = block_params.attr("get")(py::str("downsample_bn_b")).cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_m = block_params.attr("get")(py::str("downsample_bn_m")).cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_v = block_params.attr("get")(py::str("downsample_bn_v")).cast<torch::Tensor>().contiguous().to(device, true);
}
int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
x = bottleneck_fn(
x,
conv1_w,
conv2_w,
conv3_w,
bn1_w,
bn1_b,
bn1_m,
bn1_v,
bn2_w,
bn2_b,
bn2_m,
bn2_v,
bn3_w,
bn3_b,
bn3_m,
bn3_v,
downsample_conv_w,
downsample_bn_w,
downsample_bn_b,
downsample_bn_m,
downsample_bn_v,
stride,
is_training
);
}
}
x = torch::adaptive_avg_pool2d(x, {1, 1}).contiguous();
x = x.view({x.size(0), -1});
auto fc_w = params.attr("get")(py::str("fc_w")).cast<torch::Tensor>().contiguous().to(device, true);
auto fc_b = params.attr("get")(py::str("fc_b")).cast<torch::Tensor>().contiguous().to(device, true);
return torch::linear(x, fc_w, fc_b);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "ResNet101 forward function with fused addition and ReLU");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.026 | inst/cycle | 0.002 | 5 |
Executed Ipc Elapsed | 1.624 | inst/cycle | 0.007 | 5 |
Issue Slots Busy | 51.182 | % | 1.069 | 5 |
Issued Ipc Active | 2.046 | inst/cycle | 0.002 | 5 |
SM Busy | 51.182 | % | 1.069 | 5 |
Memory Throughput | 1555965485156.598 | byte/second | 12865757325163071799296.000 | 5 |
Mem Busy | 33.272 | % | 3.706 | 5 |
Max Bandwidth | 46.618 | % | 11.119 | 5 |
L1/TEX Hit Rate | 33.278 | % | 0.002 | 5 |
L2 Hit Rate | 36.298 | % | 0.188 | 5 |
Mem Pipes Busy | 41.030 | % | 3.968 | 5 |
Warp Cycles Per Issued Instruction | 24.240 | cycle | 0.056 | 5 |
Warp Cycles Per Executed Instruction | 24.464 | cycle | 0.082 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 28.000 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 78.476 | % | 0.062 | 5 |
Achieved Active Warps Per SM | 50.224 | warp | 0.026 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (32.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (78.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 5755936.10 | μs |
Device Time | 2899889.06 | μs |
Self CPU Time | 100871.29 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 5655064.81 | μs |
Device Time | 2899889.06 | μs |
Self CPU Time | 306699.69 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::copy_ | ||
CPU Time | 6391356.89 | μs |
Device Time | 2899889.06 | μs |
Self CPU Time | 1902843.96 | μs |
Self Device Time | 2899889.06 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaMemcpyAsync | ||
CPU Time | 4488403.88 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 4488403.88 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
Memcpy HtoD (Pageable -> Device) | ||
CPU Time | 0.00 | μs |
Device Time | 2899890.95 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2899890.95 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::clone | ||
CPU Time | 1661466.33 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 6945.93 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv2d | ||
CPU Time | 1106288.10 | μs |
Device Time | 604797.11 | μs |
Self CPU Time | 44392.89 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45300 warnings generated when compiling for host. Suppressed 45337 warnings (45290 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.