import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
"""
Implements the ResNet101 module.
Args:
x (torch.Tensor): Input tensor, shape (batch_size, in_channels, height, width)
params (nn.ParameterDict): Dictionary of parameters
is_training (bool): Whether to use training mode
Returns:
torch.Tensor: Output tensor, shape (batch_size, num_classes)
"""
# Initial layers
x = F.conv2d(x, params["conv1_w"].to(x.device), bias=None, stride=2, padding=3)
x = F.batch_norm(
x,
params["bn1_m"].to(x.device),
params["bn1_v"].to(x.device),
params["bn1_w"].to(x.device),
params["bn1_b"].to(x.device),
training=is_training,
)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
def bottleneck_fn(
x,
conv1_w,
conv2_w,
conv3_w,
bn1_w,
bn1_b,
bn1_m,
bn1_v,
bn2_w,
bn2_b,
bn2_m,
bn2_v,
bn3_w,
bn3_b,
bn3_m,
bn3_v,
downsample_conv_w=None,
downsample_bn_w=None,
downsample_bn_b=None,
downsample_bn_m=None,
downsample_bn_v=None,
stride=1,
is_training=True,
):
identity = x
out = F.conv2d(x, conv1_w.to(x.device), bias=None)
out = F.batch_norm(
out,
bn1_m.to(x.device),
bn1_v.to(x.device),
bn1_w.to(x.device),
bn1_b.to(x.device),
training=is_training,
)
out = F.relu(out)
out = F.conv2d(out, conv2_w.to(x.device), bias=None, stride=stride, padding=1)
out = F.batch_norm(
out,
bn2_m.to(x.device),
bn2_v.to(x.device),
bn2_w.to(x.device),
bn2_b.to(x.device),
training=is_training,
)
out = F.relu(out)
out = F.conv2d(out, conv3_w.to(x.device), bias=None)
out = F.batch_norm(
out,
bn3_m.to(x.device),
bn3_v.to(x.device),
bn3_w.to(x.device),
bn3_b.to(x.device),
training=is_training,
)
if downsample_conv_w is not None:
identity = F.conv2d(
x, downsample_conv_w.to(x.device), bias=None, stride=stride
)
identity = F.batch_norm(
identity,
downsample_bn_m.to(x.device),
downsample_bn_v.to(x.device),
downsample_bn_w.to(x.device),
downsample_bn_b.to(x.device),
training=is_training,
)
out += identity
out = F.relu(out)
return out
# Layer 1-4
for layer_idx in range(1, 5):
blocks = params[f"layer{layer_idx}_blocks"]
for block_idx in range(len(blocks)):
block_params = blocks[block_idx]
downsample_params = None
if "downsample_conv_w" in block_params:
downsample_params = [
block_params["downsample_conv_w"],
block_params["downsample_bn_w"],
block_params["downsample_bn_b"],
block_params["downsample_bn_m"],
block_params["downsample_bn_v"],
]
x = bottleneck_fn(
x,
block_params["conv1_w"],
block_params["conv2_w"],
block_params["conv3_w"],
block_params["bn1_w"],
block_params["bn1_b"],
block_params["bn1_m"],
block_params["bn1_v"],
block_params["bn2_w"],
block_params["bn2_b"],
block_params["bn2_m"],
block_params["bn2_v"],
block_params["bn3_w"],
block_params["bn3_b"],
block_params["bn3_m"],
block_params["bn3_v"],
*(downsample_params if downsample_params else [None] * 5),
stride=2 if block_idx == 0 and layer_idx > 1 else 1,
is_training=is_training,
)
x = F.adaptive_avg_pool2d(x, (1, 1))
x = torch.flatten(x, 1)
x = F.linear(x, params["fc_w"].to(x.device), params["fc_b"].to(x.device))
return x
class Model(nn.Module):
def __init__(self, layers, num_classes=1000):
super(Model, self).__init__()
self.params = nn.ParameterDict()
in_channels = 64
expansion = 4
# Initial layers
conv1 = nn.Conv2d(
3, in_channels, kernel_size=7, stride=2, padding=3, bias=False
)
bn1 = nn.BatchNorm2d(in_channels)
self.params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
self.params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
self.params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
self.params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
self.params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())
# Layers 1-4
channels = [64, 128, 256, 512]
for layer_idx, (out_channels, num_blocks) in enumerate(
zip(channels, layers), 1
):
layer_blocks = []
for block_idx in range(num_blocks):
block_in_channels = (
in_channels if block_idx == 0 else out_channels * expansion
)
# Create block parameters
block_params = {}
# First block may have downsample
if block_idx == 0 and (
layer_idx > 1 or block_in_channels != out_channels * expansion
):
downsample_conv = nn.Conv2d(
block_in_channels,
out_channels * expansion,
kernel_size=1,
stride=2 if layer_idx > 1 else 1,
bias=False,
)
downsample_bn = nn.BatchNorm2d(out_channels * expansion)
block_params["downsample_conv_w"] = nn.Parameter(
downsample_conv.weight.data.clone()
)
block_params["downsample_bn_w"] = nn.Parameter(
downsample_bn.weight.data.clone()
)
block_params["downsample_bn_b"] = nn.Parameter(
downsample_bn.bias.data.clone()
)
block_params["downsample_bn_m"] = nn.Parameter(
downsample_bn.running_mean.data.clone()
)
block_params["downsample_bn_v"] = nn.Parameter(
downsample_bn.running_var.data.clone()
)
conv1 = nn.Conv2d(
block_in_channels, out_channels, kernel_size=1, bias=False
)
bn1 = nn.BatchNorm2d(out_channels)
conv2 = nn.Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=2 if block_idx == 0 and layer_idx > 1 else 1,
padding=1,
bias=False,
)
bn2 = nn.BatchNorm2d(out_channels)
conv3 = nn.Conv2d(
out_channels, out_channels * expansion, kernel_size=1, bias=False
)
bn3 = nn.BatchNorm2d(out_channels * expansion)
block_params["conv1_w"] = nn.Parameter(conv1.weight.data.clone())
block_params["bn1_w"] = nn.Parameter(bn1.weight.data.clone())
block_params["bn1_b"] = nn.Parameter(bn1.bias.data.clone())
block_params["bn1_m"] = nn.Parameter(bn1.running_mean.data.clone())
block_params["bn1_v"] = nn.Parameter(bn1.running_var.data.clone())
block_params["conv2_w"] = nn.Parameter(conv2.weight.data.clone())
block_params["bn2_w"] = nn.Parameter(bn2.weight.data.clone())
block_params["bn2_b"] = nn.Parameter(bn2.bias.data.clone())
block_params["bn2_m"] = nn.Parameter(bn2.running_mean.data.clone())
block_params["bn2_v"] = nn.Parameter(bn2.running_var.data.clone())
block_params["conv3_w"] = nn.Parameter(conv3.weight.data.clone())
block_params["bn3_w"] = nn.Parameter(bn3.weight.data.clone())
block_params["bn3_b"] = nn.Parameter(bn3.bias.data.clone())
block_params["bn3_m"] = nn.Parameter(bn3.running_mean.data.clone())
block_params["bn3_v"] = nn.Parameter(bn3.running_var.data.clone())
layer_blocks.append(block_params)
self.params[f"layer{layer_idx}_blocks"] = layer_blocks
in_channels = out_channels * expansion
# Final FC layer
fc = nn.Linear(512 * expansion, num_classes)
self.params["fc_w"] = nn.Parameter(fc.weight.data.clone())
self.params["fc_b"] = nn.Parameter(fc.bias.data.clone())
def forward(self, x, fn=module_fn):
return fn(x, self.params, self.training)
# Test configurations
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, height, width)]
def get_init_inputs():
return [layers, num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
"""
:param in_channels: Number of input channels
:param out_channels: Number of output channels
:param stride: Stride for the first convolutional layer
:param downsample: Downsample layer for the shortcut connection
"""
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
"""
:param x: Input tensor, shape (batch_size, in_channels, height, width)
:return: Output tensor, shape (batch_size, out_channels * expansion, height, width)
"""
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Model(nn.Module):
def __init__(self, layers, num_classes=1000):
"""
:param block: Type of block to use (BasicBlock or Bottleneck)
:param layers: List of integers specifying the number of blocks in each layer
:param num_classes: Number of output classes
"""
super(Model, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(self.in_channels)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
block = Bottleneck
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels * block.expansion),
)
layers = []
layers.append(block(self.in_channels, out_channels, stride, downsample))
self.in_channels = out_channels * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
"""
:param x: Input tensor, shape (batch_size, 3, height, width)
:return: Output tensor, shape (batch_size, num_classes)
"""
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
# Test code
batch_size = 10
height = 224
width = 224
layers = [3, 4, 23, 3]
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, height, width)]
def get_init_inputs():
return [layers, num_classes]
#include <torch/extension.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>
namespace py = pybind11;
__global__ void warp_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int spatial, int C, int H, int W) {
int index = blockIdx.x;
int n = index / C;
int c = index % C;
int total = H * W;
const float* in_ptr = input + (n * C + c) * total;
float sum = 0.0f;
for (int i = threadIdx.x; i < total; i += blockDim.x) {
sum += in_ptr[i];
}
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
sum += __shfl_down_sync(0xffffffff, sum, offset);
}
if (threadIdx.x == 0) {
output[index] = sum / total;
}
}
torch::Tensor unified_bottleneck(
torch::Tensor x,
torch::Tensor conv1_w,
torch::Tensor conv2_w,
torch::Tensor conv3_w,
torch::Tensor bn1_w,
torch::Tensor bn1_b,
torch::Tensor bn1_m,
torch::Tensor bn1_v,
torch::Tensor bn2_w,
torch::Tensor bn2_b,
torch::Tensor bn2_m,
torch::Tensor bn2_v,
torch::Tensor bn3_w,
torch::Tensor bn3_b,
torch::Tensor bn3_m,
torch::Tensor bn3_v,
bool has_downsample,
torch::Tensor downsample_conv_w,
torch::Tensor downsample_bn_w,
torch::Tensor downsample_bn_b,
torch::Tensor downsample_bn_m,
torch::Tensor downsample_bn_v,
int64_t stride,
bool is_training
) {
auto out = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor());
out = torch::batch_norm(out, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
out = torch::relu(out);
out = torch::conv2d(out, conv2_w, /*bias=*/torch::Tensor(), stride, 1);
out = torch::batch_norm(out, bn2_w, bn2_b, bn2_m, bn2_v, is_training, 0.1, 1e-5, true);
out = torch::relu(out);
out = torch::conv2d(out, conv3_w, /*bias=*/torch::Tensor());
out = torch::batch_norm(out, bn3_w, bn3_b, bn3_m, bn3_v, is_training, 0.1, 1e-5, true);
torch::Tensor identity;
if (has_downsample) {
identity = torch::conv2d(x, downsample_conv_w, /*bias=*/torch::Tensor(), stride);
identity = torch::batch_norm(identity, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v, is_training, 0.1, 1e-5, true);
} else {
identity = x.to(out.dtype());
}
return torch::relu(out + identity);
}
torch::Tensor forward(torch::Tensor x, py::object params, bool is_training) {
auto device = x.device();
auto conv1_w = params.attr("get")("conv1_w").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_w = params.attr("get")("bn1_w").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_b = params.attr("get")("bn1_b").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_m = params.attr("get")("bn1_m").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_v = params.attr("get")("bn1_v").cast<torch::Tensor>().contiguous().to(device, true);
x = torch::conv2d(x, conv1_w, /*bias=*/torch::Tensor(), 2, 3);
x = torch::batch_norm(x, bn1_w, bn1_b, bn1_m, bn1_v, is_training, 0.1, 1e-5, true);
x = torch::relu(x);
x = torch::max_pool2d(x, 3, 2, 1);
for (int layer_idx = 1; layer_idx <= 4; ++layer_idx) {
std::string layer_key = "layer" + std::to_string(layer_idx) + "_blocks";
py::list blocks = params.attr("get")(py::str(layer_key)).cast<py::list>();
for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) {
py::object block = blocks[block_idx];
auto conv1_w_blk = block.attr("get")("conv1_w").cast<torch::Tensor>().contiguous().to(device, true);
auto conv2_w_blk = block.attr("get")("conv2_w").cast<torch::Tensor>().contiguous().to(device, true);
auto conv3_w_blk = block.attr("get")("conv3_w").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_w_blk = block.attr("get")("bn1_w").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_b_blk = block.attr("get")("bn1_b").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_m_blk = block.attr("get")("bn1_m").cast<torch::Tensor>().contiguous().to(device, true);
auto bn1_v_blk = block.attr("get")("bn1_v").cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_w_blk = block.attr("get")("bn2_w").cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_b_blk = block.attr("get")("bn2_b").cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_m_blk = block.attr("get")("bn2_m").cast<torch::Tensor>().contiguous().to(device, true);
auto bn2_v_blk = block.attr("get")("bn2_v").cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_w_blk = block.attr("get")("bn3_w").cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_b_blk = block.attr("get")("bn3_b").cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_m_blk = block.attr("get")("bn3_m").cast<torch::Tensor>().contiguous().to(device, true);
auto bn3_v_blk = block.attr("get")("bn3_v").cast<torch::Tensor>().contiguous().to(device, true);
bool has_downsample = py::bool_(block.attr("__contains__")(py::str("downsample_conv_w")));
torch::Tensor downsample_conv_w, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v;
if (has_downsample) {
downsample_conv_w = block.attr("get")("downsample_conv_w").cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_w = block.attr("get")("downsample_bn_w").cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_b = block.attr("get")("downsample_bn_b").cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_m = block.attr("get")("downsample_bn_m").cast<torch::Tensor>().contiguous().to(device, true);
downsample_bn_v = block.attr("get")("downsample_bn_v").cast<torch::Tensor>().contiguous().to(device, true);
}
int64_t stride = (block_idx == 0 && layer_idx > 1) ? 2 : 1;
x = unified_bottleneck(
x,
conv1_w_blk, conv2_w_blk, conv3_w_blk,
bn1_w_blk, bn1_b_blk, bn1_m_blk, bn1_v_blk,
bn2_w_blk, bn2_b_blk, bn2_m_blk, bn2_v_blk,
bn3_w_blk, bn3_b_blk, bn3_m_blk, bn3_v_blk,
has_downsample,
downsample_conv_w, downsample_bn_w, downsample_bn_b, downsample_bn_m, downsample_bn_v,
stride, is_training
);
}
}
auto sizes = x.sizes();
int N = sizes[0];
int C = sizes[1];
int H = sizes[2];
int W = sizes[3];
int spatial = H * W;
auto pooled = torch::empty({N, C, 1, 1}, x.options());
int grid = N * C;
int threads = 32;
warp_avg_pool_kernel<<<grid, threads>>>(x.data_ptr<float>(), pooled.data_ptr<float>(), spatial, C, H, W);
cudaDeviceSynchronize();
x = pooled.view({N, C});
auto fc_w = params.attr("get")("fc_w").cast<torch::Tensor>().contiguous().to(device, true);
auto fc_b = params.attr("get")("fc_b").cast<torch::Tensor>().contiguous().to(device, true);
return torch::linear(x, fc_w, fc_b);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Optimized ResNet101 with warp pooling and streamlined params");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 0.726 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.560 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 18.428 | % | 0.053 | 5 |
Issued Ipc Active | 0.736 | inst/cycle | 0.000 | 5 |
SM Busy | 18.428 | % | 0.053 | 5 |
Memory Throughput | 258432560647.092 | byte/second | 499909539194520384.000 | 5 |
Mem Busy | 7.828 | % | 0.001 | 5 |
Max Bandwidth | 7.730 | % | 0.000 | 5 |
L1/TEX Hit Rate | 9.922 | % | 0.000 | 5 |
L2 Hit Rate | 40.640 | % | 0.007 | 5 |
Mem Pipes Busy | 5.682 | % | 0.000 | 5 |
Warp Cycles Per Issued Instruction | 17.286 | cycle | 0.015 | 5 |
Warp Cycles Per Executed Instruction | 17.628 | cycle | 0.020 | 5 |
Avg. Active Threads Per Warp | 26.080 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 24.990 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 128.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 64.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 32.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 50.000 | % | 0.000 | 5 |
Achieved Occupancy | 20.382 | % | 0.021 | 5 |
Achieved Active Warps Per SM | 13.044 | warp | 0.009 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory. The difference between calculated theoretical (50.0%) and measured achieved occupancy (20.2%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 5043381.72 | μs |
Device Time | 2434767.38 | μs |
Self CPU Time | 86332.07 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 4957049.64 | μs |
Device Time | 2434767.38 | μs |
Self CPU Time | 284941.56 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::copy_ | ||
CPU Time | 5246972.96 | μs |
Device Time | 2434767.38 | μs |
Self CPU Time | 1318476.41 | μs |
Self Device Time | 2434767.38 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaMemcpyAsync | ||
CPU Time | 3928388.91 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 3928388.91 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
Memcpy HtoD (Pageable -> Device) | ||
CPU Time | 0.00 | μs |
Device Time | 2434769.05 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2434769.05 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::conv2d | ||
CPU Time | 1098533.59 | μs |
Device Time | 534840.73 | μs |
Self CPU Time | 40168.28 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45295 warnings generated when compiling for host. Suppressed 45322 warnings (45275 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.