import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(
x: torch.Tensor, params: nn.ParameterDict, is_training: bool
) -> torch.Tensor:
"""
Implementation of EfficientNetB2
Args:
x: Input tensor of shape (batch_size, 3, 224, 224).
params: A nn.ParameterDict containing model parameters.
is_training: Whether the model is in training mode.
Returns:
torch.Tensor: Output tensor of shape (batch_size, 1000).
"""
# Initial conv
x = F.conv2d(x, params["conv1_weight"], None, stride=2, padding=1)
x = F.batch_norm(
x,
params["bn1_mean"],
params["bn1_var"],
params["bn1_weight"],
params["bn1_bias"],
is_training,
)
x = F.relu(x, inplace=True)
def mbconv_block_fn(x, params, stride, expand_ratio, is_training):
"""
Functional implementation of MBConv block
"""
in_channels = x.size(1)
expanded_channels = in_channels * expand_ratio
# Expansion phase
if expand_ratio != 1:
x = F.conv2d(x, params["expand_conv_weight"], None)
x = F.batch_norm(
x,
params["expand_bn_mean"],
params["expand_bn_var"],
params["expand_bn_weight"],
params["expand_bn_bias"],
is_training,
)
x = F.relu(x, inplace=True)
else:
expanded_channels = in_channels
# Depthwise conv
x = F.conv2d(
x,
params["dw_conv_weight"],
None,
stride=stride,
padding=1,
groups=expanded_channels,
)
x = F.batch_norm(
x,
params["dw_bn_mean"],
params["dw_bn_var"],
params["dw_bn_weight"],
params["dw_bn_bias"],
is_training,
)
x = F.relu(x, inplace=True)
# Squeeze and Excitation
se = F.adaptive_avg_pool2d(x, (1, 1))
se = F.conv2d(se, params["se_reduce_weight"], None)
se = F.relu(se, inplace=True)
se = F.conv2d(se, params["se_expand_weight"], None)
se = torch.sigmoid(se)
x = se
# x = x * se
# Output phase
x = F.conv2d(x, params["project_conv_weight"], None)
x = F.batch_norm(
x,
params["project_bn_mean"],
params["project_bn_var"],
params["project_bn_weight"],
params["project_bn_bias"],
is_training,
)
return x
# MBConv blocks
mbconv_configs = [(1, 3), (2, 6), (2, 6), (2, 6), (1, 6)]
for i, (stride, expand_ratio) in enumerate(mbconv_configs, 1):
block_params = {
k.replace(f"mbconv{i}_", ""): v
for k, v in params.items()
if k.startswith(f"mbconv{i}_")
}
x = mbconv_block_fn(x, block_params, stride, expand_ratio, is_training)
# Final layers
x = F.conv2d(x, params["conv_final_weight"], None)
x = F.batch_norm(
x,
params["bn_final_mean"],
params["bn_final_var"],
params["bn_final_weight"],
params["bn_final_bias"],
is_training,
)
x = F.relu(x, inplace=True)
x = F.adaptive_avg_pool2d(x, (1, 1))
x = torch.flatten(x, 1)
x = F.linear(x, params["fc_weight"], params["fc_bias"])
return x
class Model(nn.Module):
def __init__(self, num_classes=1000):
super(Model, self).__init__()
# Create the original model to ensure identical initialization
original_model = nn.Module()
original_model.conv1 = nn.Conv2d(
3, 32, kernel_size=3, stride=2, padding=1, bias=False
)
original_model.bn1 = nn.BatchNorm2d(32)
original_model.relu = nn.ReLU(inplace=True)
# MBConv blocks
configs = [
(32, 96, 1, 3),
(96, 144, 2, 6),
(144, 192, 2, 6),
(192, 288, 2, 6),
(288, 384, 1, 6),
]
for i, (in_c, out_c, stride, expand) in enumerate(configs, 1):
expanded_c = in_c * expand
block = nn.Sequential()
if expand != 1:
block.add_module(
"expand_conv", nn.Conv2d(in_c, expanded_c, 1, bias=False)
)
block.add_module("expand_bn", nn.BatchNorm2d(expanded_c))
block.add_module("expand_relu", nn.ReLU(inplace=True))
block.add_module(
"dw_conv",
nn.Conv2d(
expanded_c,
expanded_c,
3,
stride=stride,
padding=1,
groups=expanded_c,
bias=False,
),
)
block.add_module("dw_bn", nn.BatchNorm2d(expanded_c))
block.add_module("dw_relu", nn.ReLU(inplace=True))
block.add_module("se_pool", nn.AdaptiveAvgPool2d((1, 1)))
block.add_module(
"se_reduce", nn.Conv2d(expanded_c, expanded_c // 4, 1, bias=False)
)
block.add_module("se_reduce_relu", nn.ReLU(inplace=True))
block.add_module(
"se_expand", nn.Conv2d(expanded_c // 4, expanded_c, 1, bias=False)
)
block.add_module("se_sigmoid", nn.Sigmoid())
block.add_module(
"project_conv", nn.Conv2d(expanded_c, out_c, 1, bias=False)
)
block.add_module("project_bn", nn.BatchNorm2d(out_c))
setattr(original_model, f"mbconv{i}", block)
original_model.conv_final = nn.Conv2d(384, 1408, 1, bias=False)
original_model.bn_final = nn.BatchNorm2d(1408)
original_model.avgpool = nn.AdaptiveAvgPool2d((1, 1))
original_model.fc = nn.Linear(1408, num_classes)
# Initialize parameters and buffers
self.params = nn.ParameterDict()
# Copy initial conv parameters
self.params["conv1_weight"] = nn.Parameter(original_model.conv1.weight.data)
self.params["bn1_weight"] = nn.Parameter(original_model.bn1.weight.data)
self.params["bn1_bias"] = nn.Parameter(original_model.bn1.bias.data)
self.register_buffer("bn1_mean", original_model.bn1.running_mean)
self.register_buffer("bn1_var", original_model.bn1.running_var)
# Copy MBConv block parameters
for i in range(1, 6):
block = getattr(original_model, f"mbconv{i}")
prefix = f"mbconv{i}_"
if hasattr(block, "expand_conv"):
self.params[prefix + "expand_conv_weight"] = nn.Parameter(
block.expand_conv.weight.data
)
self.params[prefix + "expand_bn_weight"] = nn.Parameter(
block.expand_bn.weight.data
)
self.params[prefix + "expand_bn_bias"] = nn.Parameter(
block.expand_bn.bias.data
)
self.register_buffer(
prefix + "expand_bn_mean", block.expand_bn.running_mean
)
self.register_buffer(
prefix + "expand_bn_var", block.expand_bn.running_var
)
self.params[prefix + "dw_conv_weight"] = nn.Parameter(
block.dw_conv.weight.data
)
self.params[prefix + "dw_bn_weight"] = nn.Parameter(block.dw_bn.weight.data)
self.params[prefix + "dw_bn_bias"] = nn.Parameter(block.dw_bn.bias.data)
self.register_buffer(prefix + "dw_bn_mean", block.dw_bn.running_mean)
self.register_buffer(prefix + "dw_bn_var", block.dw_bn.running_var)
self.params[prefix + "se_reduce_weight"] = nn.Parameter(
block.se_reduce.weight.data
)
self.params[prefix + "se_expand_weight"] = nn.Parameter(
block.se_expand.weight.data
)
self.params[prefix + "project_conv_weight"] = nn.Parameter(
block.project_conv.weight.data
)
self.params[prefix + "project_bn_weight"] = nn.Parameter(
block.project_bn.weight.data
)
self.params[prefix + "project_bn_bias"] = nn.Parameter(
block.project_bn.bias.data
)
self.register_buffer(
prefix + "project_bn_mean", block.project_bn.running_mean
)
self.register_buffer(
prefix + "project_bn_var", block.project_bn.running_var
)
# Copy final layer parameters
self.params["conv_final_weight"] = nn.Parameter(
original_model.conv_final.weight.data
)
self.params["bn_final_weight"] = nn.Parameter(
original_model.bn_final.weight.data
)
self.params["bn_final_bias"] = nn.Parameter(original_model.bn_final.bias.data)
self.register_buffer("bn_final_mean", original_model.bn_final.running_mean)
self.register_buffer("bn_final_var", original_model.bn_final.running_var)
self.params["fc_weight"] = nn.Parameter(original_model.fc.weight.data)
self.params["fc_bias"] = nn.Parameter(original_model.fc.bias.data)
def forward(self, x, fn=module_fn):
params = {
**dict(self.params),
**{k: v for k, v in self._buffers.items() if v is not None},
}
return fn(x, params, self.training)
batch_size = 2
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, num_classes=1000):
"""
EfficientNetB2 architecture implementation.
:param num_classes: The number of output classes (default is 1000 for ImageNet).
"""
super(Model, self).__init__()
# Define the EfficientNetB2 architecture components
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(32)
self.relu = nn.ReLU(inplace=True)
# Define the MBConv blocks
self.mbconv1 = self._make_mbconv_block(32, 96, 1, 3)
self.mbconv2 = self._make_mbconv_block(96, 144, 2, 6)
self.mbconv3 = self._make_mbconv_block(144, 192, 2, 6)
self.mbconv4 = self._make_mbconv_block(192, 288, 2, 6)
self.mbconv5 = self._make_mbconv_block(288, 384, 1, 6)
# Final layers
self.conv_final = nn.Conv2d(384, 1408, kernel_size=1, stride=1, padding=0, bias=False)
self.bn_final = nn.BatchNorm2d(1408)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(1408, num_classes)
def _make_mbconv_block(self, in_channels, out_channels, stride, expand_ratio):
"""
Helper function to create a MBConv block.
:param in_channels: Number of input channels.
:param out_channels: Number of output channels.
:param stride: Stride for the depthwise convolution.
:param expand_ratio: Expansion ratio for the MBConv block.
:return: A sequential container of layers forming the MBConv block.
"""
layers = []
expanded_channels = in_channels * expand_ratio
# Expansion phase
if expand_ratio != 1:
layers.append(nn.Conv2d(in_channels, expanded_channels, kernel_size=1, stride=1, padding=0, bias=False))
layers.append(nn.BatchNorm2d(expanded_channels))
layers.append(nn.ReLU(inplace=True))
# Depthwise convolution
layers.append(nn.Conv2d(expanded_channels, expanded_channels, kernel_size=3, stride=stride, padding=1, groups=expanded_channels, bias=False))
layers.append(nn.BatchNorm2d(expanded_channels))
layers.append(nn.ReLU(inplace=True))
# Squeeze and Excitation
layers.append(nn.AdaptiveAvgPool2d((1, 1)))
layers.append(nn.Conv2d(expanded_channels, expanded_channels // 4, kernel_size=1, stride=1, padding=0, bias=False))
layers.append(nn.ReLU(inplace=True))
layers.append(nn.Conv2d(expanded_channels // 4, expanded_channels, kernel_size=1, stride=1, padding=0, bias=False))
layers.append(nn.Sigmoid())
# Output phase
layers.append(nn.Conv2d(expanded_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False))
layers.append(nn.BatchNorm2d(out_channels))
return nn.Sequential(*layers)
def forward(self, x):
"""
Forward pass of the EfficientNetB2 model.
:param x: The input tensor, shape (batch_size, 3, 224, 224)
:return: The output tensor, shape (batch_size, num_classes)
"""
x = self.relu(self.bn1(self.conv1(x)))
x = self.mbconv1(x)
x = self.mbconv2(x)
x = self.mbconv3(x)
x = self.mbconv4(x)
x = self.mbconv5(x)
x = self.relu(self.bn_final(self.conv_final(x)))
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
# Test code
batch_size = 2
num_classes = 1000
def get_inputs():
return [torch.randn(batch_size, 3, 224, 224)]
def get_init_inputs():
return [num_classes]
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <map>
#include <string>
#include <vector>
using namespace torch;
// Global constants for batch normalization and MBConv configurations
const float BN_MOMENTUM = 0.1f;
const float BN_EPSILON = 1e-5f;
const int MBCONV_CONFIGS[5][2] = { {1, 3}, {2, 6}, {2, 6}, {2, 6}, {1, 6} };
// Custom CUDA kernel for adaptive average pooling in the SE module
// This kernel computes the average over the spatial dimensions for each (n, c) slice.
// It uses a single thread block per (n, c) pair with shared memory reduction, avoiding global atomics.
__global__ void se_adaptive_avg_pool_kernel(const float* __restrict__ input, float* __restrict__ output, int N, int C, int H, int W) {
int n = blockIdx.x; // batch index
int c = blockIdx.y; // channel index
int HW = H * W;
extern __shared__ float sdata[];
int tid = threadIdx.x;
float sum = 0.f;
// Each thread processes multiple elements across the spatial dimension
for (int i = tid; i < HW; i += blockDim.x) {
// Compute the index for input in NCHW layout
int idx = n * (C * H * W) + c * (H * W) + i;
sum += input[idx];
}
sdata[tid] = sum;
__syncthreads();
// In-block reduction using shared memory (no global atomics are used)
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write the averaged result to global memory
if (tid == 0) {
output[n * C + c] = sdata[0] / static_cast<float>(HW);
}
}
// Wrapper function to launch the custom adaptive average pooling kernel
// Input tensor shape: [N, C, H, W]. Output tensor shape: [N, C, 1, 1]
Tensor se_adaptive_avg_pool(Tensor input) {
int N = input.size(0);
int C = input.size(1);
int H = input.size(2);
int W = input.size(3);
auto options = input.options();
auto output = at::empty({N, C, 1, 1}, options);
int threads = 256;
int sharedMemSize = threads * sizeof(float);
dim3 grid(N, C); // one block per (n, c) pair
se_adaptive_avg_pool_kernel<<<grid, threads, sharedMemSize, at::cuda::getCurrentCUDAStream()>>>(
input.data_ptr<float>(),
output.data_ptr<float>(),
N, C, H, W
);
return output;
}
// Helper function to extract MBConv block-specific parameters from the overall parameter map
std::map<std::string, Tensor> extract_block_params(const std::map<std::string, Tensor>& params, int block_num) {
std::map<std::string, Tensor> block_params;
std::string prefix = "mbconv" + std::to_string(block_num) + "_";
for (const auto& kv : params) {
if (kv.first.rfind(prefix, 0) == 0) {
block_params[kv.first.substr(prefix.size())] = kv.second;
}
}
return block_params;
}
// Fused MBConv block implementation with custom SE adaptive average pooling
Tensor mbconv_block(Tensor x, const std::map<std::string, Tensor>& params, int stride, int expand_ratio, bool is_training) {
int64_t in_channels = x.size(1);
int64_t expanded_channels = in_channels * expand_ratio;
// Expansion phase (if necessary)
if (expand_ratio != 1) {
x = conv2d(x, params.at("expand_conv_weight"), Tensor(), {1}, at::IntArrayRef({0}), {1}, 1);
x = batch_norm(x, params.at("expand_bn_weight"), params.at("expand_bn_bias"),
params.at("expand_bn_mean"), params.at("expand_bn_var"),
is_training, BN_MOMENTUM, BN_EPSILON, true);
x.relu_();
}
// Depthwise convolution
x = conv2d(x, params.at("dw_conv_weight"), Tensor(), {stride}, at::IntArrayRef({1}), {1}, expanded_channels);
x = batch_norm(x, params.at("dw_bn_weight"), params.at("dw_bn_bias"),
params.at("dw_bn_mean"), params.at("dw_bn_var"),
is_training, BN_MOMENTUM, BN_EPSILON, true);
x.relu_();
// Squeeze and Excitation (SE) module using the custom adaptive average pooling kernel
auto se = se_adaptive_avg_pool(x); // Compute average pooling over spatial dims
se = conv2d(se, params.at("se_reduce_weight"), Tensor(), {1}, at::IntArrayRef({0}));
se = relu(se);
se = conv2d(se, params.at("se_expand_weight"), Tensor(), {1}, at::IntArrayRef({0}));
se = sigmoid(se);
// According to the reference, assign the SE output directly
x = se;
// Projection phase
x = conv2d(x, params.at("project_conv_weight"), Tensor(), {1}, at::IntArrayRef({0}), {1}, 1);
x = batch_norm(x, params.at("project_bn_weight"), params.at("project_bn_bias"),
params.at("project_bn_mean"), params.at("project_bn_var"),
is_training, BN_MOMENTUM, BN_EPSILON, true);
return x;
}
// Main forward function: combines the initial convolution, MBConv blocks, and final fully connected layers
Tensor forward(Tensor x, std::map<std::string, Tensor> params, bool is_training) {
// Initial convolution
x = conv2d(x, params.at("conv1_weight"), Tensor(), {2}, at::IntArrayRef({1}));
x = batch_norm(x, params.at("bn1_weight"), params.at("bn1_bias"),
params.at("bn1_mean"), params.at("bn1_var"),
is_training, BN_MOMENTUM, BN_EPSILON, true);
x.relu_();
// Pre-extract parameters for each MBConv block to avoid redundant lookups
std::vector<std::map<std::string, Tensor>> blocks_params;
blocks_params.reserve(5);
for (int i = 1; i <= 5; i++) {
blocks_params.push_back(extract_block_params(params, i));
}
// Execute MBConv blocks with predefined configurations
for (int i = 0; i < 5; i++) {
int stride = MBCONV_CONFIGS[i][0];
int expand_ratio = MBCONV_CONFIGS[i][1];
x = mbconv_block(x, blocks_params[i], stride, expand_ratio, is_training);
}
// Final layers
x = conv2d(x, params.at("conv_final_weight"), Tensor(), {1}, at::IntArrayRef({0}));
x = batch_norm(x, params.at("bn_final_weight"), params.at("bn_final_bias"),
params.at("bn_final_mean"), params.at("bn_final_var"),
is_training, BN_MOMENTUM, BN_EPSILON, true);
x.relu_();
x = adaptive_avg_pool2d(x, {1, 1});
x = x.flatten(1);
x = linear(x, params.at("fc_weight"), params.at("fc_bias"));
return x;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "EfficientNetB2 forward with fused MBConv and optimized SE adaptive pooling");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.720 | inst/cycle | 0.635 | 5 |
Executed Ipc Elapsed | 1.104 | inst/cycle | 0.300 | 5 |
Issue Slots Busy | 44.018 | % | 413.769 | 5 |
Issued Ipc Active | 1.758 | inst/cycle | 0.661 | 5 |
SM Busy | 44.018 | % | 413.769 | 5 |
Memory Throughput | 80602855847.298 | byte/second | 24828269885329447583744.000 | 5 |
Mem Busy | 29.638 | % | 178.429 | 5 |
Max Bandwidth | 28.476 | % | 110.084 | 5 |
L1/TEX Hit Rate | 0.820 | % | 0.330 | 5 |
L2 Hit Rate | 82.388 | % | 1344.182 | 5 |
Mem Pipes Busy | 28.258 | % | 211.859 | 5 |
Warp Cycles Per Issued Instruction | 32.524 | cycle | 377.671 | 5 |
Warp Cycles Per Executed Instruction | 33.146 | cycle | 382.932 | 5 |
Avg. Active Threads Per Warp | 31.146 | 0.138 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 21.858 | 13.075 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 16.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 16.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 64.804 | % | 549.130 | 5 |
Achieved Active Warps Per SM | 41.476 | warp | 224.980 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
WRN ThreadDivergence | Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early thread completion, and divergent flow control can significantly lower the number of active threads in a warp per cycle. This kernel achieves an average of 31.0 threads being active per cycle. This is further reduced to 20.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch. Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads execute the instructions. Try to avoid different execution paths within a warp when possible. In addition, ensure your kernel makes use of Independent Thread Scheduling, which allows a warp to reconverge after a data-dependent conditional block by explicitly calling __syncwarp(). |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (18.1%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
Operation / Metric | Value | Unit |
---|---|---|
aten::conv2d | ||
CPU Time | 2791625.26 | μs |
Device Time | 1067735.53 | μs |
Self CPU Time | 159657.09 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::convolution | ||
CPU Time | 2631968.17 | μs |
Device Time | 1067735.53 | μs |
Self CPU Time | 200429.62 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_convolution | ||
CPU Time | 2431538.54 | μs |
Device Time | 1067735.53 | μs |
Self CPU Time | 248383.94 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::cudnn_convolution | ||
CPU Time | 1909897.92 | μs |
Device Time | 895800.35 | μs |
Self CPU Time | 1209665.86 | μs |
Self Device Time | 895800.35 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::batch_norm | ||
CPU Time | 2472212.68 | μs |
Device Time | 807247.03 | μs |
Self CPU Time | 121323.04 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_batch_norm_impl_index | ||
CPU Time | 2350889.65 | μs |
Device Time | 807247.03 | μs |
Self CPU Time | 95909.15 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45315 warnings generated when compiling for host. Suppressed 45349 warnings (45302 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.