import torch
import torch.nn as nn
import torch.nn.functional as F
def module_fn(x: torch.Tensor) -> torch.Tensor:
"""
Applies Sigmoid activation to the input tensor.
Args:
x (torch.Tensor): Input tensor of any shape.
Returns:
torch.Tensor: Output tensor with Sigmoid applied, same shape as input.
"""
return torch.sigmoid(x)
class Model(nn.Module):
"""
Simple model that performs a Sigmoid activation.
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
return fn(x)
batch_size = 16
dim = 16384
def get_inputs():
x = torch.randn(batch_size, dim)
return [x]
def get_init_inputs():
return [] # No special initialization inputs needed
import torch
import torch.nn as nn
class Model(nn.Module):
"""
Simple model that performs a Sigmoid activation.
"""
def __init__(self):
super(Model, self).__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Applies Sigmoid activation to the input tensor.
Args:
x (torch.Tensor): Input tensor of any shape.
Returns:
torch.Tensor: Output tensor with Sigmoid applied, same shape as input.
"""
return torch.sigmoid(x)
batch_size = 16
dim = 16384
def get_inputs():
x = torch.randn(batch_size, dim)
return [x]
def get_init_inputs():
return [] # No special initialization inputs needed
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <type_traits>
// Define an inline device function for exponentiation, specialized for float and double.
template <typename T>
__device__ inline T myExp(T x);
template <>
__device__ inline float myExp<float>(float x) {
return expf(x);
}
template <>
__device__ inline double myExp<double>(double x) {
return exp(x);
}
// Union to facilitate vectorized load and store operations
// VecT: vector type (e.g., float4 or double2), VecSize: number of scalar elements in VecT
template <typename scalar_t, typename VecT, int VecSize>
union VecUnion {
VecT vec;
scalar_t arr[VecSize];
};
// Vectorized kernel processing multiple elements per thread using 128-bit loads/stores
// It uses __ldg() to optimize read-only global memory accesses.
template <typename scalar_t, typename VecT, int VecSize>
__global__ void sigmoid_vectorized_kernel(const scalar_t* __restrict__ input,
scalar_t* __restrict__ output,
int64_t vec_count) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = blockDim.x * gridDim.x;
for (int idx = tid; idx < vec_count; idx += stride) {
VecUnion<scalar_t, VecT, VecSize> in_union;
VecUnion<scalar_t, VecT, VecSize> out_union;
// Load using __ldg for read-only cache-optimized access
in_union.vec = __ldg(reinterpret_cast<const VecT*>(input) + idx);
#pragma unroll
for (int i = 0; i < VecSize; i++) {
// Fused computation with fewer intermediates
out_union.arr[i] = scalar_t(1) / (scalar_t(1) + myExp(-in_union.arr[i]));
}
// Vectorized store
reinterpret_cast<VecT*>(output)[idx] = out_union.vec;
}
}
// Scalar kernel for processing tail elements that don't fit in a full vectorized load/store
template <typename scalar_t>
__global__ void sigmoid_scalar_kernel(const scalar_t* __restrict__ input,
scalar_t* __restrict__ output,
int64_t start,
int64_t size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
if (idx < size) {
scalar_t val = __ldg(&input[idx]);
scalar_t exp_val = myExp(-val);
output[idx] = static_cast<scalar_t>(1) / (static_cast<scalar_t>(1) + exp_val);
}
}
// The forward function prepares the output tensor and launches the appropriate kernels
// It handles vectorized processing for 128-bit aligned data and falls back to a scalar kernel for tail elements.
torch::Tensor forward(torch::Tensor input) {
auto output = torch::empty_like(input);
const int64_t size = input.numel();
const int threads = 256;
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "sigmoid_vectorized_combined", ([&] {
const auto* input_data = input.data_ptr<scalar_t>();
auto* output_data = output.data_ptr<scalar_t>();
// Determine the vectorization factor and vector type based on the scalar type
int vecSize = 1;
int64_t vec_elements = 0;
int blocks = 0;
if (std::is_same<scalar_t, float>::value) {
vecSize = 4; // 128-bit: 4 x float
vec_elements = size / vecSize; // number of full vectorized groups
blocks = (vec_elements + threads - 1) / threads;
if (vec_elements > 0) {
sigmoid_vectorized_kernel<scalar_t, float4, 4><<<blocks, threads>>>(input_data, output_data, vec_elements);
}
} else if (std::is_same<scalar_t, double>::value) {
vecSize = 2; // 128-bit: 2 x double
vec_elements = size / vecSize;
blocks = (vec_elements + threads - 1) / threads;
if (vec_elements > 0) {
sigmoid_vectorized_kernel<scalar_t, double2, 2><<<blocks, threads>>>(input_data, output_data, vec_elements);
}
}
// Process any remaining tail elements not covered by vectorized loads/stores
int64_t vec_aligned_size = vec_elements * vecSize;
int64_t tail = size - vec_aligned_size;
if (tail > 0) {
int tail_blocks = (tail + threads - 1) / threads;
sigmoid_scalar_kernel<scalar_t><<<tail_blocks, threads>>>(input_data, output_data, vec_aligned_size, size);
}
}));
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Optimized Sigmoid forward (CUDA) with vectorized and scalar loads");
}
Metric | Value | Unit | Variance | Samples |
---|
Rule | Description |
---|
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 240150.78 | μs |
Device Time | 40.19 | μs |
Self CPU Time | 39.31 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_to_copy | ||
CPU Time | 240111.47 | μs |
Device Time | 40.19 | μs |
Self CPU Time | 99.17 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::empty_strided | ||
CPU Time | 259825.16 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 20174.24 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaDeviceGetStreamPriorityRange | ||
CPU Time | 239438.97 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 239438.97 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 512839.66 | μs |
Device Time | 22953.25 | μs |
Self CPU Time | 512839.66 | μs |
Self Device Time | 22953.25 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void sigmoid_vectorized_kernel<float, float4, 4>(float const*, float*, long) | ||
CPU Time | 0.00 | μs |
Device Time | 31617.94 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 31617.94 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaEventRecord | ||
CPU Time | 19941.94 | μs |
Device Time | 44272.72 | μs |
Self CPU Time | 19941.94 | μs |
Self Device Time | 44272.72 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::zero_ | ||
CPU Time | 65421.73 | μs |
Device Time | 655465.59 | μs |
Self CPU Time | 14576.07 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::fill_ | ||
CPU Time | 50849.81 | μs |
Device Time | 655465.59 | μs |
Self CPU Time | 16522.34 | μs |
Self Device Time | 655465.59 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>) | ||
CPU Time | 0.00 | μs |
Device Time | 655465.59 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 655465.59 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45281 warnings generated when compiling for host. Suppressed 45321 warnings (45274 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.