Kernel Details - templated_unroll_singlepass

import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    stride: int,
    padding: int,
    output_padding: int,
    bias_flag: bool,
    conv_transpose: torch.Tensor,
    conv_transpose_bias: torch.Tensor,
) -> torch.Tensor:
    """
    Applies a 3D transposed convolution operation followed by softmax and sigmoid.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W)
        stride (int): Stride of the transposed convolution
        padding (int): Padding of the transposed convolution
        output_padding (int): Additional size added to output shape
        bias_flag (bool): Whether to use bias in conv_transpose
        conv_transpose (torch.Tensor): Transposed convolution weight tensor
        conv_transpose_bias (torch.Tensor): Bias tensor for transposed convolution

    Returns:
        torch.Tensor: Output tensor after applying transposed convolution, softmax and sigmoid
    """
    bias = conv_transpose_bias if bias_flag else None
    x = F.conv_transpose3d(
        x,
        conv_transpose,
        bias=bias,
        stride=stride,
        padding=padding,
        output_padding=output_padding,
    )
    x = F.softmax(x, dim=1)
    x = torch.sigmoid(x)
    return x


class Model(nn.Module):
    """
    Model that performs a 3D transposed convolution, applies Softmax and Sigmoid.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        bias,
    ):
        super(Model, self).__init__()
        conv_transpose = nn.ConvTranspose3d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            output_padding=output_padding,
            bias=bias,
        )
        self.conv_transpose_parameter = nn.Parameter(conv_transpose.weight)
        self.conv_transpose_bias = (
            nn.Parameter(
                conv_transpose.bias
                + torch.randn(
                    conv_transpose.bias.shape,
                    device=conv_transpose.bias.device,
                    dtype=conv_transpose.bias.dtype,
                )
                * 0.02
            )
            if bias
            else None
        )

    def forward(self, x, stride, padding, output_padding, bias, fn=module_fn):
        return fn(
            x,
            stride,
            padding,
            output_padding,
            bias,
            self.conv_transpose_parameter,
            self.conv_transpose_bias,
        )


batch_size = 16
in_channels = 32
out_channels = 64
D, H, W = 16, 32, 32
kernel_size = 3
stride = 2
padding = 1
output_padding = 1
bias = True


def get_inputs():
    return [
        torch.randn(batch_size, in_channels, D, H, W),
        stride,
        padding,
        output_padding,
        bias,
    ]


def get_init_inputs():
    return [
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        output_padding,
        bias,
    ]

import torch
import torch.nn as nn

class Model(nn.Module):
    """
    Model that performs a 3D transposed convolution, applies Softmax and Sigmoid.
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, output_padding, bias=True):
        super(Model, self).__init__()
        self.conv_transpose = nn.ConvTranspose3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, output_padding=output_padding, bias=bias)
        self.conv_transpose.bias = nn.Parameter(self.conv_transpose.bias + torch.randn(self.conv_transpose.bias.shape, device=self.conv_transpose.bias.device, dtype=self.conv_transpose.bias.dtype) * 0.02) if bias else None
        self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, out_channels, D, H, W).
        """
        x = self.conv_transpose(x)
        x = self.softmax(x)
        x = self.sigmoid(x)
        return x

batch_size = 16
in_channels = 32
out_channels = 64
D, H, W = 16, 32, 32
kernel_size = 3
stride = 2
padding = 1
output_padding = 1

def get_inputs():
    return [torch.randn(batch_size, in_channels, D, H, W)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, stride, padding, output_padding]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	49_ConvTranspose3d_Softmax_Sigmoid
Level ID	2
Task ID	49
Kernel Name	templated_unroll_singlepass_base
CUDA Speedup (Native)	1.825x
CUDA Speedup (Compile)	0.950x
CUDA Runtime	1.574 ms
PyTorch Runtime (Native)	2.873 ms
PyTorch Runtime (Compile)	1.496 ms
Correct	True
Max Diff (vs. Reference)	0.006000
Model	deepseek-reasoner
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 2, Task 49 • 49_ConvTranspose3d_Softmax_Sigmoid)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	adaptive_block_softmax_sigmoid_base_base	1.57	1.83	0.95
🥈	atomic_optimized_softmax_sigmoid_base_base	1.57	1.83	0.95
🥉	unrolled_loop_optimized_base	1.57	1.83	0.95
4	templated_unroll_singlepass_base	1.57	1.83	0.95
5	warp_divergence_reduction_base_base	1.58	1.82	0.95
6	unrolled_fused_softmax_sigmoid_base_base	1.58	1.82	0.95
7	constant_mem_optimized_base_base	1.59	1.81	0.94
8	streamed_unrolled_softmax_sigmoid_base_base	1.62	1.78	0.92
9	fused_softmax_sigmoid_base	1.74	1.65	0.86
10	vectorized_work_distribution_base_base	1.74	1.65	0.86
11	constant_memory_optimization_base	1.76	1.63	0.85
12	constant_data_fused_softmax_sigmoid_base	1.77	1.63	0.85
13	stride_loop_fused_softmax_sigmoid_base	1.77	1.62	0.84
14	pipelined_fused_kernel_base	1.82	1.58	0.82
15	balanced_workload_49_convtranspose3d_edit_1	1.88	1.53	0.80
15	balanced_workload_49_convtranspose3d_base	1.88	1.53	0.80
17	49_ConvTranspose3d_Softmax_Sigmoid_optimized_block_edit_1	1.88	1.53	0.80
17	blocksize_tuning_49_convtranspose3d_base	1.88	1.53	0.80
19	49_ConvTranspose3d_Softmax_Sigmoid_unroll_loops_base	1.88	1.53	0.80
19	49_ConvTranspose3d_Softmax_Sigmoid_warp_optimized_base	1.88	1.53	0.80

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <math.h>

#define BLOCK_SIZE 256
#define UNROLL_FACTOR 4

struct PixelInfo {
    int batch_idx;
    int d, h, w;
};

__device__ PixelInfo compute_pixel_info(int idx, int depth, int height, int width) {
    PixelInfo pi;
    pi.batch_idx = idx / (depth * height * width);
    int rem = idx % (depth * height * width);
    pi.d = rem / (height * width);
    rem %= height * width;
    pi.h = rem / width;
    pi.w = rem % width;
    return pi;
}

template <typename scalar_t, int CHANNELS>
__global__ void fused_softmax_sigmoid_kernel(
    const scalar_t* __restrict__ input,
    scalar_t* __restrict__ output,
    int batch, int depth, int height, int width) {
    
    const int spatial = depth * height * width;
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= batch * spatial) return;

    const PixelInfo pi = compute_pixel_info(idx, depth, height, width);
    const int base = (pi.batch_idx * CHANNELS * spatial) + (pi.d * height * width + pi.h * width + pi.w);
    const int stride = spatial;

    scalar_t max_val = -INFINITY;
    scalar_t sum_exp = 0.0f;

    #pragma unroll
    for (int c = 0; c < CHANNELS; c += UNROLL_FACTOR) {
        scalar_t vals[UNROLL_FACTOR];
        #pragma unroll
        for (int u = 0; u < UNROLL_FACTOR; ++u) {
            vals[u] = input[base + (c + u) * stride];
            max_val = max(max_val, vals[u]);
        }
        #pragma unroll
        for (int u = 0; u < UNROLL_FACTOR; ++u) {
            sum_exp += exp(vals[u] - max_val);
        }
    }

    #pragma unroll
    for (int c = 0; c < CHANNELS; c += UNROLL_FACTOR) {
        #pragma unroll
        for (int u = 0; u < UNROLL_FACTOR; ++u) {
            const int pos = base + (c + u) * stride;
            scalar_t softmax = exp(input[pos] - max_val) / sum_exp;
            output[pos] = 1.0f / (1.0f + exp(-softmax));
        }
    }
}

template <typename scalar_t>
__global__ void dynamic_fused_softmax_sigmoid_kernel(
    const scalar_t* __restrict__ input,
    scalar_t* __restrict__ output,
    int channels, int batch, int depth, int height, int width) {
    
    const int spatial = depth * height * width;
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= batch * spatial) return;

    const PixelInfo pi = compute_pixel_info(idx, depth, height, width);
    const int base = (pi.batch_idx * channels * spatial) + (pi.d * height * width + pi.h * width + pi.w);
    const int stride = spatial;

    scalar_t max_val = -INFINITY;
    for (int c = 0; c < channels; ++c) {
        max_val = max(max_val, input[base + c * stride]);
    }

    scalar_t sum_exp = 0.0f;
    #pragma unroll 4
    for (int c = 0; c < channels; ++c) {
        sum_exp += exp(input[base + c * stride] - max_val);
    }

    #pragma unroll 4
    for (int c = 0; c < channels; ++c) {
        const int pos = base + c * stride;
        scalar_t softmax = exp(input[pos] - max_val) / sum_exp;
        output[pos] = 1.0f / (1.0f + exp(-softmax));
    }
}

torch::Tensor forward(torch::Tensor input, int stride, int padding, int output_padding, bool bias_flag, torch::Tensor conv_transpose, torch::Tensor conv_transpose_bias) {
    auto x = torch::conv_transpose3d(input, conv_transpose, bias_flag ? conv_transpose_bias : torch::Tensor(), stride, padding, output_padding);
    
    const int batch = x.size(0);
    const int channels = x.size(1);
    const int depth = x.size(2);
    const int height = x.size(3);
    const int width = x.size(4);

    auto output = torch::empty_like(x);
    const int total_pixels = batch * depth * height * width;
    const int blocks = (total_pixels + BLOCK_SIZE - 1) / BLOCK_SIZE;

    AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "fused_forward", ([&] {
        if (channels == 32) {
            fused_softmax_sigmoid_kernel<scalar_t, 32><<<blocks, BLOCK_SIZE>>>(
                x.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), batch, depth, height, width);
        } else if (channels == 64) {
            fused_softmax_sigmoid_kernel<scalar_t, 64><<<blocks, BLOCK_SIZE>>>(
                x.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), batch, depth, height, width);
        } else {
            dynamic_fused_softmax_sigmoid_kernel<scalar_t><<<blocks, BLOCK_SIZE>>>(
                x.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), channels, batch, depth, height, width);
        }
    }));

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused ConvTranspose3D with Optimized Softmax-Sigmoid");
}

Performance Metrics

Metric	Value	Unit	Variance	Samples

Analysis Rules

Rule	Description

Operation / Metric	Value	Unit
aten::conv_transpose3d
CPU Time	1843660.08	μs
Device Time	5011986.07	μs
Self CPU Time	6982.62	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::convolution
CPU Time	1836677.45	μs
Device Time	5011986.07	μs
Self CPU Time	9751.25	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_convolution
CPU Time	1826926.20	μs
Device Time	5011986.07	μs
Self CPU Time	19710.92	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::cudnn_convolution_transpose
CPU Time	539687.80	μs
Device Time	3085396.51	μs
Self CPU Time	147615.52	μs
Self Device Time	3085396.51	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaEventRecord
CPU Time	1827385.61	μs
Device Time	160705.62	μs
Self CPU Time	1827385.61	μs
Self Device Time	160705.62	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	4518633.94	μs
Device Time	83320.26	μs
Self CPU Time	4518633.94	μs
Self Device Time	83320.26	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::add_
CPU Time	1261728.90	μs
Device Time	1926589.55	μs
Self CPU Time	17507.83	μs
Self Device Time	1926589.55	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

Status: Completed

45295 warnings generated when compiling for host.
Suppressed 45327 warnings (45280 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:30:5 bugprone-easily-swappable-parameters

30 | int batch, int depth, int height, int width) {

| ^~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:30:9: note: the first parameter in the range is 'batch'

30 | int batch, int depth, int height, int width) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:30:20: note: the last parameter in the range is 'depth'

30 | int batch, int depth, int height, int width) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:33:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

33 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:72:5: warning: 3 adjacent parameters of 'dynamic_fused_softmax_sigmoid_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]

72 | int channels, int batch, int depth, int height, int width) {

| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:72:9: note: the first parameter in the range is 'channels'

72 | int channels, int batch, int depth, int height, int width) {

| ^~~~~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:72:34: note: the last parameter in the range is 'depth'

72 | int channels, int batch, int depth, int height, int width) {

| ^~~~~

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:75:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

75 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:101:37: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

101 | torch::Tensor forward(torch::Tensor input, int stride, int padding, int output_padding, bool bias_flag, torch::Tensor conv_transpose, torch::Tensor conv_transpose_bias) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:101:119: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]

101 | torch::Tensor forward(torch::Tensor input, int stride, int padding, int output_padding, bool bias_flag, torch::Tensor conv_transpose, torch::Tensor conv_transpose_bias) {

| ^

| const &

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:102:73: warning: parameter 'conv_transpose_bias' is passed by value and only copied once; consider moving it to avoid unnecessary copies [performance-unnecessary-value-param]

4 | auto x = torch::conv_transpose3d(input, conv_transpose, bias_flag ? conv_transpose_bias : torch::Tensor(), stride, padding, output_padding);

| ^

| std::move( )

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:104:23: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

104 | const int batch = x.size(0);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:105:26: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

105 | const int channels = x.size(1);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:106:23: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

106 | const int depth = x.size(2);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:107:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

107 | const int height = x.size(3);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:108:23: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]

108 | const int width = x.size(4);

| ^

/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_2/task_49/b8_s3_templated_unroll_singlepass/base/base.cu:114:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]

114 | AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "fused_forward", ([&] {

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'

237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'

233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'

74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)

| ^

note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'

58 | AT_ERROR( \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'

711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \

| ^

/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'

536 | __func__, \

| ^

The AI CUDA Engineer 👷

`49_ConvTranspose3d_Softmax_Sigmoid` • `templated_unroll_singlepass_base`

Kernel Information

Related Kernels (Level 2, Task 49 • 49_ConvTranspose3d_Softmax_Sigmoid)

The AI CUDA Engineer 👷

49_ConvTranspose3d_Softmax_Sigmoid • templated_unroll_singlepass_base

Kernel Information

Related Kernels (Level 2, Task 49 • 49_ConvTranspose3d_Softmax_Sigmoid)

`49_ConvTranspose3d_Softmax_Sigmoid` • `templated_unroll_singlepass_base`