← Back to Leaderboard

The AI CUDA Engineer 👷

65_Conv2d_AvgPool_Sigmoid_Sumopt_conv_pool_sigmoid_sum_base

Level 2 • Task 65
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    conv_weight: torch.Tensor,
    conv_bias: torch.Tensor,
) -> torch.Tensor:
    """
    Performs convolution, average pooling, applies sigmoid, and sums the result.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width)
        conv_weight (torch.Tensor): Convolution weights of shape (out_channels, in_channels, kernel_size, kernel_size)
        conv_bias (torch.Tensor): Convolution bias of shape (out_channels)

    Returns:
        torch.Tensor: Output tensor of shape (batch_size,) containing summed values
    """
    x = F.conv2d(x, conv_weight, bias=conv_bias)
    x = F.avg_pool2d(x, pool_kernel_size)
    x = torch.sigmoid(x)
    x = torch.sum(x, dim=[1, 2, 3])
    return x


class Model(nn.Module):
    """
    This model performs a convolution, average pooling, applies sigmoid, and sums the result.
    """

    def __init__(self, in_channels, out_channels, kernel_size, pool_kernel_size):
        super(Model, self).__init__()
        conv = nn.Conv2d(in_channels, out_channels, kernel_size)
        self.conv_weight = nn.Parameter(conv.weight)
        self.conv_bias = nn.Parameter(conv.bias)

    def forward(self, x, fn=module_fn):
        return fn(x, self.conv_weight, self.conv_bias)


batch_size = 128
in_channels = 3
out_channels = 16
height, width = 32, 32
kernel_size = 3
pool_kernel_size = 2


def get_inputs():
    return [torch.randn(batch_size, in_channels, height, width)]


def get_init_inputs():
    return [in_channels, out_channels, kernel_size, pool_kernel_size]
import torch
import torch.nn as nn

class Model(nn.Module):
    """
    This model performs a convolution, average pooling, applies sigmoid, and sums the result.
    """
    def __init__(self, in_channels, out_channels, kernel_size, pool_kernel_size):
        super(Model, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)
        self.avg_pool = nn.AvgPool2d(pool_kernel_size)

    def forward(self, x):
        x = self.conv(x)
        x = self.avg_pool(x)
        x = torch.sigmoid(x)
        x = torch.sum(x, dim=[1,2,3]) # Sum over all spatial dimensions
        return x

batch_size = 128
in_channels = 3
out_channels = 16
height, width = 32, 32
kernel_size = 3
pool_kernel_size = 2

def get_inputs():
    return [torch.randn(batch_size, in_channels, height, width)]

def get_init_inputs():
    return [in_channels, out_channels, kernel_size, pool_kernel_size]

Kernel Information

Related Kernels (Level 2, Task 65 • 65_Conv2d_AvgPool_Sigmoid_Sum)

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>

// Define constants
#define BLOCK_SIZE 256
#define POOL_SIZE 2
#define BLOCKS_PER_BATCH 4

// Optimized CUDA kernel: performs convolution, average pooling, and sigmoid activation,
// then reduces the results via shared memory and atomic addition. This version combines
// the best practices from previous kernels by using __restrict__ pointers for memory access,
// #pragma unroll for inner loops to minimize divergence, and workload splitting across blocks.
__global__ void optimized_conv_pool_sigmoid_sum_kernel(
    const float* __restrict__ input,
    const float* __restrict__ weight,
    const float* __restrict__ bias,
    float* output,
    const int batch_size,
    const int in_channels,
    const int out_channels,
    const int height,
    const int width,
    const int kernel_size
) {
    // Determine the batch index and the block's offset for work splitting
    int batch_idx = blockIdx.x / BLOCKS_PER_BATCH;
    int block_offset = blockIdx.x % BLOCKS_PER_BATCH;
    if (batch_idx >= batch_size) return;

    // Compute output dimensions after convolution
    int out_height = height - kernel_size + 1;
    int out_width  = width - kernel_size + 1;

    // Compute pooling output dimensions
    int pool_out_height = out_height / POOL_SIZE;
    int pool_out_width  = out_width / POOL_SIZE;

    // Each work item corresponds to one pooling cell in an output channel
    int total_work = out_channels * pool_out_height * pool_out_width;

    // Workload division among blocks assigned to a batch
    int chunk = (total_work + BLOCKS_PER_BATCH - 1) / BLOCKS_PER_BATCH;
    int start = block_offset * chunk;
    int end = start + chunk;
    if (end > total_work) end = total_work;

    float partial_sum = 0.0f;

    // Each thread processes multiple elements in a grid-stride loop
    for (int idx = start + threadIdx.x; idx < end; idx += blockDim.x) {
        int cells_per_channel = pool_out_height * pool_out_width;
        int oc = idx / cells_per_channel;
        int cell = idx % cells_per_channel;
        int pool_h = cell / pool_out_width;
        int pool_w = cell % pool_out_width;

        // Initialize convolution result with bias
        float conv_result = bias[oc];

        // Convolution over all input channels and kernel window
        for (int ic = 0; ic < in_channels; ic++) {
            #pragma unroll
            for (int kh = 0; kh < kernel_size; kh++) {
                #pragma unroll
                for (int kw = 0; kw < kernel_size; kw++) {
                    int h_in = pool_h * POOL_SIZE + kh;
                    int w_in = pool_w * POOL_SIZE + kw;
                    int input_idx = ((batch_idx * in_channels + ic) * height + h_in) * width + w_in;
                    int weight_idx = (((oc * in_channels + ic) * kernel_size) + kh) * kernel_size + kw;
                    conv_result += input[input_idx] * weight[weight_idx];
                }
            }
        }

        // Average pooling (divide by the pooling window size)
        conv_result /= (POOL_SIZE * POOL_SIZE);

        // Sigmoid activation
        conv_result = 1.0f / (1.0f + expf(-conv_result));

        partial_sum += conv_result;
    }

    // Shared memory reduction within the block
    extern __shared__ float sdata[];
    sdata[threadIdx.x] = partial_sum;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (threadIdx.x < stride) {
            sdata[threadIdx.x] += sdata[threadIdx.x + stride];
        }
        __syncthreads();
    }

    // Use atomicAdd to update the final result for the batch
    if (threadIdx.x == 0) {
        atomicAdd(&output[batch_idx], sdata[0]);
    }
}

// Host function to launch the kernel
torch::Tensor forward(
    torch::Tensor input,
    torch::Tensor weight,
    torch::Tensor bias
) {
    const int batch_size = input.size(0);
    const int in_channels = input.size(1);
    const int height = input.size(2);
    const int width = input.size(3);
    const int out_channels = weight.size(0);
    const int kernel_size = weight.size(2);

    auto output = torch::zeros({batch_size}, input.options());

    int threads = BLOCK_SIZE;
    int blocks = batch_size * BLOCKS_PER_BATCH;
    int shared_mem_size = BLOCK_SIZE * sizeof(float);

    optimized_conv_pool_sigmoid_sum_kernel<<<blocks, threads, shared_mem_size>>>(
        input.data_ptr<float>(),
        weight.data_ptr<float>(),
        bias.data_ptr<float>(),
        output.data_ptr<float>(),
        batch_size,
        in_channels,
        out_channels,
        height,
        width,
        kernel_size
    );

    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized convolution + avgpool + sigmoid + sum kernel");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.304 inst/cycle 0.000 5
Executed Ipc Elapsed 1.926 inst/cycle 0.000 5
Issue Slots Busy 57.692 % 0.059 5
Issued Ipc Active 2.308 inst/cycle 0.000 5
SM Busy 57.692 % 0.059 5
Memory Throughput 97325109942.310 byte/second 782257882745383808.000 5
Mem Busy 52.742 % 0.176 5
Max Bandwidth 28.446 % 0.051 5
L1/TEX Hit Rate 95.230 % 0.000 5
L2 Hit Rate 55.118 % 2.319 5
Mem Pipes Busy 28.446 % 0.051 5
Warp Cycles Per Issued Instruction 12.410 cycle 0.001 5
Warp Cycles Per Executed Instruction 12.426 cycle 0.001 5
Avg. Active Threads Per Warp 31.100 0.000 5
Avg. Not Predicated Off Threads Per Warp 27.830 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 6.000 block 0.000 5
Block Limit Shared Mem 16.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 48.000 warp 0.000 5
Theoretical Occupancy 75.000 % 0.000 5
Achieved Occupancy 44.772 % 0.002 5
Achieved Active Warps Per SM 28.654 warp 0.001 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (31.8%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. The difference between calculated theoretical (75.0%) and measured achieved occupancy (44.7%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::zeros
CPU Time 5390974.53 μs
Device Time 120928.85 μs
Self CPU Time 148754.37 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 5681080.84 μs
Device Time 7025104.91 μs
Self CPU Time 293179.19 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 5387903.71 μs
Device Time 7025104.91 μs
Self CPU Time 372003.12 μs
Self Device Time 7025103.67 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 5339736.78 μs
Device Time 306064.69 μs
Self CPU Time 5339736.78 μs
Self Device Time 306064.69 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
optimized_conv_pool_sigmoid_sum_kernel(float const*, float const*, float const*, float*, int, int, int, int, int, int)
CPU Time 0.00 μs
Device Time 1131277.18 μs
Self CPU Time 0.00 μs
Self Device Time 1131277.18 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaEventRecord
CPU Time 821777.39 μs
Device Time 303612.53 μs
Self CPU Time 821777.39 μs
Self Device Time 303612.53 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 6904725.14 μs
Self CPU Time 0.00 μs
Self Device Time 6904725.14 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45291 warnings generated when compiling for host.
Suppressed 45323 warnings (45276 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:17:5 bugprone-easily-swappable-parameters
17 | const float* __restrict__ weight,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:17:31: note: the first parameter in the range is 'weight'
17 | const float* __restrict__ weight,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:18:31: note: the last parameter in the range is 'bias'
18 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:20:5: warning: 3 adjacent parameters of 'optimized_conv_pool_sigmoid_sum_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
20 | const int batch_size,
| ^~~~~~~~~~~~~~~~~~~~~
21 | const int in_channels,
| ~~~~~~~~~~~~~~~~~~~~~~
22 | const int out_channels,
| ~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:20:15: note: the first parameter in the range is 'batch_size'
20 | const int batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:22:15: note: the last parameter in the range is 'out_channels'
22 | const int out_channels,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:28:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | int batch_idx = blockIdx.x / BLOCKS_PER_BATCH;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:29:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
29 | int block_offset = blockIdx.x % BLOCKS_PER_BATCH;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:52:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
52 | for (int idx = start + threadIdx.x; idx < end; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:52:59: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
52 | for (int idx = start + threadIdx.x; idx < end; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:106:19: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
106 | torch::Tensor input,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:107:19: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
107 | torch::Tensor weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:108:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
108 | torch::Tensor bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:110:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
110 | const int batch_size = input.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:111:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
111 | const int in_channels = input.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:112:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
112 | const int height = input.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:113:23: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
113 | const int width = input.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:114:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
114 | const int out_channels = weight.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_65/b4_s3_opt_conv_pool_sigmoid_sum/base/base.cu:115:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
115 | const int kernel_size = weight.size(2);
| ^