← Back to Leaderboard

The AI CUDA Engineer 👷

33_VanillaRNNfused_rnn_i2h_warp_base

Level 3 • Task 33
import torch
import torch.nn as nn
import torch.nn.functional as F


def module_fn(
    x: torch.Tensor,
    i2h_weight: torch.Tensor,
    i2h_bias: torch.Tensor,
    h2o_weight: torch.Tensor,
    h2o_bias: torch.Tensor,
    hidden: torch.Tensor,
) -> torch.Tensor:
    """
    Vanilla RNN forward pass

    Args:
        x: Input tensor of shape (batch_size, input_size)
        i2h_weight: Weight tensor for input-to-hidden layer
        i2h_bias: Bias tensor for input-to-hidden layer
        h2o_weight: Weight tensor for hidden-to-output layer
        h2o_bias: Bias tensor for hidden-to-output layer
        hidden: Hidden state tensor

    Returns:
        Output tensor of shape (batch_size, output_size)
    """
    hidden = hidden.to(x.device)
    combined = torch.cat((x, hidden), dim=1)
    hidden = torch.tanh(F.linear(combined, i2h_weight, i2h_bias))
    output = F.linear(hidden, h2o_weight, h2o_bias)
    return output


class Model(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        """
        Initialize the Vanilla RNN model.

        :param input_size: The number of input features (int).
        :param hidden_size: The size of the hidden state (int).
        :param output_size: The number of output features (int).
        """
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.hidden = nn.Parameter(torch.randn((batch_size, hidden_size)))

        # Extract parameters from linear layers
        i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2h_weight = nn.Parameter(i2h.weight.data.clone())
        self.i2h_bias = nn.Parameter(i2h.bias.data.clone())

        h2o = nn.Linear(hidden_size, output_size)
        self.h2o_weight = nn.Parameter(h2o.weight.data.clone())
        self.h2o_bias = nn.Parameter(h2o.bias.data.clone())

    def forward(self, x: torch.Tensor, fn=module_fn) -> torch.Tensor:
        return fn(
            x,
            self.i2h_weight,
            self.i2h_bias,
            self.h2o_weight,
            self.h2o_bias,
            self.hidden,
        )


batch_size = 8
input_size = 1024
hidden_size = 256
output_size = 128
sequence_length = 256


def get_inputs():
    return [torch.randn(batch_size, input_size)]


def get_init_inputs():
    return [input_size, hidden_size, output_size]
import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        """
        Initialize the Vanilla RNN model.
        
        :param input_size: The number of input features (int).
        :param hidden_size: The size of the hidden state (int).
        :param output_size: The number of output features (int).
        """
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.hidden = torch.randn((batch_size, hidden_size))
        
        # Define the RNN cell components (input to hidden, hidden to hidden, and hidden to output)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)  # Input to hidden
        self.h2o = nn.Linear(hidden_size, output_size)  # Hidden to output
        self.tanh = nn.Tanh()  # Activation function for hidden state
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the Vanilla RNN.
        
        :param x: Input tensor of shape (batch_size, input_size).
        :param hidden: Hidden state tensor of shape (batch_size, hidden_size).
        :return: Output tensor of shape (batch_size, output_size), and the new hidden state.
        """
        self.hidden = self.hidden.to(x.device)
        combined = torch.cat((x, self.hidden), dim=1)  # Concatenate input and hidden state
        self.hidden = self.tanh(self.i2h(combined))  # Update hidden state
        output = self.h2o(self.hidden)  # Compute output
        return output

batch_size = 8
input_size = 1024
hidden_size = 256
output_size = 128
sequence_length = 256

def get_inputs():
    return [torch.randn(batch_size, input_size)]

def get_init_inputs():
    return [input_size, hidden_size, output_size]

Kernel Information

Related Kernels (Level 3, Task 33 • 33_VanillaRNN)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 fused_rnn_i2h_warp_base 0.02 1.21 2.67
🥈 warp_optimized_rnn_base 0.02 1.15 2.56
🥈 optimized_rnn_reduction_base 0.02 1.15 2.56
4 atomic_rnn_optimized_edit_1 0.02 1.11 2.45
4 modular_warp_rnn_base 0.02 1.11 2.45
6 balanced_load_rnn_base_base 0.03 0.83 1.84
6 optimized_concat_kernel_base 0.03 0.83 1.84
6 optimized_unroll_concat_base 0.03 0.83 1.84
6 shared_memory_optimized_edit_1 0.03 0.83 1.84
6 stride_loops_rnn_base 0.03 0.83 1.84
6 optimal_blocksize_rnn_edit_1 0.03 0.83 1.84
6 modular_vanillarnn_edit_1 0.03 0.83 1.84
6 unroll_optimized_rnn_base_base 0.03 0.83 1.84
6 optimized_concat_base 0.03 0.83 1.84
15 unrolled_rnn_base_base 0.03 0.80 1.78
15 efficient_concat_base 0.03 0.80 1.78
15 sync_optimized_rnn_base_base 0.03 0.80 1.78
15 atomic_optimized_rnn_base 0.03 0.80 1.78
15 warp_aligned_rnn_base 0.03 0.80 1.78
15 optimized_concat_kernel_edit_1 0.03 0.80 1.78
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>

// Fused kernel: concatenates x and hidden and computes the linear transform in one pass.
// Each block (one warp of 32 threads) computes one dot product for one (row, output) pair.

// Kernel parameters:
// x: [batch_size, x_size]
// hidden: [batch_size, hidden_size]
// i2h_weight: [out_size, total_width] where total_width = x_size + hidden_size
// i2h_bias: [out_size]
// hidden_new_out: [batch_size, out_size] output after tanh( i2h_bias + dot )
// batch_size, x_size, hidden_size, out_size are dimensions

__global__ void fused_concat_linear_kernel(
    const float* __restrict__ x,
    const float* __restrict__ hidden,
    const float* __restrict__ i2h_weight,
    const float* __restrict__ i2h_bias,
    float* __restrict__ hidden_new_out,
    const int batch_size,
    const int x_size,
    const int hidden_size,
    const int out_size
) {
    // Combined width is the column dimension of the concatenated tensor
    int total_width = x_size + hidden_size;

    // Each block computes one dot product corresponding to one output neuron of the i2h linear layer for one batch row.
    // Interpret blockIdx.x as a flattened index: row index and output neuron index
    int global_idx = blockIdx.x; // one dot product per block
    int row = global_idx / out_size;
    int out_idx = global_idx % out_size;

    if (row >= batch_size) return;

    float sum = 0.0f;
    // Each thread in the warp computes a partial sum over the concatenated input elements
    int lane = threadIdx.x; // should be in [0, 31]

    // Loop over the concatenated dimension with stride equal to warp size (32)
    for (int k = lane; k < total_width; k += 32) {
        // Load from x if k is in the x part, otherwise from hidden
        float a = (k < x_size) ? x[row * x_size + k] : hidden[row * hidden_size + (k - x_size)];
        // Load weight: i2h_weight is laid out in row-major order with each row of length total_width
        float b = i2h_weight[out_idx * total_width + k];
        sum += a * b;
    }

    // Perform warp-level reduction using __shfl_down_sync
    unsigned int mask = 0xFFFFFFFF;
    for (int offset = 16; offset > 0; offset /= 2) {
        sum += __shfl_down_sync(mask, sum, offset);
    }

    // The first lane writes the final result
    if (lane == 0) {
        float result = tanhf(sum + i2h_bias[out_idx]);
        hidden_new_out[row * out_size + out_idx] = result;
    }
}

// Host function
// This fused kernel replaces the separate concatenation and addmm (i2h) operations.
// It computes hidden_new = tanh(i2h_bias + [x, hidden] * i2h_weight^T) in one pass,
// avoiding the allocation and memory traffic of an intermediate concatenated tensor.

torch::Tensor module_fn_cuda(
    torch::Tensor x,
    torch::Tensor i2h_weight,
    torch::Tensor i2h_bias,
    torch::Tensor h2o_weight,
    torch::Tensor h2o_bias,
    torch::Tensor hidden
) {
    // Ensure tensors are contiguous
    x = x.contiguous();
    i2h_weight = i2h_weight.contiguous();
    i2h_bias = i2h_bias.contiguous();
    h2o_weight = h2o_weight.contiguous();
    h2o_bias = h2o_bias.contiguous();
    hidden = hidden.contiguous();

    const int batch_size = x.size(0);
    const int x_size = x.size(1);
    const int hidden_size = hidden.size(1);
    // out_size is the number of neurons in the i2h linear transform (i2h_bias length)
    const int out_size = i2h_bias.size(0);
    int total_width = x_size + hidden_size;

    // Allocate tensor for hidden_new output of fused i2h operation
    auto options = torch::TensorOptions().dtype(x.dtype()).device(x.device());
    torch::Tensor hidden_new = torch::empty({batch_size, out_size}, options);

    // Launch configuration: one warp (32 threads) per dot product
    // Total dot products = batch_size * out_size
    int total_dot_products = batch_size * out_size;
    int threads = 32; // one warp
    int blocks = total_dot_products; // one block (warp) per dot product
    
    fused_concat_linear_kernel<<<blocks, threads>>>(
        x.data_ptr<float>(),
        hidden.data_ptr<float>(),
        i2h_weight.data_ptr<float>(),
        i2h_bias.data_ptr<float>(),
        hidden_new.data_ptr<float>(),
        batch_size,
        x_size,
        hidden_size,
        out_size
    );

    // Compute the final output: h2o_bias + hidden_new * h2o_weight^T
    // This step is kept separate and uses optimized torch::addmm
    torch::Tensor output = torch::addmm(h2o_bias, hidden_new, h2o_weight.t());
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &module_fn_cuda, "Fused Module forward (CUDA) using warp-level primitives");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 0.856 inst/cycle 0.000 5
Executed Ipc Elapsed 0.542 inst/cycle 0.000 5
Issue Slots Busy 21.462 % 0.020 5
Issued Ipc Active 0.856 inst/cycle 0.000 5
SM Busy 21.462 % 0.020 5
Memory Throughput 166181970982.476 byte/second 4335981608600158208.000 5
Mem Busy 11.274 % 0.021 5
Max Bandwidth 12.826 % 0.029 5
L1/TEX Hit Rate 31.200 % 0.000 5
L2 Hit Rate 69.962 % 0.076 5
Mem Pipes Busy 10.566 % 0.016 5
Warp Cycles Per Issued Instruction 16.924 cycle 0.001 5
Warp Cycles Per Executed Instruction 16.960 cycle 0.001 5
Avg. Active Threads Per Warp 30.500 0.000 5
Avg. Not Predicated Off Threads Per Warp 29.710 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 64.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 64.000 block 0.000 5
Theoretical Active Warps per SM 32.000 warp 0.000 5
Theoretical Occupancy 50.000 % 0.000 5
Achieved Occupancy 22.836 % 0.007 5
Achieved Active Warps Per SM 14.616 warp 0.003 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (20.3%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory. The difference between calculated theoretical (50.0%) and measured achieved occupancy (22.8%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 291884.54 μs
Device Time 66.43 μs
Self CPU Time 58.45 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_to_copy
CPU Time 291826.09 μs
Device Time 66.43 μs
Self CPU Time 104.38 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::empty_strided
CPU Time 291364.47 μs
Device Time 0.00 μs
Self CPU Time 133.67 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaDeviceGetStreamPriorityRange
CPU Time 290804.25 μs
Device Time 0.00 μs
Self CPU Time 290804.25 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::addmm
CPU Time 222046.99 μs
Device Time 70296.91 μs
Self CPU Time 123291.93 μs
Self Device Time 70296.91 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
sm80_xmma_gemm_f32f32_f32f32_f32_tn_n_tilesize32x32x8_stage3_warpsize1x2x1_ffma_aligna4_alignc4_execute_kernel__51_cublas
CPU Time 0.00 μs
Device Time 70296.91 μs
Self CPU Time 0.00 μs
Self Device Time 70296.91 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::zero_
CPU Time 52407.04 μs
Device Time 476524.74 μs
Self CPU Time 10499.64 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::fill_
CPU Time 41908.93 μs
Device Time 476524.74 μs
Self CPU Time 13402.45 μs
Self Device Time 476524.74 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<int>, at::detail::Array<char*, 1>)
CPU Time 0.00 μs
Device Time 476524.74 μs
Self CPU Time 0.00 μs
Self Device Time 476524.74 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45289 warnings generated when compiling for host.
Suppressed 45326 warnings (45279 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:19:5 bugprone-easily-swappable-parameters
19 | const float* __restrict__ hidden,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | const float* __restrict__ i2h_weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
21 | const float* __restrict__ i2h_bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:19:31: note: the first parameter in the range is 'hidden'
19 | const float* __restrict__ hidden,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:21:31: note: the last parameter in the range is 'i2h_bias'
21 | const float* __restrict__ i2h_bias,
| ^~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:23:5: warning: 2 adjacent parameters of 'fused_concat_linear_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
23 | const int batch_size,
| ^~~~~~~~~~~~~~~~~~~~~
24 | const int x_size,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:23:15: note: the first parameter in the range is 'batch_size'
23 | const int batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:24:15: note: the last parameter in the range is 'x_size'
24 | const int x_size,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:25:5: warning: 2 adjacent parameters of 'fused_concat_linear_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | const int hidden_size,
| ^~~~~~~~~~~~~~~~~~~~~~
26 | const int out_size
| ~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:25:15: note: the first parameter in the range is 'hidden_size'
25 | const int hidden_size,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:26:15: note: the last parameter in the range is 'out_size'
26 | const int out_size
| ^~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:33:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
33 | int global_idx = blockIdx.x; // one dot product per block
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:41:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
41 | int lane = threadIdx.x; // should be in [0, 31]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:86:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
86 | const int batch_size = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:87:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
87 | const int x_size = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:88:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
88 | const int hidden_size = hidden.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:90:26: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
90 | const int out_size = i2h_bias.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:91:9: warning: Value stored to 'total_width' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
91 | int total_width = x_size + hidden_size;
| ^~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_3/task_33/b5_s2_fused_rnn_i2h_warp/base/base.cu:91:9: note: Value stored to 'total_width' during its initialization is never read
91 | int total_width = x_size + hidden_size;
| ^~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~