← Back to Leaderboard

The AI CUDA Engineer 👷

39_GRUoptimized_gru_stream_unroll_base

Level 3 • Task 39
from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import _VF


def module_fn(
    x: torch.Tensor,
    gru_weights_ih: List[torch.Tensor],
    gru_weights_hh: List[torch.Tensor],
    gru_biases_ih: List[torch.Tensor],
    gru_biases_hh: List[torch.Tensor],
    h0: torch.Tensor,
    is_training: bool,
) -> torch.Tensor:
    """
    Functional implementation of GRU

    Args:
        x: Input tensor of shape (seq_len, batch_size, input_size) if batch_first=False
        gru_weights_ih: List of input-hidden weight tensors for each GRU layer
        gru_weights_hh: List of hidden-hidden weight tensors for each GRU layer
        gru_biases_ih: List of input-hidden bias tensors for each GRU layer
        gru_biases_hh: List of hidden-hidden bias tensors for each GRU layer
        h0: Initial hidden state
        is_training: Whether in training mode

    Returns:
        output tensor of shape (seq_len, batch_size, hidden_size)
    """
    h0 = h0.to(x.device)

    # Run single GRU with all layers at once
    output, _ = _VF.gru(
        x,
        h0,
        [
            w
            for layer in zip(
                gru_weights_ih, gru_weights_hh, gru_biases_ih, gru_biases_hh
            )
            for w in layer
        ],
        True,  # has_biases
        len(gru_weights_ih),  # num_layers
        0.0,  # dropout
        is_training,  # training
        False,  # bidirectional
        False,
    )  # batch_first

    return output


class Model(nn.Module):
    def __init__(
        self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
    ):
        """
        :param input_size: The number of expected features in the input x
        :param hidden_size: The number of features in the hidden state h
        :param num_layers: Number of recurrent layers (default: 1)
        :param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
        :param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
        """
        super(Model, self).__init__()

        # Create GRU and extract its parameters
        gru = nn.GRU(
            input_size,
            hidden_size,
            num_layers,
            bias=bias,
            batch_first=batch_first,
            dropout=0,
            bidirectional=False,
        )

        # Initialize h0 exactly as in original code
        self.h0 = torch.randn((num_layers, batch_size, hidden_size))

        # Extract and store GRU parameters
        self.gru_weights_ih = nn.ParameterList()
        self.gru_weights_hh = nn.ParameterList()
        self.gru_biases_ih = nn.ParameterList()
        self.gru_biases_hh = nn.ParameterList()

        for i in range(num_layers):
            self.gru_weights_ih.append(getattr(gru, f"weight_ih_l{i}"))
            self.gru_weights_hh.append(getattr(gru, f"weight_hh_l{i}"))
            if bias:
                self.gru_biases_ih.append(getattr(gru, f"bias_ih_l{i}"))
                self.gru_biases_hh.append(getattr(gru, f"bias_hh_l{i}"))
            else:
                self.gru_biases_ih.append(None)
                self.gru_biases_hh.append(None)

    def forward(self, x, fn=module_fn):
        return fn(
            x,
            self.gru_weights_ih,
            self.gru_weights_hh,
            self.gru_biases_ih,
            self.gru_biases_hh,
            self.h0,
            self.training,
        )


# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6


def get_inputs():
    return [torch.randn(seq_len, batch_size, input_size)]


def get_init_inputs():
    return [input_size, hidden_size, num_layers]
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False):
        """
        :param input_size: The number of expected features in the input x
        :param hidden_size: The number of features in the hidden state h
        :param num_layers: Number of recurrent layers (default: 1)
        :param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
        :param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
        """
        super(Model, self).__init__()
        
        self.gru = nn.GRU(input_size, hidden_size, num_layers, bias, batch_first, dropout=0, bidirectional=False)
        self.h0 = torch.randn((num_layers, batch_size, hidden_size))
    
    def forward(self, x):
        """
        :param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
        :param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
        :return: output, h_n
            - output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
            - h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
        """
        self.h0 = self.h0.to(x.device)
        output, h_n = self.gru(x, self.h0)
        return output

# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6

def get_inputs():
    return [torch.randn(seq_len, batch_size, input_size)]

def get_init_inputs():
    return [input_size, hidden_size, num_layers]

Kernel Information

Related Kernels (Level 3, Task 39 • 39_GRU)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 39_gru_constant_memory_edit_1 27.75 1.24 1.83
🥈 gru_with_stride_loops_opt_base 28.00 1.23 1.81
🥉 39_gru_coalesced_base 28.18 1.22 1.80
4 gru_with_memory_coalescing_base_base 28.21 1.22 1.80
5 optimized_gru_forward_base 28.36 1.21 1.79
6 39_gru_constant_memory_base 29.09 1.18 1.74
7 gru_with_cuda_streams_base 30.00 1.15 1.69
8 gru_with_memory_access_optimizations_base 30.08 1.14 1.68
9 optimized_copy_base 30.08 1.14 1.68
10 gru_with_optimal_block_size_base 30.13 1.14 1.68
11 gru_3d_indexing_optim_base 30.23 1.14 1.68
12 gru_with_load_balancing_base_base_base 30.23 1.14 1.68
13 gru_with_ldg_and_alignment_base_base_base 30.27 1.14 1.67
14 gru_with_unrolled_loops_base_base 30.33 1.13 1.67
15 gru_pipeline_overlap_base 30.34 1.13 1.67
16 gru_with_uniform_control_flow_base 30.37 1.13 1.67
17 gru_loop_unroll_base 30.38 1.13 1.67
18 gru_with_minimized_warp_divergence_base_base 30.39 1.13 1.67
19 optimized_gru_stream_unroll_base 30.39 1.13 1.67
20 gru_optimized_thread_block_indexing_base_base 30.43 1.13 1.67
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <cuda_runtime.h>

__global__ void GRU_forward_kernel_optimized(const float* __restrict__ input, 
                                           float* __restrict__ output,
                                           int total_elements,
                                           int hidden_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    constexpr int UNROLL_FACTOR = 4;
    
    int limit = total_elements - (total_elements % UNROLL_FACTOR);
    
    for (int i = idx; i < limit; i += stride * UNROLL_FACTOR) {
        #pragma unroll
        for (int j = 0; j < UNROLL_FACTOR; j++) {
            output[i + j] = input[i + j];
        }
    }
    
    for (int i = limit + idx; i < total_elements; i += stride) {
        output[i] = input[i];
    }
}

torch::Tensor forward(
    torch::Tensor x,
    std::vector<torch::Tensor> gru_weights_ih,
    std::vector<torch::Tensor> gru_weights_hh,
    std::vector<torch::Tensor> gru_biases_ih,
    std::vector<torch::Tensor> gru_biases_hh,
    torch::Tensor h0,
    bool is_training) {
    
    h0 = h0.to(x.device());
    size_t num_layers = gru_weights_ih.size();
    int64_t input_size = x.size(2);
    int64_t hidden_size = gru_weights_hh[0].size(1);
    
    torch::nn::GRUOptions gru_options(input_size, hidden_size);
    gru_options.num_layers(num_layers)
              .bidirectional(false)
              .batch_first(false);
    
    torch::nn::GRU gru(gru_options);
    gru->to(x.device());
    gru->train(is_training);
    
    auto params = gru->named_parameters();
    
    constexpr int NUM_STREAMS = 4;
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
    }
    
    for (size_t l = 0; l < num_layers; ++l) {
        std::string layer_str = std::to_string(l);
        cudaStream_t& current_stream = streams[l % NUM_STREAMS];
        
        std::string w_ih_key = "weight_ih_l" + layer_str;
        std::string w_hh_key = "weight_hh_l" + layer_str;
        std::string b_ih_key = "bias_ih_l" + layer_str;
        std::string b_hh_key = "bias_hh_l" + layer_str;
        
        cudaMemcpyAsync(params[w_ih_key].data_ptr(), gru_weights_ih[l].data_ptr(),
                       gru_weights_ih[l].numel() * sizeof(float),
                       cudaMemcpyHostToDevice, current_stream);
        cudaMemcpyAsync(params[w_hh_key].data_ptr(), gru_weights_hh[l].data_ptr(),
                       gru_weights_hh[l].numel() * sizeof(float),
                       cudaMemcpyHostToDevice, current_stream);
        cudaMemcpyAsync(params[b_ih_key].data_ptr(), gru_biases_ih[l].data_ptr(),
                       gru_biases_ih[l].numel() * sizeof(float),
                       cudaMemcpyHostToDevice, current_stream);
        cudaMemcpyAsync(params[b_hh_key].data_ptr(), gru_biases_hh[l].data_ptr(),
                       gru_biases_hh[l].numel() * sizeof(float),
                       cudaMemcpyHostToDevice, current_stream);
    }
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamSynchronize(streams[i]);
    }
    
    h0 = h0.contiguous().view({static_cast<int64_t>(num_layers), x.size(1), hidden_size});
    auto gru_result = gru->forward(x, h0);
    torch::Tensor result = std::get<0>(gru_result);
    
    auto output = torch::empty_like(result);
    int total_elements = output.numel();
    int threads = 256;
    int blocks = (total_elements + threads - 1) / threads;
    
    GRU_forward_kernel_optimized<<<blocks, threads, 0, streams[0]>>>(
        result.data_ptr<float>(),
        output.data_ptr<float>(),
        total_elements,
        hidden_size
    );
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamDestroy(streams[i]);
    }
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized GRU forward with streams and unrolling (CUDA)");
}
Performance Metrics
Metric Value Unit Variance Samples
Executed Ipc Active 2.648 inst/cycle 0.000 5
Executed Ipc Elapsed 1.922 inst/cycle 0.000 5
Issue Slots Busy 66.616 % 0.097 5
Issued Ipc Active 2.666 inst/cycle 0.000 5
SM Busy 66.616 % 0.097 5
Memory Throughput 620851522931.784 byte/second 34475254836632956928.000 5
Mem Busy 28.028 % 0.053 5
Max Bandwidth 37.098 % 0.131 5
L1/TEX Hit Rate 75.172 % 0.002 5
L2 Hit Rate 68.468 % 0.121 5
Mem Pipes Busy 28.786 % 0.066 5
Warp Cycles Per Issued Instruction 18.804 cycle 0.008 5
Warp Cycles Per Executed Instruction 18.924 cycle 0.008 5
Avg. Active Threads Per Warp 32.000 0.000 5
Avg. Not Predicated Off Threads Per Warp 29.270 0.000 5
Max Active Clusters 0.000 cluster 0.000 5
Max Cluster Size 8.000 block 0.000 5
Overall GPU Occupancy 0.000 % 0.000 5
Cluster Occupancy 0.000 % 0.000 5
Block Limit SM 32.000 block 0.000 5
Block Limit Registers 8.000 block 0.000 5
Block Limit Shared Mem 32.000 block 0.000 5
Block Limit Warps 8.000 block 0.000 5
Theoretical Active Warps per SM 64.000 warp 0.000 5
Theoretical Occupancy 100.000 % 0.000 5
Achieved Occupancy 79.106 % 0.069 5
Achieved Active Warps Per SM 50.630 warp 0.028 5
Analysis Rules
Rule Description
INF HighPipeUtilization ALU is the highest-utilized pipeline (45.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.
INF CPIStall Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.
WRN Occupancy This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (79.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy.
Operation / Metric Value Unit
aten::to
CPU Time 986721.53 μs
Device Time 88908.50 μs
Self CPU Time 4289.25 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::copy_
CPU Time 403861.07 μs
Device Time 104895.29 μs
Self CPU Time 23167.69 μs
Self Device Time 104895.29 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::uniform_
CPU Time 2216616.92 μs
Device Time 0.00 μs
Self CPU Time 2216616.92 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
cudaLaunchKernel
CPU Time 5198230.12 μs
Device Time 0.00 μs
Self CPU Time 5198230.12 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::gru
CPU Time 7414585.03 μs
Device Time 6503308.29 μs
Self CPU Time 6064.06 μs
Self Device Time 0.00 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
aten::_cudnn_rnn
CPU Time 7406454.46 μs
Device Time 6503308.29 μs
Self CPU Time 1737438.96 μs
Self Device Time 6503308.29 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4>(cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4::Params)
CPU Time 0.00 μs
Device Time 4149771.25 μs
Self CPU Time 0.00 μs
Self Device Time 4149771.25 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
void elemWiseRNNcell<float, float, float, (cudnnRNNMode_t)3, (cudnnRNNBiasMode_t)2>(int, int, int, int, int, bool, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float*, float*, float*, float*, float*, cudnnRNNClipMode_t, cudnnNanPropagation_t, float, float)
CPU Time 0.00 μs
Device Time 2353543.31 μs
Self CPU Time 0.00 μs
Self Device Time 2353543.31 μs
CPU Memory Usage 0 B
Device Memory Usage 0 B
Self CPU Memory Usage 0 B
Self Device Memory Usage 0 B
Status: Completed
45289 warnings generated when compiling for host.
Suppressed 45328 warnings (45281 in non-user code, 47 NOLINT).
Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:8:44 bugprone-easily-swappable-parameters
8 | int total_elements,
| ^~~~~~~~~~~~~~~~~~~
9 | int hidden_size) {
| ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:8:48: note: the first parameter in the range is 'total_elements'
8 | int total_elements,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:9:48: note: the last parameter in the range is 'hidden_size'
9 | int hidden_size) {
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:10:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
10 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:11:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
11 | int stride = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:29:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
29 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:30:5: warning: 4 adjacent parameters of 'forward' of similar type ('std::vector<torch::Tensor>') are easily swapped by mistake [bugprone-easily-swappable-parameters]
30 | std::vector<torch::Tensor> gru_weights_ih,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 | std::vector<torch::Tensor> gru_weights_hh,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
32 | std::vector<torch::Tensor> gru_biases_ih,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33 | std::vector<torch::Tensor> gru_biases_hh,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:30:32: note: the first parameter in the range is 'gru_weights_ih'
30 | std::vector<torch::Tensor> gru_weights_ih,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:33:32: note: the last parameter in the range is 'gru_biases_hh'
33 | std::vector<torch::Tensor> gru_biases_hh,
| ^~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:43:28: warning: narrowing conversion from 'size_t' (aka 'unsigned long') to signed type 'int64_t' (aka 'long') is implementation-defined [bugprone-narrowing-conversions]
43 | gru_options.num_layers(num_layers)
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:91:26: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
91 | int total_elements = output.numel();
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_3/task_39/b4_s1_optimized_gru_stream_unroll/base/base.cu:99:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
99 | hidden_size
| ^