Kernel Details - optimized_lstm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import _VF
from typing import List


def module_fn(
    x: torch.Tensor,
    lstm_weights_ih: List[torch.Tensor],
    lstm_weights_hh: List[torch.Tensor],
    lstm_biases_ih: List[torch.Tensor],
    lstm_biases_hh: List[torch.Tensor],
    h0: torch.Tensor,
    c0: torch.Tensor,
    is_training: bool,
) -> torch.Tensor:
    """
    Functional implementation of LSTM with Hn

    Args:
        x: Input tensor of shape (batch_size, sequence_length, input_size)
        lstm_weights_ih: List of input-hidden weight tensors for each LSTM layer
        lstm_weights_hh: List of hidden-hidden weight tensors for each LSTM layer
        lstm_biases_ih: List of input-hidden bias tensors for each LSTM layer
        lstm_biases_hh: List of hidden-hidden bias tensors for each LSTM layer
        h0: Initial hidden state
        c0: Initial cell state
        is_training: Whether in training mode

    Returns:
        Final hidden state tensor
    """
    h0 = h0.to(x.device)
    c0 = c0.to(x.device)

    # Run LSTM layers
    out = x
    hn = h0
    cn = c0

    for i in range(len(lstm_weights_ih)):
        params = (
            lstm_weights_ih[i],
            lstm_weights_hh[i],
            lstm_biases_ih[i],
            lstm_biases_hh[i],
        )
        result = _VF.lstm(
            out,
            (hn[i : i + 1], cn[i : i + 1]),
            params,
            True,  # has_biases
            1,  # num_layers
            0.0 if not is_training else dropout,  # dropout
            is_training,  # training
            False,  # bidirectional
            True,
        )  # batch_first

        out = result[0]
        # Update the corresponding layer's hidden state
        hn = hn.clone()
        cn = cn.clone()
        hn[i : i + 1] = result[1]
        cn[i : i + 1] = result[2]

    return hn


class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
        """
        Initialize the LSTM model.

        :param input_size: The number of expected features in the input `x`
        :param hidden_size: The number of features in the hidden state `h`
        :param num_layers: Number of recurrent layers
        :param output_size: The number of output features
        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer
        """
        super(Model, self).__init__()

        # Initialize hidden states
        self.h0 = torch.randn((num_layers, batch_size, hidden_size))
        self.c0 = torch.randn((num_layers, batch_size, hidden_size))

        # Extract LSTM parameters
        lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=False,
        )

        # Get weights and biases for each layer
        self.lstm_weights_ih = nn.ParameterList()
        self.lstm_weights_hh = nn.ParameterList()
        self.lstm_biases_ih = nn.ParameterList()
        self.lstm_biases_hh = nn.ParameterList()

        for i in range(num_layers):
            self.lstm_weights_ih.append(
                nn.Parameter(getattr(lstm, f"weight_ih_l{i}").data.clone())
            )
            self.lstm_weights_hh.append(
                nn.Parameter(getattr(lstm, f"weight_hh_l{i}").data.clone())
            )
            self.lstm_biases_ih.append(
                nn.Parameter(getattr(lstm, f"bias_ih_l{i}").data.clone())
            )
            self.lstm_biases_hh.append(
                nn.Parameter(getattr(lstm, f"bias_hh_l{i}").data.clone())
            )

        # Extract linear layer parameters
        fc = nn.Linear(hidden_size, output_size)
        self.fc_weight = nn.Parameter(fc.weight.data.clone())
        self.fc_bias = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(
            x,
            self.lstm_weights_ih,
            self.lstm_weights_hh,
            self.lstm_biases_ih,
            self.lstm_biases_hh,
            self.h0,
            self.c0,
            self.training,
        )


# Test code
batch_size = 10
sequence_length = 512
input_size = 128
hidden_size = 256
num_layers = 6
output_size = 10
dropout = 0.0


def get_inputs():
    return [torch.randn(batch_size, sequence_length, input_size)]


def get_init_inputs():
    return [input_size, hidden_size, num_layers, output_size, dropout]

import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
        """
        Initialize the LSTM model.

        :param input_size: The number of expected features in the input `x`
        :param hidden_size: The number of features in the hidden state `h`
        :param num_layers: Number of recurrent layers
        :param output_size: The number of output features
        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to `dropout`
        """
        super(Model, self).__init__()
        # Initialize hidden state with random values
        self.h0 = torch.randn((num_layers, batch_size, hidden_size))
        self.c0 = torch.randn((num_layers, batch_size, hidden_size))
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=False)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Forward pass through the LSTM model.

        :param x: The input tensor, shape (batch_size, sequence_length, input_size)
        :return: The output tensor, shape (batch_size, sequence_length, output_size)
        """
        self.h0 = self.h0.to(x.device)
        self.c0 = self.h0.to(x.device)
        
        # Forward propagate LSTM
        out, state = self.lstm(x, (self.h0, self.c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])  # out: tensor of shape (batch_size, output_size)
        
        return state[0]

# Test code
batch_size = 10
sequence_length = 512
input_size = 128
hidden_size = 256
num_layers = 6
output_size = 10
dropout = 0.0

def get_inputs():
    return [torch.randn(batch_size, sequence_length, input_size)]

def get_init_inputs():
    return [input_size, hidden_size, num_layers, output_size, dropout]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	36_LTSMHn
Level ID	3
Task ID	36
Kernel Name	optimized_lstm_base
CUDA Speedup (Native)	0.761x
CUDA Speedup (Compile)	1.588x
CUDA Runtime	36.435 ms
PyTorch Runtime (Native)	27.714 ms
PyTorch Runtime (Compile)	57.843 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	o3-mini-2025-01-31
Temperature	1.00

View Experiment Progress Details

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	optimized_lstm_base	36.44	0.76	1.59
🥈	unrolled_lstm_optimized_base	37.00	0.75	1.56
🥉	36_ltsmh_n_modular_base	37.61	0.74	1.54
4	optimized_lstm_forward_base	37.82	0.73	1.53
5	36_LTSMHn	37.86	0.73	1.53
6	combined_unroll_base	38.18	0.73	1.52
7	36_LTSMHn_unrolled_base	38.19	0.73	1.51
8	optimized_ltsmh_coalesced_base	38.38	0.72	1.51
9	warp_divergence_optimized_lstm_base	41.31	0.67	1.40
10	fused_lstm_edit_1	49.73	0.56	1.16
11	fused_lstm_base	49.76	0.56	1.16
12	36_ltsmhn_coalesced_mem_edit_1	49.92	0.56	1.16
13	36_ltsmhn_warp_aligned_base	50.08	0.55	1.15
14	36_ltsmhn_coalesced_mem_base	50.09	0.55	1.15
15	optimized_lstm_forward_base	50.47	0.55	1.15
16	fused_lstm_sync_opt_edit_1	813.07	0.03	0.07

/*
 * Optimized LSTM forward CUDA kernel extension
 * This kernel combines explicit unrolling for the first four layers with a lambda
 * function for additional layers. It minimizes redundant device conversions and
 * uses pragma unroll to hint at parameter copy inlining. Note that each layer
 * dynamically constructs an LSTM sub-module with the corresponding weights and biases.
 */

#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <tuple>


torch::Tensor forward(
    torch::Tensor x,
    std::vector<torch::Tensor> lstm_weights_ih,
    std::vector<torch::Tensor> lstm_weights_hh,
    std::vector<torch::Tensor> lstm_biases_ih,
    std::vector<torch::Tensor> lstm_biases_hh,
    torch::Tensor h0,
    torch::Tensor c0,
    bool is_training
) {
    // Move initial hidden and cell states to the correct device once
    auto device = x.device();
    h0 = h0.to(device);
    c0 = c0.to(device);

    auto out = x;
    auto hn = h0.clone();
    auto cn = c0.clone();

    const size_t num_layers = lstm_weights_ih.size();

    // Define a lambda to process each LSTM layer
    auto process_layer = [&](size_t i) {
        // Extract weights and biases for layer i
        auto weight_ih = lstm_weights_ih[i];
        auto weight_hh = lstm_weights_hh[i];
        auto bias_ih = lstm_biases_ih[i];
        auto bias_hh = lstm_biases_hh[i];

        // Determine layer dimensions
        int64_t input_size = weight_ih.size(1);
        int64_t hidden_size = weight_hh.size(1);

        // Create a one-layer LSTM sub-module
        torch::nn::LSTM lstm_model(
            torch::nn::LSTMOptions(input_size, hidden_size)
            .num_layers(1)
            .batch_first(true)
            .bidirectional(false)
        );
        lstm_model->to(device);

        // Copy parameters into the LSTM model with compiler unrolling hint
        #pragma unroll
        {
            lstm_model->named_parameters()["weight_ih_l0"].copy_(weight_ih);
            lstm_model->named_parameters()["weight_hh_l0"].copy_(weight_hh);
            lstm_model->named_parameters()["bias_ih_l0"].copy_(bias_ih);
            lstm_model->named_parameters()["bias_hh_l0"].copy_(bias_hh);
        }

        // Extract the current hidden and cell state slice
        auto h_slice = hn.narrow(0, i, 1);
        auto c_slice = cn.narrow(0, i, 1);
        std::tuple<torch::Tensor, torch::Tensor> state_tuple = std::make_tuple(h_slice, c_slice);

        lstm_model->train(is_training);

        // Run forward pass for this layer
        auto output_and_state = lstm_model->forward(out, state_tuple);
        auto output = std::get<0>(output_and_state);
        auto state = std::get<1>(output_and_state);
        auto h_n = std::get<0>(state);
        auto c_n = std::get<1>(state);

        // Update hidden and cell states
        hn.narrow(0, i, 1).copy_(h_n);
        cn.narrow(0, i, 1).copy_(c_n);

        // Update the output for the next layer
        out = output;
    };

    // Explicitly unroll first four layers if available
    if (num_layers > 0) process_layer(0);
    if (num_layers > 1) process_layer(1);
    if (num_layers > 2) process_layer(2);
    if (num_layers > 3) process_layer(3);

    // Process remaining layers if any
    for (size_t i = 4; i < num_layers; ++i) {
        process_layer(i);
    }

    return hn;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized LSTM forward (CUDA)");
}

Operation / Metric	Value	Unit
aten::to
CPU Time	1371112.34	μs
Device Time	99306.89	μs
Self CPU Time	5560.67	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::copy_
CPU Time	1075798.00	μs
Device Time	141205.32	μs
Self CPU Time	297036.49	μs
Self Device Time	141205.32	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::uniform_
CPU Time	2652051.67	μs
Device Time	0.00	μs
Self CPU Time	2652051.67	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	4297498.71	μs
Device Time	0.00	μs
Self CPU Time	4297498.71	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::lstm
CPU Time	6480928.60	μs
Device Time	5989756.49	μs
Self CPU Time	17907.93	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_cudnn_rnn
CPU Time	6453016.14	μs
Device Time	5989756.49	μs
Self CPU Time	1976790.67	μs
Self Device Time	5988591.54	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4>(cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4::Params)
CPU Time	0.00	μs
Device Time	3819064.63	μs
Self CPU Time	0.00	μs
Self Device Time	3819064.63	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void elemWiseRNNcell<float, float, float, (cudnnRNNMode_t)2, (cudnnRNNBiasMode_t)2>(int, int, int, int, int, bool, float const, float const, float const, float const, float const, float const, float const, float, float, float, float, float, cudnnRNNClipMode_t, cudnnNanPropagation_t, float, float)
CPU Time	0.00	μs
Device Time	2169526.91	μs
Self CPU Time	0.00	μs
Self Device Time	2169526.91	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

The AI CUDA Engineer 👷

`36_LTSMHn` • `optimized_lstm_base`

Kernel Information

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

The AI CUDA Engineer 👷

36_LTSMHn • optimized_lstm_base

Kernel Information

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

`36_LTSMHn` • `optimized_lstm_base`