Kernel Details - 36_ltsmhn_warp_aligned

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import _VF
from typing import List


def module_fn(
    x: torch.Tensor,
    lstm_weights_ih: List[torch.Tensor],
    lstm_weights_hh: List[torch.Tensor],
    lstm_biases_ih: List[torch.Tensor],
    lstm_biases_hh: List[torch.Tensor],
    h0: torch.Tensor,
    c0: torch.Tensor,
    is_training: bool,
) -> torch.Tensor:
    """
    Functional implementation of LSTM with Hn

    Args:
        x: Input tensor of shape (batch_size, sequence_length, input_size)
        lstm_weights_ih: List of input-hidden weight tensors for each LSTM layer
        lstm_weights_hh: List of hidden-hidden weight tensors for each LSTM layer
        lstm_biases_ih: List of input-hidden bias tensors for each LSTM layer
        lstm_biases_hh: List of hidden-hidden bias tensors for each LSTM layer
        h0: Initial hidden state
        c0: Initial cell state
        is_training: Whether in training mode

    Returns:
        Final hidden state tensor
    """
    h0 = h0.to(x.device)
    c0 = c0.to(x.device)

    # Run LSTM layers
    out = x
    hn = h0
    cn = c0

    for i in range(len(lstm_weights_ih)):
        params = (
            lstm_weights_ih[i],
            lstm_weights_hh[i],
            lstm_biases_ih[i],
            lstm_biases_hh[i],
        )
        result = _VF.lstm(
            out,
            (hn[i : i + 1], cn[i : i + 1]),
            params,
            True,  # has_biases
            1,  # num_layers
            0.0 if not is_training else dropout,  # dropout
            is_training,  # training
            False,  # bidirectional
            True,
        )  # batch_first

        out = result[0]
        # Update the corresponding layer's hidden state
        hn = hn.clone()
        cn = cn.clone()
        hn[i : i + 1] = result[1]
        cn[i : i + 1] = result[2]

    return hn


class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
        """
        Initialize the LSTM model.

        :param input_size: The number of expected features in the input `x`
        :param hidden_size: The number of features in the hidden state `h`
        :param num_layers: Number of recurrent layers
        :param output_size: The number of output features
        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer
        """
        super(Model, self).__init__()

        # Initialize hidden states
        self.h0 = torch.randn((num_layers, batch_size, hidden_size))
        self.c0 = torch.randn((num_layers, batch_size, hidden_size))

        # Extract LSTM parameters
        lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=False,
        )

        # Get weights and biases for each layer
        self.lstm_weights_ih = nn.ParameterList()
        self.lstm_weights_hh = nn.ParameterList()
        self.lstm_biases_ih = nn.ParameterList()
        self.lstm_biases_hh = nn.ParameterList()

        for i in range(num_layers):
            self.lstm_weights_ih.append(
                nn.Parameter(getattr(lstm, f"weight_ih_l{i}").data.clone())
            )
            self.lstm_weights_hh.append(
                nn.Parameter(getattr(lstm, f"weight_hh_l{i}").data.clone())
            )
            self.lstm_biases_ih.append(
                nn.Parameter(getattr(lstm, f"bias_ih_l{i}").data.clone())
            )
            self.lstm_biases_hh.append(
                nn.Parameter(getattr(lstm, f"bias_hh_l{i}").data.clone())
            )

        # Extract linear layer parameters
        fc = nn.Linear(hidden_size, output_size)
        self.fc_weight = nn.Parameter(fc.weight.data.clone())
        self.fc_bias = nn.Parameter(fc.bias.data.clone())

    def forward(self, x, fn=module_fn):
        return fn(
            x,
            self.lstm_weights_ih,
            self.lstm_weights_hh,
            self.lstm_biases_ih,
            self.lstm_biases_hh,
            self.h0,
            self.c0,
            self.training,
        )


# Test code
batch_size = 10
sequence_length = 512
input_size = 128
hidden_size = 256
num_layers = 6
output_size = 10
dropout = 0.0


def get_inputs():
    return [torch.randn(batch_size, sequence_length, input_size)]


def get_init_inputs():
    return [input_size, hidden_size, num_layers, output_size, dropout]

import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
        """
        Initialize the LSTM model.

        :param input_size: The number of expected features in the input `x`
        :param hidden_size: The number of features in the hidden state `h`
        :param num_layers: Number of recurrent layers
        :param output_size: The number of output features
        :param dropout: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to `dropout`
        """
        super(Model, self).__init__()
        # Initialize hidden state with random values
        self.h0 = torch.randn((num_layers, batch_size, hidden_size))
        self.c0 = torch.randn((num_layers, batch_size, hidden_size))
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=False)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Forward pass through the LSTM model.

        :param x: The input tensor, shape (batch_size, sequence_length, input_size)
        :return: The output tensor, shape (batch_size, sequence_length, output_size)
        """
        self.h0 = self.h0.to(x.device)
        self.c0 = self.h0.to(x.device)
        
        # Forward propagate LSTM
        out, state = self.lstm(x, (self.h0, self.c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])  # out: tensor of shape (batch_size, output_size)
        
        return state[0]

# Test code
batch_size = 10
sequence_length = 512
input_size = 128
hidden_size = 256
num_layers = 6
output_size = 10
dropout = 0.0

def get_inputs():
    return [torch.randn(batch_size, sequence_length, input_size)]

def get_init_inputs():
    return [input_size, hidden_size, num_layers, output_size, dropout]

Download Evaluation Download PyTorch Download CUDA Download Profiles

Kernel Information

Operation Name	36_LTSMHn
Level ID	3
Task ID	36
Kernel Name	36_ltsmhn_warp_aligned_base
CUDA Speedup (Native)	0.553x
CUDA Speedup (Compile)	1.155x
CUDA Runtime	50.084 ms
PyTorch Runtime (Native)	27.714 ms
PyTorch Runtime (Compile)	57.843 ms
Correct	True
Max Diff (vs. Reference)	0.000000
Model	bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
Temperature	0.50

View Experiment Progress Details

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

Rank	Kernel Name	Runtime (ms)	Speedup Native	Speedup Compile
🥇	optimized_lstm_base	36.44	0.76	1.59
🥈	unrolled_lstm_optimized_base	37.00	0.75	1.56
🥉	36_ltsmh_n_modular_base	37.61	0.74	1.54
4	optimized_lstm_forward_base	37.82	0.73	1.53
5	36_LTSMHn	37.86	0.73	1.53
6	combined_unroll_base	38.18	0.73	1.52
7	36_LTSMHn_unrolled_base	38.19	0.73	1.51
8	optimized_ltsmh_coalesced_base	38.38	0.72	1.51
9	warp_divergence_optimized_lstm_base	41.31	0.67	1.40
10	fused_lstm_edit_1	49.73	0.56	1.16
11	fused_lstm_base	49.76	0.56	1.16
12	36_ltsmhn_coalesced_mem_edit_1	49.92	0.56	1.16
13	36_ltsmhn_warp_aligned_base	50.08	0.55	1.15
14	36_ltsmhn_coalesced_mem_base	50.09	0.55	1.15
15	optimized_lstm_forward_base	50.47	0.55	1.15
16	fused_lstm_sync_opt_edit_1	813.07	0.03	0.07

#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>

torch::Tensor forward(
    torch::Tensor x,
    std::vector<torch::Tensor> lstm_weights_ih,
    std::vector<torch::Tensor> lstm_weights_hh,
    std::vector<torch::Tensor> lstm_biases_ih,
    std::vector<torch::Tensor> lstm_biases_hh,
    torch::Tensor h0,
    torch::Tensor c0,
    bool is_training
) {
    h0 = h0.to(x.device());
    c0 = c0.to(x.device());

    auto out = x;
    auto hn = h0.clone();
    auto cn = c0.clone();
    
    const size_t num_layers = lstm_weights_ih.size();
    std::vector<torch::nn::LSTM> lstm_layers;
    lstm_layers.reserve(num_layers);

    // Pre-initialize all LSTM layers to avoid conditional initialization
    for (size_t i = 0; i < num_layers; ++i) {
        int64_t input_size = lstm_weights_ih[i].size(1);
        int64_t hidden_size = lstm_weights_hh[i].size(1);
        
        lstm_layers.emplace_back(
            torch::nn::LSTMOptions(input_size, hidden_size)
            .num_layers(1)
            .batch_first(true)
            .bidirectional(false)
        );
        
        lstm_layers[i]->to(x.device());
        
        // Pre-assign weights and biases
        lstm_layers[i]->named_parameters()["weight_ih_l0"].copy_(lstm_weights_ih[i]);
        lstm_layers[i]->named_parameters()["weight_hh_l0"].copy_(lstm_weights_hh[i]);
        lstm_layers[i]->named_parameters()["bias_ih_l0"].copy_(lstm_biases_ih[i]);
        lstm_layers[i]->named_parameters()["bias_hh_l0"].copy_(lstm_biases_hh[i]);
        lstm_layers[i]->train(is_training);
    }

    // Process all layers with minimal branching
    for (size_t i = 0; i < num_layers; ++i) {
        auto h_slice = hn.narrow(0, i, 1);
        auto c_slice = cn.narrow(0, i, 1);
        
        std::tuple<torch::Tensor, torch::Tensor> state_tuple = std::make_tuple(h_slice, c_slice);
        
        auto output_and_state = lstm_layers[i]->forward(out, state_tuple);
        
        out = std::get<0>(output_and_state);
        auto h_n = std::get<0>(std::get<1>(output_and_state));
        auto c_n = std::get<1>(std::get<1>(output_and_state));
        
        hn.narrow(0, i, 1).copy_(h_n);
        cn.narrow(0, i, 1).copy_(c_n);
    }

    return hn;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "LSTM forward (CUDA)");
}

Operation / Metric	Value	Unit
aten::copy_
CPU Time	1855060.96	μs
Device Time	122537.60	μs
Self CPU Time	217736.83	μs
Self Device Time	122537.60	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::uniform_
CPU Time	2250739.91	μs
Device Time	0.00	μs
Self CPU Time	2250739.91	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
cudaLaunchKernel
CPU Time	4403648.82	μs
Device Time	0.00	μs
Self CPU Time	4403648.82	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::lstm
CPU Time	6353755.25	μs
Device Time	5235544.38	μs
Self CPU Time	17545.45	μs
Self Device Time	0.00	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
aten::_cudnn_rnn
CPU Time	6326496.10	μs
Device Time	5235544.38	μs
Self CPU Time	1417194.38	μs
Self Device Time	5234490.43	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4>(cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4::Params)
CPU Time	0.00	μs
Device Time	3371294.41	μs
Self CPU Time	0.00	μs
Self Device Time	3371294.41	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B
void elemWiseRNNcell<float, float, float, (cudnnRNNMode_t)2, (cudnnRNNBiasMode_t)2>(int, int, int, int, int, bool, float const, float const, float const, float const, float const, float const, float const, float, float, float, float, float, cudnnRNNClipMode_t, cudnnNanPropagation_t, float, float)
CPU Time	0.00	μs
Device Time	1863199.50	μs
Self CPU Time	0.00	μs
Self Device Time	1863199.50	μs
CPU Memory Usage	0	B
Device Memory Usage	0	B
Self CPU Memory Usage	0	B
Self Device Memory Usage	0	B

The AI CUDA Engineer 👷

`36_LTSMHn` • `36_ltsmhn_warp_aligned_base`

Kernel Information

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

The AI CUDA Engineer 👷

36_LTSMHn • 36_ltsmhn_warp_aligned_base

Kernel Information

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

`36_LTSMHn` • `36_ltsmhn_warp_aligned_base`