← Back to Leaderboard

The AI CUDA Engineer 👷

36_LTSMHnunrolled_lstm_optimized_base

Level 3 • Task 36

Kernel Information

Related Kernels (Level 3, Task 36 • 36_LTSMHn)

Rank Kernel Name Runtime (ms) Speedup Native Speedup Compile
🥇 optimized_lstm_base 36.44 0.76 1.59
🥈 unrolled_lstm_optimized_base 37.00 0.75 1.56
🥉 36_ltsmh_n_modular_base 37.61 0.74 1.54
4 optimized_lstm_forward_base 37.82 0.73 1.53
5 36_LTSMHn 37.86 0.73 1.53
6 combined_unroll_base 38.18 0.73 1.52
7 36_LTSMHn_unrolled_base 38.19 0.73 1.51
8 optimized_ltsmh_coalesced_base 38.38 0.72 1.51
9 warp_divergence_optimized_lstm_base 41.31 0.67 1.40
10 fused_lstm_edit_1 49.73 0.56 1.16
11 fused_lstm_base 49.76 0.56 1.16
12 36_ltsmhn_coalesced_mem_edit_1 49.92 0.56 1.16
13 36_ltsmhn_warp_aligned_base 50.08 0.55 1.15
14 36_ltsmhn_coalesced_mem_base 50.09 0.55 1.15
15 optimized_lstm_forward_base 50.47 0.55 1.15
16 fused_lstm_sync_opt_edit_1 813.07 0.03 0.07
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <tuple>

// Optimized LSTM forward CUDA kernel with manual loop unrolling

torch::Tensor forward(
    torch::Tensor x,
    std::vector<torch::Tensor> lstm_weights_ih,
    std::vector<torch::Tensor> lstm_weights_hh,
    std::vector<torch::Tensor> lstm_biases_ih,
    std::vector<torch::Tensor> lstm_biases_hh,
    torch::Tensor h0,
    torch::Tensor c0,
    bool is_training
) {
    // Move initial hidden and cell states to the correct device once
    auto device = x.device();
    h0 = h0.to(device);
    c0 = c0.to(device);

    auto out = x;
    auto hn = h0.clone();
    auto cn = c0.clone();

    const size_t num_layers = lstm_weights_ih.size();

    // Define a lambda to process each LSTM layer
    auto process_layer = [&](size_t i) {
        // Extract weights and biases for layer i
        auto weight_ih = lstm_weights_ih[i];
        auto weight_hh = lstm_weights_hh[i];
        auto bias_ih = lstm_biases_ih[i];
        auto bias_hh = lstm_biases_hh[i];

        // Determine layer dimensions
        int64_t input_size = weight_ih.size(1);
        int64_t hidden_size = weight_hh.size(1);

        // Create a one-layer LSTM sub-module
        torch::nn::LSTM lstm_model(
            torch::nn::LSTMOptions(input_size, hidden_size)
            .num_layers(1)
            .batch_first(true)
            .bidirectional(false)
        );
        lstm_model->to(device);

        // Copy parameters into the LSTM model with compiler unrolling hint
        #pragma unroll
        {
            lstm_model->named_parameters()["weight_ih_l0"].copy_(weight_ih);
            lstm_model->named_parameters()["weight_hh_l0"].copy_(weight_hh);
            lstm_model->named_parameters()["bias_ih_l0"].copy_(bias_ih);
            lstm_model->named_parameters()["bias_hh_l0"].copy_(bias_hh);
        }

        // Extract the current hidden and cell state slice
        auto h_slice = hn.narrow(0, i, 1);
        auto c_slice = cn.narrow(0, i, 1);
        std::tuple<torch::Tensor, torch::Tensor> state_tuple = std::make_tuple(h_slice, c_slice);

        lstm_model->train(is_training);

        // Run forward pass for this layer
        auto output_and_state = lstm_model->forward(out, state_tuple);
        auto output = std::get<0>(output_and_state);
        auto state = std::get<1>(output_and_state);
        auto h_n = std::get<0>(state);
        auto c_n = std::get<1>(state);

        // Update hidden and cell states
        hn.narrow(0, i, 1).copy_(h_n);
        cn.narrow(0, i, 1).copy_(c_n);

        // Update the output for the next layer
        out = output;
    };

    // Explicitly unroll first four layers if available
    if (num_layers > 0) process_layer(0);
    if (num_layers > 1) process_layer(1);
    if (num_layers > 2) process_layer(2);
    if (num_layers > 3) process_layer(3);

    // Process remaining layers if any
    for (size_t i = 4; i < num_layers; ++i) {
        process_layer(i);
    }

    return hn;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Optimized LSTM forward (CUDA)");
}