from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import _VF
def module_fn(
x: torch.Tensor,
gru_weights_ih: List[torch.Tensor],
gru_weights_hh: List[torch.Tensor],
gru_biases_ih: List[torch.Tensor],
gru_biases_hh: List[torch.Tensor],
h0: torch.Tensor,
is_training: bool,
) -> torch.Tensor:
"""
Functional implementation of GRU
Args:
x: Input tensor of shape (seq_len, batch_size, input_size) if batch_first=False
gru_weights_ih: List of input-hidden weight tensors for each GRU layer
gru_weights_hh: List of hidden-hidden weight tensors for each GRU layer
gru_biases_ih: List of input-hidden bias tensors for each GRU layer
gru_biases_hh: List of hidden-hidden bias tensors for each GRU layer
h0: Initial hidden state
is_training: Whether in training mode
Returns:
output tensor of shape (seq_len, batch_size, hidden_size)
"""
h0 = h0.to(x.device)
# Run single GRU with all layers at once
output, _ = _VF.gru(
x,
h0,
[
w
for layer in zip(
gru_weights_ih, gru_weights_hh, gru_biases_ih, gru_biases_hh
)
for w in layer
],
True, # has_biases
len(gru_weights_ih), # num_layers
0.0, # dropout
is_training, # training
False, # bidirectional
False,
) # batch_first
return output
class Model(nn.Module):
def __init__(
self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
):
"""
:param input_size: The number of expected features in the input x
:param hidden_size: The number of features in the hidden state h
:param num_layers: Number of recurrent layers (default: 1)
:param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
:param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
"""
super(Model, self).__init__()
# Create GRU and extract its parameters
gru = nn.GRU(
input_size,
hidden_size,
num_layers,
bias=bias,
batch_first=batch_first,
dropout=0,
bidirectional=False,
)
# Initialize h0 exactly as in original code
self.h0 = torch.randn((num_layers, batch_size, hidden_size))
# Extract and store GRU parameters
self.gru_weights_ih = nn.ParameterList()
self.gru_weights_hh = nn.ParameterList()
self.gru_biases_ih = nn.ParameterList()
self.gru_biases_hh = nn.ParameterList()
for i in range(num_layers):
self.gru_weights_ih.append(getattr(gru, f"weight_ih_l{i}"))
self.gru_weights_hh.append(getattr(gru, f"weight_hh_l{i}"))
if bias:
self.gru_biases_ih.append(getattr(gru, f"bias_ih_l{i}"))
self.gru_biases_hh.append(getattr(gru, f"bias_hh_l{i}"))
else:
self.gru_biases_ih.append(None)
self.gru_biases_hh.append(None)
def forward(self, x, fn=module_fn):
return fn(
x,
self.gru_weights_ih,
self.gru_weights_hh,
self.gru_biases_ih,
self.gru_biases_hh,
self.h0,
self.training,
)
# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6
def get_inputs():
return [torch.randn(seq_len, batch_size, input_size)]
def get_init_inputs():
return [input_size, hidden_size, num_layers]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False):
"""
:param input_size: The number of expected features in the input x
:param hidden_size: The number of features in the hidden state h
:param num_layers: Number of recurrent layers (default: 1)
:param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
:param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
"""
super(Model, self).__init__()
self.gru = nn.GRU(input_size, hidden_size, num_layers, bias, batch_first, dropout=0, bidirectional=False)
self.h0 = torch.randn((num_layers, batch_size, hidden_size))
def forward(self, x):
"""
:param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
:param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
:return: output, h_n
- output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
- h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
"""
self.h0 = self.h0.to(x.device)
output, h_n = self.gru(x, self.h0)
return output
# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6
def get_inputs():
return [torch.randn(seq_len, batch_size, input_size)]
def get_init_inputs():
return [input_size, hidden_size, num_layers]
#include <torch/extension.h>
#include <torch/torch.h>
#include <cuda_runtime.h>
#include <vector>
#include <string>
// Kernel that maps the GRU output dimensions (seq_length, batch, hidden) to a 3D grid
__global__ void GRU_forward_kernel_3D(const float* __restrict__ input, float* __restrict__ output, int T, int B, int H) {
// Use 3D grid indexing: x -> hidden, y -> batch, z -> sequence length
int hid = blockIdx.x * blockDim.x + threadIdx.x; // hidden dimension index
int batch = blockIdx.y * blockDim.y + threadIdx.y; // batch dimension index
int seq = blockIdx.z; // sequence index (each block in z corresponds to one sequence element)
if (seq < T && batch < B && hid < H) {
// Compute the linear index assuming tensor layout: [T, B, H]
int index = seq * (B * H) + batch * H + hid;
output[index] = input[index];
}
}
// Forward function implementation using 3D grid indexing
torch::Tensor forward(
torch::Tensor x,
std::vector<torch::Tensor> gru_weights_ih,
std::vector<torch::Tensor> gru_weights_hh,
std::vector<torch::Tensor> gru_biases_ih,
std::vector<torch::Tensor> gru_biases_hh,
torch::Tensor h0,
bool is_training) {
// Ensure the initial hidden state is on the same device as input
h0 = h0.to(x.device());
// Retrieve dimensions and GRU parameters
size_t num_layers = gru_weights_ih.size();
int64_t input_size = x.size(2);
int64_t hidden_size = gru_weights_hh[0].size(1);
bool bidirectional = false;
bool batch_first = false;
// Set up GRU options and instantiate the GRU module
torch::nn::GRUOptions gru_options(input_size, hidden_size);
gru_options.num_layers(num_layers);
gru_options.bidirectional(bidirectional);
gru_options.batch_first(batch_first);
torch::nn::GRU gru(gru_options);
gru->to(x.device());
gru->train(is_training);
// Copy provided weights and biases into the GRU module
auto params = gru->named_parameters();
for (size_t l = 0; l < num_layers; ++l) {
std::string layer_str = std::to_string(l);
std::string w_ih_key = "weight_ih_l" + layer_str;
std::string w_hh_key = "weight_hh_l" + layer_str;
std::string b_ih_key = "bias_ih_l" + layer_str;
std::string b_hh_key = "bias_hh_l" + layer_str;
params[w_ih_key].copy_(gru_weights_ih[l]);
params[w_hh_key].copy_(gru_weights_hh[l]);
params[b_ih_key].copy_(gru_biases_ih[l]);
params[b_hh_key].copy_(gru_biases_hh[l]);
}
// Reshape h0 to match expected dimensions for GRU: (num_layers, batch, hidden)
h0 = h0.contiguous().view({static_cast<int64_t>(num_layers), x.size(1), hidden_size});
// Execute the GRU forward pass (using cuDNN optimized routines) on the default stream
auto gru_result = gru->forward(x, h0);
torch::Tensor result = std::get<0>(gru_result);
// Assume the GRU output tensor 'result' has the shape [T, B, H]
int T = result.size(0);
int B = result.size(1);
int H = result.size(2);
// Allocate output tensor
torch::Tensor output = torch::empty_like(result);
// Define block and grid dimensions for 3D indexing
// Block: 16 threads in hidden dim, 16 threads in batch dim, 1 thread in sequence dimension
dim3 blockDim(16, 16, 1);
// Grid: cover hidden and batch dimensions; sequence dimension is mapped directly to grid z
dim3 gridDim((H + blockDim.x - 1) / blockDim.x, (B + blockDim.y - 1) / blockDim.y, T);
// Launch the kernel
GRU_forward_kernel_3D<<<gridDim, blockDim>>>(result.data_ptr<float>(), output.data_ptr<float>(), T, B, H);
cudaDeviceSynchronize();
return output;
}
// Pybind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "GRU forward with 3D indexing kernel (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 1.080 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 0.804 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 27.258 | % | 0.084 | 5 |
Issued Ipc Active | 1.092 | inst/cycle | 0.000 | 5 |
SM Busy | 27.258 | % | 0.084 | 5 |
Memory Throughput | 600941987173.670 | byte/second | 16183942865592496128.000 | 5 |
Mem Busy | 25.578 | % | 0.033 | 5 |
Max Bandwidth | 22.574 | % | 0.025 | 5 |
L1/TEX Hit Rate | 0.012 | % | 0.000 | 5 |
L2 Hit Rate | 53.230 | % | 0.017 | 5 |
Mem Pipes Busy | 52.896 | % | 0.107 | 5 |
Warp Cycles Per Issued Instruction | 34.862 | cycle | 0.073 | 5 |
Warp Cycles Per Executed Instruction | 35.288 | cycle | 0.072 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 31.100 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 16.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 60.902 | % | 0.237 | 5 |
Achieved Active Warps Per SM | 38.978 | warp | 0.097 | 5 |
Rule | Description |
---|---|
WRN HighPipeUtilization | All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (60.7%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 857896.68 | μs |
Device Time | 61159.00 | μs |
Self CPU Time | 3050.37 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::copy_ | ||
CPU Time | 298935.54 | μs |
Device Time | 82823.72 | μs |
Self CPU Time | 23726.85 | μs |
Self Device Time | 82823.72 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::uniform_ | ||
CPU Time | 1537737.54 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 1537737.54 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 3473216.11 | μs |
Device Time | 3.23 | μs |
Self CPU Time | 3473216.11 | μs |
Self Device Time | 3.23 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::gru | ||
CPU Time | 5017658.29 | μs |
Device Time | 4463757.21 | μs |
Self CPU Time | 3848.94 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_cudnn_rnn | ||
CPU Time | 5012470.01 | μs |
Device Time | 4463757.21 | μs |
Self CPU Time | 1190211.53 | μs |
Self Device Time | 4463757.21 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4>(cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4::Params) | ||
CPU Time | 0.00 | μs |
Device Time | 2876254.49 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2876254.49 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void elemWiseRNNcell<float, float, float, (cudnnRNNMode_t)3, (cudnnRNNBiasMode_t)2>(int, int, int, int, int, bool, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float*, float*, float*, float*, float*, cudnnRNNClipMode_t, cudnnNanPropagation_t, float, float) | ||
CPU Time | 0.00 | μs |
Device Time | 1587502.72 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 1587502.72 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45290 warnings generated when compiling for host. Suppressed 45328 warnings (45281 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.