from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import _VF
def module_fn(
x: torch.Tensor,
gru_weights_ih: List[torch.Tensor],
gru_weights_hh: List[torch.Tensor],
gru_biases_ih: List[torch.Tensor],
gru_biases_hh: List[torch.Tensor],
h0: torch.Tensor,
is_training: bool,
) -> torch.Tensor:
"""
Functional implementation of GRU
Args:
x: Input tensor of shape (seq_len, batch_size, input_size) if batch_first=False
gru_weights_ih: List of input-hidden weight tensors for each GRU layer
gru_weights_hh: List of hidden-hidden weight tensors for each GRU layer
gru_biases_ih: List of input-hidden bias tensors for each GRU layer
gru_biases_hh: List of hidden-hidden bias tensors for each GRU layer
h0: Initial hidden state
is_training: Whether in training mode
Returns:
output tensor of shape (seq_len, batch_size, hidden_size)
"""
h0 = h0.to(x.device)
# Run single GRU with all layers at once
output, _ = _VF.gru(
x,
h0,
[
w
for layer in zip(
gru_weights_ih, gru_weights_hh, gru_biases_ih, gru_biases_hh
)
for w in layer
],
True, # has_biases
len(gru_weights_ih), # num_layers
0.0, # dropout
is_training, # training
False, # bidirectional
False,
) # batch_first
return output
class Model(nn.Module):
def __init__(
self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
):
"""
:param input_size: The number of expected features in the input x
:param hidden_size: The number of features in the hidden state h
:param num_layers: Number of recurrent layers (default: 1)
:param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
:param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
"""
super(Model, self).__init__()
# Create GRU and extract its parameters
gru = nn.GRU(
input_size,
hidden_size,
num_layers,
bias=bias,
batch_first=batch_first,
dropout=0,
bidirectional=False,
)
# Initialize h0 exactly as in original code
self.h0 = torch.randn((num_layers, batch_size, hidden_size))
# Extract and store GRU parameters
self.gru_weights_ih = nn.ParameterList()
self.gru_weights_hh = nn.ParameterList()
self.gru_biases_ih = nn.ParameterList()
self.gru_biases_hh = nn.ParameterList()
for i in range(num_layers):
self.gru_weights_ih.append(getattr(gru, f"weight_ih_l{i}"))
self.gru_weights_hh.append(getattr(gru, f"weight_hh_l{i}"))
if bias:
self.gru_biases_ih.append(getattr(gru, f"bias_ih_l{i}"))
self.gru_biases_hh.append(getattr(gru, f"bias_hh_l{i}"))
else:
self.gru_biases_ih.append(None)
self.gru_biases_hh.append(None)
def forward(self, x, fn=module_fn):
return fn(
x,
self.gru_weights_ih,
self.gru_weights_hh,
self.gru_biases_ih,
self.gru_biases_hh,
self.h0,
self.training,
)
# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6
def get_inputs():
return [torch.randn(seq_len, batch_size, input_size)]
def get_init_inputs():
return [input_size, hidden_size, num_layers]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False):
"""
:param input_size: The number of expected features in the input x
:param hidden_size: The number of features in the hidden state h
:param num_layers: Number of recurrent layers (default: 1)
:param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
:param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
"""
super(Model, self).__init__()
self.gru = nn.GRU(input_size, hidden_size, num_layers, bias, batch_first, dropout=0, bidirectional=False)
self.h0 = torch.randn((num_layers, batch_size, hidden_size))
def forward(self, x):
"""
:param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
:param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
:return: output, h_n
- output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
- h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
"""
self.h0 = self.h0.to(x.device)
output, h_n = self.gru(x, self.h0)
return output
# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6
def get_inputs():
return [torch.randn(seq_len, batch_size, input_size)]
def get_init_inputs():
return [input_size, hidden_size, num_layers]
#include <torch/extension.h>
#include <torch/torch.h>
#include <vector>
#include <cuda_runtime.h>
__global__ void GRU_forward_kernel_optimized(const float* __restrict__ input,
float* __restrict__ output,
int total_elements,
int hidden_size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
constexpr int UNROLL_FACTOR = 4;
int limit = total_elements - (total_elements % UNROLL_FACTOR);
for (int i = idx; i < limit; i += stride * UNROLL_FACTOR) {
#pragma unroll
for (int j = 0; j < UNROLL_FACTOR; j++) {
output[i + j] = input[i + j];
}
}
for (int i = limit + idx; i < total_elements; i += stride) {
output[i] = input[i];
}
}
torch::Tensor forward(
torch::Tensor x,
std::vector<torch::Tensor> gru_weights_ih,
std::vector<torch::Tensor> gru_weights_hh,
std::vector<torch::Tensor> gru_biases_ih,
std::vector<torch::Tensor> gru_biases_hh,
torch::Tensor h0,
bool is_training) {
h0 = h0.to(x.device());
size_t num_layers = gru_weights_ih.size();
int64_t input_size = x.size(2);
int64_t hidden_size = gru_weights_hh[0].size(1);
torch::nn::GRUOptions gru_options(input_size, hidden_size);
gru_options.num_layers(num_layers)
.bidirectional(false)
.batch_first(false);
torch::nn::GRU gru(gru_options);
gru->to(x.device());
gru->train(is_training);
auto params = gru->named_parameters();
constexpr int NUM_STREAMS = 4;
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamCreate(&streams[i]);
}
for (size_t l = 0; l < num_layers; ++l) {
std::string layer_str = std::to_string(l);
cudaStream_t& current_stream = streams[l % NUM_STREAMS];
std::string w_ih_key = "weight_ih_l" + layer_str;
std::string w_hh_key = "weight_hh_l" + layer_str;
std::string b_ih_key = "bias_ih_l" + layer_str;
std::string b_hh_key = "bias_hh_l" + layer_str;
cudaMemcpyAsync(params[w_ih_key].data_ptr(), gru_weights_ih[l].data_ptr(),
gru_weights_ih[l].numel() * sizeof(float),
cudaMemcpyHostToDevice, current_stream);
cudaMemcpyAsync(params[w_hh_key].data_ptr(), gru_weights_hh[l].data_ptr(),
gru_weights_hh[l].numel() * sizeof(float),
cudaMemcpyHostToDevice, current_stream);
cudaMemcpyAsync(params[b_ih_key].data_ptr(), gru_biases_ih[l].data_ptr(),
gru_biases_ih[l].numel() * sizeof(float),
cudaMemcpyHostToDevice, current_stream);
cudaMemcpyAsync(params[b_hh_key].data_ptr(), gru_biases_hh[l].data_ptr(),
gru_biases_hh[l].numel() * sizeof(float),
cudaMemcpyHostToDevice, current_stream);
}
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamSynchronize(streams[i]);
}
h0 = h0.contiguous().view({static_cast<int64_t>(num_layers), x.size(1), hidden_size});
auto gru_result = gru->forward(x, h0);
torch::Tensor result = std::get<0>(gru_result);
auto output = torch::empty_like(result);
int total_elements = output.numel();
int threads = 256;
int blocks = (total_elements + threads - 1) / threads;
GRU_forward_kernel_optimized<<<blocks, threads, 0, streams[0]>>>(
result.data_ptr<float>(),
output.data_ptr<float>(),
total_elements,
hidden_size
);
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamDestroy(streams[i]);
}
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "Optimized GRU forward with streams and unrolling (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.648 | inst/cycle | 0.000 | 5 |
Executed Ipc Elapsed | 1.922 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 66.616 | % | 0.097 | 5 |
Issued Ipc Active | 2.666 | inst/cycle | 0.000 | 5 |
SM Busy | 66.616 | % | 0.097 | 5 |
Memory Throughput | 620851522931.784 | byte/second | 34475254836632956928.000 | 5 |
Mem Busy | 28.028 | % | 0.053 | 5 |
Max Bandwidth | 37.098 | % | 0.131 | 5 |
L1/TEX Hit Rate | 75.172 | % | 0.002 | 5 |
L2 Hit Rate | 68.468 | % | 0.121 | 5 |
Mem Pipes Busy | 28.786 | % | 0.066 | 5 |
Warp Cycles Per Issued Instruction | 18.804 | cycle | 0.008 | 5 |
Warp Cycles Per Executed Instruction | 18.924 | cycle | 0.008 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 29.270 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 8.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 64.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 100.000 | % | 0.000 | 5 |
Achieved Occupancy | 79.106 | % | 0.069 | 5 |
Achieved Active Warps Per SM | 50.630 | warp | 0.028 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (45.4%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated theoretical (100.0%) and measured achieved occupancy (79.5%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 986721.53 | μs |
Device Time | 88908.50 | μs |
Self CPU Time | 4289.25 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::copy_ | ||
CPU Time | 403861.07 | μs |
Device Time | 104895.29 | μs |
Self CPU Time | 23167.69 | μs |
Self Device Time | 104895.29 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::uniform_ | ||
CPU Time | 2216616.92 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 2216616.92 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 5198230.12 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 5198230.12 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::gru | ||
CPU Time | 7414585.03 | μs |
Device Time | 6503308.29 | μs |
Self CPU Time | 6064.06 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_cudnn_rnn | ||
CPU Time | 7406454.46 | μs |
Device Time | 6503308.29 | μs |
Self CPU Time | 1737438.96 | μs |
Self Device Time | 6503308.29 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4>(cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4::Params) | ||
CPU Time | 0.00 | μs |
Device Time | 4149771.25 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 4149771.25 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void elemWiseRNNcell<float, float, float, (cudnnRNNMode_t)3, (cudnnRNNBiasMode_t)2>(int, int, int, int, int, bool, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float*, float*, float*, float*, float*, cudnnRNNClipMode_t, cudnnNanPropagation_t, float, float) | ||
CPU Time | 0.00 | μs |
Device Time | 2353543.31 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2353543.31 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45289 warnings generated when compiling for host. Suppressed 45328 warnings (45281 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.