from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import _VF
def module_fn(
x: torch.Tensor,
gru_weights_ih: List[torch.Tensor],
gru_weights_hh: List[torch.Tensor],
gru_biases_ih: List[torch.Tensor],
gru_biases_hh: List[torch.Tensor],
h0: torch.Tensor,
is_training: bool,
) -> torch.Tensor:
"""
Functional implementation of GRU
Args:
x: Input tensor of shape (seq_len, batch_size, input_size) if batch_first=False
gru_weights_ih: List of input-hidden weight tensors for each GRU layer
gru_weights_hh: List of hidden-hidden weight tensors for each GRU layer
gru_biases_ih: List of input-hidden bias tensors for each GRU layer
gru_biases_hh: List of hidden-hidden bias tensors for each GRU layer
h0: Initial hidden state
is_training: Whether in training mode
Returns:
output tensor of shape (seq_len, batch_size, hidden_size)
"""
h0 = h0.to(x.device)
# Run single GRU with all layers at once
output, _ = _VF.gru(
x,
h0,
[
w
for layer in zip(
gru_weights_ih, gru_weights_hh, gru_biases_ih, gru_biases_hh
)
for w in layer
],
True, # has_biases
len(gru_weights_ih), # num_layers
0.0, # dropout
is_training, # training
False, # bidirectional
False,
) # batch_first
return output
class Model(nn.Module):
def __init__(
self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False
):
"""
:param input_size: The number of expected features in the input x
:param hidden_size: The number of features in the hidden state h
:param num_layers: Number of recurrent layers (default: 1)
:param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
:param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
"""
super(Model, self).__init__()
# Create GRU and extract its parameters
gru = nn.GRU(
input_size,
hidden_size,
num_layers,
bias=bias,
batch_first=batch_first,
dropout=0,
bidirectional=False,
)
# Initialize h0 exactly as in original code
self.h0 = torch.randn((num_layers, batch_size, hidden_size))
# Extract and store GRU parameters
self.gru_weights_ih = nn.ParameterList()
self.gru_weights_hh = nn.ParameterList()
self.gru_biases_ih = nn.ParameterList()
self.gru_biases_hh = nn.ParameterList()
for i in range(num_layers):
self.gru_weights_ih.append(getattr(gru, f"weight_ih_l{i}"))
self.gru_weights_hh.append(getattr(gru, f"weight_hh_l{i}"))
if bias:
self.gru_biases_ih.append(getattr(gru, f"bias_ih_l{i}"))
self.gru_biases_hh.append(getattr(gru, f"bias_hh_l{i}"))
else:
self.gru_biases_ih.append(None)
self.gru_biases_hh.append(None)
def forward(self, x, fn=module_fn):
return fn(
x,
self.gru_weights_ih,
self.gru_weights_hh,
self.gru_biases_ih,
self.gru_biases_hh,
self.h0,
self.training,
)
# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6
def get_inputs():
return [torch.randn(seq_len, batch_size, input_size)]
def get_init_inputs():
return [input_size, hidden_size, num_layers]
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=3, bias=True, batch_first=False):
"""
:param input_size: The number of expected features in the input x
:param hidden_size: The number of features in the hidden state h
:param num_layers: Number of recurrent layers (default: 1)
:param bias: If False, then the layer does not use bias weights b_ih and b_hh (default: True)
:param batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) (default: False)
"""
super(Model, self).__init__()
self.gru = nn.GRU(input_size, hidden_size, num_layers, bias, batch_first, dropout=0, bidirectional=False)
self.h0 = torch.randn((num_layers, batch_size, hidden_size))
def forward(self, x):
"""
:param x: The input tensor, shape (seq_len, batch_size, input_size) if batch_first=False, otherwise (batch_size, seq_len, input_size)
:param h_0: The initial hidden state for the input sequence, shape (num_layers * num_directions, batch_size, hidden_size) (default: None)
:return: output, h_n
- output: The output features (h_t) from the last layer of the GRU, for each t, shape (seq_len, batch_size, num_directions * hidden_size) if batch_first=False, otherwise (batch_size, seq_len, num_directions * hidden_size)
- h_n: The hidden state for t = seq_len, shape (num_layers * num_directions, batch_size, hidden_size)
"""
self.h0 = self.h0.to(x.device)
output, h_n = self.gru(x, self.h0)
return output
# Test code
batch_size = 10
seq_len = 512
input_size = 128
hidden_size = 256
num_layers = 6
def get_inputs():
return [torch.randn(seq_len, batch_size, input_size)]
def get_init_inputs():
return [input_size, hidden_size, num_layers]
#include <torch/extension.h>
#include <torch/torch.h>
#include <cuda_runtime.h>
#include <vector>
#include <cstdio>
// Optimized kernel that combines even workload distribution with vectorized memory accesses
__global__ void optimized_copy_kernel(const float* __restrict__ in, float* __restrict__ out, int total) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int total_threads = gridDim.x * blockDim.x;
// Evenly split the work among threads (each gets a contiguous chunk)
int base = total / total_threads;
int rem = total % total_threads;
int start, count;
if (tid < rem) {
count = base + 1;
start = tid * count;
} else {
count = base;
start = tid * base + rem;
}
// Check if the starting pointer is 16-byte aligned (needed for float4 vectorized loads)
bool aligned = ((((uintptr_t)in) + start * sizeof(float)) & 0xF) == 0;
if (aligned && count >= 4) {
// Process as many groups of 4 as possible with vectorized copy
int nvec = count / 4; // number of float4 iterations
int tail = count % 4; // remaining elements
// Cast pointers to float4
const float4* in_vec = reinterpret_cast<const float4*>(in);
float4* out_vec = reinterpret_cast<float4*>(out);
// The index for vectorized processing
int vec_index = start / 4;
for (int i = 0; i < nvec; i++) {
out_vec[vec_index + i] = in_vec[vec_index + i];
}
// Process remaining tail elements elementwise
int elem_index = start + nvec * 4;
for (int i = 0; i < tail; i++) {
out[elem_index + i] = in[elem_index + i];
}
} else {
// Fallback to elementwise copy if not aligned or insufficient elements for vectorization
for (int i = start; i < start + count; i++) {
out[i] = in[i];
}
}
}
// Forward function that uses the cuDNN-accelerated GRU module and then applies the optimized copy kernel.
// This function sets up the GRU using the given weights and biases, runs the forward pass, and then
// copies the result using our efficient CUDA kernel.
torch::Tensor forward(
torch::Tensor x,
std::vector<torch::Tensor> gru_weights_ih,
std::vector<torch::Tensor> gru_weights_hh,
std::vector<torch::Tensor> gru_biases_ih,
std::vector<torch::Tensor> gru_biases_hh,
torch::Tensor h0,
bool is_training) {
// Ensure initial hidden state is on the same device
h0 = h0.to(x.device());
size_t num_layers = gru_weights_ih.size();
int64_t input_size = x.size(2);
int64_t hidden_size = gru_weights_hh[0].size(1);
bool bidirectional = false;
bool batch_first = false;
// Create GRU options and initialize the GRU module (cuDNN optimized under the hood)
torch::nn::GRUOptions gru_options(input_size, hidden_size);
gru_options.num_layers(num_layers);
gru_options.bidirectional(bidirectional);
gru_options.batch_first(batch_first);
torch::nn::GRU gru(gru_options);
gru->to(x.device());
gru->train(is_training);
// Copy provided weights and biases into the GRU module's parameters
auto params = gru->named_parameters();
for (size_t l = 0; l < num_layers; ++l) {
std::string layer_str = std::to_string(l);
params["weight_ih_l" + layer_str].copy_(gru_weights_ih[l]);
params["weight_hh_l" + layer_str].copy_(gru_weights_hh[l]);
params["bias_ih_l" + layer_str].copy_(gru_biases_ih[l]);
params["bias_hh_l" + layer_str].copy_(gru_biases_hh[l]);
}
// Reshape h0 to match expected dimensions: (num_layers, batch, hidden_size)
h0 = h0.contiguous().view({static_cast<int64_t>(num_layers), x.size(1), hidden_size});
// Execute the GRU forward pass (cuDNN accelerated)
auto gru_result = gru->forward(x, h0);
torch::Tensor result = std::get<0>(gru_result);
// Allocate output tensor of the same shape
torch::Tensor output = torch::empty_like(result);
int total_elements = output.numel();
// Launch the optimized kernel
int threads = 256;
int blocks = (total_elements + threads - 1) / threads;
optimized_copy_kernel<<<blocks, threads>>>(result.data_ptr<float>(), output.data_ptr<float>(), total_elements);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel launch error: %s\n", cudaGetErrorString(err));
}
cudaDeviceSynchronize();
return output;
}
// Pybind11 module definition
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "GRU forward with optimized copy kernel (CUDA)");
}
Metric | Value | Unit | Variance | Samples |
---|---|---|---|---|
Executed Ipc Active | 2.414 | inst/cycle | 0.001 | 5 |
Executed Ipc Elapsed | 1.856 | inst/cycle | 0.000 | 5 |
Issue Slots Busy | 60.900 | % | 0.431 | 5 |
Issued Ipc Active | 2.434 | inst/cycle | 0.001 | 5 |
SM Busy | 60.900 | % | 0.431 | 5 |
Memory Throughput | 548883498639.302 | byte/second | 13762310464949231616.000 | 5 |
Mem Busy | 14.228 | % | 0.024 | 5 |
Max Bandwidth | 17.446 | % | 0.022 | 5 |
L1/TEX Hit Rate | 0.000 | % | 0.000 | 5 |
L2 Hit Rate | 53.478 | % | 0.141 | 5 |
Mem Pipes Busy | 17.138 | % | 0.017 | 5 |
Warp Cycles Per Issued Instruction | 15.570 | cycle | 0.002 | 5 |
Warp Cycles Per Executed Instruction | 15.718 | cycle | 0.002 | 5 |
Avg. Active Threads Per Warp | 32.000 | 0.000 | 5 | |
Avg. Not Predicated Off Threads Per Warp | 28.800 | 0.000 | 5 | |
Max Active Clusters | 0.000 | cluster | 0.000 | 5 |
Max Cluster Size | 8.000 | block | 0.000 | 5 |
Overall GPU Occupancy | 0.000 | % | 0.000 | 5 |
Cluster Occupancy | 0.000 | % | 0.000 | 5 |
Block Limit SM | 32.000 | block | 0.000 | 5 |
Block Limit Registers | 6.000 | block | 0.000 | 5 |
Block Limit Shared Mem | 32.000 | block | 0.000 | 5 |
Block Limit Warps | 8.000 | block | 0.000 | 5 |
Theoretical Active Warps per SM | 48.000 | warp | 0.000 | 5 |
Theoretical Occupancy | 75.000 | % | 0.000 | 5 |
Achieved Occupancy | 60.294 | % | 0.045 | 5 |
Achieved Active Warps Per SM | 38.590 | warp | 0.019 | 5 |
Rule | Description |
---|---|
INF HighPipeUtilization | ALU is the highest-utilized pipeline (44.5%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck. |
INF CPIStall | Check the Warp Stall Sampling (All Cycles) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason. |
WRN Occupancy | This kernel's theoretical occupancy (75.0%) is limited by the number of required registers. The difference between calculated theoretical (75.0%) and measured achieved occupancy (60.0%) can be the result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on optimizing occupancy. |
Operation / Metric | Value | Unit |
---|---|---|
aten::to | ||
CPU Time | 682996.05 | μs |
Device Time | 91531.57 | μs |
Self CPU Time | 4232.55 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::copy_ | ||
CPU Time | 443832.99 | μs |
Device Time | 124545.30 | μs |
Self CPU Time | 33232.60 | μs |
Self Device Time | 124545.30 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::uniform_ | ||
CPU Time | 2267942.72 | μs |
Device Time | 0.00 | μs |
Self CPU Time | 2267942.72 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
cudaLaunchKernel | ||
CPU Time | 5108691.55 | μs |
Device Time | 4.80 | μs |
Self CPU Time | 5108691.55 | μs |
Self Device Time | 4.80 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::gru | ||
CPU Time | 7342530.61 | μs |
Device Time | 6854505.61 | μs |
Self CPU Time | 5549.21 | μs |
Self Device Time | 0.00 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
aten::_cudnn_rnn | ||
CPU Time | 7335112.23 | μs |
Device Time | 6854505.61 | μs |
Self CPU Time | 1733597.82 | μs |
Self Device Time | 6854505.61 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4>(cutlass_80_tensorop_s1688gemm_64x64_32x6_tn_align4::Params) | ||
CPU Time | 0.00 | μs |
Device Time | 4418624.09 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 4418624.09 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
void elemWiseRNNcell<float, float, float, (cudnnRNNMode_t)3, (cudnnRNNBiasMode_t)2>(int, int, int, int, int, bool, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float*, float*, float*, float*, float*, cudnnRNNClipMode_t, cudnnNanPropagation_t, float, float) | ||
CPU Time | 0.00 | μs |
Device Time | 2435881.52 | μs |
Self CPU Time | 0.00 | μs |
Self Device Time | 2435881.52 | μs |
CPU Memory Usage | 0 | B |
Device Memory Usage | 0 | B |
Self CPU Memory Usage | 0 | B |
Self Device Memory Usage | 0 | B |
45287 warnings generated when compiling for host. Suppressed 45328 warnings (45281 in non-user code, 47 NOLINT). Use -header-filter=.* to display errors from all non-system headers. Use -system-headers to display errors from system headers as well.