11 | const float* __restrict__ gates,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12 | const float* __restrict__ prev_c,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:11:31: note: the first parameter in the range is 'gates'
11 | const float* __restrict__ gates,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:12:31: note: the last parameter in the range is 'prev_c'
12 | const float* __restrict__ prev_c,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:13:5: warning: 2 adjacent parameters of 'lstm_elementwise_stride' of similar type ('float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
13 | float* __restrict__ h,
| ^~~~~~~~~~~~~~~~~~~~~~
14 | float* __restrict__ c,
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:13:25: note: the first parameter in the range is 'h'
13 | float* __restrict__ h,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:14:25: note: the last parameter in the range is 'c'
14 | float* __restrict__ c,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:19:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
19 | const int stride = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:21:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:40:5: warning: 3 adjacent parameters of 'linear_stride_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
40 | const float* __restrict__ input,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
41 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:40:31: note: the first parameter in the range is 'input'
40 | const float* __restrict__ input,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:42:31: note: the last parameter in the range is 'bias'
42 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:44:5: warning: 2 adjacent parameters of 'linear_stride_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
44 | int in_dim,
| ^~~~~~~~~~~
45 | int out_dim,
| ~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:44:9: note: the first parameter in the range is 'in_dim'
44 | int in_dim,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:45:9: note: the last parameter in the range is 'out_dim'
45 | int out_dim,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:48:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
48 | const int gid = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:49:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
49 | const int stride = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:56:31: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
56 | const float* w_row = &weight[o * in_dim];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:56:38: note: make conversion explicit to silence this warning
3 | const float* w_row = &weight[o * in_dim];
| ^~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:56:38: note: perform multiplication in a wider type
56 | const float* w_row = &weight[o * in_dim];
| ^
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:57:31: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
57 | const float* x_row = &input[b * in_dim];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:57:37: note: make conversion explicit to silence this warning
57 | const float* x_row = &input[b * in_dim];
| ^~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:57:37: note: perform multiplication in a wider type
57 | const float* x_row = &input[b * in_dim];
| ^
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:71:19: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
71 | torch::Tensor input,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:72:19: warning: the parameter 'w_ih' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
72 | torch::Tensor w_ih,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:73:19: warning: the parameter 'w_hh' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
73 | torch::Tensor w_hh,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:74:19: warning: the parameter 'b_ih' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
74 | torch::Tensor b_ih,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:75:5: warning: 2 adjacent parameters of 'lstm_forward_stride' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
75 | torch::Tensor b_hh,
| ^~~~~~~~~~~~~~~~~~~
76 | torch::Tensor h0,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:75:19: note: the first parameter in the range is 'b_hh'
75 | torch::Tensor b_hh,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:76:19: note: the last parameter in the range is 'h0'
76 | torch::Tensor h0,
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:75:19: warning: the parameter 'b_hh' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
75 | torch::Tensor b_hh,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:76:19: warning: the parameter 'h0' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
76 | torch::Tensor h0,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:77:19: warning: the parameter 'c0' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
77 | torch::Tensor c0
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:79:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
79 | const int batch_size = input.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:80:25: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
80 | const int seq_len = input.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:81:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
81 | const int hidden_size = h0.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:112:19: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
112 | torch::Tensor input,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:113:19: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
113 | torch::Tensor weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:114:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
114 | torch::Tensor bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:116:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
116 | const int batch_size = input.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:117:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
117 | const int in_dim = input.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:118:25: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
118 | const int out_dim = weight.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:140:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
140 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:146:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
146 | torch::Tensor fc_bias,
| ^~~~~~~~~~~~~~~~~~~~~~
147 | torch::Tensor h0,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:146:19: note: the first parameter in the range is 'fc_bias'
146 | torch::Tensor fc_bias,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:147:19: note: the last parameter in the range is 'h0'
147 | torch::Tensor h0,
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:155:24: warning: narrowing conversion from 'size_type' (aka 'unsigned long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
155 | const int layers = lstm_weights_ih.size();
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:169:53: warning: parameter 'fc_weight' is passed by value and only copied once; consider moving it to avoid unnecessary copies [performance-unnecessary-value-param]
3 | return linear_forward_stride(out.select(1, -1), fc_weight, fc_bias);
| ^
| std::move( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s1_35_lstm_grid_stride_base/base/base.cu:169:64: warning: parameter 'fc_bias' is passed by value and only copied once; consider moving it to avoid unnecessary copies [performance-unnecessary-value-param]
169 | return linear_forward_stride(out.select(1, -1), fc_weight, fc_bias);
| ^
| std::move( )