12 | const float* __restrict__ gates,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | const float* __restrict__ prev_c,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:12:31: note: the first parameter in the range is 'gates'
12 | const float* __restrict__ gates,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:13:31: note: the last parameter in the range is 'prev_c'
13 | const float* __restrict__ prev_c,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:14:5: warning: 2 adjacent parameters of 'lstm_elementwise_forward' of similar type ('float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
14 | float* __restrict__ h,
| ^~~~~~~~~~~~~~~~~~~~~~
15 | float* __restrict__ c,
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:14:25: note: the first parameter in the range is 'h'
14 | float* __restrict__ h,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:15:25: note: the last parameter in the range is 'c'
15 | float* __restrict__ c,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:19:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
19 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:20:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
20 | const int bid = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:21:29: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | const int num_threads = blockDim.x * gridDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:25:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
25 | for (int idx = bid * blockDim.x + tid; idx < total_elements; idx += num_threads) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:47:5: warning: 3 adjacent parameters of 'linear_forward_balanced' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
47 | const float* __restrict__ input,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
49 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:47:31: note: the first parameter in the range is 'input'
47 | const float* __restrict__ input,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:49:31: note: the last parameter in the range is 'bias'
49 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:51:5: warning: 2 adjacent parameters of 'linear_forward_balanced' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
51 | const int batch_size,
| ^~~~~~~~~~~~~~~~~~~~~
52 | const int in_features,
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:51:15: note: the first parameter in the range is 'batch_size'
51 | const int batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:52:15: note: the last parameter in the range is 'in_features'
52 | const int in_features,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:57:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
57 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:58:15: warning: Value stored to 'wid' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
58 | const int wid = tid / 32; // warp ID
| ^~~ ~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:58:15: note: Value stored to 'wid' during its initialization is never read
58 | const int wid = tid / 32; // warp ID
| ^~~ ~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:60:15: warning: Value stored to 'num_warps' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
60 | const int num_warps = blockDim.x / 32;
| ^~~~~~~~~ ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:60:15: note: Value stored to 'num_warps' during its initialization is never read
60 | const int num_warps = blockDim.x / 32;
| ^~~~~~~~~ ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:60:27: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
60 | const int num_warps = blockDim.x / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:62:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
62 | for (int out_idx = blockIdx.x; out_idx < batch_size * out_features; out_idx += gridDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:62:84: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
62 | for (int out_idx = blockIdx.x; out_idx < batch_size * out_features; out_idx += gridDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:67:31: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
67 | const float* in_row = input + batch * in_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:67:39: note: make conversion explicit to silence this warning
2 | const float* in_row = input + batch * in_features;
| ^~~~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:67:39: note: perform multiplication in a wider type
67 | const float* in_row = input + batch * in_features;
| ^~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:68:30: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
68 | const float* w_row = weight + feat * in_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:68:39: note: make conversion explicit to silence this warning
68 | const float* w_row = weight + feat * in_features;
| ^~~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:68:39: note: perform multiplication in a wider type
68 | const float* w_row = weight + feat * in_features;
| ^~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:93:5: warning: 2 adjacent parameters of 'lstm_forward_cuda' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
93 | torch::Tensor input,
| ^~~~~~~~~~~~~~~~~~~~
94 | torch::Tensor w_ih,
| ~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:93:19: note: the first parameter in the range is 'input'
93 | torch::Tensor input,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:94:19: note: the last parameter in the range is 'w_ih'
94 | torch::Tensor w_ih,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:93:19: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
93 | torch::Tensor input,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:94:19: warning: the parameter 'w_ih' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
94 | torch::Tensor w_ih,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:95:5: warning: 4 adjacent parameters of 'lstm_forward_cuda' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
95 | torch::Tensor w_hh,
| ^~~~~~~~~~~~~~~~~~~
96 | torch::Tensor b_ih,
| ~~~~~~~~~~~~~~~~~~~
97 | torch::Tensor b_hh,
| ~~~~~~~~~~~~~~~~~~~
98 | torch::Tensor h0,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:95:19: note: the first parameter in the range is 'w_hh'
95 | torch::Tensor w_hh,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:98:19: note: the last parameter in the range is 'h0'
98 | torch::Tensor h0,
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:95:19: warning: the parameter 'w_hh' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
95 | torch::Tensor w_hh,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:96:19: warning: the parameter 'b_ih' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
96 | torch::Tensor b_ih,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:97:19: warning: the parameter 'b_hh' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
97 | torch::Tensor b_hh,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:98:19: warning: the parameter 'h0' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
98 | torch::Tensor h0,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:99:19: warning: the parameter 'c0' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
99 | torch::Tensor c0
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:101:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
101 | const int batch_size = input.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:102:25: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
102 | const int seq_len = input.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:103:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
103 | const int hidden_size = h0.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:137:19: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
137 | torch::Tensor input,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:138:19: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
138 | torch::Tensor weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:139:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
139 | torch::Tensor bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:141:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
141 | const int batch_size = input.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:142:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
142 | const int in_features = input.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:143:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
143 | const int out_features = weight.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:164:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
164 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:165:5: warning: 4 adjacent parameters of 'forward' of similar type ('std::vector<torch::Tensor>') are easily swapped by mistake [bugprone-easily-swappable-parameters]
165 | std::vector<torch::Tensor> lstm_weights_ih,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
166 | std::vector<torch::Tensor> lstm_weights_hh,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
167 | std::vector<torch::Tensor> lstm_biases_ih,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
168 | std::vector<torch::Tensor> lstm_biases_hh,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:165:32: note: the first parameter in the range is 'lstm_weights_ih'
165 | std::vector<torch::Tensor> lstm_weights_ih,
| ^~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:168:32: note: the last parameter in the range is 'lstm_biases_hh'
168 | std::vector<torch::Tensor> lstm_biases_hh,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:170:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
170 | torch::Tensor fc_bias,
| ^~~~~~~~~~~~~~~~~~~~~~
171 | torch::Tensor h0,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:170:19: note: the first parameter in the range is 'fc_bias'
170 | torch::Tensor fc_bias,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:171:19: note: the last parameter in the range is 'h0'
171 | torch::Tensor h0,
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:179:28: warning: narrowing conversion from 'size_type' (aka 'unsigned long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
179 | const int num_layers = lstm_weights_ih.size();
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:194:36: warning: parameter 'fc_weight' is passed by value and only copied once; consider moving it to avoid unnecessary copies [performance-unnecessary-value-param]
2 | out = linear_forward_cuda(out, fc_weight, fc_bias);
| ^
| std::move( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_3/task_35/b5_s2_35_lstm_workload_balanced/base/base.cu:194:47: warning: parameter 'fc_bias' is passed by value and only copied once; consider moving it to avoid unnecessary copies [performance-unnecessary-value-param]
194 | out = linear_forward_cuda(out, fc_weight, fc_bias);
| ^
| std::move( )