16 | const int total_windows, // total number of output pooling windows
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17 | const int N, const int C, const int D, const int H, const int W,
| ~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:16:15: note: the first parameter in the range is 'total_windows'
16 | const int total_windows, // total number of output pooling windows
| ^~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:17:28: note: the last parameter in the range is 'C'
17 | const int N, const int C, const int D, const int H, const int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:17:57: warning: 2 adjacent parameters of 'blocksize_maxpool_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
17 | const int N, const int C, const int D, const int H, const int W,
| ^~~~~~~~~~~~
18 | const int outD, const int outH, const int outW
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:17:67: note: the first parameter in the range is 'W'
17 | const int N, const int C, const int D, const int H, const int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:18:15: note: the last parameter in the range is 'outD'
18 | const int outD, const int outH, const int outW
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:21:29: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int windows_per_block = blockDim.x / 64;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:28:17: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | int group = threadIdx.x / 64; // group index [0, windows_per_block - 1]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:29:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
29 | int lane = threadIdx.x % 64; // lane within the group [0,63]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:37:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
37 | int base_win_idx = blockIdx.x * windows_per_block + group;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:40:74: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
40 | for (int win_idx = base_win_idx; win_idx < total_windows; win_idx += gridDim.x * windows_per_block) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:87:37: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
87 | float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:87:49: note: make conversion explicit to silence this warning
5 | float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
| ^~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:87:49: note: perform multiplication in a wider type
87 | float final_val = fmaxf(shared_data[group * 2], shared_data[group * 2 + 1]);
| ^~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:110:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
110 | const int N = softmax_output.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:111:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
111 | const int C = softmax_output.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:112:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
112 | const int D = softmax_output.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:113:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
113 | const int H = softmax_output.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:114:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
114 | const int W = softmax_output.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: note: make conversion explicit to silence this warning
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^~~~~~~~~~~~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: note: perform multiplication in a wider type
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^~~~~~~~~~~~~~~~~
| static_cast<long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_6/b10_s1_blocksize_experiment_maxpool/base/base.cu:132:22: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
132 | int shared_mem = windows_per_block * 2 * sizeof(float);
| ^