24 | const float* __restrict__ input,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:24:31: note: the first parameter in the range is 'input'
24 | const float* __restrict__ input,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:26:31: note: the last parameter in the range is 'bias'
26 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:28:5: warning: 2 adjacent parameters of 'unrolled_fused_conv_gelu_pool_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
28 | const int N,
| ^~~~~~~~~~~~
29 | const int in_channels,
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:28:15: note: the first parameter in the range is 'N'
28 | const int N,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:29:15: note: the last parameter in the range is 'in_channels'
29 | const int in_channels,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:31:5: warning: 3 adjacent parameters of 'unrolled_fused_conv_gelu_pool_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
31 | const int in_w,
| ^~~~~~~~~~~~~~~
32 | const int out_channels,
| ~~~~~~~~~~~~~~~~~~~~~~~
33 | const int out_h,
| ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:31:15: note: the first parameter in the range is 'in_w'
31 | const int in_w,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:33:15: note: the last parameter in the range is 'out_h'
33 | const int out_h,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:40:28: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
40 | float* partial_sums = &shared_mem[in_channels * KERNEL_SIZE * KERNEL_SIZE];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:40:39: note: make conversion explicit to silence this warning
5 | float* partial_sums = &shared_mem[in_channels * KERNEL_SIZE * KERNEL_SIZE];
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:40:39: note: perform multiplication in a wider type
40 | float* partial_sums = &shared_mem[in_channels * KERNEL_SIZE * KERNEL_SIZE];
| ^~~~~~~~~~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:42:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
42 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:43:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
43 | const int n = blockIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:44:23: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
44 | const int c_out = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:73:39: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
73 | const float* w_ptr = &conv_weights[ic * KERNEL_SIZE * KERNEL_SIZE];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:73:52: note: make conversion explicit to silence this warning
73 | const float* w_ptr = &conv_weights[ic * KERNEL_SIZE * KERNEL_SIZE];
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:73:52: note: perform multiplication in a wider type
73 | const float* w_ptr = &conv_weights[ic * KERNEL_SIZE * KERNEL_SIZE];
| ^~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:116:19: warning: the parameter 'input' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
116 | torch::Tensor input,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:117:19: warning: the parameter 'conv_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
117 | torch::Tensor conv_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:118:19: warning: the parameter 'conv_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
118 | torch::Tensor conv_bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:124:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
124 | const int N = input.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:125:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
125 | const int in_channels = input.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:126:22: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
126 | const int in_h = input.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:127:22: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
127 | const int in_w = input.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_67/b10_s1_unrolled_fused_conv_gelu_pool/base/base.cu:128:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
128 | const int out_channels = conv_weight.size(0);
| ^