14 | const float* __restrict__ output,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | const float* __restrict__ scaling_factor,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:14:31: note: the first parameter in the range is 'output'
14 | const float* __restrict__ output,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:16:31: note: the last parameter in the range is 'bias'
16 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:23:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
23 | int warp_in_block = threadIdx.x / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:24:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
24 | int lane = threadIdx.x % 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:25:27: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
25 | int warps_per_block = blockDim.x / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:28:26: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | int global_warp_id = blockIdx.x * warps_per_block + warp_in_block;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:44:17: warning: 2nd function call argument is an uninitialized value [clang-analyzer-core.CallAndMessage]
44 | s_val = __shfl_sync(0xffffffff, s_val, 0);
| ^ ~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:33:9: note: Assuming 'global_warp_id' is < 'total_tiles'
33 | if (global_warp_id < total_tiles) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:33:5: note: Taking true branch
33 | if (global_warp_id < total_tiles) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:39:15: note: 's_val' declared without an initial value
39 | float s_val, b_val;
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:40:13: note: Assuming 'lane' is not equal to 0
40 | if (lane == 0) {
| ^~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:40:9: note: Taking false branch
40 | if (lane == 0) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:44:17: note: 2nd function call argument is an uninitialized value
44 | s_val = __shfl_sync(0xffffffff, s_val, 0);
| ^ ~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:66:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
66 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:67:19: warning: the parameter 'conv_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
67 | torch::Tensor conv_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:68:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
68 | torch::Tensor conv_bias,
| ^~~~~~~~~~~~~~~~~~~~~~~~
69 | torch::Tensor scaling_factor,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:68:19: note: the first parameter in the range is 'conv_bias'
68 | torch::Tensor conv_bias,
| ^~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:69:19: note: the last parameter in the range is 'scaling_factor'
69 | torch::Tensor scaling_factor,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:69:19: warning: the parameter 'scaling_factor' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
69 | torch::Tensor scaling_factor,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:70:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
70 | torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:75:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
75 | int batch = conv_out.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:76:20: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
76 | int channels = conv_out.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:77:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
77 | int depth = conv_out.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:78:18: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
78 | int height = conv_out.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250211_optimize_b5_s4_e1_v2/level_2/task_48/b5_s3_block_size_experimentation/base/base.cu:79:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
79 | int width = conv_out.size(4);
| ^