20 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:21:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int bid = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:22:9: warning: Value stored to 'index' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
22 | int index = bid * spatial_size + tid;
| ^~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:22:9: note: Value stored to 'index' during its initialization is never read
22 | int index = bid * spatial_size + tid;
| ^~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:28:49: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | for (int i = tid * 4; i < vector_size; i += blockDim.x * 4) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:34:60: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
34 | for (int i = vector_size + tid; i < spatial_size; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:62:5: warning: 2 adjacent parameters of 'module_fn_cuda' of similar type ('double') are easily swapped by mistake [bugprone-easily-swappable-parameters]
62 | double momentum,
| ^~~~~~~~~~~~~~~~
63 | double scale_factor,
| ~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:62:12: note: the first parameter in the range is 'momentum'
62 | double momentum,
| ^~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:63:12: note: the last parameter in the range is 'scale_factor'
63 | double scale_factor,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:64:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
64 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:65:5: warning: 2 adjacent parameters of 'module_fn_cuda' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
65 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
66 | torch::Tensor bn_weight,
| ~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:65:19: note: the first parameter in the range is 'conv_transpose_bias'
65 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:66:19: note: the last parameter in the range is 'bn_weight'
66 | torch::Tensor bn_weight,
| ^~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:101:22: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
101 | int batch_size = sizes[0];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:102:20: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
102 | int channels = sizes[1];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:103:24: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
103 | int spatial_size = sizes[2] * sizes[3] * sizes[4];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:105:31: warning: performing an implicit widening conversion to type 'const long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
105 | auto x_reshaped = x.view({batch_size * channels, spatial_size});
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:105:31: note: make conversion explicit to silence this warning
4 | auto x_reshaped = x.view({batch_size * channels, spatial_size});
| ^~~~~~~~~~~~~~~~~~~~~
| static_cast<const long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:105:31: note: perform multiplication in a wider type
105 | auto x_reshaped = x.view({batch_size * channels, spatial_size});
| ^~~~~~~~~~
| static_cast<const long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:106:33: warning: performing an implicit widening conversion to type 'const long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
106 | auto output = torch::empty({batch_size * channels}, x.options());
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:106:33: note: make conversion explicit to silence this warning
106 | auto output = torch::empty({batch_size * channels}, x.options());
| ^~~~~~~~~~~~~~~~~~~~~
| static_cast<const long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:106:33: note: perform multiplication in a wider type
106 | auto output = torch::empty({batch_size * channels}, x.options());
| ^~~~~~~~~~
| static_cast<const long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_77/b4_s1_optimized_global_avg_pool/base/base.cu:110:27: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
110 | int shared_mem_size = ((threads.x + WARP_SIZE - 1) / WARP_SIZE) * sizeof(float);
| ^