16 | const float* __restrict__ conv_output,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17 | const float* __restrict__ element_bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:16:31: note: the first parameter in the range is 'conv_output'
16 | const float* __restrict__ conv_output,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:17:31: note: the last parameter in the range is 'element_bias'
17 | const float* __restrict__ element_bias,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:19:5: warning: 2 adjacent parameters of 'coalesced_atomic_selective_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
19 | int num_elements,
| ^~~~~~~~~~~~~~~~~
20 | int channels,
| ~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:19:9: note: the first parameter in the range is 'num_elements'
19 | int num_elements,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:20:9: note: the last parameter in the range is 'channels'
20 | int channels,
| ^~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:26:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
26 | for (int i = threadIdx.x; i < channels; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:26:50: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
26 | for (int i = threadIdx.x; i < channels; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:31:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
31 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:32:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | int total_threads = gridDim.x * blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:64:9: warning: Value stored to 'remainder' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
64 | int remainder = num_elements % 4;
| ^~~~~~~~~ ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:64:9: note: Value stored to 'remainder' during its initialization is never read
64 | int remainder = num_elements % 4;
| ^~~~~~~~~ ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:85:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
85 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:89:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
89 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:90:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
90 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
91 | torch::Tensor bias
| ~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:90:19: note: the first parameter in the range is 'conv_transpose_bias'
90 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:91:19: note: the last parameter in the range is 'bias'
91 | torch::Tensor bias
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:91:19: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
91 | torch::Tensor bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:103:20: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
103 | int channels = sizes[1];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:104:24: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
104 | int spatial_size = sizes[2] * sizes[3] * sizes[4]; // D * H * W
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_20/b5_s1_coalesced_atomic_selective_kernel/base/base.cu:105:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
105 | int num_elements = conv_result.numel();
| ^