19 | __device__ __forceinline__ void load_store_vec4(float4* dst, float4* src, int idx) {
| ^~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:19:57: note: the first parameter in the range is 'dst'
19 | __device__ __forceinline__ void load_store_vec4(float4* dst, float4* src, int idx) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:19:70: note: the last parameter in the range is 'src'
19 | __device__ __forceinline__ void load_store_vec4(float4* dst, float4* src, int idx) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:32:5: warning: 3 adjacent parameters of 'fused_relu_groupnorm_minimal_atomic_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
32 | int N, int C, int D, int H, int W,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:32:9: note: the first parameter in the range is 'N'
32 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:32:23: note: the last parameter in the range is 'D'
32 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:32:33: warning: 3 adjacent parameters of 'fused_relu_groupnorm_minimal_atomic_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
32 | int N, int C, int D, int H, int W,
| ^~~~~~
33 | int G, float eps)
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:32:37: note: the first parameter in the range is 'W'
32 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:33:18: note: the last parameter in the range is 'eps'
33 | int G, float eps)
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:33:12: note: 'int' and 'float' may be implicitly converted
33 | int G, float eps)
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:35:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
35 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:38:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
38 | const int n = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:39:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
39 | const int g = blockIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:113:38: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
113 | float mean = local_sum / group_size;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:114:44: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
114 | float variance = local_sumsq / group_size - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:157:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
157 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:158:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
158 | torch::Tensor conv_transpose,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
159 | torch::Tensor group_norm_weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:158:19: note: the first parameter in the range is 'conv_transpose'
158 | torch::Tensor conv_transpose,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:159:19: note: the last parameter in the range is 'group_norm_weight'
159 | torch::Tensor group_norm_weight,
| ^~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:158:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
158 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:159:19: warning: the parameter 'group_norm_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
159 | torch::Tensor group_norm_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:160:19: warning: the parameter 'group_norm_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
160 | torch::Tensor group_norm_bias,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:161:5: warning: 2 adjacent parameters of 'forward' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
161 | int64_t groups,
| ^~~~~~~~~~~~~~~
162 | double eps) {
| ~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:161:13: note: the first parameter in the range is 'groups'
161 | int64_t groups,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:162:12: note: the last parameter in the range is 'eps'
162 | double eps) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:161:5: note:
161 | int64_t groups,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:162:5: note: 'int64_t' and 'double' may be implicitly converted: 'int64_t' (as 'long') -> 'double', 'double' -> 'int64_t' (as 'long')
162 | double eps) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:175:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
175 | int N = y.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:176:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
176 | int C = y.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:177:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
177 | int D = y.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:178:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
178 | int H = y.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:179:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
179 | int W = y.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b9_s3_fused_rg_atomic_min_base/base/base.cu:180:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
180 | int G = groups;
| ^