27 | int N, int C, int H, int W,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:27:9: note: the first parameter in the range is 'N'
27 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:27:16: note: the last parameter in the range is 'C'
27 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:27:26: warning: 2 adjacent parameters of 'fused_gelu_group_norm_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
27 | int N, int C, int H, int W,
| ^~~~~~
28 | int num_groups,
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:27:30: note: the first parameter in the range is 'W'
27 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:28:9: note: the last parameter in the range is 'num_groups'
28 | int num_groups,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:29:5: warning: 2 adjacent parameters of 'fused_gelu_group_norm_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
29 | const float* __restrict__ gn_weight,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
30 | const float* __restrict__ gn_bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:29:31: note: the first parameter in the range is 'gn_weight'
29 | const float* __restrict__ gn_weight,
| ^~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:30:31: note: the last parameter in the range is 'gn_bias'
30 | const float* __restrict__ gn_bias,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:34:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
34 | int group_id = blockIdx.x; // overall group id
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:45:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:45:59: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:60:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
60 | int lane = threadIdx.x & 31;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:61:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
61 | int wid = threadIdx.x >> 5;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:71:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
71 | int num_warps = (blockDim.x + 31) / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:82:28: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
82 | mean = group_sum / group_elems;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:83:30: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
83 | var = group_sum_sq / group_elems - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:92:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
92 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:92:59: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
92 | for (int idx = threadIdx.x; idx < group_elems; idx += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:131:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
131 | int N = conv_out.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:132:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
132 | int C = conv_out.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:133:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
133 | int H = conv_out.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:134:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
134 | int W = conv_out.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:136:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
136 | int total_groups = N * num_groups; // One block per (sample, group) pair
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_19/b3_s1_modular_convtrans_gelu_gn/base/base.cu:146:40: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
146 | conv_ptr, out_ptr, N, C, H, W, num_groups,
| ^