14 | const float* __restrict__ gamma, // GroupNorm weight, shape: [C]
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | const float* __restrict__ beta, // GroupNorm bias, shape: [C]
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:14:31: note: the first parameter in the range is 'gamma'
14 | const float* __restrict__ gamma, // GroupNorm weight, shape: [C]
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:15:31: note: the last parameter in the range is 'beta'
15 | const float* __restrict__ beta, // GroupNorm bias, shape: [C]
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:16:5: warning: 3 adjacent parameters of 'fused_relu_groupnorm_no_div_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
16 | int N, int C, int D, int H, int W,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:16:9: note: the first parameter in the range is 'N'
16 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:16:23: note: the last parameter in the range is 'D'
16 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:16:33: warning: 3 adjacent parameters of 'fused_relu_groupnorm_no_div_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
16 | int N, int C, int D, int H, int W,
| ^~~~~~
17 | int G, float eps) // groups and epsilon
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:16:37: note: the first parameter in the range is 'W'
16 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:17:18: note: the last parameter in the range is 'eps'
17 | int G, float eps) // groups and epsilon
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:17:12: note: 'int' and 'float' may be implicitly converted
17 | int G, float eps) // groups and epsilon
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:24:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
24 | int n = blockIdx.x; // sample index
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:25:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
25 | int g = blockIdx.y; // group index
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:43:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
43 | for (int i = threadIdx.x; i < channels_per_group; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:43:60: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
43 | for (int i = threadIdx.x; i < channels_per_group; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:57:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
57 | for (int i = threadIdx.x; i < num_vec; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:57:49: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
57 | for (int i = threadIdx.x; i < num_vec; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:75:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
75 | int num_rem_iters = (rem + blockDim.x - 1) / blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:77:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
77 | int idx = start + i * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:89:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
89 | int lane = threadIdx.x & (WARP_SIZE - 1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:96:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
96 | int warp_id = threadIdx.x / WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:107:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
107 | int num_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:112:34: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
112 | float mean = group_sum / group_elems;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:113:35: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
113 | float var = group_sumsq / group_elems - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:124:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
124 | for (int i = threadIdx.x; i < group_elems; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:124:53: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
124 | for (int i = threadIdx.x; i < group_elems; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:136:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
136 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:137:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
137 | torch::Tensor conv_transpose,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
138 | torch::Tensor group_norm_weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:137:19: note: the first parameter in the range is 'conv_transpose'
137 | torch::Tensor conv_transpose,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:138:19: note: the last parameter in the range is 'group_norm_weight'
138 | torch::Tensor group_norm_weight,
| ^~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:137:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
137 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:138:19: warning: the parameter 'group_norm_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
138 | torch::Tensor group_norm_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:139:19: warning: the parameter 'group_norm_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
139 | torch::Tensor group_norm_bias,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:140:5: warning: 2 adjacent parameters of 'forward' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
140 | int64_t groups,
| ^~~~~~~~~~~~~~~
141 | double eps) {
| ~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:140:13: note: the first parameter in the range is 'groups'
140 | int64_t groups,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:141:12: note: the last parameter in the range is 'eps'
141 | double eps) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:140:5: note:
140 | int64_t groups,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:141:5: note: 'int64_t' and 'double' may be implicitly converted: 'int64_t' (as 'long') -> 'double', 'double' -> 'int64_t' (as 'long')
141 | double eps) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:155:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
155 | int N = y.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:156:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
156 | int C = y.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:157:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
157 | int D = y.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:158:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
158 | int H = y.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:159:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
159 | int W = y.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:160:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
160 | int G = groups;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:162:9: warning: Value stored to 'spatial_size' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
162 | int spatial_size = D * H * W;
| ^~~~~~~~~~~~ ~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b7_s3_fused_rg_no_divergence/base/base.cu:162:9: note: Value stored to 'spatial_size' during its initialization is never read
162 | int spatial_size = D * H * W;
| ^~~~~~~~~~~~ ~~~~~~~~~