13 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor.")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:14:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
14 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous.")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:21:5: warning: 4 adjacent parameters of 'fused_warp_groupnorm_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
21 | const float* __restrict__ bias, // bias for elementwise op (size 1 or C)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
22 | const float* __restrict__ scale,// scale for elementwise op (size 1 or C)
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23 | const float* __restrict__ gn_weight, // group norm weight (gamma), shape [C]
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 | const float* __restrict__ gn_bias, // group norm bias (beta), shape [C]
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:21:31: note: the first parameter in the range is 'bias'
21 | const float* __restrict__ bias, // bias for elementwise op (size 1 or C)
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:24:31: note: the last parameter in the range is 'gn_bias'
24 | const float* __restrict__ gn_bias, // group norm bias (beta), shape [C]
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:25:5: warning: 2 adjacent parameters of 'fused_warp_groupnorm_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | int N, int C, int H, int W,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:25:9: note: the first parameter in the range is 'N'
25 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:25:16: note: the last parameter in the range is 'C'
25 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:25:26: warning: 2 adjacent parameters of 'fused_warp_groupnorm_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | int N, int C, int H, int W,
| ^~~~~~
26 | int num_groups,
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:25:30: note: the first parameter in the range is 'W'
25 | int N, int C, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:26:9: note: the last parameter in the range is 'num_groups'
26 | int num_groups,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:32:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | int sample_idx = blockIdx.x / num_groups;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:33:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
33 | int group_idx = blockIdx.x % num_groups;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:45:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | for (int i = threadIdx.x; i < group_size; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:45:52: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | for (int i = threadIdx.x; i < group_size; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:74:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
74 | int warpId = threadIdx.x / warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:75:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
75 | int lane = threadIdx.x % warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:76:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
76 | int num_warps = (blockDim.x + warpSize - 1) / warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:100:30: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
100 | float mean = group_sum / group_size;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:101:33: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
101 | float var = group_sum_sq / group_size - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:105:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
105 | for (int i = threadIdx.x; i < group_size; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:105:52: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
105 | for (int i = threadIdx.x; i < group_size; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:121:16: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
121 | at::Tensor x, // Input tensor from conv2d
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:122:16: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
122 | at::Tensor bias, // Bias for elementwise op
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:123:16: warning: the parameter 'scale' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
123 | at::Tensor scale, // Scale for elementwise op
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:124:16: warning: the parameter 'y' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
124 | at::Tensor y, // Output tensor
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:125:16: warning: the parameter 'gn_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
125 | at::Tensor gn_weight, // Group normalization weight (gamma)
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:126:16: warning: the parameter 'gn_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
126 | at::Tensor gn_bias, // Group normalization bias (beta)
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:132:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
132 | int N = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:133:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
133 | int C = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:134:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
134 | int H = x.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:135:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
135 | int W = x.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:138:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
138 | int total_blocks = N * num_groups;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:141:30: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
141 | size_t shared_mem_size = num_warps * 2 * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:141:30: note: make conversion explicit to silence this warning
12 | size_t shared_mem_size = num_warps * 2 * sizeof(float);
| ^~~~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:141:30: note: perform multiplication in a wider type
141 | size_t shared_mem_size = num_warps * 2 * sizeof(float);
| ^~~~~~~~~
| static_cast<long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:151:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
151 | num_groups,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:163:16: warning: the parameter 'conv_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
163 | at::Tensor conv_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:165:16: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
165 | at::Tensor bias,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:166:16: warning: the parameter 'scale' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
166 | at::Tensor scale,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:167:16: warning: the parameter 'gn_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
167 | at::Tensor gn_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_21/b4_s2_fused_warp_groupnorm/base/base.cu:168:16: warning: the parameter 'gn_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
168 | at::Tensor gn_bias,
| ^
| const &