9 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:10:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
10 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:23:5: warning: 2 adjacent parameters of 'fused_vec_gridstride_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
23 | int N, int C, int D, int H, int W,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:23:9: note: the first parameter in the range is 'N'
23 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:23:16: note: the last parameter in the range is 'C'
23 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:23:33: warning: 3 adjacent parameters of 'fused_vec_gridstride_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
23 | int N, int C, int D, int H, int W,
| ^~~~~~
24 | int groups, float eps
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:23:37: note: the first parameter in the range is 'W'
23 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:24:23: note: the last parameter in the range is 'eps'
24 | int groups, float eps
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:24:17: note: 'int' and 'float' may be implicitly converted
24 | int groups, float eps
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:27:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
27 | int n = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:28:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
28 | int g = blockIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:37:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
37 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:38:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
38 | int blockSize = blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:107:30: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
107 | float mean = total_sum / group_elements;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:108:31: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
108 | float var = total_sumsq / group_elements - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:148:5: warning: 2 adjacent parameters of 'forward' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
148 | int padding,
| ^~~~~~~~~~~~
149 | int groups,
| ~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:148:9: note: the first parameter in the range is 'padding'
148 | int padding,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:149:9: note: the last parameter in the range is 'groups'
149 | int groups,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:151:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
151 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:153:19: warning: the parameter 'group_norm_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
153 | torch::Tensor group_norm_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:154:19: warning: the parameter 'group_norm_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
154 | torch::Tensor group_norm_bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:166:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
166 | int N = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:167:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
167 | int C = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:168:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
168 | int D = x.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:169:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
169 | int H = x.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:170:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
170 | int W = x.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:176:25: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
176 | size_t shared_mem = 2 * numWarps * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:176:25: note: make conversion explicit to silence this warning
7 | size_t shared_mem = 2 * numWarps * sizeof(float);
| ^~~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s3_fused_vec_gridstride/base/base.cu:176:25: note: perform multiplication in a wider type
176 | size_t shared_mem = 2 * numWarps * sizeof(float);
| ^
| static_cast<long>( )