9 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:10:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
10 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:22:5: warning: 2 adjacent parameters of 'efficient_fused_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
22 | int N, int C, int D, int H, int W,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:22:9: note: the first parameter in the range is 'N'
22 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:22:16: note: the last parameter in the range is 'C'
22 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:22:33: warning: 3 adjacent parameters of 'efficient_fused_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
22 | int N, int C, int D, int H, int W,
| ^~~~~~
23 | int groups, float eps
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:22:37: note: the first parameter in the range is 'W'
22 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:23:23: note: the last parameter in the range is 'eps'
23 | int groups, float eps
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:23:17: note: 'int' and 'float' may be implicitly converted
23 | int groups, float eps
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:25:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
25 | int n = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:26:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
26 | int g = blockIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:32:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:33:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
33 | int blockSize = blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:66:9: warning: Value stored to 'local_sum' is never read [clang-analyzer-deadcode.DeadStores]
66 | local_sum = warp_sums[tid];
| ^ ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:66:9: note: Value stored to 'local_sum' is never read
66 | local_sum = warp_sums[tid];
| ^ ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:67:9: warning: Value stored to 'local_sumsq' is never read [clang-analyzer-deadcode.DeadStores]
67 | local_sumsq = warp_sumsq[tid];
| ^ ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:67:9: note: Value stored to 'local_sumsq' is never read
67 | local_sumsq = warp_sumsq[tid];
| ^ ~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:82:33: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
82 | float mean = warp_sums[0] / group_elements;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:83:33: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
83 | float var = warp_sumsq[0] / group_elements - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:103:5: warning: 2 adjacent parameters of 'forward' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
103 | int padding,
| ^~~~~~~~~~~~
104 | int groups,
| ~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:103:9: note: the first parameter in the range is 'padding'
103 | int padding,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:104:9: note: the last parameter in the range is 'groups'
104 | int groups,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:106:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
106 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:108:19: warning: the parameter 'group_norm_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
108 | torch::Tensor group_norm_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:109:19: warning: the parameter 'group_norm_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
109 | torch::Tensor group_norm_bias
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:120:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
120 | int N = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:121:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
121 | int C = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:122:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
122 | int D = x.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:123:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
123 | int H = x.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:124:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
124 | int W = x.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:128:30: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' [bugprone-implicit-widening-of-multiplication-result]
128 | size_t shared_mem_size = 2 * (threads/32) * sizeof(float);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:128:30: note: make conversion explicit to silence this warning
6 | size_t shared_mem_size = 2 * (threads/32) * sizeof(float);
| ^~~~~~~~~~~~~~~~
| static_cast<unsigned long>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_60/b8_s1_efficient_fused_kernel/base/base.cu:128:30: note: perform multiplication in a wider type
128 | size_t shared_mem_size = 2 * (threads/32) * sizeof(float);
| ^
| static_cast<long>( )