23 | const float* __restrict__ gamma,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 | const float* __restrict__ beta,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:23:31: note: the first parameter in the range is 'gamma'
23 | const float* __restrict__ gamma,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:24:31: note: the last parameter in the range is 'beta'
24 | const float* __restrict__ beta,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:25:5: warning: 3 adjacent parameters of 'fused_relu_groupnorm_coalesced_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | int N, int C, int D, int H, int W,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:25:9: note: the first parameter in the range is 'N'
25 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:25:23: note: the last parameter in the range is 'D'
25 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:25:33: warning: 3 adjacent parameters of 'fused_relu_groupnorm_coalesced_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | int N, int C, int D, int H, int W,
| ^~~~~~
26 | int G, float eps) {
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:25:37: note: the first parameter in the range is 'W'
25 | int N, int C, int D, int H, int W,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:26:18: note: the last parameter in the range is 'eps'
26 | int G, float eps) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:26:12: note: 'int' and 'float' may be implicitly converted
26 | int G, float eps) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:29:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
29 | int n = blockIdx.x; // sample index
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:30:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
30 | int g = blockIdx.y; // group index
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:68:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
68 | for (int i = threadIdx.x; i < total_vectors; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:68:55: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
68 | for (int i = threadIdx.x; i < total_vectors; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:87:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
87 | for (int i = threadIdx.x; i < remainder; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:87:51: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
87 | for (int i = threadIdx.x; i < remainder; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:98:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
98 | int lane = threadIdx.x & (WARP_SIZE - 1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:105:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
105 | int warp_id = threadIdx.x / WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:115:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
115 | int num_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:122:34: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
122 | group_mean = sum_total / group_elems;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:123:40: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
123 | float variance = sumsq_total / group_elems - group_mean * group_mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:135:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
135 | for (int i = threadIdx.x; i < total_vectors; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:135:55: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
135 | for (int i = threadIdx.x; i < total_vectors; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:152:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
152 | for (int i = threadIdx.x; i < remainder; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:152:51: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
152 | for (int i = threadIdx.x; i < remainder; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:164:19: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
164 | torch::Tensor x,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:165:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
165 | torch::Tensor conv_transpose,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
166 | torch::Tensor group_norm_weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:165:19: note: the first parameter in the range is 'conv_transpose'
165 | torch::Tensor conv_transpose,
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:166:19: note: the last parameter in the range is 'group_norm_weight'
166 | torch::Tensor group_norm_weight,
| ^~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:165:19: warning: the parameter 'conv_transpose' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
165 | torch::Tensor conv_transpose,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:166:19: warning: the parameter 'group_norm_weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
166 | torch::Tensor group_norm_weight,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:167:19: warning: the parameter 'group_norm_bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
167 | torch::Tensor group_norm_bias,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:168:5: warning: 2 adjacent parameters of 'forward' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
168 | int64_t groups,
| ^~~~~~~~~~~~~~~
169 | double eps) {
| ~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:168:13: note: the first parameter in the range is 'groups'
168 | int64_t groups,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:169:12: note: the last parameter in the range is 'eps'
169 | double eps) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:168:5: note:
168 | int64_t groups,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:169:5: note: 'int64_t' and 'double' may be implicitly converted: 'int64_t' (as 'long') -> 'double', 'double' -> 'int64_t' (as 'long')
169 | double eps) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:182:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
182 | int N = y.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:183:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
183 | int C = y.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:184:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
184 | int D = y.size(2);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:185:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
185 | int H = y.size(3);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:186:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
186 | int W = y.size(4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250204_optimize_b10_s4_e0_sweep/level_2/task_61/b6_s1_fused_rg_coalesced/base/base.cu:187:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
187 | int G = groups;
| ^