25 | int batch,
| ^~~~~~~~~~
26 | int group,
| ~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:25:9: note: the first parameter in the range is 'batch'
25 | int batch,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:26:9: note: the last parameter in the range is 'group'
26 | int group,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:27:5: warning: 2 adjacent parameters of 'compute_group_mean_var' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
27 | int channels_per_group,
| ^~~~~~~~~~~~~~~~~~~~~~~
28 | int num_channels,
| ~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:27:9: note: the first parameter in the range is 'channels_per_group'
27 | int channels_per_group,
| ^~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:28:9: note: the last parameter in the range is 'num_channels'
28 | int num_channels,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:29:5: warning: 2 adjacent parameters of 'compute_group_mean_var' of similar type ('scalar_t &') are easily swapped by mistake [bugprone-easily-swappable-parameters]
29 | scalar_t &mean,
| ^~~~~~~~~~~~~~~
30 | scalar_t &var) {
| ~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:29:15: note: the first parameter in the range is 'mean'
29 | scalar_t &mean,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:30:15: note: the last parameter in the range is 'var'
30 | scalar_t &var) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:52:5: warning: 2 adjacent parameters of 'group_norm_normalize' of similar type ('scalar_t') are easily swapped by mistake [bugprone-easily-swappable-parameters]
52 | scalar_t mean,
| ^~~~~~~~~~~~~~
53 | scalar_t var,
| ~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:52:14: note: the first parameter in the range is 'mean'
52 | scalar_t mean,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:53:14: note: the last parameter in the range is 'var'
53 | scalar_t var,
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:54:5: warning: 2 adjacent parameters of 'group_norm_normalize' of similar type ('scalar_t') are easily swapped by mistake [bugprone-easily-swappable-parameters]
54 | scalar_t eps,
| ^~~~~~~~~~~~~
55 | scalar_t gamma,
| ~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:54:14: note: the first parameter in the range is 'eps'
54 | scalar_t eps,
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:55:14: note: the last parameter in the range is 'gamma'
55 | scalar_t gamma,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:64:5: warning: 3 adjacent parameters of 'linear_forward_kernel' of similar type ('const scalar_t *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
64 | const scalar_t* __restrict__ x,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
65 | const scalar_t* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
66 | const scalar_t* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:64:34: note: the first parameter in the range is 'x'
64 | const scalar_t* __restrict__ x,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:66:34: note: the last parameter in the range is 'bias'
66 | const scalar_t* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:68:5: warning: 2 adjacent parameters of 'linear_forward_kernel' of similar type ('size_t') are easily swapped by mistake [bugprone-easily-swappable-parameters]
68 | size_t batch_size,
| ^~~~~~~~~~~~~~~~~~
69 | size_t in_features,
| ~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:68:12: note: the first parameter in the range is 'batch_size'
68 | size_t batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:69:12: note: the last parameter in the range is 'in_features'
69 | size_t in_features,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:72:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
72 | int row = blockIdx.y * blockDim.y + threadIdx.y; // batch index
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:73:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
73 | int col = blockIdx.x * blockDim.x + threadIdx.x; // output feature index
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:80:20: warning: narrowing conversion from 'size_t' (aka 'unsigned long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
80 | int numTiles = (in_features + TILE_DIM - 1) / TILE_DIM;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:82:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
82 | int x_idx = t * TILE_DIM + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:83:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
83 | int w_idx = t * TILE_DIM + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:100:5: warning: 2 adjacent parameters of 'group_norm_forward_kernel' of similar type ('const scalar_t *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
100 | const scalar_t* __restrict__ x,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
101 | const scalar_t* __restrict__ gamma,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:100:34: note: the first parameter in the range is 'x'
100 | const scalar_t* __restrict__ x,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:101:34: note: the last parameter in the range is 'gamma'
101 | const scalar_t* __restrict__ gamma,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:104:5: warning: 2 adjacent parameters of 'group_norm_forward_kernel' of similar type ('int64_t') are easily swapped by mistake [bugprone-easily-swappable-parameters]
104 | int64_t batch_size,
| ^~~~~~~~~~~~~~~~~~~
105 | int64_t num_channels,
| ~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:104:13: note: the first parameter in the range is 'batch_size'
104 | int64_t batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:105:13: note: the last parameter in the range is 'num_channels'
105 | int64_t num_channels,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:106:5: warning: 3 adjacent parameters of 'group_norm_forward_kernel' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
106 | int64_t num_groups,
| ^~~~~~~~~~~~~~~~~~~
107 | int64_t channels_per_group,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~
108 | float eps = 1e-5f) {
| ~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:106:13: note: the first parameter in the range is 'num_groups'
106 | int64_t num_groups,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:108:11: note: the last parameter in the range is 'eps'
108 | float eps = 1e-5f) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:106:5: note:
106 | int64_t num_groups,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:108:5: note: 'int64_t' and 'float' may be implicitly converted: 'int64_t' (as 'long') -> 'float', 'float' -> 'int64_t' (as 'long')
108 | float eps = 1e-5f) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:110:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
110 | int batch = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:111:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
111 | int group = blockIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:118:21: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
118 | int channel = group * channels_per_group + c;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:134:13: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
134 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:158:3: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]
158 | AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "linear_forward_cuda", ([&] {
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'
237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'
74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'
58 | AT_ERROR( \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'
711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'
536 | __func__, \
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:187:3: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]
187 | AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "group_norm_forward_cuda", ([&] {
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'
237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'
74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'
58 | AT_ERROR( \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'
711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'
536 | __func__, \
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:212:22: warning: narrowing conversion from 'size_t' (aka 'unsigned long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
212 | const int blocks = (total_elements + threads - 1) / threads;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:213:3: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]
213 | AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "hardtanh_forward_cuda", ([&] {
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'
237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'
74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'
58 | AT_ERROR( \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'
711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'
536 | __func__, \
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:234:5: warning: 2 adjacent parameters of 'module_fn_cuda_forward' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
234 | int64_t num_groups,
| ^~~~~~~~~~~~~~~~~~~
235 | float hardtanh_min,
| ~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:234:13: note: the first parameter in the range is 'num_groups'
234 | int64_t num_groups,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:235:11: note: the last parameter in the range is 'hardtanh_min'
235 | float hardtanh_min,
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:234:5: note:
234 | int64_t num_groups,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:235:5: note: 'int64_t' and 'float' may be implicitly converted: 'int64_t' (as 'long') -> 'float', 'float' -> 'int64_t' (as 'long')
235 | float hardtanh_min,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:245:11: warning: Value stored to 'in_features' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
245 | int64_t in_features = x.size(1);
| ^~~~~~~~~~~ ~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_30/b5_s0_min_warp_divergence/edit_1/edit_1.cu:245:11: note: Value stored to 'in_features' during its initialization is never read
245 | int64_t in_features = x.size(1);
| ^~~~~~~~~~~ ~~~~~~~~~