25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:71: note: the first parameter in the range is 'input'
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:104: note: the last parameter in the range is 'gamma'
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:171: warning: 3 adjacent parameters of 'compute_layernorm_and_store' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:175: note: the first parameter in the range is 'offset'
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:197: note: the last parameter in the range is 'mean'
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:191: note: 'int' and 'float' may be implicitly converted
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:203: warning: 2 adjacent parameters of 'compute_layernorm_and_store' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:209: note: the first parameter in the range is 'inv_std'
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:222: note: the last parameter in the range is 'tid'
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:25:218: note: 'float' and 'int' may be implicitly converted
25 | __device__ void compute_layernorm_and_store(const float* __restrict__ input, const float* __restrict__ gamma, const float* __restrict__ beta, float* __restrict__ output, int offset, int n2, float mean, float inv_std, int tid) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:52:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
52 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:53:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
53 | int bid = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:70:27: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
70 | float mean = s_mean / n2;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:71:35: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
71 | float variance = s_variance / n2 - mean * mean;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:82:5: warning: 2 adjacent parameters of 'forward' of similar type ('torch::Tensor') are easily swapped by mistake [bugprone-easily-swappable-parameters]
82 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
83 | torch::Tensor sum_weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:82:19: note: the first parameter in the range is 'conv_transpose_bias'
82 | torch::Tensor conv_transpose_bias,
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:83:19: note: the last parameter in the range is 'sum_weight'
83 | torch::Tensor sum_weight,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:86:5: warning: 5 adjacent parameters of 'forward' of similar type ('std::vector<int64_t>') are easily swapped by mistake [bugprone-easily-swappable-parameters]
86 | std::vector<int64_t> stride,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
87 | std::vector<int64_t> padding,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
88 | std::vector<int64_t> output_padding,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
89 | std::vector<int64_t> pool_kernel_size,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
90 | std::vector<int64_t> norm_shape
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:86:26: note: the first parameter in the range is 'stride'
86 | std::vector<int64_t> stride,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:90:26: note: the last parameter in the range is 'norm_shape'
90 | std::vector<int64_t> norm_shape
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:86:26: warning: the parameter 'stride' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
86 | std::vector<int64_t> stride,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:87:26: warning: the parameter 'padding' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
87 | std::vector<int64_t> padding,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:88:26: warning: the parameter 'output_padding' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
88 | std::vector<int64_t> output_padding,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:89:26: warning: the parameter 'pool_kernel_size' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
89 | std::vector<int64_t> pool_kernel_size,
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:135:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
135 | n1,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_3/b6_s3_modular_fused_atomic_layernorm/base/base.cu:136:9: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
136 | n2
| ^