43 | const float* __restrict__ x,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
44 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:43:31: note: the first parameter in the range is 'x'
43 | const float* __restrict__ x,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:44:31: note: the last parameter in the range is 'weight'
44 | const float* __restrict__ weight,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:53:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
53 | const int tx = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:54:20: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
54 | const int ty = threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:55:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
55 | const int row = blockIdx.y * TILE_DIM + ty;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:56:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
56 | const int col = blockIdx.x * TILE_DIM + tx;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:62:15: warning: Value stored to 'vec_in_features' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
62 | const int vec_in_features = in_features / vec_size;
| ^~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:62:15: note: Value stored to 'vec_in_features' during its initialization is never read
62 | const int vec_in_features = in_features / vec_size;
| ^~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:91:5: warning: 2 adjacent parameters of 'fused_reduce_gelu_residual_kernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
91 | const float* __restrict__ gemm_out,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 | const float* __restrict__ original_x,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:91:31: note: the first parameter in the range is 'gemm_out'
91 | const float* __restrict__ gemm_out,
| ^~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:92:31: note: the last parameter in the range is 'original_x'
92 | const float* __restrict__ original_x,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:94:5: warning: 3 adjacent parameters of 'fused_reduce_gelu_residual_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
94 | const int batch_size,
| ^~~~~~~~~~~~~~~~~~~~~
95 | const int out_features,
| ~~~~~~~~~~~~~~~~~~~~~~~
96 | const int in_features
| ~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:94:15: note: the first parameter in the range is 'batch_size'
94 | const int batch_size,
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:96:15: note: the last parameter in the range is 'in_features'
96 | const int in_features
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:100:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
100 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:101:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
101 | const int bid = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:107:34: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
107 | const int items_per_thread = (out_features + blockDim.x - 1) / blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:111:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
111 | const int idx = tid + i * blockDim.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:146:46: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
146 | for (int i = tid; i < vec_elements; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:148:19: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
148 | orig.load(original_x + bid * in_features + i * 4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:148:52: note: make conversion explicit to silence this warning
5 | orig.load(original_x + bid * in_features + i * 4);
| ^~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:148:52: note: perform multiplication in a wider type
148 | orig.load(original_x + bid * in_features + i * 4);
| ^
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:155:22: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
155 | result.store(out + bid * in_features + i * 4);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:155:48: note: make conversion explicit to silence this warning
155 | result.store(out + bid * in_features + i * 4);
| ^~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:155:48: note: perform multiplication in a wider type
155 | result.store(out + bid * in_features + i * 4);
| ^
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:159:64: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
159 | for (int i = vec_elements * 4 + tid; i < in_features; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:170:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
170 | const int batch_size = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:171:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
171 | const int in_features = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_51/b9_s1_atomic_optimized_pipeline/base/base.cu:172:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
172 | const int out_features = weight.size(0);
| ^