21 | __device__ inline void load_A_tile(const float* __restrict__ A, float* A_tile, int row, int t, int M, int K) {
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:21:84: note: the first parameter in the range is 'row'
21 | __device__ inline void load_A_tile(const float* __restrict__ A, float* A_tile, int row, int t, int M, int K) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:21:93: note: the last parameter in the range is 't'
21 | __device__ inline void load_A_tile(const float* __restrict__ A, float* A_tile, int row, int t, int M, int K) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:22:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | int col = t * TILE_SIZE + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:31:80: warning: 2 adjacent parameters of 'load_B_tile' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
31 | __device__ inline void load_B_tile(const float* __restrict__ B, float* B_tile, int col, int t, int K, int N) {
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:31:84: note: the first parameter in the range is 'col'
31 | __device__ inline void load_B_tile(const float* __restrict__ B, float* B_tile, int col, int t, int K, int N) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:31:93: note: the last parameter in the range is 't'
31 | __device__ inline void load_B_tile(const float* __restrict__ B, float* B_tile, int col, int t, int K, int N) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:32:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | int rowB = t * TILE_SIZE + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:56:46: warning: 3 adjacent parameters of 'ModularFusedMatMulBiasKernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
56 | __global__ void ModularFusedMatMulBiasKernel(const float* __restrict__ A,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
57 | const float* __restrict__ B,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
58 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:56:72: note: the first parameter in the range is 'A'
56 | __global__ void ModularFusedMatMulBiasKernel(const float* __restrict__ A,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:58:73: note: the last parameter in the range is 'bias'
58 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:65:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
65 | int row = blockIdx.y * TILE_SIZE + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:66:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
66 | int col = blockIdx.x * TILE_SIZE + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:87:84: warning: 4 adjacent parameters of 'compute_pool_activation' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
87 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:87:88: note: the first parameter in the range is 'N'
87 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:87:130: note: the last parameter in the range is 'scale_factor'
87 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:87:124: note: 'int' and 'float' may be implicitly converted
87 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:98:38: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
98 | float avg = (count > 0) ? (sum / count) : 0.0f;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:117:47: warning: 2 adjacent parameters of 'ModularFusedPoolActMaxKernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
117 | int pool_kernel_size,
| ^~~~~~~~~~~~~~~~~~~~~
118 | int output_length,
| ~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:117:51: note: the first parameter in the range is 'pool_kernel_size'
117 | int pool_kernel_size,
| ^~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:118:51: note: the last parameter in the range is 'output_length'
118 | int output_length,
| ^~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:120:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
120 | int row = blockIdx.x; // one block per row
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:121:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
121 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:123:28: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
123 | const float* row_ptr = linear_output + row * N;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:123:44: note: make conversion explicit to silence this warning
6 | const float* row_ptr = linear_output + row * N;
| ^~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:123:44: note: perform multiplication in a wider type
123 | const float* row_ptr = linear_output + row * N;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:126:53: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
126 | for (int bin = tid; bin < output_length; bin += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:137:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
137 | int lane = threadIdx.x % warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:138:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
138 | int warpId = threadIdx.x / warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:176:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
176 | int M = x.size(0); // Batch size
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:177:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
177 | int K = x.size(1); // Input features
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s1_shared_memory_optimization/base/base.cu:178:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
178 | int N = weight.size(0); // Output features
| ^