24 | __device__ inline void load_A_tile(const float* __restrict__ A, float* A_tile, int row, int t, int M, int K) {
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:24:84: note: the first parameter in the range is 'row'
24 | __device__ inline void load_A_tile(const float* __restrict__ A, float* A_tile, int row, int t, int M, int K) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:24:93: note: the last parameter in the range is 't'
24 | __device__ inline void load_A_tile(const float* __restrict__ A, float* A_tile, int row, int t, int M, int K) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:25:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
25 | int col = t * TILE_SIZE + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:34:80: warning: 2 adjacent parameters of 'load_B_tile' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
34 | __device__ inline void load_B_tile(const float* __restrict__ B, float* B_tile, int col, int t, int K, int N) {
| ^~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:34:84: note: the first parameter in the range is 'col'
34 | __device__ inline void load_B_tile(const float* __restrict__ B, float* B_tile, int col, int t, int K, int N) {
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:34:93: note: the last parameter in the range is 't'
34 | __device__ inline void load_B_tile(const float* __restrict__ B, float* B_tile, int col, int t, int K, int N) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:35:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
35 | int rowB = t * TILE_SIZE + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:59:46: warning: 2 adjacent parameters of 'ModularFusedMatMulBiasKernel' of similar type ('const float *__restrict') are easily swapped by mistake [bugprone-easily-swappable-parameters]
59 | __global__ void ModularFusedMatMulBiasKernel(const float* __restrict__ A,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
60 | const float* __restrict__ B,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:59:72: note: the first parameter in the range is 'A'
59 | __global__ void ModularFusedMatMulBiasKernel(const float* __restrict__ A,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:60:73: note: the last parameter in the range is 'B'
60 | const float* __restrict__ B,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:67:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
67 | int row = blockIdx.y * TILE_SIZE + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:68:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
68 | int col = blockIdx.x * TILE_SIZE + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:89:84: warning: 4 adjacent parameters of 'compute_pool_activation' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
89 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:89:88: note: the first parameter in the range is 'N'
89 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:89:130: note: the last parameter in the range is 'scale_factor'
89 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:89:124: note: 'int' and 'float' may be implicitly converted
89 | __device__ inline float compute_pool_activation(const float* __restrict__ row_ptr, int N, int start, int pool_kernel_size, float scale_factor) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:100:38: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
100 | float avg = (count > 0) ? (sum / count) : 0.0f;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:119:47: warning: 2 adjacent parameters of 'ModularFusedPoolActMaxKernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
119 | int pool_kernel_size,
| ^~~~~~~~~~~~~~~~~~~~~
120 | int output_length,
| ~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:119:51: note: the first parameter in the range is 'pool_kernel_size'
119 | int pool_kernel_size,
| ^~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:120:51: note: the last parameter in the range is 'output_length'
120 | int output_length,
| ^~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:122:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
122 | int row = blockIdx.x; // one block per row
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:123:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
123 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:125:28: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
125 | const float* row_ptr = linear_output + row * N;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:125:44: note: make conversion explicit to silence this warning
6 | const float* row_ptr = linear_output + row * N;
| ^~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:125:44: note: perform multiplication in a wider type
125 | const float* row_ptr = linear_output + row * N;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:128:53: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
128 | for (int bin = tid; bin < output_length; bin += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:139:16: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
139 | int lane = threadIdx.x % warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:140:18: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
140 | int warpId = threadIdx.x / warpSize;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:178:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
178 | int M = x.size(0); // Batch size
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:179:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
179 | int K = x.size(1); // Input features
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_98/b9_s3_constant_memory_fusion/base/base.cu:180:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
180 | int N = weight.size(0); // Output features
| ^