13 | __global__ void tiledDoubleOutputKernel(const float* __restrict__ A,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
14 | const float* __restrict__ B,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:13:67: note: the first parameter in the range is 'A'
13 | __global__ void tiledDoubleOutputKernel(const float* __restrict__ A,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:14:69: note: the last parameter in the range is 'B'
14 | const float* __restrict__ B,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:16:50: warning: 2 adjacent parameters of 'tiledDoubleOutputKernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
16 | int K, int M, int N) {
| ^~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:16:54: note: the first parameter in the range is 'M'
16 | int K, int M, int N) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:16:61: note: the last parameter in the range is 'N'
16 | int K, int M, int N) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:19:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
19 | int row = blockIdx.y * BLOCK_M + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:20:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
20 | int col = blockIdx.x * BLOCK_N + threadIdx.x * 2; // two outputs per thread
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:35:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
35 | int a_k = tileStart + threadIdx.x; // threadIdx.x in [0, TILE-1]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:42:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
42 | int b_k = tileStart + threadIdx.y; // threadIdx.y in [0, TILE-1]
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:47:17: warning: result of multiplication in type 'unsigned int' is used as a pointer offset after an implicit widening conversion to type 'size_t' [bugprone-implicit-widening-of-multiplication-result]
47 | B_tile[threadIdx.y][threadIdx.x * 2] = B[b_k * N + global_col0];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:47:37: note: make conversion explicit to silence this warning
4 | B_tile[threadIdx.y][threadIdx.x * 2] = B[b_k * N + global_col0];
| ^~~~~~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:47:37: note: perform multiplication in a wider type
47 | B_tile[threadIdx.y][threadIdx.x * 2] = B[b_k * N + global_col0];
| ^~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:49:17: warning: result of multiplication in type 'unsigned int' is used as a pointer offset after an implicit widening conversion to type 'size_t' [bugprone-implicit-widening-of-multiplication-result]
49 | B_tile[threadIdx.y][threadIdx.x * 2] = 0.0f;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:49:37: note: make conversion explicit to silence this warning
49 | B_tile[threadIdx.y][threadIdx.x * 2] = 0.0f;
| ^~~~~~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:49:37: note: perform multiplication in a wider type
49 | B_tile[threadIdx.y][threadIdx.x * 2] = 0.0f;
| ^~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:55:13: warning: result of multiplication in type 'unsigned int' is used as a pointer offset after an implicit widening conversion to type 'size_t' [bugprone-implicit-widening-of-multiplication-result]
55 | B_tile[threadIdx.y][threadIdx.x * 2] = 0.0f;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:55:33: note: make conversion explicit to silence this warning
55 | B_tile[threadIdx.y][threadIdx.x * 2] = 0.0f;
| ^~~~~~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:55:33: note: perform multiplication in a wider type
55 | B_tile[threadIdx.y][threadIdx.x * 2] = 0.0f;
| ^~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:65:29: warning: result of multiplication in type 'unsigned int' is used as a pointer offset after an implicit widening conversion to type 'size_t' [bugprone-implicit-widening-of-multiplication-result]
65 | out0 += a_val * B_tile[s][threadIdx.x * 2];
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:65:39: note: make conversion explicit to silence this warning
65 | out0 += a_val * B_tile[s][threadIdx.x * 2];
| ^~~~~~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:65:39: note: perform multiplication in a wider type
65 | out0 += a_val * B_tile[s][threadIdx.x * 2];
| ^~~~~~~~~~~
| static_cast<size_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:88:37: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
88 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:88:54: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
88 | torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:94:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
94 | int K = A.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:95:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
95 | int M = A.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_16/b5_s2_tiled_double_output/base/base.cu:97:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
97 | int N = B.size(1);
| ^