17 | int M, int N, int K,
| ^~~~~~
18 | int lda, int ldb, int ldc,
| ~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:17:67: note: the first parameter in the range is 'K'
17 | int M, int N, int K,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:18:53: note: the last parameter in the range is 'lda'
18 | int lda, int ldb, int ldc,
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:18:67: warning: 2 adjacent parameters of 'matmul_kernel_streamed_balanced' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
18 | int lda, int ldb, int ldc,
| ^~~~~~~~
19 | int m_offset,
| ~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:18:71: note: the first parameter in the range is 'ldc'
18 | int lda, int ldb, int ldc,
| ^~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:19:53: note: the last parameter in the range is 'm_offset'
19 | int m_offset,
| ^~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:21:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int block_row = blockIdx.y * (BLOCK_SIZE * ELEMENTS_PER_THREAD) + m_offset;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:22:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
22 | int block_col = blockIdx.x * BLOCK_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:23:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
23 | int thread_row = threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:24:22: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
24 | int thread_col = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:77:41: warning: the parameter 'A' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
77 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:77:58: warning: the parameter 'B' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
77 | torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:87:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
87 | int lda = A.stride(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:88:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
88 | int ldb = B.stride(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:89:15: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
89 | int ldc = N;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:98:27: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
98 | int rows_per_stream = (M + NUM_STREAMS - 1) / NUM_STREAMS;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:116:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
116 | M, N, K,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:116:16: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
116 | M, N, K,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_9/b8_s2_streamed_balanced_matmul/base/base.cu:116:19: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
116 | M, N, K,
| ^