24 | __global__ void warp_aligned_gemm_kernel(const float* __restrict__ x,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:24:68: note: the first parameter in the range is 'x'
24 | __global__ void warp_aligned_gemm_kernel(const float* __restrict__ x,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:25:69: note: the last parameter in the range is 'weight'
25 | const float* __restrict__ weight,
| ^~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:31:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
31 | const int warp_id = threadIdx.x / WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:32:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
32 | const int lane_id = threadIdx.x % WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:34:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
34 | const int row = blockIdx.y * TILE_DIM + warp_id;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:35:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
35 | const int col = blockIdx.x * TILE_DIM + lane_id;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:72:42: warning: 3 adjacent parameters of 'warp_reduce_max_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
72 | int rows, int cols, int reduce_dim) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:72:46: note: the first parameter in the range is 'rows'
72 | int rows, int cols, int reduce_dim) {
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:72:66: note: the last parameter in the range is 'reduce_dim'
72 | int rows, int cols, int reduce_dim) {
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:75:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
75 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:83:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
83 | const int col = blockIdx.x * WARP_SIZE + lane_id;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:93:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
93 | const int row = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:122:47: warning: 2 adjacent parameters of 'warp_fused_mean_gelu_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
122 | int rows, int cols) {
| ^~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:122:51: note: the first parameter in the range is 'rows'
122 | int rows, int cols) {
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:122:61: note: the last parameter in the range is 'cols'
122 | int rows, int cols) {
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:125:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
125 | const int row = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:126:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
126 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:131:44: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
131 | for (int col = tid; col < cols; col += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:150:36: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
150 | warp_sums[0] = total_sum / cols;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:155:44: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
155 | for (int col = tid; col < cols; col += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:162:37: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
162 | torch::Tensor forward(torch::Tensor x, int max_dim, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:162:67: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
162 | torch::Tensor forward(torch::Tensor x, int max_dim, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:162:89: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
162 | torch::Tensor forward(torch::Tensor x, int max_dim, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:163:23: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
163 | const int batch = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:164:29: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
164 | const int in_features = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:165:30: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
165 | const int out_features = weight.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:188:15: warning: Value stored to 'rows' during its initialization is never read [clang-analyzer-deadcode.DeadStores]
188 | const int rows = (max_dim == 0) ? batch : 1;
| ^~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:188:15: note: Value stored to 'rows' during its initialization is never read
188 | const int rows = (max_dim == 0) ? batch : 1;
| ^~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:196:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
196 | const int final_rows = max_out.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_2/task_80/b3_s1_warp_aligned_gemm_const_bias/base/base.cu:197:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
197 | const int final_cols = max_out.size(1);
| ^