31 | __global__ void tiled_gemm_kernel(const float* __restrict__ x,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
32 | const float* __restrict__ weight,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33 | const float* __restrict__ bias,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:31:61: note: the first parameter in the range is 'x'
31 | __global__ void tiled_gemm_kernel(const float* __restrict__ x,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:33:62: note: the last parameter in the range is 'bias'
33 | const float* __restrict__ bias,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:39:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
39 | int row = blockIdx.y * TILE_DIM + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:40:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
40 | int col = blockIdx.x * TILE_DIM + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:44:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
44 | int idx = t * TILE_DIM + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:45:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | int idy = t * TILE_DIM + threadIdx.y;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:66:48: warning: 3 adjacent parameters of 'fused_warp_reduce_gelu_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
66 | int batch,
| ^~~~~~~~~~
67 | int out_features,
| ~~~~~~~~~~~~~~~~~
68 | int max_dim) {
| ~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:66:52: note: the first parameter in the range is 'batch'
66 | int batch,
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:68:52: note: the last parameter in the range is 'max_dim'
68 | int max_dim) {
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:70:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
70 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:73:33: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
73 | const int warps_per_block = blockDim.x / WARP_SIZE;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:80:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
80 | const int col = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:83:43: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
83 | for (int i = tid; i < batch; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:112:43: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
112 | for (int i = tid; i < batch; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:128:39: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
128 | smem[0] = block_sum / batch;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:136:43: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
136 | for (int i = tid; i < batch; i += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:143:25: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
143 | const int row = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:145:50: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
145 | for (int j = tid; j < out_features; j += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:167:50: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
167 | for (int j = tid; j < out_features; j += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:183:39: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
183 | smem[0] = block_sum / out_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:190:50: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
190 | for (int j = tid; j < out_features; j += blockDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:198:37: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
198 | torch::Tensor forward(torch::Tensor x, int max_dim, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:198:67: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
198 | torch::Tensor forward(torch::Tensor x, int max_dim, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:198:89: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
198 | torch::Tensor forward(torch::Tensor x, int max_dim, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:199:17: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
199 | int batch = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:200:23: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
200 | int in_features = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:201:24: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
201 | int out_features = weight.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250203_optimize_b10_s4_e0_sweep/level_2/task_80/b7_s1_warp_optimized_reduction_base/base/base.cu:221:25: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
221 | int sharedMemSize = (threads / WARP_SIZE) * sizeof(float);
| ^