6 | __global__ void linear_relu_vector_warp_kernel(const float* x, const float* weight, const float* bias, float* out,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:6:61: note: the first parameter in the range is 'x'
6 | __global__ void linear_relu_vector_warp_kernel(const float* x, const float* weight, const float* bias, float* out,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:6:98: note: the last parameter in the range is 'bias'
6 | __global__ void linear_relu_vector_warp_kernel(const float* x, const float* weight, const float* bias, float* out,
| ^~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:7:48: warning: 2 adjacent parameters of 'linear_relu_vector_warp_kernel' of similar type ('int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
7 | int batch_size, int in_features, int out_features) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:7:52: note: the first parameter in the range is 'batch_size'
7 | int batch_size, int in_features, int out_features) {
| ^~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:7:68: note: the last parameter in the range is 'in_features'
7 | int batch_size, int in_features, int out_features) {
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:9:17: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
9 | int warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:10:17: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
10 | int lane_id = threadIdx.x % 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:19:24: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
19 | const float* x_row = x + row * in_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:19:28: note: make conversion explicit to silence this warning
5 |
6 | __global__ void linear_relu_vector_warp_kernel(const float* x, const float* weight, const float* bias, float* out,
7 | int batch_size, int in_features, int out_features) {
8 | // Global warp index determines which output element we compute
9 | int warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
10 | int lane_id = threadIdx.x % 32;
11 |
12 | if (warp_id >= batch_size * out_features) return;
13 |
14 | int row = warp_id / out_features;
15 | int col = warp_id % out_features;
16 |
17 | float sum = 0.0f;
18 |
19 | const float* x_row = x + row * in_features;
| ^~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:19:28: note: perform multiplication in a wider type
19 | const float* x_row = x + row * in_features;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:20:24: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
20 | const float* w_row = weight + col * in_features;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:20:33: note: make conversion explicit to silence this warning
20 | const float* w_row = weight + col * in_features;
| ^~~~~~~~~~~~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:20:33: note: perform multiplication in a wider type
20 | const float* w_row = weight + col * in_features;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:47:49: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
47 | torch::Tensor linear_relu_forward(torch::Tensor x, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:47:66: warning: the parameter 'weight' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
47 | torch::Tensor linear_relu_forward(torch::Tensor x, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:47:88: warning: the parameter 'bias' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
47 | torch::Tensor linear_relu_forward(torch::Tensor x, torch::Tensor weight, torch::Tensor bias) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:52:26: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
52 | const int batch_size = x.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:53:27: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
53 | const int in_features = x.size(1);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250212_optimize_b5_s4_e1_v2/level_2/task_76/b3_s3_vectorized_warp_reduction/base/base.cu:54:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
54 | const int out_features = weight.size(0);
| ^