6 | __global__ void block_tuned_cosine_similarity_loss_kernel(const float* __restrict__ predictions,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7 | const float* __restrict__ targets,
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:6:85: note: the first parameter in the range is 'predictions'
6 | __global__ void block_tuned_cosine_similarity_loss_kernel(const float* __restrict__ predictions,
| ^~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:7:85: note: the last parameter in the range is 'targets'
7 | const float* __restrict__ targets,
| ^~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:9:59: warning: 2 adjacent parameters of 'block_tuned_cosine_similarity_loss_kernel' of similar type ('const int') are easily swapped by mistake [bugprone-easily-swappable-parameters]
9 | const int N,
| ^~~~~~~~~~~~
10 | const int D) {
| ~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:9:69: note: the first parameter in the range is 'N'
9 | const int N,
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:10:69: note: the last parameter in the range is 'D'
10 | const int D) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:12:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
12 | const int row = blockIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:13:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
13 | const int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:17:29: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
17 | const float* pred_row = predictions + row * D;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:17:43: note: make conversion explicit to silence this warning
4 |
5 | template<int BLOCK_SIZE>
6 | __global__ void block_tuned_cosine_similarity_loss_kernel(const float* __restrict__ predictions,
7 | const float* __restrict__ targets,
8 | float* output,
9 | const int N,
10 | const int D) {
11 | constexpr int WARPS_PER_BLOCK = BLOCK_SIZE / 32;
12 | const int row = blockIdx.x;
13 | const int tid = threadIdx.x;
14 | const int warp_id = tid / warpSize;
15 | const int lane_id = tid % warpSize;
16 |
17 | const float* pred_row = predictions + row * D;
| ^~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:17:43: note: perform multiplication in a wider type
17 | const float* pred_row = predictions + row * D;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:18:31: warning: result of multiplication in type 'int' is used as a pointer offset after an implicit widening conversion to type 'ptrdiff_t' [bugprone-implicit-widening-of-multiplication-result]
18 | const float* target_row = targets + row * D;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:18:41: note: make conversion explicit to silence this warning
18 | const float* target_row = targets + row * D;
| ^~~~~~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:18:41: note: perform multiplication in a wider type
18 | const float* target_row = targets + row * D;
| ^~~
| static_cast<ptrdiff_t>( )
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:81:50: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
81 | atomicAdd(output, (1.0f - cos_sim) / N);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:86:72: warning: the parameter 'predictions' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
86 | torch::Tensor block_tuned_cosine_similarity_loss_forward(torch::Tensor predictions, torch::Tensor targets) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:86:99: warning: the parameter 'targets' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
86 | torch::Tensor block_tuned_cosine_similarity_loss_forward(torch::Tensor predictions, torch::Tensor targets) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:93:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
93 | int N = predictions.size(0);
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_97/b6_s3_block_tuned_cosine_loss_base/base/base.cu:94:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
94 | int D = predictions.size(1);
| ^