24 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:26:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
26 | idx += blockDim.x * gridDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:27:31: error: no matching function for call to '__ldcg' [clang-diagnostic-error]
27 | float2 pred_vec = __ldcg(&preds2[idx]); // Cache hint for global loads
| ^~~~~~
/usr/local/cuda/include/cuda_bf16.hpp:1662:35: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __nv_bfloat162 *const' for 1st argument
1662 | __CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda_bf16.hpp:1668:34: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __nv_bfloat16 *const' for 1st argument
1668 | __CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda_fp16.hpp:1688:28: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __half2 *const' for 1st argument
1688 | __CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda_fp16.hpp:1694:27: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __half *const' for 1st argument
1694 | __CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:28:30: error: no matching function for call to '__ldcg' [clang-diagnostic-error]
28 | float2 tgt_vec = __ldcg(&tgts2[idx]);
| ^~~~~~
/usr/local/cuda/include/cuda_bf16.hpp:1662:35: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __nv_bfloat162 *const' for 1st argument
1662 | __CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda_bf16.hpp:1668:34: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __nv_bfloat16 *const' for 1st argument
1668 | __CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda_fp16.hpp:1688:28: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __half2 *const' for 1st argument
1688 | __CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda_fp16.hpp:1694:27: note: candidate function not viable: no known conversion from 'const float2 *' to 'const __half *const' for 1st argument
1694 | __CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
| ^ ~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:37:28: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
37 | int last_idx = num_elements - 1;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:43:24: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
43 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:45:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
45 | idx += blockDim.x * gridDim.x) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:61:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
61 | int warp_id = threadIdx.x / 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:62:19: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
62 | int lane_id = threadIdx.x % 32;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250208_optimize_b5_s4_e1_sweep/level_1/task_94/b5_s3_mse_min_sync/edit_1/edit_1.cu:98:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name]
98 | AT_DISPATCH_FLOATING_TYPES(predictions.scalar_type(), "mse_forward_cuda", ([&] {
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:237:34: note: expanded from macro 'AT_DISPATCH_FLOATING_TYPES'
237 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:233:3: note: expanded from macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
233 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:74:3: note: expanded from macro 'AT_DISPATCH_CASE'
74 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^
note: (skipping 1 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/ATen/Dispatch.h:58:7: note: expanded from macro 'AT_PRIVATE_CHECK_SELECTIVE_BUILD'
58 | AT_ERROR( \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:711:32: note: expanded from macro 'AT_ERROR'
711 | C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
| ^
/home/robert_sakana_ai/miniconda3/envs/llm2cuda/lib/python3.11/site-packages/torch/include/c10/util/Exception.h:536:9: note: expanded from macro 'TORCH_CHECK'
536 | __func__, \
| ^