12 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:13:41: warning: macro argument should be enclosed in parentheses [bugprone-macro-parentheses]
13 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
| ^
| ()
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:69: warning: 2 adjacent parameters of 'elu_kernel_vec_shared' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:75: note: the first parameter in the range is 'alpha'
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:86: note: the last parameter in the range is 'n4'
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:18:82: note: 'float' and 'int' may be implicitly converted
18 | __global__ void elu_kernel_vec_shared(const float4* x, float4* out, float alpha, int n4) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:20:15: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
20 | int tid = threadIdx.x;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:21:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
21 | int globalIdx = blockIdx.x * blockDim.x + tid;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:61: warning: 3 adjacent parameters of 'elu_kernel_tail' of convertible types are easily swapped by mistake [bugprone-easily-swappable-parameters]
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:67: note: the first parameter in the range is 'alpha'
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^~~~~
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:90: note: the last parameter in the range is 'n'
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:48:74: note: 'float' and 'int' may be implicitly converted
48 | __global__ void elu_kernel_tail(const float* x, float* out, float alpha, int offset, int n) {
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:49:21: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
49 | int globalIdx = blockIdx.x * blockDim.x + threadIdx.x + offset;
| ^
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:58:47: warning: the parameter 'x' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
58 | torch::Tensor elu_cuda_combined(torch::Tensor x, float alpha) {
| ^
| const &
/home/robert_sakana_ai/llm_cuda/experiments/20250202_optimize_b10_s4_e0_sweep/level_1/task_31/b4_s1_vec_shared_elu/base/base.cu:62:13: warning: narrowing conversion from 'int64_t' (aka 'long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
62 | int n = x.numel();
| ^