Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions internal/cuda/kernels/fused_encoder_bwd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* kernel_gelu_bwd GELU derivative * upstream gradient
* kernel_softmax_bwd Softmax backward (Jacobian-vector product)
* kernel_bias_grad_reduce Sum rows to compute bias gradients
* kernel_add_elementwise Element-wise addition for residual gradients
* kernel_enc_bwd_add_elementwise Element-wise addition for residual gradients
* kernel_matmul_grad_accum Accumulate weight gradient: dW += A^T @ B
*
* cuBLAS calls (~14 total per layer):
Expand Down Expand Up @@ -313,7 +313,7 @@ __global__ void kernel_bias_grad_reduce(
/* out[i] = a[i] + b[i] */
/* ------------------------------------------------------------------ */

__global__ void kernel_add(
__global__ void kernel_enc_bwd_add(
const float* __restrict__ a,
const float* __restrict__ b,
float* __restrict__ out,
Expand All @@ -330,7 +330,7 @@ __global__ void kernel_add(
/* out[i] = a[i] + b[i] + c[i] */
/* ------------------------------------------------------------------ */

__global__ void kernel_add3(
__global__ void kernel_enc_bwd_add3(
const float* __restrict__ a,
const float* __restrict__ b,
const float* __restrict__ c,
Expand Down Expand Up @@ -558,7 +558,7 @@ cudaError_t fused_encoder_bwd_f32(
dXRes1, dg_norm2W, dg_norm2B, dModel);

/* Add residual skip: dXRes1 += dOutput */
kernel_add<<<elemGridTrDm, block256, 0, stream>>>(
kernel_enc_bwd_add<<<elemGridTrDm, block256, 0, stream>>>(
dXRes1, dOutput, dXRes1, trDm);

/* ------------------------------------------------------------ */
Expand Down Expand Up @@ -722,7 +722,7 @@ cudaError_t fused_encoder_bwd_f32(
temp, dg_norm1W, dg_norm1B, dModel);

/* dInput = temp + dXRes1 */
kernel_add<<<elemGridTrDm, block256, 0, stream>>>(
kernel_enc_bwd_add<<<elemGridTrDm, block256, 0, stream>>>(
temp, dXRes1, dInput, trDm);

return cudaGetLastError();
Expand Down
Loading