From 91357ddcc577c4449f4cc93c65564f1df3a481f3 Mon Sep 17 00:00:00 2001
From: Developer <user@example.com>
Date: Fri, 22 May 2026 14:18:48 -0500
Subject: [PATCH] fix(cuda): increase argmax topk kernel limit from 32 to 64

The custom topk_f32 CUDA kernel had a hardcoded K <= 32 limit from
fixed-size heap arrays. The Gemma 4 GGUF bakes top_k=64 into its
metadata (general.sampling.top_k), which the DFlash reduced verifier
passes through to the target model's verification logits as K=64.

On CUDA 12.x / pre-CCCL 3.2 builds, the CUB TopK fallback is
unavailable, so K=64 hits the custom path and crashes:

  argmax.cu:557: GGML_ASSERT(K <= 32) failed

Increase the register arrays and assertion to K <= 64. Shared memory
usage stays well within limits (16 KB at K=64, 32 warps). Bump the
CUB auto-threshold to K > 64 so the custom path covers the full
top_k=64 range.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 ggml/src/ggml-cuda/argmax.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
index cf904605132..ca0f960f46d 100644
--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@@ -237,8 +237,8 @@ static __global__ void topk_f32(
 
     // Per-thread top-K heap (min-heap: smallest score at index 0)
     // Max K=32, stored in registers
-    float  heap_val[32];
-    int32_t heap_idx[32];
+    float  heap_val[64];
+    int32_t heap_idx[64];
     for (int i = 0; i < K; i++) {
         heap_val[i] = -FLT_MAX;
         heap_idx[i] = -1;
@@ -527,7 +527,7 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int cub_mode = ggml_cuda_dflash_cub_topk_mode();
     const bool use_cub_topk =
         K > 1 && !output_logprob && seed == 0 &&
-        (K > 32 || cub_mode == 1 || (cub_mode < 0 && nrows > 32));
+        (K > 64 || cub_mode == 1 || (cub_mode < 0 && nrows > 32));
     if (ggml_cuda_dflash_argmax_profile_enabled() && K > 1) {
         GGML_LOG_INFO("%s: dflash argmax profile path=%s K=%d nrows=%" PRId64
                 " vocab=%" PRId64 " temp=%.3f seed=%" PRIu64 " output_logprob=%d cub_mode=%d\n",
@@ -554,7 +554,7 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     if (K == 1) {
         argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows, inv_temp, seed, output_logprob);
     } else {
-        GGML_ASSERT(K <= 32);
+        GGML_ASSERT(K <= 64);
         // Shared memory: K * n_warps floats + K * n_warps ints + 2 * n_warps floats (softmax)
         const int n_warps = (int)(num_threads / WARP_SIZE);
         const size_t smem_size = K * n_warps * (sizeof(float) + sizeof(int32_t)) + 2 * n_warps * sizeof(float);