From 91357ddcc577c4449f4cc93c65564f1df3a481f3 Mon Sep 17 00:00:00 2001 From: Developer Date: Fri, 22 May 2026 14:18:48 -0500 Subject: [PATCH] fix(cuda): increase argmax topk kernel limit from 32 to 64 The custom topk_f32 CUDA kernel had a hardcoded K <= 32 limit from fixed-size heap arrays. The Gemma 4 GGUF bakes top_k=64 into its metadata (general.sampling.top_k), which the DFlash reduced verifier passes through to the target model's verification logits as K=64. On CUDA 12.x / pre-CCCL 3.2 builds, the CUB TopK fallback is unavailable, so K=64 hits the custom path and crashes: argmax.cu:557: GGML_ASSERT(K <= 32) failed Increase the register arrays and assertion to K <= 64. Shared memory usage stays well within limits (16 KB at K=64, 32 warps). Bump the CUB auto-threshold to K > 64 so the custom path covers the full top_k=64 range. Co-Authored-By: Claude Opus 4.7 --- ggml/src/ggml-cuda/argmax.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index cf904605132..ca0f960f46d 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -237,8 +237,8 @@ static __global__ void topk_f32( // Per-thread top-K heap (min-heap: smallest score at index 0) // Max K=32, stored in registers - float heap_val[32]; - int32_t heap_idx[32]; + float heap_val[64]; + int32_t heap_idx[64]; for (int i = 0; i < K; i++) { heap_val[i] = -FLT_MAX; heap_idx[i] = -1; @@ -527,7 +527,7 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int cub_mode = ggml_cuda_dflash_cub_topk_mode(); const bool use_cub_topk = K > 1 && !output_logprob && seed == 0 && - (K > 32 || cub_mode == 1 || (cub_mode < 0 && nrows > 32)); + (K > 64 || cub_mode == 1 || (cub_mode < 0 && nrows > 32)); if (ggml_cuda_dflash_argmax_profile_enabled() && K > 1) { GGML_LOG_INFO("%s: dflash argmax profile path=%s K=%d nrows=%" PRId64 " vocab=%" PRId64 " temp=%.3f seed=%" PRIu64 " output_logprob=%d cub_mode=%d\n", @@ -554,7 +554,7 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if (K == 1) { argmax_f32<<>>(src0_d, dst_d, ne00, nrows, inv_temp, seed, output_logprob); } else { - GGML_ASSERT(K <= 32); + GGML_ASSERT(K <= 64); // Shared memory: K * n_warps floats + K * n_warps ints + 2 * n_warps floats (softmax) const int n_warps = (int)(num_threads / WARP_SIZE); const size_t smem_size = K * n_warps * (sizeof(float) + sizeof(int32_t)) + 2 * n_warps * sizeof(float);