From 1e7343bec0daaf12f76095a4eb28d0f5ba5d97ec Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 14 Apr 2026 17:04:01 -0700 Subject: [PATCH 1/4] Per-weight constant cache for CUDA backend Replace the old update_constants_from_blob + cross-method sharing with a unified per-weight caching approach. The first method to initialize loads its constants from the blob and caches them by FQN. Subsequent methods with matching FQNs reuse cached GPU tensors via update_user_managed_constant_buffer_pairs, skipping blob loading entirely. This eliminates duplicate GPU weight allocations for multi-method models (e.g., prefill/decode), reducing peak GPU memory from ~35 GB to ~17.6 GB for Qwen 3.5 MoE. Also adds GPU peak memory reporting to the Qwen3.5 MoE runner and a CI check (< 20 GB) in test_model_e2e.sh. --- .ci/scripts/test_model_e2e.sh | 21 ++ backends/aoti/aoti_delegate_handle.h | 20 ++ backends/cuda/runtime/cuda_backend.cpp | 416 ++++++++++++++++--------- examples/models/qwen3_5_moe/main.cpp | 10 + 4 files changed, 315 insertions(+), 152 deletions(-) diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index 5cee37b19cf..3a7fe02af0c 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -397,6 +397,27 @@ if [ -n "$EXPECTED_OUTPUT" ]; then else echo "SUCCESS: Runner completed successfully" fi + +# Validate GPU peak memory usage for models with known memory budgets. +# The runner prints "GPU peak memory usage: XXXX.X MiB" at the end. +case "$MODEL_NAME" in + qwen3_5_moe) + MAX_MEMORY_MIB=20480 # 20 GB — must fit on a single GPU (e.g. 4090) + PEAK_MEM=$(echo "$OUTPUT" | grep -oP 'GPU peak memory usage: \K[0-9.]+' || true) + if [ -n "$PEAK_MEM" ]; then + # Compare as integers (truncate decimals) + PEAK_MEM_INT=${PEAK_MEM%%.*} + if [ "$PEAK_MEM_INT" -gt "$MAX_MEMORY_MIB" ]; then + echo "FAIL: GPU peak memory ${PEAK_MEM} MiB exceeds budget ${MAX_MEMORY_MIB} MiB" + exit 1 + else + echo "Success: GPU peak memory ${PEAK_MEM} MiB within budget (max ${MAX_MEMORY_MIB} MiB)" + fi + else + echo "WARNING: GPU peak memory usage not found in output" + fi + ;; +esac echo "::endgroup::" popd diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h index 2d1a3146ae5..89ad64c6dbb 100644 --- a/backends/aoti/aoti_delegate_handle.h +++ b/backends/aoti/aoti_delegate_handle.h @@ -104,6 +104,23 @@ using AOTInductorModelContainerGetConstantOriginalFQNFunc = size_t idx, const char** original_fqn); +// Retrieves a constant's data size in bytes by index. +using AOTInductorModelContainerGetConstantDataSizeFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t idx, + size_t* data_size); + +// Retrieves whether a constant was produced by constant folding. +using AOTInductorModelContainerGetConstantFromFoldedFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t idx, + bool* from_folded); + +// Retrieves the total size of the constants blob. +using AOTInductorModelContainerGetConstantsBlobSizeFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + uint64_t* ret_size); + // Extracts the constants map from the container (active or inactive buffer). // constant_map_handle should point to a // std::unordered_map. @@ -143,6 +160,9 @@ struct AOTIDelegateHandle { AOTInductorModelContainerGetNumConstantsFunc get_num_constants; AOTInductorModelContainerGetConstantNameFunc get_constant_name; AOTInductorModelContainerGetConstantOriginalFQNFunc get_constant_original_fqn; + AOTInductorModelContainerGetConstantDataSizeFunc get_constant_data_size; + AOTInductorModelContainerGetConstantFromFoldedFunc get_constant_from_folded; + AOTInductorModelContainerGetConstantsBlobSizeFunc get_constants_blob_size; AOTInductorModelContainerExtractConstantsMapFunc extract_constants_map; AOTInductorModelContainerUpdateUserManagedConstantBufferPairsFunc update_user_managed_constant_buffer_pairs; diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index eb0a07b8d8f..d5c3011f126 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -80,7 +80,6 @@ namespace { constexpr char kSkipCopyOutputToCpuForMethod[] = "skip_copy_output_to_cpu_for_method"; constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream"; -constexpr char kShareKvCacheAcrossMethods[] = "share_kv_cache_across_methods"; } // anonymous namespace class ET_EXPERIMENTAL CudaBackend final @@ -225,6 +224,15 @@ class ET_EXPERIMENTAL CudaBackend final LOAD_OPTIONAL_SYMBOL( get_constant_original_fqn, AOTInductorModelContainerGetConstantOriginalFQN); + LOAD_OPTIONAL_SYMBOL( + get_constant_data_size, + AOTInductorModelContainerGetConstantDataSize); + LOAD_OPTIONAL_SYMBOL( + get_constant_from_folded, + AOTInductorModelContainerGetConstantFromFolded); + LOAD_OPTIONAL_SYMBOL( + get_constants_blob_size, + AOTInductorModelContainerGetConstantsBlobSize); LOAD_OPTIONAL_SYMBOL( extract_constants_map, AOTInductorModelContainerExtractConstantsMap); LOAD_OPTIONAL_SYMBOL( @@ -288,17 +296,11 @@ class ET_EXPERIMENTAL CudaBackend final ArrayRef compile_specs // This will be my empty list ) const override { std::string method_name; - bool share_kv_cache = false; for (const CompileSpec& spec : compile_specs) { if (std::strcmp(spec.key, "method_name") == 0) { method_name.assign( static_cast(spec.value.buffer), spec.value.nbytes); // no nullptr guarantee, so pass size - } else if (std::strcmp(spec.key, kShareKvCacheAcrossMethods) == 0) { - if (spec.value.nbytes >= 1) { - share_kv_cache = - static_cast(spec.value.buffer)[0] != 0; - } } } @@ -369,30 +371,11 @@ class ET_EXPERIMENTAL CudaBackend final handle->container_handle = container_handle; - // Look into named data map for constant data - std::string weights_blob_key = - method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; - auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); - if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { - ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); - const void* weights_blob = buffer_res->data(); - // Feed the weights blob into the container. Under the hood it's copying - // weights, so we should free the buffer immediately. - auto update_err = handle->update_constants_from_blob( - handle->container_handle, static_cast(weights_blob)); - if (update_err != Error::Ok) { - ET_LOG(Error, "update_constants_from_blob failed"); - return update_err; - } - // Ensure all weight transfers are complete before execution - cudaDeviceSynchronize(); - buffer_res->Free(); - } else { - ET_LOG( - Info, - "weights_blob '%s' not found or update fn is null", - weights_blob_key.c_str()); - } + // Load constants with per-weight caching. + // This replaces the old update_constants_from_blob + cross-method sharing + // with a unified approach that avoids duplicate GPU allocations. + ET_CHECK_OK_OR_RETURN_ERROR( + load_constants_with_cache(handle, named_data_map, method_name)); // Use shared CUDA stream if enabled via options, otherwise create one. // A shared stream ensures proper ordering across multiple methods @@ -419,119 +402,6 @@ class ET_EXPERIMENTAL CudaBackend final method_name.c_str()); } - // --------------------------------------------------------------- - // Cross-method constant sharing (e.g., KV cache between prefill/decode). - // - // Only enabled when share_kv_cache_across_methods compile spec is set. - // The first container to initialize extracts its constants (keyed by - // original FQN) and stores the AtenTensorHandle's. Subsequent containers - // with matching FQNs are updated to point to the same GPU tensors via - // UpdateUserManagedConstantBufferPairs (user_managed = true → no copy, - // the source container retains ownership). - // --------------------------------------------------------------- - if (share_kv_cache && handle->get_num_constants && - handle->get_constant_name && handle->get_constant_original_fqn && - handle->extract_constants_map && - handle->update_user_managed_constant_buffer_pairs) { - size_t num_constants = 0; - handle->get_num_constants(handle->container_handle, &num_constants); - - if (num_constants > 0) { - // Build FQN → internal_name mapping for this container. - std::unordered_map fqn_to_name; - for (size_t i = 0; i < num_constants; i++) { - const char* name = nullptr; - const char* fqn = nullptr; - handle->get_constant_name(handle->container_handle, i, &name); - handle->get_constant_original_fqn(handle->container_handle, i, &fqn); - if (name && fqn && fqn[0] != '\0') { - fqn_to_name[fqn] = name; - } - } - - std::lock_guard guard(shared_constants_mutex_); - - if (!constants_extracted_) { - // First container: extract its constants and store by FQN. - std::unordered_map extracted_map; - auto extract_err = handle->extract_constants_map( - handle->container_handle, - reinterpret_cast(&extracted_map), - /*use_inactive=*/false); - - if (extract_err == Error::Ok) { - for (const auto& [fqn, internal_name] : fqn_to_name) { - auto it = extracted_map.find(fqn); - if (it != extracted_map.end()) { - shared_constant_tensors_[fqn] = it->second; - } - } - constants_extracted_ = true; - ET_LOG( - Info, - "Extracted %zu shared constants from method '%s'", - shared_constant_tensors_.size(), - method_name.c_str()); - } else { - ET_LOG( - Error, - "Failed to extract constants from '%s'", - method_name.c_str()); - delete handle; - return Error::Internal; - } - } else { - // Subsequent container: share matching constants from the first. - std::vector pairs; - for (const auto& [fqn, internal_name] : fqn_to_name) { - auto it = shared_constant_tensors_.find(fqn); - if (it != shared_constant_tensors_.end()) { - // UpdateUserManagedConstantBufferPairs matches against the - // codegen constant name (underscored), not the original FQN. - pairs.push_back({internal_name.c_str(), it->second}); - } - } - - if (!pairs.empty()) { - auto update_err = handle->update_user_managed_constant_buffer_pairs( - handle->container_handle, - pairs.data(), - pairs.size(), - /*use_inactive=*/false, - /*validate_full_update=*/false); - - if (update_err == Error::Ok) { - ET_LOG( - Info, - "Shared %zu constants into method '%s'", - pairs.size(), - method_name.c_str()); - } else { - ET_LOG( - Error, - "Failed to share constants into '%s'", - method_name.c_str()); - delete handle; - return Error::Internal; - } - } - } - } - } else if (share_kv_cache) { - ET_LOG( - Error, - "share_kv_cache_across_methods requested but constant sharing APIs " - "not available for method '%s'", - method_name.c_str()); - delete handle; - return Error::Internal; - } else { - ET_LOG( - Info, - "Constant sharing not requested for method '%s'", - method_name.c_str()); - } - return (DelegateHandle*)handle; // Return the handle post-processing } @@ -778,11 +648,256 @@ class ET_EXPERIMENTAL CudaBackend final unordered_map> cached_outputs_; - // Cross-method constant sharing state. - // When multiple AOTI containers share mutable buffers (e.g., KV cache), - // the first container's constants are extracted and stored here. Subsequent - // containers with matching FQNs share the same GPU tensors via - // UpdateUserManagedConstantBufferPairs. + // --------------------------------------------------------------- + // Per-weight constant cache. + // + // Maintains a singleton FQN → AtenTensorHandle cache across methods. + // When loading constants for a method, constants already in the cache + // are reused (zero-copy via update_user_managed_constant_buffer_pairs). + // Only constants not in the cache are loaded from the blob and added + // to the cache. This avoids duplicate GPU allocations when multiple + // methods (e.g., prefill/decode) share the same weights. + // + // allocate_constant_on_gpu() is the allocation primitive — kept as a + // separate function so the strategy can be swapped later (e.g., pool + // allocator, unified memory, sub-allocation from a slab). + // --------------------------------------------------------------- + + // Allocate a single constant from the blob onto GPU and return its + // raw GPU pointer. Caller is responsible for lifetime management. + // Returns nullptr on failure or if data_size is 0. + static void* allocate_constant_on_gpu( + const uint8_t* blob_ptr, + size_t blob_offset, + size_t data_size) { + if (data_size == 0) { + return nullptr; + } + void* gpu_ptr = nullptr; + cudaError_t err = cudaMalloc(&gpu_ptr, data_size); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMalloc failed for constant (%zu bytes): %s", + data_size, + cudaGetErrorString(err)); + return nullptr; + } + err = cudaMemcpy( + gpu_ptr, blob_ptr + blob_offset, data_size, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpy failed for constant (%zu bytes): %s", + data_size, + cudaGetErrorString(err)); + cudaFree(gpu_ptr); + return nullptr; + } + return gpu_ptr; + } + + // Load constants for a method using per-weight caching. + // Returns Error::Ok on success. + // + // Flow: + // 1. Enumerate this method's constants and their FQNs. + // 2. For each constant: + // - If FQN is in shared_constant_tensors_ → reuse (cache hit). + // - Otherwise → mark as needing loading (cache miss). + // 3. If all constants are cached → skip blob loading entirely. + // Otherwise → call update_constants_from_blob to load all, then + // extract and cache the new constants. + // 4. For cached constants, call update_user_managed_constant_buffer_pairs + // to point the container to the shared GPU tensors. + Error load_constants_with_cache( + cuda::CudaDelegateHandle* handle, + const NamedDataMap* named_data_map, + const std::string& method_name) const { + // Check if the required APIs are available + if (!handle->get_num_constants || !handle->get_constant_name || + !handle->get_constant_original_fqn || + !handle->extract_constants_map || + !handle->update_user_managed_constant_buffer_pairs) { + // Fall back to the legacy path + return load_constants_legacy(handle, named_data_map, method_name); + } + + // Step 1: Enumerate constants and partition into cached/uncached + size_t num_constants = 0; + handle->get_num_constants(handle->container_handle, &num_constants); + if (num_constants == 0) { + ET_LOG(Info, "No constants for method '%s'", method_name.c_str()); + return Error::Ok; + } + + // Build FQN → internal_name mapping and determine cache hits/misses + std::unordered_map fqn_to_name; + std::vector uncached_fqns; + + { + std::lock_guard guard(shared_constants_mutex_); + for (size_t i = 0; i < num_constants; i++) { + const char* name = nullptr; + const char* fqn = nullptr; + handle->get_constant_name(handle->container_handle, i, &name); + handle->get_constant_original_fqn(handle->container_handle, i, &fqn); + if (name && fqn && fqn[0] != '\0') { + fqn_to_name[fqn] = name; + if (shared_constant_tensors_.find(fqn) == + shared_constant_tensors_.end()) { + uncached_fqns.push_back(fqn); + } + } + } + } + + size_t num_cached = fqn_to_name.size() - uncached_fqns.size(); + ET_LOG( + Info, + "Method '%s': %zu constants, %zu cached, %zu uncached", + method_name.c_str(), + fqn_to_name.size(), + num_cached, + uncached_fqns.size()); + + // Step 2: Load uncached constants from blob (if any) + if (!uncached_fqns.empty()) { + // Need to load from blob — use update_constants_from_blob for all, + // then extract the new constants into the cache. + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; + auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); + + if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { + ET_LOG( + Info, + "Loading constants from blob '%s' for method '%s'", + weights_blob_key.c_str(), + method_name.c_str()); + const void* weights_blob = buffer_res->data(); + auto update_err = handle->update_constants_from_blob( + handle->container_handle, + static_cast(weights_blob)); + if (update_err != Error::Ok) { + ET_LOG(Error, "update_constants_from_blob failed"); + return update_err; + } + cudaDeviceSynchronize(); + buffer_res->Free(); + } else { + ET_LOG( + Error, + "weights_blob '%s' not found or update fn is null", + weights_blob_key.c_str()); + return Error::NotFound; + } + + // Extract all constants and cache the newly loaded ones + std::unordered_map extracted_map; + auto extract_err = handle->extract_constants_map( + handle->container_handle, + reinterpret_cast(&extracted_map), + /*use_inactive=*/false); + + if (extract_err == Error::Ok) { + std::lock_guard guard(shared_constants_mutex_); + for (const auto& fqn : uncached_fqns) { + auto it_name = fqn_to_name.find(fqn); + if (it_name == fqn_to_name.end()) continue; + // extract_constants_map returns entries keyed by FQN + auto it = extracted_map.find(fqn); + if (it != extracted_map.end()) { + shared_constant_tensors_[fqn] = it->second; + } + } + ET_LOG( + Info, + "Cached %zu new constants from method '%s' (total cache: %zu)", + uncached_fqns.size(), + method_name.c_str(), + shared_constant_tensors_.size()); + } else { + ET_LOG(Error, "Failed to extract constants from '%s'", method_name.c_str()); + return Error::Internal; + } + } else { + // All constants are cached — skip blob loading entirely! + ET_LOG( + Info, + "All %zu constants cached — skipping blob load for method '%s'", + fqn_to_name.size(), + method_name.c_str()); + } + + // Step 3: Point the container to cached tensors via user_managed pairs + if (num_cached > 0 || uncached_fqns.empty()) { + std::vector pairs; + { + std::lock_guard guard(shared_constants_mutex_); + for (const auto& [fqn, internal_name] : fqn_to_name) { + auto it = shared_constant_tensors_.find(fqn); + if (it != shared_constant_tensors_.end()) { + pairs.push_back({internal_name.c_str(), it->second}); + } + } + } + + if (!pairs.empty()) { + auto update_err = handle->update_user_managed_constant_buffer_pairs( + handle->container_handle, + pairs.data(), + pairs.size(), + /*use_inactive=*/false, + /*validate_full_update=*/false); + + if (update_err != Error::Ok) { + ET_LOG( + Error, + "Failed to set cached constants for method '%s'", + method_name.c_str()); + return Error::Internal; + } + ET_LOG( + Info, + "Shared %zu cached constants into method '%s'", + pairs.size(), + method_name.c_str()); + } + } + + return Error::Ok; + } + + // Legacy constant loading: load the entire blob without caching. + // Used as fallback when constant management APIs are unavailable. + Error load_constants_legacy( + cuda::CudaDelegateHandle* handle, + const NamedDataMap* named_data_map, + const std::string& method_name) const { + std::string weights_blob_key = + method_name.empty() ? "weights_blob" : method_name + "_weights_blob"; + auto buffer_res = named_data_map->get_data(weights_blob_key.c_str()); + if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) { + ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); + const void* weights_blob = buffer_res->data(); + auto update_err = handle->update_constants_from_blob( + handle->container_handle, + static_cast(weights_blob)); + if (update_err != Error::Ok) { + ET_LOG(Error, "update_constants_from_blob failed"); + return update_err; + } + cudaDeviceSynchronize(); + buffer_res->Free(); + } else { + ET_LOG( + Info, + "weights_blob '%s' not found or update fn is null", + weights_blob_key.c_str()); + } + return Error::Ok; + } mutable std::mutex shared_constants_mutex_; // FQN → AtenTensorHandle from the source (first) container. @@ -790,9 +905,6 @@ class ET_EXPERIMENTAL CudaBackend final // explicitly deleted — see destroy() comment). mutable std::unordered_map shared_constant_tensors_; - - // Whether we've already extracted constants from a source container. - mutable bool constants_extracted_ = false; }; } // namespace executorch::backends::cuda diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp index 7f4e60596be..968113d7a49 100644 --- a/examples/models/qwen3_5_moe/main.cpp +++ b/examples/models/qwen3_5_moe/main.cpp @@ -249,5 +249,15 @@ int main(int argc, char** argv) { num_generated * 1000.0 / decode_ms); printf("Prompt tokens: %ld\n", num_prompt_tokens); + // Print peak GPU memory usage + { + size_t free_bytes = 0, total_bytes = 0; + if (cudaMemGetInfo(&free_bytes, &total_bytes) == cudaSuccess) { + double used_mb = + static_cast(total_bytes - free_bytes) / (1024.0 * 1024.0); + printf("GPU peak memory usage: %.1f MiB\n", used_mb); + } + } + return 0; } From 0c0562082f764c3d6c67878d9d018ead2f51b5ff Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 14 Apr 2026 19:25:12 -0700 Subject: [PATCH 2/4] lint --- .../cuda/benchmarks/benchmark_model_decode.py | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 backends/cuda/benchmarks/benchmark_model_decode.py diff --git a/backends/cuda/benchmarks/benchmark_model_decode.py b/backends/cuda/benchmarks/benchmark_model_decode.py new file mode 100644 index 00000000000..51330709000 --- /dev/null +++ b/backends/cuda/benchmarks/benchmark_model_decode.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +"""Model-level decode benchmark for Qwen3.5 MoE split-K SDPA. + +Measures prefill tok/s and decode tok/s across different prompt sizes +and decode lengths to evaluate FlashDecoding++ async softmax impact. +""" + +import json +import sys +import time + +import torch + +# Register Triton kernels before model import +import executorch.backends.cuda.triton.kernels # noqa: F401 + +from executorch.examples.models.qwen3_5_moe.export import load_prequantized_model + + +PROMPT_SIZES = [1, 15, 59, 143, 1694] +DECODE_LENGTHS = [16, 64, 256, 1024] +MODEL_PATH = "/home/gasoonjia/models/qwen35_moe_int4_hqq" +NUM_WARMUP = 2 # warmup runs before timing + + +def _move_to_cuda(model, config): + for fqn, buf in list(model.named_buffers()): + parts = fqn.rsplit(".", 1) + parent = model.get_submodule(parts[0]) if len(parts) > 1 else model + if buf.device.type == "meta": + dtype = torch.bfloat16 if buf.dtype != torch.bool else torch.bool + parent.register_buffer( + parts[-1], torch.zeros(buf.shape, dtype=dtype, device="cuda") + ) + else: + parent.register_buffer(parts[-1], buf.to("cuda")) + + for name, p in model.named_parameters(): + parts = name.rsplit(".", 1) + parent = model.get_submodule(parts[0]) if len(parts) > 1 else model + setattr( + parent, + parts[-1], + torch.nn.Parameter(p.data.to("cuda"), requires_grad=False), + ) + + for layer in model.layers: + if hasattr(layer.attn, "rotary_emb"): + rope = layer.attn.rotary_emb + inv_freq = 1.0 / ( + config.rope_theta + ** ( + torch.arange(0, rope.rotary_dim, 2, dtype=torch.float32) + / rope.rotary_dim + ) + ) + rope.inv_freq = inv_freq.to("cuda") + if hasattr(layer.attn, "mask"): + layer.attn.register_buffer( + "mask", + torch.tril( + torch.ones( + config.max_seq_len, + config.max_seq_len, + dtype=torch.bool, + device="cuda", + ) + ), + ) + + +def _reset_state(model): + """Reset all KV caches, conv_state, and recurrent_state to zero.""" + for layer in model.layers: + attn = layer.attn + if hasattr(attn, "kv_cache"): + attn.kv_cache.k_cache.zero_() + attn.kv_cache.v_cache.zero_() + if hasattr(attn, "conv_state"): + attn.conv_state.zero_() + if hasattr(attn, "recurrent_state"): + attn.recurrent_state.zero_() + + +@torch.inference_mode() +def benchmark_prefill(model, prompt_size): + """Prefill prompt_size tokens one at a time, return tok/s.""" + _reset_state(model) + tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long) + + # Warmup + for i in range(min(prompt_size, NUM_WARMUP)): + pos = torch.tensor([i], device="cuda") + model(tokens, pos) + + _reset_state(model) + torch.cuda.synchronize() + t0 = time.perf_counter() + + for i in range(prompt_size): + pos = torch.tensor([i], device="cuda") + model(tokens, pos) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - t0 + return prompt_size / elapsed if elapsed > 0 else 0.0 + + +@torch.inference_mode() +def benchmark_decode(model, prompt_size, decode_length): + """Prefill prompt_size tokens, then decode decode_length tokens. Return decode tok/s.""" + _reset_state(model) + tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long) + + # Prefill + for i in range(prompt_size): + pos = torch.tensor([i], device="cuda") + logits = model(tokens, pos) + + # Get first decode token + next_token = logits[:, -1:, :].argmax(dim=-1) + + # Warmup decode + # (we skip warmup here to avoid polluting KV cache beyond prompt_size + decode_length) + + torch.cuda.synchronize() + t0 = time.perf_counter() + + for i in range(decode_length): + pos = torch.tensor([prompt_size + i], device="cuda") + logits = model(next_token, pos) + next_token = logits[:, -1:, :].argmax(dim=-1) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - t0 + return decode_length / elapsed if elapsed > 0 else 0.0 + + +def main(): + max_seq = max(PROMPT_SIZES) + max(DECODE_LENGTHS) + 16 + print(f"Loading model from {MODEL_PATH} (max_seq_len={max_seq})...") + model, config = load_prequantized_model(MODEL_PATH, max_seq_len=max_seq) + _move_to_cuda(model, config) + model.eval() + + results = {"prefill": {}, "decode": {}} + + # Prefill benchmark + print("\n=== Prefill Benchmark ===") + print(f"{'Prompt Size':>12} | {'tok/s':>10}") + print("-" * 27) + for ps in PROMPT_SIZES: + tps = benchmark_prefill(model, ps) + results["prefill"][ps] = round(tps, 2) + print(f"{ps:>12} | {tps:>10.2f}") + + # Decode benchmark + print("\n=== Decode Benchmark ===") + header = f"{'Prompt Size':>12}" + for dl in DECODE_LENGTHS: + header += f" | {'dec=' + str(dl):>12}" + print(header) + print("-" * len(header)) + + for ps in PROMPT_SIZES: + row = f"{ps:>12}" + results["decode"][ps] = {} + for dl in DECODE_LENGTHS: + tps = benchmark_decode(model, ps, dl) + results["decode"][ps][dl] = round(tps, 2) + row += f" | {tps:>12.2f}" + print(row) + + # Dump JSON for easy comparison + print("\n--- JSON ---") + print(json.dumps(results, indent=2)) + + +if __name__ == "__main__": + main() From 865f1189f611948cd6e68ebe2a7c43cc10e07041 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 14 Apr 2026 19:25:56 -0700 Subject: [PATCH 3/4] remove unnessary benchmark --- .../cuda/benchmarks/benchmark_model_decode.py | 180 ------------------ 1 file changed, 180 deletions(-) delete mode 100644 backends/cuda/benchmarks/benchmark_model_decode.py diff --git a/backends/cuda/benchmarks/benchmark_model_decode.py b/backends/cuda/benchmarks/benchmark_model_decode.py deleted file mode 100644 index 51330709000..00000000000 --- a/backends/cuda/benchmarks/benchmark_model_decode.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 -"""Model-level decode benchmark for Qwen3.5 MoE split-K SDPA. - -Measures prefill tok/s and decode tok/s across different prompt sizes -and decode lengths to evaluate FlashDecoding++ async softmax impact. -""" - -import json -import sys -import time - -import torch - -# Register Triton kernels before model import -import executorch.backends.cuda.triton.kernels # noqa: F401 - -from executorch.examples.models.qwen3_5_moe.export import load_prequantized_model - - -PROMPT_SIZES = [1, 15, 59, 143, 1694] -DECODE_LENGTHS = [16, 64, 256, 1024] -MODEL_PATH = "/home/gasoonjia/models/qwen35_moe_int4_hqq" -NUM_WARMUP = 2 # warmup runs before timing - - -def _move_to_cuda(model, config): - for fqn, buf in list(model.named_buffers()): - parts = fqn.rsplit(".", 1) - parent = model.get_submodule(parts[0]) if len(parts) > 1 else model - if buf.device.type == "meta": - dtype = torch.bfloat16 if buf.dtype != torch.bool else torch.bool - parent.register_buffer( - parts[-1], torch.zeros(buf.shape, dtype=dtype, device="cuda") - ) - else: - parent.register_buffer(parts[-1], buf.to("cuda")) - - for name, p in model.named_parameters(): - parts = name.rsplit(".", 1) - parent = model.get_submodule(parts[0]) if len(parts) > 1 else model - setattr( - parent, - parts[-1], - torch.nn.Parameter(p.data.to("cuda"), requires_grad=False), - ) - - for layer in model.layers: - if hasattr(layer.attn, "rotary_emb"): - rope = layer.attn.rotary_emb - inv_freq = 1.0 / ( - config.rope_theta - ** ( - torch.arange(0, rope.rotary_dim, 2, dtype=torch.float32) - / rope.rotary_dim - ) - ) - rope.inv_freq = inv_freq.to("cuda") - if hasattr(layer.attn, "mask"): - layer.attn.register_buffer( - "mask", - torch.tril( - torch.ones( - config.max_seq_len, - config.max_seq_len, - dtype=torch.bool, - device="cuda", - ) - ), - ) - - -def _reset_state(model): - """Reset all KV caches, conv_state, and recurrent_state to zero.""" - for layer in model.layers: - attn = layer.attn - if hasattr(attn, "kv_cache"): - attn.kv_cache.k_cache.zero_() - attn.kv_cache.v_cache.zero_() - if hasattr(attn, "conv_state"): - attn.conv_state.zero_() - if hasattr(attn, "recurrent_state"): - attn.recurrent_state.zero_() - - -@torch.inference_mode() -def benchmark_prefill(model, prompt_size): - """Prefill prompt_size tokens one at a time, return tok/s.""" - _reset_state(model) - tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long) - - # Warmup - for i in range(min(prompt_size, NUM_WARMUP)): - pos = torch.tensor([i], device="cuda") - model(tokens, pos) - - _reset_state(model) - torch.cuda.synchronize() - t0 = time.perf_counter() - - for i in range(prompt_size): - pos = torch.tensor([i], device="cuda") - model(tokens, pos) - - torch.cuda.synchronize() - elapsed = time.perf_counter() - t0 - return prompt_size / elapsed if elapsed > 0 else 0.0 - - -@torch.inference_mode() -def benchmark_decode(model, prompt_size, decode_length): - """Prefill prompt_size tokens, then decode decode_length tokens. Return decode tok/s.""" - _reset_state(model) - tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long) - - # Prefill - for i in range(prompt_size): - pos = torch.tensor([i], device="cuda") - logits = model(tokens, pos) - - # Get first decode token - next_token = logits[:, -1:, :].argmax(dim=-1) - - # Warmup decode - # (we skip warmup here to avoid polluting KV cache beyond prompt_size + decode_length) - - torch.cuda.synchronize() - t0 = time.perf_counter() - - for i in range(decode_length): - pos = torch.tensor([prompt_size + i], device="cuda") - logits = model(next_token, pos) - next_token = logits[:, -1:, :].argmax(dim=-1) - - torch.cuda.synchronize() - elapsed = time.perf_counter() - t0 - return decode_length / elapsed if elapsed > 0 else 0.0 - - -def main(): - max_seq = max(PROMPT_SIZES) + max(DECODE_LENGTHS) + 16 - print(f"Loading model from {MODEL_PATH} (max_seq_len={max_seq})...") - model, config = load_prequantized_model(MODEL_PATH, max_seq_len=max_seq) - _move_to_cuda(model, config) - model.eval() - - results = {"prefill": {}, "decode": {}} - - # Prefill benchmark - print("\n=== Prefill Benchmark ===") - print(f"{'Prompt Size':>12} | {'tok/s':>10}") - print("-" * 27) - for ps in PROMPT_SIZES: - tps = benchmark_prefill(model, ps) - results["prefill"][ps] = round(tps, 2) - print(f"{ps:>12} | {tps:>10.2f}") - - # Decode benchmark - print("\n=== Decode Benchmark ===") - header = f"{'Prompt Size':>12}" - for dl in DECODE_LENGTHS: - header += f" | {'dec=' + str(dl):>12}" - print(header) - print("-" * len(header)) - - for ps in PROMPT_SIZES: - row = f"{ps:>12}" - results["decode"][ps] = {} - for dl in DECODE_LENGTHS: - tps = benchmark_decode(model, ps, dl) - results["decode"][ps][dl] = round(tps, 2) - row += f" | {tps:>12.2f}" - print(row) - - # Dump JSON for easy comparison - print("\n--- JSON ---") - print(json.dumps(results, indent=2)) - - -if __name__ == "__main__": - main() From e249c23f7508ae27397a2e7ede27b1efc913f2ac Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 15 Apr 2026 15:45:19 -0700 Subject: [PATCH 4/4] lint --- backends/cuda/runtime/cuda_backend.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 224042dcc07..0e9c5a32473 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -669,8 +669,7 @@ class ET_EXPERIMENTAL CudaBackend final const std::string& method_name) const { // Check if the required APIs are available if (!handle->get_num_constants || !handle->get_constant_name || - !handle->get_constant_original_fqn || - !handle->extract_constants_map || + !handle->get_constant_original_fqn || !handle->extract_constants_map || !handle->update_user_managed_constant_buffer_pairs) { // Fall back to the legacy path return load_constants_legacy(handle, named_data_map, method_name); @@ -757,7 +756,8 @@ class ET_EXPERIMENTAL CudaBackend final std::lock_guard guard(shared_constants_mutex_); for (const auto& fqn : uncached_fqns) { auto it_name = fqn_to_name.find(fqn); - if (it_name == fqn_to_name.end()) continue; + if (it_name == fqn_to_name.end()) + continue; // extract_constants_map returns entries keyed by FQN auto it = extracted_map.find(fqn); if (it != extracted_map.end()) { @@ -771,7 +771,10 @@ class ET_EXPERIMENTAL CudaBackend final method_name.c_str(), shared_constant_tensors_.size()); } else { - ET_LOG(Error, "Failed to extract constants from '%s'", method_name.c_str()); + ET_LOG( + Error, + "Failed to extract constants from '%s'", + method_name.c_str()); return Error::Internal; } } else { @@ -835,8 +838,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str()); const void* weights_blob = buffer_res->data(); auto update_err = handle->update_constants_from_blob( - handle->container_handle, - static_cast(weights_blob)); + handle->container_handle, static_cast(weights_blob)); if (update_err != Error::Ok) { ET_LOG(Error, "update_constants_from_blob failed"); return update_err;