From 1e7343bec0daaf12f76095a4eb28d0f5ba5d97ec Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@fb.com>
Date: Tue, 14 Apr 2026 17:04:01 -0700
Subject: [PATCH 1/4] Per-weight constant cache for CUDA backend

Replace the old update_constants_from_blob + cross-method sharing with a
unified per-weight caching approach. The first method to initialize loads
its constants from the blob and caches them by FQN. Subsequent methods
with matching FQNs reuse cached GPU tensors via
update_user_managed_constant_buffer_pairs, skipping blob loading entirely.

This eliminates duplicate GPU weight allocations for multi-method models
(e.g., prefill/decode), reducing peak GPU memory from ~35 GB to ~17.6 GB
for Qwen 3.5 MoE.

Also adds GPU peak memory reporting to the Qwen3.5 MoE runner and a
CI check (< 20 GB) in test_model_e2e.sh.
---
 .ci/scripts/test_model_e2e.sh          |  21 ++
 backends/aoti/aoti_delegate_handle.h   |  20 ++
 backends/cuda/runtime/cuda_backend.cpp | 416 ++++++++++++++++---------
 examples/models/qwen3_5_moe/main.cpp   |  10 +
 4 files changed, 315 insertions(+), 152 deletions(-)

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 5cee37b19cf..3a7fe02af0c 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -397,6 +397,27 @@ if [ -n "$EXPECTED_OUTPUT" ]; then
 else
   echo "SUCCESS: Runner completed successfully"
 fi
+
+# Validate GPU peak memory usage for models with known memory budgets.
+# The runner prints "GPU peak memory usage: XXXX.X MiB" at the end.
+case "$MODEL_NAME" in
+  qwen3_5_moe)
+    MAX_MEMORY_MIB=20480  # 20 GB — must fit on a single GPU (e.g. 4090)
+    PEAK_MEM=$(echo "$OUTPUT" | grep -oP 'GPU peak memory usage: \K[0-9.]+' || true)
+    if [ -n "$PEAK_MEM" ]; then
+      # Compare as integers (truncate decimals)
+      PEAK_MEM_INT=${PEAK_MEM%%.*}
+      if [ "$PEAK_MEM_INT" -gt "$MAX_MEMORY_MIB" ]; then
+        echo "FAIL: GPU peak memory ${PEAK_MEM} MiB exceeds budget ${MAX_MEMORY_MIB} MiB"
+        exit 1
+      else
+        echo "Success: GPU peak memory ${PEAK_MEM} MiB within budget (max ${MAX_MEMORY_MIB} MiB)"
+      fi
+    else
+      echo "WARNING: GPU peak memory usage not found in output"
+    fi
+    ;;
+esac
 echo "::endgroup::"
 
 popd
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index 2d1a3146ae5..89ad64c6dbb 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -104,6 +104,23 @@ using AOTInductorModelContainerGetConstantOriginalFQNFunc =
         size_t idx,
         const char** original_fqn);
 
+// Retrieves a constant's data size in bytes by index.
+using AOTInductorModelContainerGetConstantDataSizeFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    size_t* data_size);
+
+// Retrieves whether a constant was produced by constant folding.
+using AOTInductorModelContainerGetConstantFromFoldedFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded);
+
+// Retrieves the total size of the constants blob.
+using AOTInductorModelContainerGetConstantsBlobSizeFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    uint64_t* ret_size);
+
 // Extracts the constants map from the container (active or inactive buffer).
 // constant_map_handle should point to a
 // std::unordered_map<std::string, AtenTensorHandle>.
@@ -143,6 +160,9 @@ struct AOTIDelegateHandle {
   AOTInductorModelContainerGetNumConstantsFunc get_num_constants;
   AOTInductorModelContainerGetConstantNameFunc get_constant_name;
   AOTInductorModelContainerGetConstantOriginalFQNFunc get_constant_original_fqn;
+  AOTInductorModelContainerGetConstantDataSizeFunc get_constant_data_size;
+  AOTInductorModelContainerGetConstantFromFoldedFunc get_constant_from_folded;
+  AOTInductorModelContainerGetConstantsBlobSizeFunc get_constants_blob_size;
   AOTInductorModelContainerExtractConstantsMapFunc extract_constants_map;
   AOTInductorModelContainerUpdateUserManagedConstantBufferPairsFunc
       update_user_managed_constant_buffer_pairs;
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index eb0a07b8d8f..d5c3011f126 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -80,7 +80,6 @@ namespace {
 constexpr char kSkipCopyOutputToCpuForMethod[] =
     "skip_copy_output_to_cpu_for_method";
 constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
-constexpr char kShareKvCacheAcrossMethods[] = "share_kv_cache_across_methods";
 } // anonymous namespace
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -225,6 +224,15 @@ class ET_EXPERIMENTAL CudaBackend final
     LOAD_OPTIONAL_SYMBOL(
         get_constant_original_fqn,
         AOTInductorModelContainerGetConstantOriginalFQN);
+    LOAD_OPTIONAL_SYMBOL(
+        get_constant_data_size,
+        AOTInductorModelContainerGetConstantDataSize);
+    LOAD_OPTIONAL_SYMBOL(
+        get_constant_from_folded,
+        AOTInductorModelContainerGetConstantFromFolded);
+    LOAD_OPTIONAL_SYMBOL(
+        get_constants_blob_size,
+        AOTInductorModelContainerGetConstantsBlobSize);
     LOAD_OPTIONAL_SYMBOL(
         extract_constants_map, AOTInductorModelContainerExtractConstantsMap);
     LOAD_OPTIONAL_SYMBOL(
@@ -288,17 +296,11 @@ class ET_EXPERIMENTAL CudaBackend final
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
     std::string method_name;
-    bool share_kv_cache = false;
     for (const CompileSpec& spec : compile_specs) {
       if (std::strcmp(spec.key, "method_name") == 0) {
         method_name.assign(
             static_cast<const char*>(spec.value.buffer),
             spec.value.nbytes); // no nullptr guarantee, so pass size
-      } else if (std::strcmp(spec.key, kShareKvCacheAcrossMethods) == 0) {
-        if (spec.value.nbytes >= 1) {
-          share_kv_cache =
-              static_cast<const uint8_t*>(spec.value.buffer)[0] != 0;
-        }
       }
     }
 
@@ -369,30 +371,11 @@ class ET_EXPERIMENTAL CudaBackend final
 
     handle->container_handle = container_handle;
 
-    // Look into named data map for constant data
-    std::string weights_blob_key =
-        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
-    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
-    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
-      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
-      const void* weights_blob = buffer_res->data();
-      // Feed the weights blob into the container. Under the hood it's copying
-      // weights, so we should free the buffer immediately.
-      auto update_err = handle->update_constants_from_blob(
-          handle->container_handle, static_cast<const uint8_t*>(weights_blob));
-      if (update_err != Error::Ok) {
-        ET_LOG(Error, "update_constants_from_blob failed");
-        return update_err;
-      }
-      // Ensure all weight transfers are complete before execution
-      cudaDeviceSynchronize();
-      buffer_res->Free();
-    } else {
-      ET_LOG(
-          Info,
-          "weights_blob '%s' not found or update fn is null",
-          weights_blob_key.c_str());
-    }
+    // Load constants with per-weight caching.
+    // This replaces the old update_constants_from_blob + cross-method sharing
+    // with a unified approach that avoids duplicate GPU allocations.
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        load_constants_with_cache(handle, named_data_map, method_name));
 
     // Use shared CUDA stream if enabled via options, otherwise create one.
     // A shared stream ensures proper ordering across multiple methods
@@ -419,119 +402,6 @@ class ET_EXPERIMENTAL CudaBackend final
           method_name.c_str());
     }
 
-    // ---------------------------------------------------------------
-    // Cross-method constant sharing (e.g., KV cache between prefill/decode).
-    //
-    // Only enabled when share_kv_cache_across_methods compile spec is set.
-    // The first container to initialize extracts its constants (keyed by
-    // original FQN) and stores the AtenTensorHandle's. Subsequent containers
-    // with matching FQNs are updated to point to the same GPU tensors via
-    // UpdateUserManagedConstantBufferPairs (user_managed = true → no copy,
-    // the source container retains ownership).
-    // ---------------------------------------------------------------
-    if (share_kv_cache && handle->get_num_constants &&
-        handle->get_constant_name && handle->get_constant_original_fqn &&
-        handle->extract_constants_map &&
-        handle->update_user_managed_constant_buffer_pairs) {
-      size_t num_constants = 0;
-      handle->get_num_constants(handle->container_handle, &num_constants);
-
-      if (num_constants > 0) {
-        // Build FQN → internal_name mapping for this container.
-        std::unordered_map<std::string, std::string> fqn_to_name;
-        for (size_t i = 0; i < num_constants; i++) {
-          const char* name = nullptr;
-          const char* fqn = nullptr;
-          handle->get_constant_name(handle->container_handle, i, &name);
-          handle->get_constant_original_fqn(handle->container_handle, i, &fqn);
-          if (name && fqn && fqn[0] != '\0') {
-            fqn_to_name[fqn] = name;
-          }
-        }
-
-        std::lock_guard<std::mutex> guard(shared_constants_mutex_);
-
-        if (!constants_extracted_) {
-          // First container: extract its constants and store by FQN.
-          std::unordered_map<std::string, AtenTensorHandle> extracted_map;
-          auto extract_err = handle->extract_constants_map(
-              handle->container_handle,
-              reinterpret_cast<AOTInductorConstantMapHandle>(&extracted_map),
-              /*use_inactive=*/false);
-
-          if (extract_err == Error::Ok) {
-            for (const auto& [fqn, internal_name] : fqn_to_name) {
-              auto it = extracted_map.find(fqn);
-              if (it != extracted_map.end()) {
-                shared_constant_tensors_[fqn] = it->second;
-              }
-            }
-            constants_extracted_ = true;
-            ET_LOG(
-                Info,
-                "Extracted %zu shared constants from method '%s'",
-                shared_constant_tensors_.size(),
-                method_name.c_str());
-          } else {
-            ET_LOG(
-                Error,
-                "Failed to extract constants from '%s'",
-                method_name.c_str());
-            delete handle;
-            return Error::Internal;
-          }
-        } else {
-          // Subsequent container: share matching constants from the first.
-          std::vector<AOTInductorConstantMapEntry> pairs;
-          for (const auto& [fqn, internal_name] : fqn_to_name) {
-            auto it = shared_constant_tensors_.find(fqn);
-            if (it != shared_constant_tensors_.end()) {
-              // UpdateUserManagedConstantBufferPairs matches against the
-              // codegen constant name (underscored), not the original FQN.
-              pairs.push_back({internal_name.c_str(), it->second});
-            }
-          }
-
-          if (!pairs.empty()) {
-            auto update_err = handle->update_user_managed_constant_buffer_pairs(
-                handle->container_handle,
-                pairs.data(),
-                pairs.size(),
-                /*use_inactive=*/false,
-                /*validate_full_update=*/false);
-
-            if (update_err == Error::Ok) {
-              ET_LOG(
-                  Info,
-                  "Shared %zu constants into method '%s'",
-                  pairs.size(),
-                  method_name.c_str());
-            } else {
-              ET_LOG(
-                  Error,
-                  "Failed to share constants into '%s'",
-                  method_name.c_str());
-              delete handle;
-              return Error::Internal;
-            }
-          }
-        }
-      }
-    } else if (share_kv_cache) {
-      ET_LOG(
-          Error,
-          "share_kv_cache_across_methods requested but constant sharing APIs "
-          "not available for method '%s'",
-          method_name.c_str());
-      delete handle;
-      return Error::Internal;
-    } else {
-      ET_LOG(
-          Info,
-          "Constant sharing not requested for method '%s'",
-          method_name.c_str());
-    }
-
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
@@ -778,11 +648,256 @@ class ET_EXPERIMENTAL CudaBackend final
       unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
           cached_outputs_;
 
-  // Cross-method constant sharing state.
-  // When multiple AOTI containers share mutable buffers (e.g., KV cache),
-  // the first container's constants are extracted and stored here. Subsequent
-  // containers with matching FQNs share the same GPU tensors via
-  // UpdateUserManagedConstantBufferPairs.
+  // ---------------------------------------------------------------
+  // Per-weight constant cache.
+  //
+  // Maintains a singleton FQN → AtenTensorHandle cache across methods.
+  // When loading constants for a method, constants already in the cache
+  // are reused (zero-copy via update_user_managed_constant_buffer_pairs).
+  // Only constants not in the cache are loaded from the blob and added
+  // to the cache. This avoids duplicate GPU allocations when multiple
+  // methods (e.g., prefill/decode) share the same weights.
+  //
+  // allocate_constant_on_gpu() is the allocation primitive — kept as a
+  // separate function so the strategy can be swapped later (e.g., pool
+  // allocator, unified memory, sub-allocation from a slab).
+  // ---------------------------------------------------------------
+
+  // Allocate a single constant from the blob onto GPU and return its
+  // raw GPU pointer. Caller is responsible for lifetime management.
+  // Returns nullptr on failure or if data_size is 0.
+  static void* allocate_constant_on_gpu(
+      const uint8_t* blob_ptr,
+      size_t blob_offset,
+      size_t data_size) {
+    if (data_size == 0) {
+      return nullptr;
+    }
+    void* gpu_ptr = nullptr;
+    cudaError_t err = cudaMalloc(&gpu_ptr, data_size);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "cudaMalloc failed for constant (%zu bytes): %s",
+          data_size,
+          cudaGetErrorString(err));
+      return nullptr;
+    }
+    err = cudaMemcpy(
+        gpu_ptr, blob_ptr + blob_offset, data_size, cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "cudaMemcpy failed for constant (%zu bytes): %s",
+          data_size,
+          cudaGetErrorString(err));
+      cudaFree(gpu_ptr);
+      return nullptr;
+    }
+    return gpu_ptr;
+  }
+
+  // Load constants for a method using per-weight caching.
+  // Returns Error::Ok on success.
+  //
+  // Flow:
+  //   1. Enumerate this method's constants and their FQNs.
+  //   2. For each constant:
+  //      - If FQN is in shared_constant_tensors_ → reuse (cache hit).
+  //      - Otherwise → mark as needing loading (cache miss).
+  //   3. If all constants are cached → skip blob loading entirely.
+  //      Otherwise → call update_constants_from_blob to load all, then
+  //      extract and cache the new constants.
+  //   4. For cached constants, call update_user_managed_constant_buffer_pairs
+  //      to point the container to the shared GPU tensors.
+  Error load_constants_with_cache(
+      cuda::CudaDelegateHandle* handle,
+      const NamedDataMap* named_data_map,
+      const std::string& method_name) const {
+    // Check if the required APIs are available
+    if (!handle->get_num_constants || !handle->get_constant_name ||
+        !handle->get_constant_original_fqn ||
+        !handle->extract_constants_map ||
+        !handle->update_user_managed_constant_buffer_pairs) {
+      // Fall back to the legacy path
+      return load_constants_legacy(handle, named_data_map, method_name);
+    }
+
+    // Step 1: Enumerate constants and partition into cached/uncached
+    size_t num_constants = 0;
+    handle->get_num_constants(handle->container_handle, &num_constants);
+    if (num_constants == 0) {
+      ET_LOG(Info, "No constants for method '%s'", method_name.c_str());
+      return Error::Ok;
+    }
+
+    // Build FQN → internal_name mapping and determine cache hits/misses
+    std::unordered_map<std::string, std::string> fqn_to_name;
+    std::vector<std::string> uncached_fqns;
+
+    {
+      std::lock_guard<std::mutex> guard(shared_constants_mutex_);
+      for (size_t i = 0; i < num_constants; i++) {
+        const char* name = nullptr;
+        const char* fqn = nullptr;
+        handle->get_constant_name(handle->container_handle, i, &name);
+        handle->get_constant_original_fqn(handle->container_handle, i, &fqn);
+        if (name && fqn && fqn[0] != '\0') {
+          fqn_to_name[fqn] = name;
+          if (shared_constant_tensors_.find(fqn) ==
+              shared_constant_tensors_.end()) {
+            uncached_fqns.push_back(fqn);
+          }
+        }
+      }
+    }
+
+    size_t num_cached = fqn_to_name.size() - uncached_fqns.size();
+    ET_LOG(
+        Info,
+        "Method '%s': %zu constants, %zu cached, %zu uncached",
+        method_name.c_str(),
+        fqn_to_name.size(),
+        num_cached,
+        uncached_fqns.size());
+
+    // Step 2: Load uncached constants from blob (if any)
+    if (!uncached_fqns.empty()) {
+      // Need to load from blob — use update_constants_from_blob for all,
+      // then extract the new constants into the cache.
+      std::string weights_blob_key =
+          method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+      auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+
+      if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
+        ET_LOG(
+            Info,
+            "Loading constants from blob '%s' for method '%s'",
+            weights_blob_key.c_str(),
+            method_name.c_str());
+        const void* weights_blob = buffer_res->data();
+        auto update_err = handle->update_constants_from_blob(
+            handle->container_handle,
+            static_cast<const uint8_t*>(weights_blob));
+        if (update_err != Error::Ok) {
+          ET_LOG(Error, "update_constants_from_blob failed");
+          return update_err;
+        }
+        cudaDeviceSynchronize();
+        buffer_res->Free();
+      } else {
+        ET_LOG(
+            Error,
+            "weights_blob '%s' not found or update fn is null",
+            weights_blob_key.c_str());
+        return Error::NotFound;
+      }
+
+      // Extract all constants and cache the newly loaded ones
+      std::unordered_map<std::string, AtenTensorHandle> extracted_map;
+      auto extract_err = handle->extract_constants_map(
+          handle->container_handle,
+          reinterpret_cast<AOTInductorConstantMapHandle>(&extracted_map),
+          /*use_inactive=*/false);
+
+      if (extract_err == Error::Ok) {
+        std::lock_guard<std::mutex> guard(shared_constants_mutex_);
+        for (const auto& fqn : uncached_fqns) {
+          auto it_name = fqn_to_name.find(fqn);
+          if (it_name == fqn_to_name.end()) continue;
+          // extract_constants_map returns entries keyed by FQN
+          auto it = extracted_map.find(fqn);
+          if (it != extracted_map.end()) {
+            shared_constant_tensors_[fqn] = it->second;
+          }
+        }
+        ET_LOG(
+            Info,
+            "Cached %zu new constants from method '%s' (total cache: %zu)",
+            uncached_fqns.size(),
+            method_name.c_str(),
+            shared_constant_tensors_.size());
+      } else {
+        ET_LOG(Error, "Failed to extract constants from '%s'", method_name.c_str());
+        return Error::Internal;
+      }
+    } else {
+      // All constants are cached — skip blob loading entirely!
+      ET_LOG(
+          Info,
+          "All %zu constants cached — skipping blob load for method '%s'",
+          fqn_to_name.size(),
+          method_name.c_str());
+    }
+
+    // Step 3: Point the container to cached tensors via user_managed pairs
+    if (num_cached > 0 || uncached_fqns.empty()) {
+      std::vector<AOTInductorConstantMapEntry> pairs;
+      {
+        std::lock_guard<std::mutex> guard(shared_constants_mutex_);
+        for (const auto& [fqn, internal_name] : fqn_to_name) {
+          auto it = shared_constant_tensors_.find(fqn);
+          if (it != shared_constant_tensors_.end()) {
+            pairs.push_back({internal_name.c_str(), it->second});
+          }
+        }
+      }
+
+      if (!pairs.empty()) {
+        auto update_err = handle->update_user_managed_constant_buffer_pairs(
+            handle->container_handle,
+            pairs.data(),
+            pairs.size(),
+            /*use_inactive=*/false,
+            /*validate_full_update=*/false);
+
+        if (update_err != Error::Ok) {
+          ET_LOG(
+              Error,
+              "Failed to set cached constants for method '%s'",
+              method_name.c_str());
+          return Error::Internal;
+        }
+        ET_LOG(
+            Info,
+            "Shared %zu cached constants into method '%s'",
+            pairs.size(),
+            method_name.c_str());
+      }
+    }
+
+    return Error::Ok;
+  }
+
+  // Legacy constant loading: load the entire blob without caching.
+  // Used as fallback when constant management APIs are unavailable.
+  Error load_constants_legacy(
+      cuda::CudaDelegateHandle* handle,
+      const NamedDataMap* named_data_map,
+      const std::string& method_name) const {
+    std::string weights_blob_key =
+        method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
+    auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
+    if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
+      ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
+      const void* weights_blob = buffer_res->data();
+      auto update_err = handle->update_constants_from_blob(
+          handle->container_handle,
+          static_cast<const uint8_t*>(weights_blob));
+      if (update_err != Error::Ok) {
+        ET_LOG(Error, "update_constants_from_blob failed");
+        return update_err;
+      }
+      cudaDeviceSynchronize();
+      buffer_res->Free();
+    } else {
+      ET_LOG(
+          Info,
+          "weights_blob '%s' not found or update fn is null",
+          weights_blob_key.c_str());
+    }
+    return Error::Ok;
+  }
   mutable std::mutex shared_constants_mutex_;
 
   // FQN → AtenTensorHandle from the source (first) container.
@@ -790,9 +905,6 @@ class ET_EXPERIMENTAL CudaBackend final
   // explicitly deleted — see destroy() comment).
   mutable std::unordered_map<std::string, AtenTensorHandle>
       shared_constant_tensors_;
-
-  // Whether we've already extracted constants from a source container.
-  mutable bool constants_extracted_ = false;
 };
 
 } // namespace executorch::backends::cuda
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
index 7f4e60596be..968113d7a49 100644
--- a/examples/models/qwen3_5_moe/main.cpp
+++ b/examples/models/qwen3_5_moe/main.cpp
@@ -249,5 +249,15 @@ int main(int argc, char** argv) {
       num_generated * 1000.0 / decode_ms);
   printf("Prompt tokens: %ld\n", num_prompt_tokens);
 
+  // Print peak GPU memory usage
+  {
+    size_t free_bytes = 0, total_bytes = 0;
+    if (cudaMemGetInfo(&free_bytes, &total_bytes) == cudaSuccess) {
+      double used_mb =
+          static_cast<double>(total_bytes - free_bytes) / (1024.0 * 1024.0);
+      printf("GPU peak memory usage: %.1f MiB\n", used_mb);
+    }
+  }
+
   return 0;
 }

From 0c0562082f764c3d6c67878d9d018ead2f51b5ff Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 14 Apr 2026 19:25:12 -0700
Subject: [PATCH 2/4] lint

---
 .../cuda/benchmarks/benchmark_model_decode.py | 180 ++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 backends/cuda/benchmarks/benchmark_model_decode.py

diff --git a/backends/cuda/benchmarks/benchmark_model_decode.py b/backends/cuda/benchmarks/benchmark_model_decode.py
new file mode 100644
index 00000000000..51330709000
--- /dev/null
+++ b/backends/cuda/benchmarks/benchmark_model_decode.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""Model-level decode benchmark for Qwen3.5 MoE split-K SDPA.
+
+Measures prefill tok/s and decode tok/s across different prompt sizes
+and decode lengths to evaluate FlashDecoding++ async softmax impact.
+"""
+
+import json
+import sys
+import time
+
+import torch
+
+# Register Triton kernels before model import
+import executorch.backends.cuda.triton.kernels  # noqa: F401
+
+from executorch.examples.models.qwen3_5_moe.export import load_prequantized_model
+
+
+PROMPT_SIZES = [1, 15, 59, 143, 1694]
+DECODE_LENGTHS = [16, 64, 256, 1024]
+MODEL_PATH = "/home/gasoonjia/models/qwen35_moe_int4_hqq"
+NUM_WARMUP = 2  # warmup runs before timing
+
+
+def _move_to_cuda(model, config):
+    for fqn, buf in list(model.named_buffers()):
+        parts = fqn.rsplit(".", 1)
+        parent = model.get_submodule(parts[0]) if len(parts) > 1 else model
+        if buf.device.type == "meta":
+            dtype = torch.bfloat16 if buf.dtype != torch.bool else torch.bool
+            parent.register_buffer(
+                parts[-1], torch.zeros(buf.shape, dtype=dtype, device="cuda")
+            )
+        else:
+            parent.register_buffer(parts[-1], buf.to("cuda"))
+
+    for name, p in model.named_parameters():
+        parts = name.rsplit(".", 1)
+        parent = model.get_submodule(parts[0]) if len(parts) > 1 else model
+        setattr(
+            parent,
+            parts[-1],
+            torch.nn.Parameter(p.data.to("cuda"), requires_grad=False),
+        )
+
+    for layer in model.layers:
+        if hasattr(layer.attn, "rotary_emb"):
+            rope = layer.attn.rotary_emb
+            inv_freq = 1.0 / (
+                config.rope_theta
+                ** (
+                    torch.arange(0, rope.rotary_dim, 2, dtype=torch.float32)
+                    / rope.rotary_dim
+                )
+            )
+            rope.inv_freq = inv_freq.to("cuda")
+        if hasattr(layer.attn, "mask"):
+            layer.attn.register_buffer(
+                "mask",
+                torch.tril(
+                    torch.ones(
+                        config.max_seq_len,
+                        config.max_seq_len,
+                        dtype=torch.bool,
+                        device="cuda",
+                    )
+                ),
+            )
+
+
+def _reset_state(model):
+    """Reset all KV caches, conv_state, and recurrent_state to zero."""
+    for layer in model.layers:
+        attn = layer.attn
+        if hasattr(attn, "kv_cache"):
+            attn.kv_cache.k_cache.zero_()
+            attn.kv_cache.v_cache.zero_()
+        if hasattr(attn, "conv_state"):
+            attn.conv_state.zero_()
+        if hasattr(attn, "recurrent_state"):
+            attn.recurrent_state.zero_()
+
+
+@torch.inference_mode()
+def benchmark_prefill(model, prompt_size):
+    """Prefill prompt_size tokens one at a time, return tok/s."""
+    _reset_state(model)
+    tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long)
+
+    # Warmup
+    for i in range(min(prompt_size, NUM_WARMUP)):
+        pos = torch.tensor([i], device="cuda")
+        model(tokens, pos)
+
+    _reset_state(model)
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+
+    for i in range(prompt_size):
+        pos = torch.tensor([i], device="cuda")
+        model(tokens, pos)
+
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - t0
+    return prompt_size / elapsed if elapsed > 0 else 0.0
+
+
+@torch.inference_mode()
+def benchmark_decode(model, prompt_size, decode_length):
+    """Prefill prompt_size tokens, then decode decode_length tokens. Return decode tok/s."""
+    _reset_state(model)
+    tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long)
+
+    # Prefill
+    for i in range(prompt_size):
+        pos = torch.tensor([i], device="cuda")
+        logits = model(tokens, pos)
+
+    # Get first decode token
+    next_token = logits[:, -1:, :].argmax(dim=-1)
+
+    # Warmup decode
+    # (we skip warmup here to avoid polluting KV cache beyond prompt_size + decode_length)
+
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+
+    for i in range(decode_length):
+        pos = torch.tensor([prompt_size + i], device="cuda")
+        logits = model(next_token, pos)
+        next_token = logits[:, -1:, :].argmax(dim=-1)
+
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - t0
+    return decode_length / elapsed if elapsed > 0 else 0.0
+
+
+def main():
+    max_seq = max(PROMPT_SIZES) + max(DECODE_LENGTHS) + 16
+    print(f"Loading model from {MODEL_PATH} (max_seq_len={max_seq})...")
+    model, config = load_prequantized_model(MODEL_PATH, max_seq_len=max_seq)
+    _move_to_cuda(model, config)
+    model.eval()
+
+    results = {"prefill": {}, "decode": {}}
+
+    # Prefill benchmark
+    print("\n=== Prefill Benchmark ===")
+    print(f"{'Prompt Size':>12} | {'tok/s':>10}")
+    print("-" * 27)
+    for ps in PROMPT_SIZES:
+        tps = benchmark_prefill(model, ps)
+        results["prefill"][ps] = round(tps, 2)
+        print(f"{ps:>12} | {tps:>10.2f}")
+
+    # Decode benchmark
+    print("\n=== Decode Benchmark ===")
+    header = f"{'Prompt Size':>12}"
+    for dl in DECODE_LENGTHS:
+        header += f" | {'dec=' + str(dl):>12}"
+    print(header)
+    print("-" * len(header))
+
+    for ps in PROMPT_SIZES:
+        row = f"{ps:>12}"
+        results["decode"][ps] = {}
+        for dl in DECODE_LENGTHS:
+            tps = benchmark_decode(model, ps, dl)
+            results["decode"][ps][dl] = round(tps, 2)
+            row += f" | {tps:>12.2f}"
+        print(row)
+
+    # Dump JSON for easy comparison
+    print("\n--- JSON ---")
+    print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+    main()

From 865f1189f611948cd6e68ebe2a7c43cc10e07041 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 14 Apr 2026 19:25:56 -0700
Subject: [PATCH 3/4] remove unnessary benchmark

---
 .../cuda/benchmarks/benchmark_model_decode.py | 180 ------------------
 1 file changed, 180 deletions(-)
 delete mode 100644 backends/cuda/benchmarks/benchmark_model_decode.py

diff --git a/backends/cuda/benchmarks/benchmark_model_decode.py b/backends/cuda/benchmarks/benchmark_model_decode.py
deleted file mode 100644
index 51330709000..00000000000
--- a/backends/cuda/benchmarks/benchmark_model_decode.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-"""Model-level decode benchmark for Qwen3.5 MoE split-K SDPA.
-
-Measures prefill tok/s and decode tok/s across different prompt sizes
-and decode lengths to evaluate FlashDecoding++ async softmax impact.
-"""
-
-import json
-import sys
-import time
-
-import torch
-
-# Register Triton kernels before model import
-import executorch.backends.cuda.triton.kernels  # noqa: F401
-
-from executorch.examples.models.qwen3_5_moe.export import load_prequantized_model
-
-
-PROMPT_SIZES = [1, 15, 59, 143, 1694]
-DECODE_LENGTHS = [16, 64, 256, 1024]
-MODEL_PATH = "/home/gasoonjia/models/qwen35_moe_int4_hqq"
-NUM_WARMUP = 2  # warmup runs before timing
-
-
-def _move_to_cuda(model, config):
-    for fqn, buf in list(model.named_buffers()):
-        parts = fqn.rsplit(".", 1)
-        parent = model.get_submodule(parts[0]) if len(parts) > 1 else model
-        if buf.device.type == "meta":
-            dtype = torch.bfloat16 if buf.dtype != torch.bool else torch.bool
-            parent.register_buffer(
-                parts[-1], torch.zeros(buf.shape, dtype=dtype, device="cuda")
-            )
-        else:
-            parent.register_buffer(parts[-1], buf.to("cuda"))
-
-    for name, p in model.named_parameters():
-        parts = name.rsplit(".", 1)
-        parent = model.get_submodule(parts[0]) if len(parts) > 1 else model
-        setattr(
-            parent,
-            parts[-1],
-            torch.nn.Parameter(p.data.to("cuda"), requires_grad=False),
-        )
-
-    for layer in model.layers:
-        if hasattr(layer.attn, "rotary_emb"):
-            rope = layer.attn.rotary_emb
-            inv_freq = 1.0 / (
-                config.rope_theta
-                ** (
-                    torch.arange(0, rope.rotary_dim, 2, dtype=torch.float32)
-                    / rope.rotary_dim
-                )
-            )
-            rope.inv_freq = inv_freq.to("cuda")
-        if hasattr(layer.attn, "mask"):
-            layer.attn.register_buffer(
-                "mask",
-                torch.tril(
-                    torch.ones(
-                        config.max_seq_len,
-                        config.max_seq_len,
-                        dtype=torch.bool,
-                        device="cuda",
-                    )
-                ),
-            )
-
-
-def _reset_state(model):
-    """Reset all KV caches, conv_state, and recurrent_state to zero."""
-    for layer in model.layers:
-        attn = layer.attn
-        if hasattr(attn, "kv_cache"):
-            attn.kv_cache.k_cache.zero_()
-            attn.kv_cache.v_cache.zero_()
-        if hasattr(attn, "conv_state"):
-            attn.conv_state.zero_()
-        if hasattr(attn, "recurrent_state"):
-            attn.recurrent_state.zero_()
-
-
-@torch.inference_mode()
-def benchmark_prefill(model, prompt_size):
-    """Prefill prompt_size tokens one at a time, return tok/s."""
-    _reset_state(model)
-    tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long)
-
-    # Warmup
-    for i in range(min(prompt_size, NUM_WARMUP)):
-        pos = torch.tensor([i], device="cuda")
-        model(tokens, pos)
-
-    _reset_state(model)
-    torch.cuda.synchronize()
-    t0 = time.perf_counter()
-
-    for i in range(prompt_size):
-        pos = torch.tensor([i], device="cuda")
-        model(tokens, pos)
-
-    torch.cuda.synchronize()
-    elapsed = time.perf_counter() - t0
-    return prompt_size / elapsed if elapsed > 0 else 0.0
-
-
-@torch.inference_mode()
-def benchmark_decode(model, prompt_size, decode_length):
-    """Prefill prompt_size tokens, then decode decode_length tokens. Return decode tok/s."""
-    _reset_state(model)
-    tokens = torch.randint(0, 1000, (1, 1), device="cuda", dtype=torch.long)
-
-    # Prefill
-    for i in range(prompt_size):
-        pos = torch.tensor([i], device="cuda")
-        logits = model(tokens, pos)
-
-    # Get first decode token
-    next_token = logits[:, -1:, :].argmax(dim=-1)
-
-    # Warmup decode
-    # (we skip warmup here to avoid polluting KV cache beyond prompt_size + decode_length)
-
-    torch.cuda.synchronize()
-    t0 = time.perf_counter()
-
-    for i in range(decode_length):
-        pos = torch.tensor([prompt_size + i], device="cuda")
-        logits = model(next_token, pos)
-        next_token = logits[:, -1:, :].argmax(dim=-1)
-
-    torch.cuda.synchronize()
-    elapsed = time.perf_counter() - t0
-    return decode_length / elapsed if elapsed > 0 else 0.0
-
-
-def main():
-    max_seq = max(PROMPT_SIZES) + max(DECODE_LENGTHS) + 16
-    print(f"Loading model from {MODEL_PATH} (max_seq_len={max_seq})...")
-    model, config = load_prequantized_model(MODEL_PATH, max_seq_len=max_seq)
-    _move_to_cuda(model, config)
-    model.eval()
-
-    results = {"prefill": {}, "decode": {}}
-
-    # Prefill benchmark
-    print("\n=== Prefill Benchmark ===")
-    print(f"{'Prompt Size':>12} | {'tok/s':>10}")
-    print("-" * 27)
-    for ps in PROMPT_SIZES:
-        tps = benchmark_prefill(model, ps)
-        results["prefill"][ps] = round(tps, 2)
-        print(f"{ps:>12} | {tps:>10.2f}")
-
-    # Decode benchmark
-    print("\n=== Decode Benchmark ===")
-    header = f"{'Prompt Size':>12}"
-    for dl in DECODE_LENGTHS:
-        header += f" | {'dec=' + str(dl):>12}"
-    print(header)
-    print("-" * len(header))
-
-    for ps in PROMPT_SIZES:
-        row = f"{ps:>12}"
-        results["decode"][ps] = {}
-        for dl in DECODE_LENGTHS:
-            tps = benchmark_decode(model, ps, dl)
-            results["decode"][ps][dl] = round(tps, 2)
-            row += f" | {tps:>12.2f}"
-        print(row)
-
-    # Dump JSON for easy comparison
-    print("\n--- JSON ---")
-    print(json.dumps(results, indent=2))
-
-
-if __name__ == "__main__":
-    main()

From e249c23f7508ae27397a2e7ede27b1efc913f2ac Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 15 Apr 2026 15:45:19 -0700
Subject: [PATCH 4/4] lint

---
 backends/cuda/runtime/cuda_backend.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 224042dcc07..0e9c5a32473 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -669,8 +669,7 @@ class ET_EXPERIMENTAL CudaBackend final
       const std::string& method_name) const {
     // Check if the required APIs are available
     if (!handle->get_num_constants || !handle->get_constant_name ||
-        !handle->get_constant_original_fqn ||
-        !handle->extract_constants_map ||
+        !handle->get_constant_original_fqn || !handle->extract_constants_map ||
         !handle->update_user_managed_constant_buffer_pairs) {
       // Fall back to the legacy path
       return load_constants_legacy(handle, named_data_map, method_name);
@@ -757,7 +756,8 @@ class ET_EXPERIMENTAL CudaBackend final
         std::lock_guard<std::mutex> guard(shared_constants_mutex_);
         for (const auto& fqn : uncached_fqns) {
           auto it_name = fqn_to_name.find(fqn);
-          if (it_name == fqn_to_name.end()) continue;
+          if (it_name == fqn_to_name.end())
+            continue;
           // extract_constants_map returns entries keyed by FQN
           auto it = extracted_map.find(fqn);
           if (it != extracted_map.end()) {
@@ -771,7 +771,10 @@ class ET_EXPERIMENTAL CudaBackend final
             method_name.c_str(),
             shared_constant_tensors_.size());
       } else {
-        ET_LOG(Error, "Failed to extract constants from '%s'", method_name.c_str());
+        ET_LOG(
+            Error,
+            "Failed to extract constants from '%s'",
+            method_name.c_str());
         return Error::Internal;
       }
     } else {
@@ -835,8 +838,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
       const void* weights_blob = buffer_res->data();
       auto update_err = handle->update_constants_from_blob(
-          handle->container_handle,
-          static_cast<const uint8_t*>(weights_blob));
+          handle->container_handle, static_cast<const uint8_t*>(weights_blob));
       if (update_err != Error::Ok) {
         ET_LOG(Error, "update_constants_from_blob failed");
         return update_err;