From a9a3cef36d57183890b86a01068358e1fc4a2396 Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Mon, 4 May 2026 15:25:02 -0500
Subject: [PATCH 1/8] Tune MiniMax MI355X vLLM scheduling thresholds

---
 .../single_node/minimaxm2.5_fp8_mi355x.sh     | 47 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index 53cffceee..c4ab51188 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -26,7 +26,49 @@ fi
 
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+VLLM_BLOCK_SIZE=32
+ASYNC_SCHEDULING_ARGS=""
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && "$TP" == "8" && "$EP_SIZE" == "8" ]] && (( CONC == 2 )); then
+    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 1k1k TP8/EP8 c2."
+elif [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+
+    if (( CONC <= 128 )); then
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+        echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 1k1k c${CONC}."
+    else
+        echo "Using shuffle KV cache layout with block size 16 and async scheduling for 1k1k c${CONC}."
+    fi
+elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+    VLLM_BLOCK_SIZE=32
+    echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8."
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if (( CONC <= 64 )); then
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    fi
+
+    if (( CONC >= 64 )); then
+        export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+        VLLM_BLOCK_SIZE=16
+
+        if [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then
+            echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}."
+        else
+            echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}."
+        fi
+    elif [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then
+        echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}."
+    else
+        echo "Using baseline block size 32, shuffle disabled, and async scheduling for 8k1k c${CONC}."
+    fi
+else
+    echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}."
+fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -52,9 +94,10 @@ $EP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --kv-cache-dtype fp8 \
---block-size=32 \
+--block-size=$VLLM_BLOCK_SIZE \
 --no-enable-prefix-caching \
 --attention-backend "ROCM_AITER_FA" \
+$ASYNC_SCHEDULING_ARGS \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 4b89f404c7070a9b4175245865c7824dfb8356b8 Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Mon, 4 May 2026 18:06:36 -0500
Subject: [PATCH 2/8] Clarify MiniMax 8k1k scheduling branches

---
 .../single_node/minimaxm2.5_fp8_mi355x.sh     | 21 +++++++------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index c4ab51188..a5ecafab3 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -48,23 +48,18 @@ elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
     VLLM_BLOCK_SIZE=32
     echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8."
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if (( CONC <= 64 )); then
+    if (( CONC < 64 )); then
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+        echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}."
+    elif (( CONC == 64 )); then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
-    fi
-
-    if (( CONC >= 64 )); then
         export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
         VLLM_BLOCK_SIZE=16
-
-        if [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then
-            echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}."
-        else
-            echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}."
-        fi
-    elif [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then
-        echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}."
+        echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}."
     else
-        echo "Using baseline block size 32, shuffle disabled, and async scheduling for 8k1k c${CONC}."
+        export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+        VLLM_BLOCK_SIZE=16
+        echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}."
     fi
 else
     echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}."

From 91595f72d803b286168226275cd20109d8e8d77c Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Tue, 5 May 2026 11:27:10 -0500
Subject: [PATCH 3/8] Refactor MiniMax MI355X scheduling policy

---
 .../single_node/minimaxm2.5_fp8_mi355x.sh     | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index a5ecafab3..8950f73fb 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -30,39 +30,40 @@ export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
 VLLM_BLOCK_SIZE=32
 ASYNC_SCHEDULING_ARGS=""
 
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && "$TP" == "8" && "$EP_SIZE" == "8" ]] && (( CONC == 2 )); then
-    ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
-    echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 1k1k TP8/EP8 c2."
-elif [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
-    VLLM_BLOCK_SIZE=16
-
-    if (( CONC <= 128 )); then
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    if (( CONC == 2 )) && [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
-        echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 1k1k c${CONC}."
+        echo "1k1k TP8/EP8 c2: using block size 32, shuffle disabled, async scheduling disabled."
+    elif (( CONC <= 128 )); then
+        export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+        VLLM_BLOCK_SIZE=16
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+        echo "1k1k c${CONC}: using block size 16, shuffle enabled, async scheduling disabled."
     else
-        echo "Using shuffle KV cache layout with block size 16 and async scheduling for 1k1k c${CONC}."
+        export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+        VLLM_BLOCK_SIZE=16
+        echo "1k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
     fi
-elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
-    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
-    VLLM_BLOCK_SIZE=32
-    echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8."
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if (( CONC < 64 )); then
+    if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+        echo "8k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling enabled."
+    elif (( CONC < 64 )); then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
-        echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}."
+        echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
     elif (( CONC == 64 )); then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
         export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
         VLLM_BLOCK_SIZE=16
-        echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}."
+        echo "8k1k c64: using block size 16, shuffle enabled, async scheduling disabled."
     else
         export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
         VLLM_BLOCK_SIZE=16
-        echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}."
+        echo "8k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
     fi
+elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    echo "TP8/EP8 fallback: using block size 32, shuffle disabled, async scheduling enabled."
 else
-    echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}."
+    echo "Default policy for ISL=${ISL}, OSL=${OSL}, TP=${TP}, EP=${EP_SIZE}, CONC=${CONC}: using block size 32, shuffle disabled, async scheduling enabled."
 fi
 
 SERVER_LOG=/workspace/server.log

From e2524a21ed8300607e2714d5adccdef39962c24f Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Wed, 6 May 2026 11:39:57 -0500
Subject: [PATCH 4/8] Disable AITER MoE for MiniMax 8k1k TP8

---
 benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 3 ++-
 perf-changelog.yaml                              | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index 8950f73fb..a64594e8f 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -46,7 +46,8 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     fi
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
     if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
-        echo "8k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling enabled."
+        export VLLM_ROCM_USE_AITER_MOE=0
+        echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling enabled."
     elif (( CONC < 64 )); then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
         echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b04ae1947..4fbf74455 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2214,3 +2214,10 @@
     - "Bump --speculative-config num_speculative_tokens from 1 to 2 (`{\"method\":\"mtp\",\"num_speculative_tokens\":2}`)"
     - "Re-test whether H200 MTP kernels accept 2 draft tokens — Blackwell MTP runs at 2 (per @wzhao18's vLLM Blackwell MTP submission); checking if H200 has parity now"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1279
+
+- config-keys:
+    - minimaxm2.5-fp8-mi355x-vllm
+  description:
+    - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path"
+    - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark after reproducing the GPU fault without this env override"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276

From 804eba4940b3be67a135315b889c2fd2be6ebdbf Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Wed, 6 May 2026 11:54:08 -0500
Subject: [PATCH 5/8] Disable async for MiniMax 8k1k TP8

---
 benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 3 ++-
 perf-changelog.yaml                              | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index a64594e8f..0c6734755 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -47,7 +47,8 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
     if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
         export VLLM_ROCM_USE_AITER_MOE=0
-        echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling enabled."
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+        echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
     elif (( CONC < 64 )); then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
         echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4fbf74455..e3d27839b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2218,6 +2218,6 @@
 - config-keys:
     - minimaxm2.5-fp8-mi355x-vllm
   description:
-    - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path"
-    - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark after reproducing the GPU fault without this env override"
+    - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and disable async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path"
+    - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark; block_size=32 with shuffle disabled outperformed the tested shuffle-KV/block_size=16 variant"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276

From fbf4738c2dc0389ef3ed60c297c8b2035798cbe4 Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Wed, 6 May 2026 11:57:38 -0500
Subject: [PATCH 6/8] Make MiniMax 1k1k TP8 no-async explicit

---
 benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 4 ++--
 perf-changelog.yaml                              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index 0c6734755..b4b4840de 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -31,9 +31,9 @@ VLLM_BLOCK_SIZE=32
 ASYNC_SCHEDULING_ARGS=""
 
 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if (( CONC == 2 )) && [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
         ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
-        echo "1k1k TP8/EP8 c2: using block size 32, shuffle disabled, async scheduling disabled."
+        echo "1k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling disabled."
     elif (( CONC <= 128 )); then
         export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
         VLLM_BLOCK_SIZE=16
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e3d27839b..299c612cd 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2218,6 +2218,6 @@
 - config-keys:
     - minimaxm2.5-fp8-mi355x-vllm
   description:
-    - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and disable async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path"
+    - "Disable async scheduling for the 1k1k TP8/EP8 path; disable both AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path"
     - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark; block_size=32 with shuffle disabled outperformed the tested shuffle-KV/block_size=16 variant"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276

From 8d8b1e0116dfa6343b1fb517214c3cf066b29178 Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Wed, 6 May 2026 11:58:26 -0500
Subject: [PATCH 7/8] Remove MiniMax perf changelog entry

---
 perf-changelog.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 299c612cd..b04ae1947 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2214,10 +2214,3 @@
     - "Bump --speculative-config num_speculative_tokens from 1 to 2 (`{\"method\":\"mtp\",\"num_speculative_tokens\":2}`)"
     - "Re-test whether H200 MTP kernels accept 2 draft tokens — Blackwell MTP runs at 2 (per @wzhao18's vLLM Blackwell MTP submission); checking if H200 has parity now"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1279
-
-- config-keys:
-    - minimaxm2.5-fp8-mi355x-vllm
-  description:
-    - "Disable async scheduling for the 1k1k TP8/EP8 path; disable both AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path"
-    - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark; block_size=32 with shuffle disabled outperformed the tested shuffle-KV/block_size=16 variant"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276

From 8bbdc8131bf90c49802d259e338269f9030344d3 Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Wed, 6 May 2026 12:01:02 -0500
Subject: [PATCH 8/8] Remove MiniMax default policy log

---
 benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index b4b4840de..59e0b10f5 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -62,10 +62,6 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
         VLLM_BLOCK_SIZE=16
         echo "8k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
     fi
-elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
-    echo "TP8/EP8 fallback: using block size 32, shuffle disabled, async scheduling enabled."
-else
-    echo "Default policy for ISL=${ISL}, OSL=${OSL}, TP=${TP}, EP=${EP_SIZE}, CONC=${CONC}: using block size 32, shuffle disabled, async scheduling enabled."
 fi
 
 SERVER_LOG=/workspace/server.log