From a9a3cef36d57183890b86a01068358e1fc4a2396 Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Mon, 4 May 2026 15:25:02 -0500 Subject: [PATCH 1/8] Tune MiniMax MI355X vLLM scheduling thresholds --- .../single_node/minimaxm2.5_fp8_mi355x.sh | 47 ++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 53cffceee..c4ab51188 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -26,7 +26,49 @@ fi export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 +export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 +VLLM_BLOCK_SIZE=32 +ASYNC_SCHEDULING_ARGS="" + +if [[ "$ISL" == "1024" && "$OSL" == "1024" && "$TP" == "8" && "$EP_SIZE" == "8" ]] && (( CONC == 2 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 1k1k TP8/EP8 c2." +elif [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + + if (( CONC <= 128 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 1k1k c${CONC}." + else + echo "Using shuffle KV cache layout with block size 16 and async scheduling for 1k1k c${CONC}." + fi +elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 + VLLM_BLOCK_SIZE=32 + echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8." +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if (( CONC <= 64 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + fi + + if (( CONC >= 64 )); then + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + + if [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then + echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}." + else + echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}." + fi + elif [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then + echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}." + else + echo "Using baseline block size 32, shuffle disabled, and async scheduling for 8k1k c${CONC}." + fi +else + echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}." +fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -52,9 +94,10 @@ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --kv-cache-dtype fp8 \ ---block-size=32 \ +--block-size=$VLLM_BLOCK_SIZE \ --no-enable-prefix-caching \ --attention-backend "ROCM_AITER_FA" \ +$ASYNC_SCHEDULING_ARGS \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! From 4b89f404c7070a9b4175245865c7824dfb8356b8 Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Mon, 4 May 2026 18:06:36 -0500 Subject: [PATCH 2/8] Clarify MiniMax 8k1k scheduling branches --- .../single_node/minimaxm2.5_fp8_mi355x.sh | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index c4ab51188..a5ecafab3 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -48,23 +48,18 @@ elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then VLLM_BLOCK_SIZE=32 echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8." elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if (( CONC <= 64 )); then + if (( CONC < 64 )); then + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}." + elif (( CONC == 64 )); then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" - fi - - if (( CONC >= 64 )); then export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 VLLM_BLOCK_SIZE=16 - - if [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then - echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}." - else - echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}." - fi - elif [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then - echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}." + echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}." else - echo "Using baseline block size 32, shuffle disabled, and async scheduling for 8k1k c${CONC}." + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}." fi else echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}." From 91595f72d803b286168226275cd20109d8e8d77c Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Tue, 5 May 2026 11:27:10 -0500 Subject: [PATCH 3/8] Refactor MiniMax MI355X scheduling policy --- .../single_node/minimaxm2.5_fp8_mi355x.sh | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index a5ecafab3..8950f73fb 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -30,39 +30,40 @@ export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 VLLM_BLOCK_SIZE=32 ASYNC_SCHEDULING_ARGS="" -if [[ "$ISL" == "1024" && "$OSL" == "1024" && "$TP" == "8" && "$EP_SIZE" == "8" ]] && (( CONC == 2 )); then - ASYNC_SCHEDULING_ARGS="--no-async-scheduling" - echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 1k1k TP8/EP8 c2." -elif [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 - VLLM_BLOCK_SIZE=16 - - if (( CONC <= 128 )); then +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + if (( CONC == 2 )) && [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" - echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 1k1k c${CONC}." + echo "1k1k TP8/EP8 c2: using block size 32, shuffle disabled, async scheduling disabled." + elif (( CONC <= 128 )); then + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "1k1k c${CONC}: using block size 16, shuffle enabled, async scheduling disabled." else - echo "Using shuffle KV cache layout with block size 16 and async scheduling for 1k1k c${CONC}." + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + VLLM_BLOCK_SIZE=16 + echo "1k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled." fi -elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then - export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 - VLLM_BLOCK_SIZE=32 - echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8." elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if (( CONC < 64 )); then + if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + echo "8k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling enabled." + elif (( CONC < 64 )); then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" - echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}." + echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled." elif (( CONC == 64 )); then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 VLLM_BLOCK_SIZE=16 - echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}." + echo "8k1k c64: using block size 16, shuffle enabled, async scheduling disabled." else export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 VLLM_BLOCK_SIZE=16 - echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}." + echo "8k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled." fi +elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + echo "TP8/EP8 fallback: using block size 32, shuffle disabled, async scheduling enabled." else - echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}." + echo "Default policy for ISL=${ISL}, OSL=${OSL}, TP=${TP}, EP=${EP_SIZE}, CONC=${CONC}: using block size 32, shuffle disabled, async scheduling enabled." fi SERVER_LOG=/workspace/server.log From e2524a21ed8300607e2714d5adccdef39962c24f Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Wed, 6 May 2026 11:39:57 -0500 Subject: [PATCH 4/8] Disable AITER MoE for MiniMax 8k1k TP8 --- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 3 ++- perf-changelog.yaml | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 8950f73fb..a64594e8f 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -46,7 +46,8 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then - echo "8k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling enabled." + export VLLM_ROCM_USE_AITER_MOE=0 + echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling enabled." elif (( CONC < 64 )); then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled." diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b04ae1947..4fbf74455 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2214,3 +2214,10 @@ - "Bump --speculative-config num_speculative_tokens from 1 to 2 (`{\"method\":\"mtp\",\"num_speculative_tokens\":2}`)" - "Re-test whether H200 MTP kernels accept 2 draft tokens — Blackwell MTP runs at 2 (per @wzhao18's vLLM Blackwell MTP submission); checking if H200 has parity now" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1279 + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path" + - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark after reproducing the GPU fault without this env override" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276 From 804eba4940b3be67a135315b889c2fd2be6ebdbf Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Wed, 6 May 2026 11:54:08 -0500 Subject: [PATCH 5/8] Disable async for MiniMax 8k1k TP8 --- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 3 ++- perf-changelog.yaml | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index a64594e8f..0c6734755 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -47,7 +47,8 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then export VLLM_ROCM_USE_AITER_MOE=0 - echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling enabled." + ASYNC_SCHEDULING_ARGS="--no-async-scheduling" + echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled." elif (( CONC < 64 )); then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled." diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4fbf74455..e3d27839b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2218,6 +2218,6 @@ - config-keys: - minimaxm2.5-fp8-mi355x-vllm description: - - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path" - - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark after reproducing the GPU fault without this env override" + - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and disable async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path" + - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark; block_size=32 with shuffle disabled outperformed the tested shuffle-KV/block_size=16 variant" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276 From fbf4738c2dc0389ef3ed60c297c8b2035798cbe4 Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Wed, 6 May 2026 11:57:38 -0500 Subject: [PATCH 6/8] Make MiniMax 1k1k TP8 no-async explicit --- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 4 ++-- perf-changelog.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 0c6734755..b4b4840de 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -31,9 +31,9 @@ VLLM_BLOCK_SIZE=32 ASYNC_SCHEDULING_ARGS="" if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if (( CONC == 2 )) && [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then ASYNC_SCHEDULING_ARGS="--no-async-scheduling" - echo "1k1k TP8/EP8 c2: using block size 32, shuffle disabled, async scheduling disabled." + echo "1k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling disabled." elif (( CONC <= 128 )); then export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 VLLM_BLOCK_SIZE=16 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e3d27839b..299c612cd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2218,6 +2218,6 @@ - config-keys: - minimaxm2.5-fp8-mi355x-vllm description: - - "Disable AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and disable async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path" + - "Disable async scheduling for the 1k1k TP8/EP8 path; disable both AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path" - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark; block_size=32 with shuffle disabled outperformed the tested shuffle-KV/block_size=16 variant" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276 From 8d8b1e0116dfa6343b1fb517214c3cf066b29178 Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Wed, 6 May 2026 11:58:26 -0500 Subject: [PATCH 7/8] Remove MiniMax perf changelog entry --- perf-changelog.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 299c612cd..b04ae1947 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2214,10 +2214,3 @@ - "Bump --speculative-config num_speculative_tokens from 1 to 2 (`{\"method\":\"mtp\",\"num_speculative_tokens\":2}`)" - "Re-test whether H200 MTP kernels accept 2 draft tokens — Blackwell MTP runs at 2 (per @wzhao18's vLLM Blackwell MTP submission); checking if H200 has parity now" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1279 - -- config-keys: - - minimaxm2.5-fp8-mi355x-vllm - description: - - "Disable async scheduling for the 1k1k TP8/EP8 path; disable both AITER MoE via VLLM_ROCM_USE_AITER_MOE=0 and async scheduling for the 8k1k TP8/EP8 MiniMax-M2.5 FP8 MI355X vLLM path" - - "Local validation on vllm/vllm-openai-rocm:v0.19.0 completed the 8k1k TP8/EP8 CONC=2 serving benchmark; block_size=32 with shuffle disabled outperformed the tested shuffle-KV/block_size=16 variant" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276 From 8bbdc8131bf90c49802d259e338269f9030344d3 Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Wed, 6 May 2026 12:01:02 -0500 Subject: [PATCH 8/8] Remove MiniMax default policy log --- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index b4b4840de..59e0b10f5 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -62,10 +62,6 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then VLLM_BLOCK_SIZE=16 echo "8k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled." fi -elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then - echo "TP8/EP8 fallback: using block size 32, shuffle disabled, async scheduling enabled." -else - echo "Default policy for ISL=${ISL}, OSL=${OSL}, TP=${TP}, EP=${EP_SIZE}, CONC=${CONC}: using block size 32, shuffle disabled, async scheduling enabled." fi SERVER_LOG=/workspace/server.log