diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index 53cffceee..f079974de 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -26,7 +26,46 @@ fi
 
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+VLLM_BLOCK_SIZE=32
+ASYNC_SCHEDULING_ARGS=""
+
+if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
+    VLLM_BLOCK_SIZE=32
+    echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8."
+elif [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+    VLLM_BLOCK_SIZE=16
+
+    if (( CONC <= 128 )); then
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+        echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 1k1k c${CONC}."
+    else
+        echo "Using shuffle KV cache layout with block size 16 and async scheduling for 1k1k c${CONC}."
+    fi
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if (( CONC <= 64 )); then
+        ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
+    fi
+
+    if (( CONC >= 32 )); then
+        export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+        VLLM_BLOCK_SIZE=16
+
+        if [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then
+            echo "Using shuffle KV cache layout with block size 16 and disabling async scheduling for 8k1k c${CONC}."
+        else
+            echo "Using shuffle KV cache layout with block size 16 and async scheduling for 8k1k c${CONC}."
+        fi
+    elif [[ -n "$ASYNC_SCHEDULING_ARGS" ]]; then
+        echo "Using baseline block size 32, shuffle disabled, and disabling async scheduling for 8k1k c${CONC}."
+    else
+        echo "Using baseline block size 32, shuffle disabled, and async scheduling for 8k1k c${CONC}."
+    fi
+else
+    echo "Using baseline block size 32, shuffle disabled, and async scheduling for ISL=${ISL}, OSL=${OSL}, c${CONC}."
+fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -52,9 +91,10 @@ $EP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --kv-cache-dtype fp8 \
---block-size=32 \
+--block-size=$VLLM_BLOCK_SIZE \
 --no-enable-prefix-caching \
 --attention-backend "ROCM_AITER_FA" \
+$ASYNC_SCHEDULING_ARGS \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!