Skip to content
41 changes: 39 additions & 2 deletions benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,43 @@ fi

export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0
VLLM_BLOCK_SIZE=32
ASYNC_SCHEDULING_ARGS=""

if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
echo "1k1k TP8/EP8: using block size 32, shuffle disabled, async scheduling disabled."
elif (( CONC <= 128 )); then
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
VLLM_BLOCK_SIZE=16
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
echo "1k1k c${CONC}: using block size 16, shuffle enabled, async scheduling disabled."
else
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
VLLM_BLOCK_SIZE=16
echo "1k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
fi
elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then
export VLLM_ROCM_USE_AITER_MOE=0
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
echo "8k1k TP8/EP8: using block size 32, shuffle disabled, AITER MoE disabled, async scheduling disabled."
elif (( CONC < 64 )); then
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
echo "8k1k c${CONC}: using block size 32, shuffle disabled, async scheduling disabled."
elif (( CONC == 64 )); then
ASYNC_SCHEDULING_ARGS="--no-async-scheduling"
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
VLLM_BLOCK_SIZE=16
echo "8k1k c64: using block size 16, shuffle enabled, async scheduling disabled."
else
export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
VLLM_BLOCK_SIZE=16
echo "8k1k c${CONC}: using block size 16, shuffle enabled, async scheduling enabled."
fi
fi

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
Expand All @@ -52,9 +88,10 @@ $EP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--kv-cache-dtype fp8 \
--block-size=32 \
--block-size=$VLLM_BLOCK_SIZE \
--no-enable-prefix-caching \
--attention-backend "ROCM_AITER_FA" \
$ASYNC_SCHEDULING_ARGS \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
Loading