SemiAnalysisAI · wzhao18 · May 4, 2026 · May 4, 2026 · May 4, 2026
@@ -2730,7 +2730,7 @@ dsv4-fp4-b300-trt:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
 
 dsv4-fp4-b300-vllm-mtp:
-  image: vllm/vllm-openai:v0.20.0-cu130
+  image: vllm/vllm-openai:v0.20.1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -2744,12 +2744,16 @@ dsv4-fp4-b300-vllm-mtp:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 2048, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 8192, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 2048, spec-decoding: mtp }
 
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.9-cu129-amd64

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -36,7 +36,17 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
 fi
 
-MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 ))
+MOE_ARGS=()
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
+fi
+
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MAX_NUM_BATCHED_TOKENS=2048
+else
+    MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 ))
+fi
+
 BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -61,6 +71,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --block-size 256 \
     --no-enable-prefix-caching \
     "${EP_ARGS[@]}" \
+    "${MOE_ARGS[@]}" \
     --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2179,3 +2179,10 @@
     - "Use vllm/vllm-openai:v0.20.1-ubuntu2404 directly for GB200 MTP2 instead of upgrading vLLM inside the v0.20.0 container"
     - "Fix applies to all 7 multinode launch scripts, the benchmark-multinode-tmpl workflow, and process_result.py"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1256
+
+- config-keys:
+    - dsv4-fp4-b300-vllm-mtp
+  description:
+    - "Update image tag to vllm/vllm-openai:v0.20.1-cu130"
+    - "Add DEP configs for B300 vLLM MTP"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1271