diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e2fca49f2..6aaf251a5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2730,7 +2730,7 @@ dsv4-fp4-b300-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } dsv4-fp4-b300-vllm-mtp: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -2744,12 +2744,16 @@ dsv4-fp4-b300-vllm-mtp: search-space: - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 2048, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 8192, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 2048, spec-decoding: mtp } qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index 0145a7702..aed283daa 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -36,7 +36,17 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) fi -MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 )) +MOE_ARGS=() +if [ "${DP_ATTENTION}" = "true" ]; then + MOE_ARGS=(--moe-backend deep_gemm_mega_moe) +fi + +if [ "${DP_ATTENTION}" = "true" ]; then + MAX_NUM_BATCHED_TOKENS=2048 +else + MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 )) +fi + BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN if [ "${EVAL_ONLY}" = "true" ]; then @@ -61,6 +71,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --block-size 256 \ --no-enable-prefix-caching \ "${EP_ARGS[@]}" \ + "${MOE_ARGS[@]}" \ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 524c91e67..85150737b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2179,3 +2179,10 @@ - "Use vllm/vllm-openai:v0.20.1-ubuntu2404 directly for GB200 MTP2 instead of upgrading vLLM inside the v0.20.0 container" - "Fix applies to all 7 multinode launch scripts, the benchmark-multinode-tmpl workflow, and process_result.py" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1256 + +- config-keys: + - dsv4-fp4-b300-vllm-mtp + description: + - "Update image tag to vllm/vllm-openai:v0.20.1-cu130" + - "Add DEP configs for B300 vLLM MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1271