From 72dbddd94dcdd8d1ce788012d60d4a858f4df560 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 4 May 2026 15:27:57 -0700 Subject: [PATCH 1/5] Add DSv4 B200 TRT benchmark --- .github/configs/nvidia-master.yaml | 21 +++ benchmarks/single_node/dsv4_fp4_b200_trt.sh | 160 ++++++++++++++++++++ perf-changelog.yaml | 7 + 3 files changed, 188 insertions(+) create mode 100644 benchmarks/single_node/dsv4_fp4_b200_trt.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 042d9a5f8..daf84b773 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1755,6 +1755,27 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } +dsv4-fp4-b200-trt: + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-dsv4 + precision: fp4 + framework: trt + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 } + # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b200-vllm-mtp: diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh new file mode 100644 index 000000000..654499f8e --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro single-node TRTLLM recipe for B200. The configured image +# already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at +# runtime from this benchmark path. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + DP_ATTENTION \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}" +export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}" + +sanitize_slurm_mpi_env_for_trtllm() { + if [[ "${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-0}" != "1" ]]; then + return 0 + fi + + echo "Sanitizing Slurm/PMI environment for TensorRT-LLM launch" + while IFS='=' read -r name _; do + case "$name" in + SLURM_*|PMIX*|PMI*|OMPI_*|ORTE_*) + unset "$name" + ;; + esac + done < <(env) +} + +sanitize_slurm_mpi_env_for_trtllm + +export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" +echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" + +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} +EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" + +MOE_BACKEND="TRTLLM" +MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) +CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" +KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" + +ATTENTION_DP_CONFIG="" +if [[ "$DP_ATTENTION" == "true" ]]; then + ATTENTION_DP_CONFIG=" +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60" +fi + +cat > "$EXTRA_CONFIG_FILE" << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE +enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG +print_iter_log: true +kv_cache_config: + tokens_per_block: 128 + dtype: fp8 + free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION + enable_block_reuse: false +stream_interval: 10 +num_postprocess_workers: 4 +moe_config: + backend: $MOE_BACKEND +EOF + +echo "Generated config file contents:" +cat "$EXTRA_CONFIG_FILE" + +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( ISL + OSL + 256 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + +# DeepSeek-V4-Pro has hidden size 7168. Keep fused HC off with the current +# feat/deepseek_v4 image, matching the B300 TRT recipe. +export TRTLLM_MHC_ENABLE_FUSED_HC="${TRTLLM_MHC_ENABLE_FUSED_HC:-0}" +echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC" + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +SERVE_CMD=( + trtllm-serve "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --trust_remote_code \ + --backend pytorch \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --max_seq_len "$MAX_MODEL_LEN" \ + --max_num_tokens "$MAX_NUM_TOKENS" \ + --tp_size "$TP" \ + --ep_size "$EP_SIZE" \ + --custom_tokenizer deepseek_v4 \ + --config "$EXTRA_CONFIG_FILE" +) + +if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then + "${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 & +else + mpirun -n 1 --oversubscribe --allow-run-as-root \ + "${SERVE_CMD[@]}" \ + > "$SERVER_LOG" 2>&1 & +fi + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$(( CONC * 10 ))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" \ + --trust-remote-code \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 98fa4e8b3..8f4f587bf 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2207,3 +2207,10 @@ - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens" - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1222 + +- config-keys: + - dsv4-fp4-b200-trt + description: + - "Add B200 TensorRT-LLM DeepSeek-V4-Pro single-node coverage using the feat/deepseek_v4 image" + - "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277 From b76e23dce8c38c04c9c4b80b4a0d3774d062160b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 4 May 2026 15:54:05 -0700 Subject: [PATCH 2/5] Fix B200 DGXC Slurm partition --- perf-changelog.yaml | 1 + runners/launch_b200-dgxc.sh | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8f4f587bf..95b896986 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2213,4 +2213,5 @@ description: - "Add B200 TensorRT-LLM DeepSeek-V4-Pro single-node coverage using the feat/deepseek_v4 image" - "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness" + - "Update the B200 DGXC Slurm partition from removed gpu to gpu-2 so single-node B200 jobs can allocate" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index de66a0c4b..fb259dd7d 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -1,7 +1,7 @@ #!/usr/bin/bash # System-specific configuration for B200 DGXC Slurm cluster -SLURM_PARTITION="gpu" +SLURM_PARTITION="gpu-2" SLURM_ACCOUNT="benchmark" set -x @@ -284,8 +284,14 @@ else CONTAINER_MOUNT_DIR=/workspace fi + # b200-dgxc was re-partitioned from gpu to gpu-1/gpu-2. Use gpu-2, which + # is the clean GPU-only pool, instead of the removed gpu partition. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) + if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job on partition $SLURM_PARTITION" + exit 1 + fi # Use flock to serialize concurrent imports to the same squash file # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes From 1cfb7eeeeab690346985aeb69f72fec2d20596c1 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 5 May 2026 14:33:56 -0700 Subject: [PATCH 3/5] Update nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index daf84b773..ef02b37fe 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1756,7 +1756,7 @@ dsv4-fp4-b200-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } dsv4-fp4-b200-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1769,12 +1769,12 @@ dsv4-fp4-b200-trt: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. From 289672c406674da3f5b52b198fcee4a94cc5926c Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 5 May 2026 14:36:41 -0700 Subject: [PATCH 4/5] Cleanup: Remove fused HC settings from script Removed comments and environment variable related to fused HC. --- benchmarks/single_node/dsv4_fp4_b200_trt.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index 654499f8e..40669cd15 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -101,11 +101,6 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" fi -# DeepSeek-V4-Pro has hidden size 7168. Keep fused HC off with the current -# feat/deepseek_v4 image, matching the B300 TRT recipe. -export TRTLLM_MHC_ENABLE_FUSED_HC="${TRTLLM_MHC_ENABLE_FUSED_HC:-0}" -echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC" - start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x From 9a25686f305c60fa04163f5435bf38e51c7e33bf Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 5 May 2026 16:04:56 -0700 Subject: [PATCH 5/5] final --- .github/configs/nvidia-master.yaml | 4 ++-- runners/launch_b200-dgxc.sh | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ef02b37fe..94af7fdda 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1769,12 +1769,12 @@ dsv4-fp4-b200-trt: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 2048 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 } # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index fb259dd7d..e2681ccec 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -284,14 +284,8 @@ else CONTAINER_MOUNT_DIR=/workspace fi - # b200-dgxc was re-partitioned from gpu to gpu-1/gpu-2. Use gpu-2, which - # is the clean GPU-only pool, instead of the removed gpu partition. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job on partition $SLURM_PARTITION" - exit 1 - fi # Use flock to serialize concurrent imports to the same squash file # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes