From 72dbddd94dcdd8d1ce788012d60d4a858f4df560 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 4 May 2026 15:27:57 -0700
Subject: [PATCH 1/5] Add DSv4 B200 TRT benchmark

---
 .github/configs/nvidia-master.yaml          |  21 +++
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 160 ++++++++++++++++++++
 perf-changelog.yaml                         |   7 +
 3 files changed, 188 insertions(+)
 create mode 100644 benchmarks/single_node/dsv4_fp4_b200_trt.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 042d9a5f8..daf84b773 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1755,6 +1755,27 @@ dsv4-fp4-b200-vllm:
       - { tp: 8, conc-start: 1, conc-end: 32 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
+dsv4-fp4-b200-trt:
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200-dsv4
+  precision: fp4
+  framework: trt
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 32 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 32 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 }
+
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp4-b200-vllm-mtp:
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
new file mode 100644
index 000000000..654499f8e
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+# DeepSeek-V4-Pro single-node TRTLLM recipe for B200. The configured image
+# already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at
+# runtime from this benchmark path.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    DP_ATTENTION \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}"
+export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}"
+
+sanitize_slurm_mpi_env_for_trtllm() {
+    if [[ "${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-0}" != "1" ]]; then
+        return 0
+    fi
+
+    echo "Sanitizing Slurm/PMI environment for TensorRT-LLM launch"
+    while IFS='=' read -r name _; do
+        case "$name" in
+            SLURM_*|PMIX*|PMI*|OMPI_*|ORTE_*)
+                unset "$name"
+                ;;
+        esac
+    done < <(env)
+}
+
+sanitize_slurm_mpi_env_for_trtllm
+
+export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
+echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
+
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
+
+nvidia-smi
+
+SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
+EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
+
+MOE_BACKEND="TRTLLM"
+MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
+CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
+KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
+
+ATTENTION_DP_CONFIG=""
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    ATTENTION_DP_CONFIG="
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60"
+fi
+
+cat > "$EXTRA_CONFIG_FILE" << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
+enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
+print_iter_log: true
+kv_cache_config:
+    tokens_per_block: 128
+    dtype: fp8
+    free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
+    enable_block_reuse: false
+stream_interval: 10
+num_postprocess_workers: 4
+moe_config:
+    backend: $MOE_BACKEND
+EOF
+
+echo "Generated config file contents:"
+cat "$EXTRA_CONFIG_FILE"
+
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
+# DeepSeek-V4-Pro has hidden size 7168. Keep fused HC off with the current
+# feat/deepseek_v4 image, matching the B300 TRT recipe.
+export TRTLLM_MHC_ENABLE_FUSED_HC="${TRTLLM_MHC_ENABLE_FUSED_HC:-0}"
+echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC"
+
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+set -x
+SERVE_CMD=(
+    trtllm-serve "$MODEL" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
+    --trust_remote_code \
+    --backend pytorch \
+    --max_batch_size "$MAX_BATCH_SIZE" \
+    --max_seq_len "$MAX_MODEL_LEN" \
+    --max_num_tokens "$MAX_NUM_TOKENS" \
+    --tp_size "$TP" \
+    --ep_size "$EP_SIZE" \
+    --custom_tokenizer deepseek_v4 \
+    --config "$EXTRA_CONFIG_FILE"
+)
+
+if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then
+    "${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+else
+    mpirun -n 1 --oversubscribe --allow-run-as-root \
+        "${SERVE_CMD[@]}" \
+        > "$SERVER_LOG" 2>&1 &
+fi
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai-chat \
+    --endpoint /v1/chat/completions \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$(( CONC * 10 ))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir "$PWD/" \
+    --trust-remote-code \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 98fa4e8b3..8f4f587bf 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2207,3 +2207,10 @@
     - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens"
     - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1222
+
+- config-keys:
+    - dsv4-fp4-b200-trt
+  description:
+    - "Add B200 TensorRT-LLM DeepSeek-V4-Pro single-node coverage using the feat/deepseek_v4 image"
+    - "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277

From b76e23dce8c38c04c9c4b80b4a0d3774d062160b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 4 May 2026 15:54:05 -0700
Subject: [PATCH 2/5] Fix B200 DGXC Slurm partition

---
 perf-changelog.yaml         | 1 +
 runners/launch_b200-dgxc.sh | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8f4f587bf..95b896986 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2213,4 +2213,5 @@
   description:
     - "Add B200 TensorRT-LLM DeepSeek-V4-Pro single-node coverage using the feat/deepseek_v4 image"
     - "Mirror the B300 TRT launch path with OpenAI chat serving, FP8 KV cache, TRTLLM MoE, NCCL NVLS disabled by default, and fused MHC disabled for hidden size 7168 correctness"
+    - "Update the B200 DGXC Slurm partition from removed gpu to gpu-2 so single-node B200 jobs can allocate"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1277
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index de66a0c4b..fb259dd7d 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/bash
 
 # System-specific configuration for B200 DGXC Slurm cluster
-SLURM_PARTITION="gpu"
+SLURM_PARTITION="gpu-2"
 SLURM_ACCOUNT="benchmark"
 
 set -x
@@ -284,8 +284,14 @@ else
         CONTAINER_MOUNT_DIR=/workspace
     fi
 
+    # b200-dgxc was re-partitioned from gpu to gpu-1/gpu-2. Use gpu-2, which
+    # is the clean GPU-only pool, instead of the removed gpu partition.
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
+    if [ -z "$JOB_ID" ]; then
+        echo "ERROR: salloc failed to allocate a job on partition $SLURM_PARTITION"
+        exit 1
+    fi
 
     # Use flock to serialize concurrent imports to the same squash file
     # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes

From 1cfb7eeeeab690346985aeb69f72fec2d20596c1 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 5 May 2026 14:33:56 -0700
Subject: [PATCH 3/5] Update nvidia-master.yaml

---
 .github/configs/nvidia-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index daf84b773..ef02b37fe 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1756,7 +1756,7 @@ dsv4-fp4-b200-vllm:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
 dsv4-fp4-b200-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-4999884
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -1769,12 +1769,12 @@ dsv4-fp4-b200-trt:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
 
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.

From 289672c406674da3f5b52b198fcee4a94cc5926c Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 5 May 2026 14:36:41 -0700
Subject: [PATCH 4/5] Cleanup: Remove fused HC settings from script

Removed comments and environment variable related to fused HC.
---
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index 654499f8e..40669cd15 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -101,11 +101,6 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
 fi
 
-# DeepSeek-V4-Pro has hidden size 7168. Keep fused HC off with the current
-# feat/deepseek_v4 image, matching the B300 TRT recipe.
-export TRTLLM_MHC_ENABLE_FUSED_HC="${TRTLLM_MHC_ENABLE_FUSED_HC:-0}"
-echo "TRTLLM_MHC_ENABLE_FUSED_HC: $TRTLLM_MHC_ENABLE_FUSED_HC"
-
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x

From 9a25686f305c60fa04163f5435bf38e51c7e33bf Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 5 May 2026 16:04:56 -0700
Subject: [PATCH 5/5] final

---
 .github/configs/nvidia-master.yaml | 4 ++--
 runners/launch_b200-dgxc.sh        | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ef02b37fe..94af7fdda 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1769,12 +1769,12 @@ dsv4-fp4-b200-trt:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }
 
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index fb259dd7d..e2681ccec 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -284,14 +284,8 @@ else
         CONTAINER_MOUNT_DIR=/workspace
     fi
 
-    # b200-dgxc was re-partitioned from gpu to gpu-1/gpu-2. Use gpu-2, which
-    # is the clean GPU-only pool, instead of the removed gpu partition.
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
-    if [ -z "$JOB_ID" ]; then
-        echo "ERROR: salloc failed to allocate a job on partition $SLURM_PARTITION"
-        exit 1
-    fi
 
     # Use flock to serialize concurrent imports to the same squash file
     # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes