Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
0383696
[AMD] add dsr1 mxfp4 v2 sweep points
billishyahao Mar 16, 2026
18e05b1
fix
billishyahao Mar 17, 2026
32b5d3d
Fix tokenizer mismatch between benchmark client and sglang server on …
ZhaiFeiyue Mar 24, 2026
0bd347f
change mtp model to fp8
billishyahao Mar 25, 2026
754e53c
change fp8 image
billishyahao Mar 25, 2026
f29f2d0
bump image to 0327
billishyahao Mar 27, 2026
a44c7eb
remove specv2
billishyahao Mar 27, 2026
2514136
consolidate dsr1 fp4 configs
billishyahao Mar 30, 2026
3b4d4ab
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-march15
billishyahao Mar 30, 2026
682a4ab
bump fp8 image to 0327
billishyahao Mar 30, 2026
64bf100
fix crash
billishyahao Mar 30, 2026
c44e175
fix env
billishyahao Mar 30, 2026
0a41f89
cleanup
billishyahao Mar 31, 2026
7282748
add perf change log
billishyahao Mar 31, 2026
e6d4b32
add deprecate comments
billishyahao Mar 31, 2026
b7dd65f
add spec v2 env
billishyahao Apr 1, 2026
12a4ba0
bump the docker image
billishyahao Apr 2, 2026
597a458
add stream control to eliminate cpu overhead
billishyahao Apr 9, 2026
f715e47
tune the config
billishyahao Apr 10, 2026
2ea82d5
bump image
billishyahao Apr 11, 2026
16384e7
tune config
billishyahao Apr 11, 2026
4d733e7
add new exp config
billishyahao Apr 13, 2026
83af743
enable log level info
billishyahao Apr 13, 2026
0c3083e
fix mori env
billishyahao Apr 13, 2026
1c61622
bump image
billishyahao Apr 13, 2026
e2d2ac9
fix log
billishyahao Apr 13, 2026
d2a7988
bump the image
billishyahao Apr 14, 2026
b09ae6c
fix
billishyahao Apr 14, 2026
2c3ee04
fix
billishyahao Apr 14, 2026
69102f7
fix
billishyahao Apr 15, 2026
668068c
fix
billishyahao Apr 16, 2026
776fd42
bump image to 0416
billishyahao Apr 16, 2026
2471379
fix
billishyahao Apr 17, 2026
c80997f
set si to 100
billishyahao Apr 17, 2026
616c57d
bump the image
billishyahao Apr 18, 2026
3d62e2c
revert old image
billishyahao Apr 19, 2026
2c4c09d
revert old image
billishyahao Apr 19, 2026
1c9b8d2
increase DISPATCH_TOKENS_PREFILL to 5120
billishyahao Apr 20, 2026
8e6104e
bump image to 0417
billishyahao Apr 20, 2026
7cc5d81
add exp config
billishyahao Apr 21, 2026
a1c05da
add exp config
billishyahao Apr 22, 2026
a915729
add exp config
billishyahao Apr 23, 2026
44d10a1
add exp config
billishyahao Apr 23, 2026
f09820e
add exp configs
billishyahao Apr 24, 2026
5144ca1
add exp configs
billishyahao Apr 24, 2026
d9e2eef
bump image
billishyahao Apr 28, 2026
ee33925
sync arguments
billishyahao Apr 30, 2026
2b1ff6b
fix
billishyahao Apr 30, 2026
0548773
fix config
billishyahao May 1, 2026
724bd61
add exp configs
billishyahao May 1, 2026
f8f0a3a
enable sdma
billishyahao May 1, 2026
feb6c7d
fix
billishyahao May 1, 2026
f501a3e
fix
billishyahao May 1, 2026
217d892
cleanup
billishyahao May 1, 2026
a5a822a
bump image
billishyahao May 2, 2026
91e1396
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 2, 2026
2d2dada
fix yaml
billishyahao May 2, 2026
0d84ca0
Merge branch 'main' into amd/mi355x-dsfp4-april14
billishyahao May 2, 2026
26e6979
fix eval
billishyahao May 2, 2026
3974848
fix eval
billishyahao May 2, 2026
ace0e0e
fix eval
billishyahao May 3, 2026
d3a7d1e
add eval only to perflog
billishyahao May 3, 2026
d90995f
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 3, 2026
1a007bd
fix eval
billishyahao May 3, 2026
3235860
fix
billishyahao May 3, 2026
8fbd2ab
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 3, 2026
3ff0812
fix args diff for eval
billishyahao May 3, 2026
2a32c13
add eval only
billishyahao May 3, 2026
219cf7a
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 3, 2026
a73f622
fix
billishyahao May 3, 2026
8039b5f
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 3, 2026
25fb9d1
Merge branch 'main' into amd/mi355x-dsfp4-april14
billishyahao May 3, 2026
2c48183
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 5, 2026
9be76be
Update server.sh
Oseltamivir May 7, 2026
b8ed4a3
Merge branch 'main' into amd/mi355x-dsfp4-april14
Oseltamivir May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 22 additions & 21 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1145,10 +1145,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"


dsr1-fp4-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
precision: fp4
Expand Down Expand Up @@ -1239,7 +1238,7 @@ dsr1-fp4-mi355x-sglang-disagg:

# 1*DEP4+ 1*DEP8
- spec-decoding: "none"
conc-list: [ 1024, 2048 ]
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 1
tp: 4
Expand Down Expand Up @@ -1336,16 +1335,16 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# 4*DEP4 + 1*DEP8
# 2*DEP8 + 1*DEP8
- spec-decoding: "none"
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 4
tp: 4
ep: 4
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
Expand All @@ -1355,9 +1354,10 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp4-mi355x-sglang-disagg-mtp:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
precision: fp4
Expand Down Expand Up @@ -1425,7 +1425,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"
- "DECODE_MTP_SIZE=2"

# 1P2D TP4
- spec-decoding: "mtp"
Expand All @@ -1444,11 +1444,11 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"
- "DECODE_MTP_SIZE=2"

# 1*DEP4+ 1*DEP8
- spec-decoding: "mtp"
conc-list: [ 1024, 2048 ]
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 1
tp: 4
Expand Down Expand Up @@ -1526,7 +1526,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"
- "DECODE_MTP_SIZE=2"

# 1P2D TP4
- spec-decoding: "mtp"
Expand All @@ -1545,18 +1545,18 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"
- "DECODE_MTP_SIZE=2"

# 4*DEP4 + 1*DEP8
# 2*DEP8 + 1*DEP8
- spec-decoding: "mtp"
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 4
tp: 4
ep: 4
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
Expand All @@ -1565,6 +1565,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsv4-fp8-mi355x-sglang:
image: rocm/sgl-dev:deepseek-v4-mi35x
Expand Down
53 changes: 34 additions & 19 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,39 +34,47 @@ export IBDEVICES
export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)

set +x

export NCCL_IB_HCA=$IBDEVICES

export SGLANG_USE_AITER=1
export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200

export SGLANG_MORI_DISPATCH_DTYPE=auto
export SGLANG_MORI_FP8_COMB=true
export SGLANG_MORI_QP_PER_TRANSFER=4
export SGLANG_MORI_NUM_WORKERS=4
export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000

export MORI_IO_QP_MAX_SEND_WR=16384
export MORI_IO_QP_MAX_CQE=32768
export MORI_IO_QP_MAX_SGE=4

export MORI_IO_TC_DISABLE=0

export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600

# Disable allocating memory in one pass
export MORI_SHMEM_MODE=ISOLATION
export SGLANG_MORI_FP8_DISP=True

if [[ "$MODEL_NAME" == *mxfp4* ]]; then
export SGLANG_MORI_FP8_DISP=False
fi
# Enable spec v2
export SGLANG_ENABLE_SPEC_V2=1
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1

export SGLANG_MORI_FP4_DISP=False
export SGLANG_MORI_FP8_COMB=False
export SGLANG_LOG_MS=true
export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32

# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
if [[ "$MODEL_NAME" == *mxfp4* ]]; then
export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
fi
export MORI_MAX_DISPATCH_TOKENS_DECODE=160
export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
export MORI_MAX_DISPATCH_TOKENS_DECODE=512

export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703

# set MTP size=1 when EP16
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))

export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
export MORI_IO_QP_MAX_SEND_WR=16384
export MORI_IO_QP_MAX_CQE=32768
export MORI_IO_QP_MAX_SGE=4


export MORI_APP_LOG_LEVEL=INFO

Expand All @@ -89,17 +97,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
TC=$(( 4 * ND_DSCP ))
export MORI_RDMA_SL=$ND_PRIO
export MORI_IO_SL=$ND_PRIO
export MORI_RDMA_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
export MORI_IO_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
else
echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
# Fall back to hostname-based detection
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
export MORI_IO_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
export MORI_IO_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
Expand All @@ -110,9 +122,11 @@ else
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
export MORI_IO_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
export MORI_IO_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
Expand All @@ -124,3 +138,4 @@ fi
export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}


set +x
34 changes: 34 additions & 0 deletions benchmarks/multi_node/amd_utils/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,37 @@ DeepSeek-R1-0528-MXFP4:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528-MXFP4-v2:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 "
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
cuda_graph_bs: "1 2 3"
context_length: 9217
max_total_tokens: 131072
enable_two_batch_overlap: true
no_dp:
max_running_requests: 128
chunked_prefill_size: 16384
cuda_graph_bs_range: "1-128"
decode:
mem_fraction_static: 0.85
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-512"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"
Loading
Loading