From 8b29399c51395c4b2cb9025f708a3b18e94f1ab7 Mon Sep 17 00:00:00 2001 From: odashi Date: Wed, 14 Jan 2026 18:09:56 +0900 Subject: [PATCH 1/2] add moe small phase2 config --- pretrain/scripts/v4-8b-phase1/base/params.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pretrain/scripts/v4-8b-phase1/base/params.sh b/pretrain/scripts/v4-8b-phase1/base/params.sh index 57a307d..6b28681 100644 --- a/pretrain/scripts/v4-8b-phase1/base/params.sh +++ b/pretrain/scripts/v4-8b-phase1/base/params.sh @@ -112,4 +112,5 @@ ALL_PARAMS+=( # NOTE(odashi): # https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#communication-overlaps-and-tuning export NVTE_FWD_LAYERNORM_SM_MARGIN=16 -export NVTE_BWD_LAYERNORM_SM_MARGIN=16 \ No newline at end of file +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 + From 21f3460483f82c3c64f7e6fef9a691a75b0af5c4 Mon Sep 17 00:00:00 2001 From: odashi Date: Sun, 22 Feb 2026 15:08:52 +0900 Subject: [PATCH 2/2] fix moe installer --- .../scripts/environment.sh | 4 +-- .../src/install_flash_attention_3.sh | 26 +++++++------------ 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/pretrain/installers/v4-moe-upstream-megatron-abci/scripts/environment.sh b/pretrain/installers/v4-moe-upstream-megatron-abci/scripts/environment.sh index bff0350..dded596 100644 --- a/pretrain/installers/v4-moe-upstream-megatron-abci/scripts/environment.sh +++ b/pretrain/installers/v4-moe-upstream-megatron-abci/scripts/environment.sh @@ -16,8 +16,8 @@ export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1 export PRETRAIN_PYTHON_VERSION=3.10.4 export PRETRAIN_TORCH_VERSION=2.8.0 export PRETRAIN_APEX_COMMIT=e13873debc4699d39c6861074b9a3b2a02327f92 -export PRETRAIN_FLASH_ATTENTION_VERSION=060c9188beec3a8b62b33a3bfa6d5d2d44975fab -export PRETRAIN_TRANSFORMER_ENGINE_VERSION=2.5.0 +export PRETRAIN_FLASH_ATTENTION_VERSION=2.8.1 +export PRETRAIN_TRANSFORMER_ENGINE_VERSION=2.8.0 # export PRETRAIN_MEGATRON_TAG=v4 export PRETRAIN_MEGATRON_TAG=main diff --git a/pretrain/installers/v4-moe-upstream-megatron-abci/src/install_flash_attention_3.sh b/pretrain/installers/v4-moe-upstream-megatron-abci/src/install_flash_attention_3.sh index 8da9616..5d1227c 100644 --- a/pretrain/installers/v4-moe-upstream-megatron-abci/src/install_flash_attention_3.sh +++ b/pretrain/installers/v4-moe-upstream-megatron-abci/src/install_flash_attention_3.sh @@ -1,29 +1,23 @@ # Installs flash attention 3 (flash attention for NVIDIA Hopper architecture). -# CAUTION(sosuke): -# Installing flash attention v2 and v3 in the same environment may cause problems when used with Megatron-LM. -# We highly recommend only to use flash attention v3 for Hopper architecture. - echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}" source ${TARGET_DIR}/venv/bin/activate pushd ${TARGET_DIR}/src -git clone https://github.com/Dao-AILab/flash-attention.git -pushd flash-attention/ -git checkout ${PRETRAIN_FLASH_ATTENTION_VERSION} - -# Use flash-attention 3 -pushd hopper/ +git clone https://github.com/Dao-AILab/flash-attention.git -b v${PRETRAIN_FLASH_ATTENTION_VERSION} --recursive +pushd flash-attention +# install v2 python setup.py install +pushd hopper +# install v3 +python setup.py install +python_path=$(python -c "import site; print(site.getsitepackages()[0])") +cp ./flash_attn_interface.py ${python_path}/flash_attn_3 +popd +popd -python_path=`python -c "import site; print(site.getsitepackages()[0])"` -mkdir -p $python_path/flash_attn_3 -wget -P $python_path/flash_attn_3 https://raw.githubusercontent.com/Dao-AILab/flash-attention/${PRETRAIN_FLASH_ATTENTION_VERSION}/hopper/flash_attn_interface.py - -popd # hopper/ -popd # flash-attention/ popd # ${TARGET_DIR}/src deactivate