Skip to content

Commit 8a029b0

Browse files
authored
Megatron multi node V1 (#20)
1 parent cfa8b7a commit 8a029b0

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from truss_train import definitions
2+
from truss.base import truss_config
3+
4+
BASE_IMAGE = "baseten/megatron:0.0.1"
5+
PROJECT_NAME = "Megatron-qwen3-30b-a3b 2nodes"
6+
7+
training_runtime = definitions.Runtime(
8+
start_commands=["/bin/sh -c 'chmod +x ./run.sh && ./run.sh'"],
9+
environment_variables={
10+
"HF_TOKEN": definitions.SecretReference(
11+
name="hf_access_token"
12+
), # The name of the HF Access Token secret in your B10 account
13+
"HF_HUB_ENABLE_HF_TRANSFER": "true",
14+
"WANDB_API_KEY": definitions.SecretReference(
15+
name="wandb_api_key"
16+
), # comment this out if you don't want to use wandb
17+
},
18+
cache_config=definitions.CacheConfig(
19+
enabled=False,
20+
),
21+
checkpointing_config=definitions.CheckpointingConfig(
22+
enabled=True,
23+
),
24+
)
25+
26+
training_compute = definitions.Compute(
27+
accelerator=truss_config.AcceleratorSpec(
28+
accelerator=truss_config.Accelerator.H100,
29+
count=8,
30+
),
31+
node_count=2,
32+
)
33+
34+
training_job = definitions.TrainingJob(
35+
image=definitions.Image(base_image=BASE_IMAGE),
36+
compute=training_compute,
37+
runtime=training_runtime,
38+
)
39+
40+
training_project = definitions.TrainingProject(name=PROJECT_NAME, job=training_job)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/bin/bash
2+
set -eux
3+
4+
pip install -U wandb
5+
6+
# cloning megatron-LM separately because in swift, while a subprocess
7+
# is cloning the repo, another starts trying to install it. This ensures
8+
# when the repo exists when installation is being attempted.
9+
mkdir -p /root/.cache/modelscope/hub/_github
10+
cd /root/.cache/modelscope/hub/_github
11+
git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM --branch core_r0.13.0
12+
13+
cd /root/
14+
export DATASET="zai-org/LongAlign-10k"
15+
export MODEL_ID="Qwen/Qwen3-30B-A3B-Instruct-2507"
16+
17+
export MCORE_MODEL_DIR="Converted/Qwen3-30B-A3B-Instruct-2507-mcore"
18+
swift export \
19+
--model $MODEL_ID \
20+
--to_mcore true \
21+
--torch_dtype bfloat16 \
22+
--use_hf \
23+
--output_dir $MCORE_MODEL_DIR
24+
25+
echo "Done converting ckpt"
26+
27+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True NPROC_PER_NODE=$BT_NUM_GPUS NNODES=$BT_GROUP_SIZE NODE_RANK=$BT_NODE_RANK MASTER_ADDR=$BT_LEADER_ADDR megatron sft \
28+
--load $MCORE_MODEL_DIR \
29+
--dataset $DATASET \
30+
--no_initialization false \
31+
--split_dataset_ratio 0.01 \
32+
--tensor_model_parallel_size 2 \
33+
--pipeline_model_parallel_size 2 \
34+
--expert_model_parallel_size 2 \
35+
--moe_permute_fusion true \
36+
--moe_grouped_gemm true \
37+
--moe_shared_expert_overlap true \
38+
--moe_aux_loss_coeff 1e-3 \
39+
--micro_batch_size 1 \
40+
--global_batch_size 8 \
41+
--packing true \
42+
--recompute_granularity full \
43+
--recompute_method uniform \
44+
--recompute_num_layers 4 \
45+
--train_iters 200 \
46+
--eval_iters 40 \
47+
--finetune true \
48+
--cross_entropy_loss_fusion true \
49+
--lr 1e-5 \
50+
--lr_warmup_fraction 0.05 \
51+
--min_lr 1e-6 \
52+
--save $BT_CHECKPOINT_DIR \
53+
--eval_interval 40 \
54+
--save_interval 40 \
55+
--max_length 32000 \
56+
--num_workers 8 \
57+
--dataset_num_proc 8 \
58+
--no_save_optim true \
59+
--no_save_rng true \
60+
--sequence_parallel true \
61+
--attention_backend flash \
62+
--optimizer_cpu_offload true \
63+
--use_precision_aware_optimizer true \
64+
--use_hf 1 \
65+
--wandb_project qwen3_moe_megatron \
66+
--wandb_exp_name all_training_b10f \

0 commit comments

Comments
 (0)