1+ #! /bin/bash
2+ set -eux
3+
4+ pip install -U wandb
5+
6+ # cloning megatron-LM separately because in swift, while a subprocess
7+ # is cloning the repo, another starts trying to install it. This ensures
8+ # when the repo exists when installation is being attempted.
9+ mkdir -p /root/.cache/modelscope/hub/_github
10+ cd /root/.cache/modelscope/hub/_github
11+ git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM --branch core_r0.13.0
12+
13+ cd /root/
14+ export DATASET=" zai-org/LongAlign-10k"
15+ export MODEL_ID=" Qwen/Qwen3-30B-A3B-Instruct-2507"
16+
17+ export MCORE_MODEL_DIR=" Converted/Qwen3-30B-A3B-Instruct-2507-mcore"
18+ swift export \
19+ --model $MODEL_ID \
20+ --to_mcore true \
21+ --torch_dtype bfloat16 \
22+ --use_hf \
23+ --output_dir $MCORE_MODEL_DIR
24+
25+ echo " Done converting ckpt"
26+
27+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True NPROC_PER_NODE=$BT_NUM_GPUS NNODES=$BT_GROUP_SIZE NODE_RANK=$BT_NODE_RANK MASTER_ADDR=$BT_LEADER_ADDR megatron sft \
28+ --load $MCORE_MODEL_DIR \
29+ --dataset $DATASET \
30+ --no_initialization false \
31+ --split_dataset_ratio 0.01 \
32+ --tensor_model_parallel_size 2 \
33+ --pipeline_model_parallel_size 2 \
34+ --expert_model_parallel_size 2 \
35+ --moe_permute_fusion true \
36+ --moe_grouped_gemm true \
37+ --moe_shared_expert_overlap true \
38+ --moe_aux_loss_coeff 1e-3 \
39+ --micro_batch_size 1 \
40+ --global_batch_size 8 \
41+ --packing true \
42+ --recompute_granularity full \
43+ --recompute_method uniform \
44+ --recompute_num_layers 4 \
45+ --train_iters 200 \
46+ --eval_iters 40 \
47+ --finetune true \
48+ --cross_entropy_loss_fusion true \
49+ --lr 1e-5 \
50+ --lr_warmup_fraction 0.05 \
51+ --min_lr 1e-6 \
52+ --save $BT_CHECKPOINT_DIR \
53+ --eval_interval 40 \
54+ --save_interval 40 \
55+ --max_length 32000 \
56+ --num_workers 8 \
57+ --dataset_num_proc 8 \
58+ --no_save_optim true \
59+ --no_save_rng true \
60+ --sequence_parallel true \
61+ --attention_backend flash \
62+ --optimizer_cpu_offload true \
63+ --use_precision_aware_optimizer true \
64+ --use_hf 1 \
65+ --wandb_project qwen3_moe_megatron \
66+ --wandb_exp_name all_training_b10f \
0 commit comments