diff --git a/dev/ms-swift-megatron/Dockerfile b/dev/ms-swift-megatron/Dockerfile new file mode 100644 index 00000000..8a359c93 --- /dev/null +++ b/dev/ms-swift-megatron/Dockerfile @@ -0,0 +1,42 @@ +# syntax=docker/dockerfile:1 +FROM modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.6.3-py311-torch2.7.1-vllm0.10.0-modelscope1.28.2-swift3.7.2 + +ENV DEBIAN_FRONTEND=noninteractive \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +# Core packages SkyPilot expects on the instance, plus distutils for gpustat build +RUN apt-get update && apt-get install -y --no-install-recommends \ + openssh-server \ + rsync \ + netcat-openbsd \ + pciutils \ + libpci3 \ + fuse3 \ + libfuse3-3 \ + libfuse2 \ + python3.10 \ + python3-pip \ + python3.10-venv \ + python3-distutils \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Ensure SSH server can start (SkyPilot uses SSH inside pods) +RUN mkdir -p /var/run/sshd && \ + sed -i 's/#\?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config || true + +# Preinstall Ray and SkyPilot deps into system Python 3.10 so /usr/local/bin/ray works +RUN /usr/bin/python3.10 -m pip install --upgrade "pip<25.1" "setuptools<70" && \ + /usr/bin/python3.10 -m pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3" + +# Also prepare the SkyPilot runtime venv to skip runtime setup where possible +RUN python3.10 -m venv /root/skypilot-runtime && \ + . /root/skypilot-runtime/bin/activate && \ + pip install --upgrade "pip<25.1" "setuptools<70" && \ + pip install --no-cache-dir "gpustat==1.1.1" "ray[default]==2.9.3" "skypilot==0.10.3" + +# Keep base image entrypoint/cmd + + diff --git a/dev/ms-swift-megatron/config.yaml b/dev/ms-swift-megatron/config.yaml new file mode 100644 index 00000000..ea9a5caf --- /dev/null +++ b/dev/ms-swift-megatron/config.yaml @@ -0,0 +1,11 @@ +# config.yaml +name: ms-swift-megatron + +# Single node; single H200 on CoreWeave K8s +num_nodes: 1 +resources: + infra: k8s + accelerators: H200:2 # uses the cluster's GPU label catalog + cpus: 16+ + memory: 64+ + image_id: docker:bradhiltonnw/ms-swift-megatron:skypilot diff --git a/dev/ms-swift-megatron/to-hf.sh b/dev/ms-swift-megatron/to-hf.sh new file mode 100755 index 00000000..51bafae3 --- /dev/null +++ b/dev/ms-swift-megatron/to-hf.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +swift export \ + --mcore_adapters megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx \ + --to_hf true \ + --torch_dtype bfloat16 \ + --output_dir megatron_output/Qwen3-235B-A22B-Instruct-2507/vx-xxx-hf \ + --test_convert_precision true + \ No newline at end of file diff --git a/dev/ms-swift-megatron/to-mcore.sh b/dev/ms-swift-megatron/to-mcore.sh new file mode 100755 index 00000000..89eeec8e --- /dev/null +++ b/dev/ms-swift-megatron/to-mcore.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +swift export \ + --model Qwen/Qwen3-235B-A22B-Instruct-2507 \ + --to_mcore true \ + --torch_dtype bfloat16 \ + --output_dir Qwen3-235B-A22B-Instruct-2507-mcore \ + --test_convert_precision true + \ No newline at end of file diff --git a/dev/ms-swift-megatron/train.sh b/dev/ms-swift-megatron/train.sh new file mode 100755 index 00000000..eaf2e4ce --- /dev/null +++ b/dev/ms-swift-megatron/train.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' +export NPROC_PER_NODE=2 +export CUDA_VISIBLE_DEVICES=0,1 + +megatron sft \ + --load Qwen3-235B-A22B-Instruct-2507-mcore \ + --dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT#2000' \ + 'swift/self-cognition#1000' \ + --train_type lora \ + --lora_rank 8 \ + --lora_alpha 32 \ + --target_modules all-linear \ + --split_dataset_ratio 0.01 \ + --moe_permute_fusion true \ + --tensor_model_parallel_size 4 \ + --expert_tensor_parallel_size 1 \ + --expert_model_parallel_size 8 \ + --moe_grouped_gemm true \ + --moe_shared_expert_overlap true \ + --moe_aux_loss_coeff 1e-3 \ + --micro_batch_size 8 \ + --global_batch_size 16 \ + --recompute_granularity full \ + --recompute_method uniform \ + --recompute_num_layers 1 \ + --max_epochs 1 \ + --finetune true \ + --cross_entropy_loss_fusion true \ + --lr 1e-4 \ + --lr_warmup_fraction 0.05 \ + --min_lr 1e-5 \ + --save megatron_output/Qwen3-235B-A22B-Instruct-2507 \ + --eval_interval 200 \ + --save_interval 200 \ + --max_length 2048 \ + --num_workers 8 \ + --dataset_num_proc 8 \ + --no_save_optim true \ + --no_save_rng true \ + --sequence_parallel true \ + --attention_backend flash \ + --model_author swift \ + --model_name swift-robot + \ No newline at end of file