Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: acpt-pytorch-2.8-cuda12.6
version: auto
type: environment
spec: spec.yaml
extra_config: environment.yaml
test:
pytest:
enabled: true
pip_requirements: tests/requirements.txt
tests_dir: tests
categories: ["PyTorch", "Training"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2204-cu126-py310-torch280:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libcurl4 \
liblttng-ust1 \
libunwind8 \
libxml++2.6-2v5 \
nginx-light \
psmisc \
rsyslog \
runit \
unzip && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client

RUN pip list
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
azureml-core=={{latest-pypi-version}}
azureml-dataset-runtime=={{latest-pypi-version}}
azureml-defaults=={{latest-pypi-version}}
azure-ml-component=={{latest-pypi-version}}
azureml-mlflow=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
azureml-inference-server-http
inference-schema
MarkupSafe
regex
pybind11
urllib3
requests
pillow
transformers
aiohttp>=3.12.14
py-spy
debugpy
ipykernel
tensorboard
psutil
matplotlib
tqdm
py-cpuinfo
torch-tb-profiler
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
image:
name: azureml/curated/acpt-pytorch-2.8-cuda12.6
os: linux
context:
dir: context
dockerfile: Dockerfile
template_files:
- Dockerfile
- requirements.txt
publish:
location: mcr
visibility: public
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json

description: >-
Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.1 for early testing, and preview of new fastcheckpointing capability called Nebula.
Azure Container Registry:acptdev.azurecr.io/test/public/aifx/acpt/stable-ubuntu2004-cu121-py310-torch212

name: "{{asset.name}}"
version: "{{asset.version}}"

build:
path: "{{image.context.path}}"
dockerfile_path: "{{image.dockerfile.path}}"

os_type: linux

tags:
PyTorch: "2.8"
GPU: Cuda12
OS: Ubuntu22.04
Training: ""
Preview: ""
Python: "3.10"
DeepSpeed: "0.13.1"
ONNXRuntime: "1.17.1"
torch_ORT: "1.17.0"
Checkpointing:Nebula: "0.16.10"
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Test running a sample job in the pytorch 2.8 environment."""
import os
import time
from pathlib import Path
from azure.ai.ml import command, Output, MLClient, PyTorchDistribution
from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration
from azure.identity import AzureCliCredential
import subprocess

BUILD_CONTEXT = Path("../context")
JOB_SOURCE_CODE = "../../acpt-tests/src"
TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60)
STD_LOG = Path("artifacts/user_logs/std_log.txt")


def test_pytorch_2_8():
"""Tests a sample job using pytorch 2.8 as the environment."""
this_dir = Path(__file__).parent

subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")

ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)

env_name = "acpt-pytorch-2_8-cuda12_6"

env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 2.8 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)

# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt"
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
" --local_rank $RANK --logging_strategy \"epoch\""
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch27_Cuda126_py310_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)

returned_job = ml_client.create_or_update(job)
assert returned_job is not None

# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds

bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)

if current_status == "Failed" or current_status == "Cancelled":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
ml_client.jobs.stream(returned_job.name)

assert current_status == "Completed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
azure-ai-ml==1.27.1
azure.identity==1.10.0
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
transformers
datasets
evaluate
accelerate
scikit-learn
apache_beam
apache_beam~=2.69.0
evaluate
2 changes: 1 addition & 1 deletion scripts/setup/create_azure_resources_daily.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ else
echo "Checking ${resource_name}"
if ! az ml compute show --name $gpu_v100_cluster --output none >/dev/null 2>&1; then
echo "Creating ${resource_name}"
az ml compute create --name $gpu_v100_cluster --size Standard_NC6s_v3 --min-instances 0 --max-instances 2 --type AmlCompute --idle-time-before-scale-down 120
az ml compute create --name $gpu_v100_cluster --size Standard_NC4as_T4_v3 --min-instances 0 --max-instances 2 --type AmlCompute --idle-time-before-scale-down 120
fi
fi

Expand Down
Loading