Skip to content

Commit 322b3b6

Browse files
authored
Add acpt-pytorch-2.8-cuda12.6 env (#4534)
* add acpt-pytorch-2.7-cuda12.6 env * update torch to 2.8 * fix comments * remove unnecessary apt update * prevent image build failure * update SKU * update packages
1 parent 33229d1 commit 322b3b6

File tree

9 files changed

+205
-3
lines changed

9 files changed

+205
-3
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: acpt-pytorch-2.8-cuda12.6
2+
version: auto
3+
type: environment
4+
spec: spec.yaml
5+
extra_config: environment.yaml
6+
test:
7+
pytest:
8+
enabled: true
9+
pip_requirements: tests/requirements.txt
10+
tests_dir: tests
11+
categories: ["PyTorch", "Training"]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2204-cu126-py310-torch280:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}
2+
3+
# Install pip dependencies
4+
COPY requirements.txt .
5+
RUN pip install -r requirements.txt --no-cache-dir
6+
7+
# Inference requirements
8+
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
9+
RUN apt-get update && \
10+
apt-get install -y --no-install-recommends \
11+
libcurl4 \
12+
liblttng-ust1 \
13+
libunwind8 \
14+
libxml++2.6-2v5 \
15+
nginx-light \
16+
psmisc \
17+
rsyslog \
18+
runit \
19+
unzip && \
20+
apt-get clean && rm -rf /var/lib/apt/lists/* && \
21+
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
22+
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
23+
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
24+
rm -f /etc/nginx/sites-enabled/default
25+
ENV SVDIR=/var/runit
26+
ENV WORKER_TIMEOUT=400
27+
EXPOSE 5001 8883 8888
28+
29+
# support Deepspeed launcher requirement of passwordless ssh login
30+
RUN apt-get update
31+
RUN apt-get install -y openssh-server openssh-client
32+
33+
RUN pip list
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
azureml-core=={{latest-pypi-version}}
2+
azureml-dataset-runtime=={{latest-pypi-version}}
3+
azureml-defaults=={{latest-pypi-version}}
4+
azure-ml-component=={{latest-pypi-version}}
5+
azureml-mlflow=={{latest-pypi-version}}
6+
azureml-contrib-services=={{latest-pypi-version}}
7+
azureml-inference-server-http
8+
inference-schema
9+
MarkupSafe
10+
regex
11+
pybind11
12+
urllib3
13+
requests
14+
pillow
15+
transformers
16+
aiohttp>=3.12.14
17+
py-spy
18+
debugpy
19+
ipykernel
20+
tensorboard
21+
psutil
22+
matplotlib
23+
tqdm
24+
py-cpuinfo
25+
torch-tb-profiler
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
image:
2+
name: azureml/curated/acpt-pytorch-2.8-cuda12.6
3+
os: linux
4+
context:
5+
dir: context
6+
dockerfile: Dockerfile
7+
template_files:
8+
- Dockerfile
9+
- requirements.txt
10+
publish:
11+
location: mcr
12+
visibility: public
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
2+
3+
description: >-
4+
Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.1 for early testing, and preview of new fastcheckpointing capability called Nebula.
5+
Azure Container Registry:acptdev.azurecr.io/test/public/aifx/acpt/stable-ubuntu2004-cu121-py310-torch212
6+
7+
name: "{{asset.name}}"
8+
version: "{{asset.version}}"
9+
10+
build:
11+
path: "{{image.context.path}}"
12+
dockerfile_path: "{{image.dockerfile.path}}"
13+
14+
os_type: linux
15+
16+
tags:
17+
PyTorch: "2.8"
18+
GPU: Cuda12
19+
OS: Ubuntu22.04
20+
Training: ""
21+
Preview: ""
22+
Python: "3.10"
23+
DeepSpeed: "0.13.1"
24+
ONNXRuntime: "1.17.1"
25+
torch_ORT: "1.17.0"
26+
Checkpointing:Nebula: "0.16.10"
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""Test running a sample job in the pytorch 2.8 environment."""
5+
import os
6+
import time
7+
from pathlib import Path
8+
from azure.ai.ml import command, Output, MLClient, PyTorchDistribution
9+
from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration
10+
from azure.identity import AzureCliCredential
11+
import subprocess
12+
13+
BUILD_CONTEXT = Path("../context")
14+
JOB_SOURCE_CODE = "../../acpt-tests/src"
15+
TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60)
16+
STD_LOG = Path("artifacts/user_logs/std_log.txt")
17+
18+
19+
def test_pytorch_2_8():
20+
"""Tests a sample job using pytorch 2.8 as the environment."""
21+
this_dir = Path(__file__).parent
22+
23+
subscription_id = os.environ.get("subscription_id")
24+
resource_group = os.environ.get("resource_group")
25+
workspace_name = os.environ.get("workspace")
26+
27+
ml_client = MLClient(
28+
AzureCliCredential(), subscription_id, resource_group, workspace_name
29+
)
30+
31+
env_name = "acpt-pytorch-2_8-cuda12_6"
32+
33+
env_docker_context = Environment(
34+
build=BuildContext(path=this_dir / BUILD_CONTEXT),
35+
name=env_name,
36+
description="Pytorch 2.8 environment created from a Docker context.",
37+
)
38+
ml_client.environments.create_or_update(env_docker_context)
39+
40+
# create the command
41+
job = command(
42+
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
43+
command="pip install -r requirements.txt"
44+
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
45+
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
46+
" --local_rank $RANK --logging_strategy \"epoch\""
47+
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
48+
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
49+
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
50+
" --model_checkpoint \"bert-large-uncased\"",
51+
outputs={
52+
"output": Output(
53+
type="uri_folder",
54+
mode="rw_mount",
55+
path="azureml://datastores/workspaceblobstore/paths/outputs"
56+
)
57+
},
58+
environment=f"{env_name}@latest",
59+
compute=os.environ.get("gpu_v100_cluster"),
60+
display_name="bert-pretrain-GLUE",
61+
description="Pretrain the BERT model on the GLUE dataset.",
62+
experiment_name="pytorch27_Cuda126_py310_Experiment",
63+
distribution=PyTorchDistribution(process_count_per_instance=1),
64+
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
65+
)
66+
67+
returned_job = ml_client.create_or_update(job)
68+
assert returned_job is not None
69+
70+
# Poll until final status is reached or timed out
71+
timeout = time.time() + (TIMEOUT_MINUTES * 60)
72+
while time.time() <= timeout:
73+
current_status = ml_client.jobs.get(returned_job.name).status
74+
if current_status in ["Completed", "Failed"]:
75+
break
76+
time.sleep(30) # sleep 30 seconds
77+
78+
bashCommand = "ls"
79+
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
80+
output, error = process.communicate()
81+
print(output)
82+
print(error)
83+
84+
if current_status == "Failed" or current_status == "Cancelled":
85+
ml_client.jobs.download(returned_job.name)
86+
if STD_LOG.exists():
87+
print(f"*** BEGIN {STD_LOG} ***")
88+
with open(STD_LOG, "r") as f:
89+
print(f.read(), end="")
90+
print(f"*** END {STD_LOG} ***")
91+
else:
92+
ml_client.jobs.stream(returned_job.name)
93+
94+
assert current_status == "Completed"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
azure-ai-ml==1.27.1
2+
azure.identity==1.10.0
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
transformers
22
datasets
3-
evaluate
43
accelerate
54
scikit-learn
6-
apache_beam
5+
apache_beam~=2.69.0
76
evaluate

scripts/setup/create_azure_resources_daily.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ else
5757
echo "Checking ${resource_name}"
5858
if ! az ml compute show --name $gpu_v100_cluster --output none >/dev/null 2>&1; then
5959
echo "Creating ${resource_name}"
60-
az ml compute create --name $gpu_v100_cluster --size Standard_NC6s_v3 --min-instances 0 --max-instances 2 --type AmlCompute --idle-time-before-scale-down 120
60+
az ml compute create --name $gpu_v100_cluster --size Standard_NC4as_T4_v3 --min-instances 0 --max-instances 2 --type AmlCompute --idle-time-before-scale-down 120
6161
fi
6262
fi
6363

0 commit comments

Comments
 (0)