Skip to content

wip: tunercheck action #14

wip: tunercheck action

wip: tunercheck action #14

Workflow file for this run

name: Build Checks
on: [push, pull_request]
permissions:
contents: read
pull-requests: read
jobs:
tuner-decisions-check:
strategy:
matrix:
platform:
- p5en.48xlarge
- p5.48xlarge
runs-on: ubuntu-22.04
steps:
- name: Install Dependencies
run: |
sudo apt-key del 7fa2af80
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install cuda-toolkit libhwloc-dev
pip install uv
- name: Fetch and Install EFA Installer
run: |
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
tar -xf aws-efa-installer-*.tar.gz
pushd aws-efa-installer/
sudo ./efa_installer.sh -y --skip-kmod
popd
- uses: actions/checkout@v4
- name: Build Plugin
run: |
set -x
# actions/checkout@v4 would drop the plugin source in $PWD,
# so go ahead and build it.
./autogen.sh
./configure --with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda/ \
--enable-platform-aws \
--prefix=$PWD/install
make -j 2
make install
- name: Check Decisions
run: |
OFI_NCCL_FORCE_PRODUCT_NAME=${{ matrix.platform }} uv run --directory contrib/python show-tuner-decisions \
../..//install/lib/libnccl-ofi-tuner.so \
--min-ranks-per-node 1 --max-ranks-per-node 8 \
--min-nnodes 2 --max-nnodes 2048