Skip to content
Closed
87 changes: 87 additions & 0 deletions .azure-pipelines/templates/ut-no-ib-env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string

steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}

- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp

- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)

- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > [email protected]
tail -f [email protected] &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
70 changes: 5 additions & 65 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,81 +28,21 @@ pr:
- '**/*.md'

jobs:
- job: UnitTestA100
timeoutInMinutes: 40
- job: UnitTestNoIBEnv
displayName: Test No IB Environment
pool:
name: msccl-ci
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4

container:
image: $(containerImage)

steps:
- template: templates/ut.yaml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
name: msccl-ci-h100

- job: UnitTestWithNpKitA100
timeoutInMinutes: 30
pool:
name: msccl-ci
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4

container:
image: $(containerImage)

steps:
- template: templates/ut-npkit.yaml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem

- job: UnitTestH100
timeoutInMinutes: 40
pool:
name: msccl-ci-h100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4

container:
image: $(containerImage)

steps:
- template: templates/ut.yaml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem

- job: UnitTestWithNpKitH100
timeoutInMinutes: 30
pool:
name: msccl-ci-h100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4

container:
image: $(containerImage)


steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-no-ib-env.yaml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
Expand Down
4 changes: 2 additions & 2 deletions python/csrc/port_channel_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ using namespace mscclpp;

void register_port_channel(nb::module_& m) {
nb::class_<BaseProxyService>(m, "BaseProxyService")
.def("start_proxy", &BaseProxyService::startProxy)
.def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false)
.def("stop_proxy", &BaseProxyService::stopProxy);

nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
.def(nb::init<int>(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE)
.def("start_proxy", &ProxyService::startProxy)
.def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false)
.def("stop_proxy", &ProxyService::stopProxy)
.def("build_and_add_semaphore", &ProxyService::buildAndAddSemaphore, nb::arg("comm"), nb::arg("connection"))
.def("add_semaphore", static_cast<SemaphoreId (ProxyService::*)(const Semaphore&)>(&ProxyService::addSemaphore),
Expand Down
21 changes: 14 additions & 7 deletions src/executor/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,17 @@ struct hash<mscclpp::DeviceExecutionPlanKey> {
} // namespace std

namespace {
auto inSameNode = [](int rank1, int rank2, int nranksPerNode) {
return rank1 / nranksPerNode == rank2 / nranksPerNode;
auto hasIBDevices = []() {
#if defined(USE_IBVERBS)
return (mscclpp::getIBDeviceCount() > 0);
#else
return false;
#endif
};

auto useIB = [](int rank1, int rank2, int nranksPerNode) {
bool inSameNode = rank1 / nranksPerNode == rank2 / nranksPerNode;
return hasIBDevices() && !inSameNode;
};

static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2,
Expand Down Expand Up @@ -222,7 +231,7 @@ struct Executor::Impl {
if (type == ChannelType::MEMORY) {
flags |= Transport::CudaIpc;
} else if (type == ChannelType::PORT) {
if (!inSameNode(rank, info.accessRank, this->nranksPerNode)) {
if (useIB(rank, info.accessRank, this->nranksPerNode)) {
flags |= IBs[rank % this->nranksPerNode];
} else
flags |= Transport::CudaIpc;
Expand Down Expand Up @@ -273,7 +282,7 @@ struct Executor::Impl {
std::vector<std::shared_future<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
for (int peer : connectedPeers) {
Transport transport =
inSameNode(rank, peer, this->nranksPerNode) ? Transport::CudaIpc : IBs[rank % this->nranksPerNode];
!useIB(rank, peer, this->nranksPerNode) ? Transport::CudaIpc : IBs[rank % this->nranksPerNode];
connectionFutures.push_back(this->comm->connect(transport, peer));
}
for (size_t i = 0; i < connectionFutures.size(); i++) {
Expand All @@ -294,9 +303,7 @@ struct Executor::Impl {
context.localMemoryIdBegin = context.proxyService->nextMemoryId(3);
for (auto& bufferType : {BufferType::INPUT, BufferType::OUTPUT, BufferType::SCRATCH}) {
TransportFlags flags = Transport::CudaIpc;
#if defined(USE_IBVERBS)
flags |= IBs[rank % this->nranksPerNode];
#endif
if (hasIBDevices()) flags |= IBs[rank % this->nranksPerNode];
RegisteredMemory localMemory;
auto bufferInfo = getBufferInfo(bufferType, sendbuff, recvbuff, context.scratchBuffer.get(), sendBufferSize,
recvBufferSize, context.scratchBufferSize);
Expand Down
17 changes: 13 additions & 4 deletions test/deploy/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ set -e

# get parameter form $1
TEST_NAME=$1
IB_ENVIRONMENT="${2:-true}"

KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
Expand Down Expand Up @@ -37,10 +38,18 @@ parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT
# force to pull the latest image
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker pull ${CONTAINERIMAGE}"
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
if [ "${IB_ENVIRONMENT}" == "true" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
else
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined --mount type=tmpfs,destination=/dev/infiniband \
--mount type=tmpfs,destination=/sys/class/infiniband --mount type=tmpfs,destination=/sys/class/infiniband_verbs\
-w /root -v ${DST_DIR}:/root/mscclpp --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
fi
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"

Loading