Skip to content

Commit e90c176

Browse files
authored
Enable publishNotReadyAddresses when the runLauncherAsWorker (#703)
Signed-off-by: Yuki Iwai <[email protected]>
1 parent 798a339 commit e90c176

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

pkg/controller/mpi_job_controller.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -885,9 +885,10 @@ func (c *MPIJobController) getOrCreateService(job *kubeflow.MPIJob, newSvc *core
885885
}
886886

887887
// If the Service selector is changed, update it.
888-
if !equality.Semantic.DeepEqual(svc.Spec.Selector, newSvc.Spec.Selector) {
888+
if !equality.Semantic.DeepEqual(svc.Spec.Selector, newSvc.Spec.Selector) || svc.Spec.PublishNotReadyAddresses != newSvc.Spec.PublishNotReadyAddresses {
889889
svc = svc.DeepCopy()
890890
svc.Spec.Selector = newSvc.Spec.Selector
891+
svc.Spec.PublishNotReadyAddresses = newSvc.Spec.PublishNotReadyAddresses
891892
return c.kubeClient.CoreV1().Services(svc.Namespace).Update(context.TODO(), svc, metav1.UpdateOptions{})
892893
}
893894

@@ -1343,10 +1344,10 @@ func newJobService(job *kubeflow.MPIJob) *corev1.Service {
13431344
kubeflow.OperatorNameLabel: kubeflow.OperatorName,
13441345
kubeflow.JobNameLabel: job.Name,
13451346
}
1346-
return newService(job, job.Name, labels)
1347+
return newService(job, job.Name, labels, ptr.Deref(job.Spec.RunLauncherAsWorker, false))
13471348
}
13481349

1349-
func newService(job *kubeflow.MPIJob, name string, selector map[string]string) *corev1.Service {
1350+
func newService(job *kubeflow.MPIJob, name string, selector map[string]string, isRunLauncherAsWorker bool) *corev1.Service {
13501351
return &corev1.Service{
13511352
ObjectMeta: metav1.ObjectMeta{
13521353
Name: name,
@@ -1361,6 +1362,9 @@ func newService(job *kubeflow.MPIJob, name string, selector map[string]string) *
13611362
Spec: corev1.ServiceSpec{
13621363
ClusterIP: corev1.ClusterIPNone,
13631364
Selector: selector,
1365+
// The publishNotReadyAddresses must be true only for the MPIJob with runLauncherAsWorker
1366+
// to avoid deadlock to wait for Launcher is ready.
1367+
PublishNotReadyAddresses: isRunLauncherAsWorker,
13641368
},
13651369
}
13661370
}

0 commit comments

Comments
 (0)