Skip to content

Commit e02dfe5

Browse files
authored
Support cluster domain for MPI HostFile (#704)
Signed-off-by: Yuki Iwai <[email protected]>
1 parent e90c176 commit e02dfe5

File tree

5 files changed

+193
-53
lines changed

5 files changed

+193
-53
lines changed

cmd/mpi-operator/app/options/options.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type ServerOption struct {
4040
Burst int
4141
ControllerRateLimit int
4242
ControllerBurst int
43+
ClusterDomain string
4344
}
4445

4546
// NewServerOption creates a new CMServer with a default config.
@@ -80,4 +81,8 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) {
8081

8182
fs.IntVar(&s.ControllerRateLimit, "controller-queue-rate-limit", 10, "Rate limit of the controller events queue .")
8283
fs.IntVar(&s.ControllerBurst, "controller-queue-burst", 100, "Maximum burst of the controller events queue.")
84+
85+
fs.StringVar(&s.ClusterDomain, "cluster-domain", "", `
86+
The cluster domain is used to construct MPI HostFile. When this is specified, the HostFile is built with "<pod-name>.<mpi-job-name>.<namespace>.svc.<cluster-domain>".
87+
Otherwise, that is built with <pod-name>.<mpi-job-name>.<namespace>.svc`)
8388
}

cmd/mpi-operator/app/server.go

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323

2424
"github.com/prometheus/client_golang/prometheus"
2525
"github.com/prometheus/client_golang/prometheus/promauto"
26-
"golang.org/x/time/rate"
2726
corev1 "k8s.io/api/core/v1"
2827
"k8s.io/apimachinery/pkg/api/errors"
2928
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -39,7 +38,6 @@ import (
3938
election "k8s.io/client-go/tools/leaderelection"
4039
"k8s.io/client-go/tools/leaderelection/resourcelock"
4140
"k8s.io/client-go/tools/record"
42-
"k8s.io/client-go/util/workqueue"
4341
"k8s.io/klog"
4442
schedclientset "sigs.k8s.io/scheduler-plugins/pkg/generated/clientset/versioned"
4543
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"
@@ -69,9 +67,6 @@ var (
6967
// allowed for timeout. Checks within the timeout period after the lease
7068
// expires will still return healthy.
7169
leaderHealthzAdaptorTimeout = time.Second * 20
72-
//exponential workqueue rate limiting config
73-
workqueueExponentialBaseDelay = 5 * time.Millisecond
74-
workqueueExponentialMaxDelay = 1000 * time.Second
7570
)
7671

7772
var (
@@ -146,11 +141,6 @@ func Run(opt *options.ServerOption) error {
146141
kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, 0, kubeInformerFactoryOpts...)
147142
kubeflowInformerFactory := informers.NewSharedInformerFactoryWithOptions(mpiJobClientSet, 0, kubeflowInformerFactoryOpts...)
148143

149-
workqueueRateLimiter := workqueue.NewTypedMaxOfRateLimiter(
150-
workqueue.NewTypedItemExponentialFailureRateLimiter[any](workqueueExponentialBaseDelay, workqueueExponentialMaxDelay),
151-
&workqueue.TypedBucketRateLimiter[any]{Limiter: rate.NewLimiter(rate.Limit(opt.ControllerRateLimit), opt.ControllerBurst)},
152-
)
153-
154144
controller, err := controllersv1.NewMPIJobController(
155145
kubeClient,
156146
mpiJobClientSet,
@@ -163,8 +153,7 @@ func Run(opt *options.ServerOption) error {
163153
kubeInformerFactory.Core().V1().Pods(),
164154
kubeInformerFactory.Scheduling().V1().PriorityClasses(),
165155
kubeflowInformerFactory.Kubeflow().V2beta1().MPIJobs(),
166-
namespace, opt.GangSchedulingName,
167-
workqueueRateLimiter)
156+
opt)
168157
if err != nil {
169158
klog.Fatalf("Failed to setup the controller")
170159
}

pkg/controller/mpi_job_controller.go

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/prometheus/client_golang/prometheus"
3333
"github.com/prometheus/client_golang/prometheus/promauto"
3434
"golang.org/x/crypto/ssh"
35+
"golang.org/x/time/rate"
3536
batchv1 "k8s.io/api/batch/v1"
3637
corev1 "k8s.io/api/core/v1"
3738
"k8s.io/apimachinery/pkg/api/equality"
@@ -117,7 +118,10 @@ const (
117118
)
118119

119120
var (
120-
mpiJobsCreatedCount = promauto.NewCounter(prometheus.CounterOpts{
121+
//exponential workqueue rate limiting config
122+
workqueueExponentialBaseDelay = 5 * time.Millisecond
123+
workqueueExponentialMaxDelay = 1000 * time.Second
124+
mpiJobsCreatedCount = promauto.NewCounter(prometheus.CounterOpts{
121125
Name: "mpi_operator_jobs_created_total",
122126
Help: "Counts number of MPI jobs created",
123127
})
@@ -252,6 +256,9 @@ type MPIJobController struct {
252256
// To allow injection of updateStatus for testing.
253257
updateStatusHandler func(mpijob *kubeflow.MPIJob) error
254258

259+
// clusterDomain is the FQDN for the HostFile.
260+
clusterDomain string
261+
255262
// Clock for internal use of unit-testing
256263
clock clock.WithTicker
257264
}
@@ -269,11 +276,10 @@ func NewMPIJobController(
269276
podInformer coreinformers.PodInformer,
270277
priorityClassInformer schedulinginformers.PriorityClassInformer,
271278
mpiJobInformer informers.MPIJobInformer,
272-
namespace, gangSchedulingName string,
273-
workqueueRateLimiter workqueue.TypedRateLimiter[any]) (*MPIJobController, error) {
279+
opt *options.ServerOption) (*MPIJobController, error) {
274280
return NewMPIJobControllerWithClock(kubeClient, kubeflowClient, volcanoClient, schedClient,
275281
configMapInformer, secretInformer, serviceInformer, jobInformer, podInformer,
276-
priorityClassInformer, mpiJobInformer, &clock.RealClock{}, namespace, gangSchedulingName, workqueueRateLimiter)
282+
priorityClassInformer, mpiJobInformer, &clock.RealClock{}, opt)
277283
}
278284

279285
// NewMPIJobControllerWithClock returns a new MPIJob controller.
@@ -289,9 +295,7 @@ func NewMPIJobControllerWithClock(
289295
podInformer coreinformers.PodInformer,
290296
priorityClassInformer schedulinginformers.PriorityClassInformer,
291297
mpiJobInformer informers.MPIJobInformer,
292-
clock clock.WithTicker,
293-
namespace, gangSchedulingName string,
294-
workqueueRateLimiter workqueue.TypedRateLimiter[any]) (*MPIJobController, error) {
298+
clock clock.WithTicker, opt *options.ServerOption) (*MPIJobController, error) {
295299

296300
// Create event broadcaster.
297301
klog.V(4).Info("Creating event broadcaster")
@@ -309,11 +313,11 @@ func NewMPIJobControllerWithClock(
309313
)
310314
priorityClassLister = priorityClassInformer.Lister()
311315
priorityClassSynced = priorityClassInformer.Informer().HasSynced
312-
if gangSchedulingName == options.GangSchedulerVolcano {
313-
podGroupCtrl = NewVolcanoCtrl(volcanoClient, namespace, priorityClassLister)
314-
} else if len(gangSchedulingName) != 0 {
316+
if opt.GangSchedulingName == options.GangSchedulerVolcano {
317+
podGroupCtrl = NewVolcanoCtrl(volcanoClient, opt.Namespace, priorityClassLister)
318+
} else if len(opt.GangSchedulingName) != 0 {
315319
// Use scheduler-plugins as a default gang-scheduler.
316-
podGroupCtrl = NewSchedulerPluginsCtrl(schedClient, namespace, gangSchedulingName, priorityClassLister)
320+
podGroupCtrl = NewSchedulerPluginsCtrl(schedClient, opt.Namespace, opt.GangSchedulingName, priorityClassLister)
317321
}
318322
if podGroupCtrl != nil {
319323
podGroupSynced = podGroupCtrl.PodGroupSharedIndexInformer().HasSynced
@@ -338,9 +342,15 @@ func NewMPIJobControllerWithClock(
338342
priorityClassSynced: priorityClassSynced,
339343
mpiJobLister: mpiJobInformer.Lister(),
340344
mpiJobSynced: mpiJobInformer.Informer().HasSynced,
341-
queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueueRateLimiter, workqueue.TypedRateLimitingQueueConfig[any]{Name: "MPIJob"}),
342-
recorder: recorder,
343-
clock: clock,
345+
queue: workqueue.NewTypedRateLimitingQueueWithConfig(
346+
workqueue.NewTypedMaxOfRateLimiter(
347+
workqueue.NewTypedItemExponentialFailureRateLimiter[any](workqueueExponentialBaseDelay, workqueueExponentialMaxDelay),
348+
&workqueue.TypedBucketRateLimiter[any]{Limiter: rate.NewLimiter(rate.Limit(opt.ControllerRateLimit), opt.ControllerBurst)},
349+
),
350+
workqueue.TypedRateLimitingQueueConfig[any]{Name: "MPIJob"},
351+
),
352+
recorder: recorder,
353+
clock: clock,
344354
}
345355

346356
controller.updateStatusHandler = controller.doUpdateJobStatus
@@ -833,7 +843,7 @@ func (c *MPIJobController) countReadyWorkerPods(workers []*corev1.Pod) int {
833843
// getOrCreateConfigMap gets the ConfigMap controlled by this MPIJob, or creates
834844
// one if it doesn't exist.
835845
func (c *MPIJobController) getOrCreateConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.ConfigMap, error) {
836-
newCM := newConfigMap(mpiJob, workerReplicas(mpiJob))
846+
newCM := newConfigMap(mpiJob, workerReplicas(mpiJob), c.clusterDomain)
837847
podList, err := c.getRunningWorkerPods(mpiJob)
838848
if err != nil {
839849
return nil, err
@@ -1272,29 +1282,33 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
12721282
// newConfigMap creates a new ConfigMap containing configurations for an MPIJob
12731283
// resource. It also sets the appropriate OwnerReferences on the resource so
12741284
// handleObject can discover the MPIJob resource that 'owns' it.
1275-
func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigMap {
1285+
func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32, clusterDomain string) *corev1.ConfigMap {
12761286
var buffer bytes.Buffer
12771287
slots := ptr.Deref(mpiJob.Spec.SlotsPerWorker, 1)
1288+
domainFormat := "%s.%s.%s.svc"
1289+
if len(clusterDomain) > 0 {
1290+
domainFormat += fmt.Sprintf(".%s", clusterDomain)
1291+
}
12781292
// note that pod.spec.dnsConfig also affect the svc resolution
12791293
// ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/
12801294
// launcher can be reach with hostname or service name
12811295
if runLauncherAsWorker(mpiJob) {
12821296
name := mpiJob.Name + launcherSuffix
12831297
switch mpiJob.Spec.MPIImplementation {
12841298
case kubeflow.MPIImplementationOpenMPI:
1285-
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1299+
buffer.WriteString(fmt.Sprintf("%s slots=%d\n", fmt.Sprintf(domainFormat, name, mpiJob.Name, mpiJob.Namespace), slots))
12861300
case kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH:
1287-
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc:%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1301+
buffer.WriteString(fmt.Sprintf("%s:%d\n", fmt.Sprintf(domainFormat, name, mpiJob.Name, mpiJob.Namespace), slots))
12881302
}
12891303
}
12901304

12911305
for i := 0; i < int(workerReplicas); i++ {
12921306
name := workerName(mpiJob, i)
12931307
switch mpiJob.Spec.MPIImplementation {
12941308
case kubeflow.MPIImplementationOpenMPI:
1295-
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1309+
buffer.WriteString(fmt.Sprintf("%s slots=%d\n", fmt.Sprintf(domainFormat, name, mpiJob.Name, mpiJob.Namespace), slots))
12961310
case kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH:
1297-
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc:%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1311+
buffer.WriteString(fmt.Sprintf("%s:%d\n", fmt.Sprintf(domainFormat, name, mpiJob.Name, mpiJob.Namespace), slots))
12981312
}
12991313
}
13001314

0 commit comments

Comments
 (0)