@@ -125,6 +125,8 @@ func (s *Scheduler) routeHandlers() map[string]http.HandlerFunc {
125125 m ["POST " + inference .InferencePrefix + "/unload" ] = s .Unload
126126 m ["POST " + inference .InferencePrefix + "/{backend}/_configure" ] = s .Configure
127127 m ["POST " + inference .InferencePrefix + "/_configure" ] = s .Configure
128+ m ["POST " + inference .InferencePrefix + "/{backend}/load" ] = s .Load
129+ m ["POST " + inference .InferencePrefix + "/load" ] = s .Load
128130 m ["GET " + inference .InferencePrefix + "/requests" ] = s .openAIRecorder .GetRecordsHandler ()
129131 return m
130132}
@@ -431,6 +433,89 @@ func (s *Scheduler) Configure(w http.ResponseWriter, r *http.Request) {
431433 w .WriteHeader (http .StatusAccepted )
432434}
433435
436+ // Load handles loading a model into memory without performing inference.
437+ // This is useful for warming up models in detached mode.
438+ func (s * Scheduler ) Load (w http.ResponseWriter , r * http.Request ) {
439+ // Determine the requested backend and ensure that it's valid.
440+ var backend inference.Backend
441+ if b := r .PathValue ("backend" ); b == "" {
442+ backend = s .defaultBackend
443+ } else {
444+ backend = s .backends [b ]
445+ }
446+ if backend == nil {
447+ http .Error (w , ErrBackendNotFound .Error (), http .StatusNotFound )
448+ return
449+ }
450+
451+ body , err := io .ReadAll (http .MaxBytesReader (w , r .Body , maximumOpenAIInferenceRequestSize ))
452+ if err != nil {
453+ if _ , ok := err .(* http.MaxBytesError ); ok {
454+ http .Error (w , "request too large" , http .StatusBadRequest )
455+ } else {
456+ http .Error (w , "unknown error" , http .StatusInternalServerError )
457+ }
458+ return
459+ }
460+
461+ var loadRequest LoadRequest
462+ if err := json .Unmarshal (body , & loadRequest ); err != nil {
463+ http .Error (w , "invalid request" , http .StatusBadRequest )
464+ return
465+ }
466+
467+ if loadRequest .Model == "" {
468+ http .Error (w , "model name is required" , http .StatusBadRequest )
469+ return
470+ }
471+
472+ // Wait for the backend installation to complete
473+ if err := s .installer .wait (r .Context (), backend .Name ()); err != nil {
474+ if errors .Is (err , ErrBackendNotFound ) {
475+ http .Error (w , err .Error (), http .StatusNotFound )
476+ } else if errors .Is (err , errInstallerNotStarted ) {
477+ http .Error (w , err .Error (), http .StatusServiceUnavailable )
478+ } else if errors .Is (err , context .Canceled ) {
479+ http .Error (w , "service unavailable" , http .StatusServiceUnavailable )
480+ } else {
481+ http .Error (w , fmt .Errorf ("backend installation failed: %w" , err ).Error (), http .StatusServiceUnavailable )
482+ }
483+ return
484+ }
485+
486+ // Resolve the model ID
487+ modelID := s .modelManager .ResolveModelID (loadRequest .Model )
488+
489+ // Load the model using the loader (default to completion mode)
490+ mode := inference .BackendModeCompletion
491+ runner , err := s .loader .load (r .Context (), backend .Name (), modelID , loadRequest .Model , mode )
492+ if err != nil {
493+ s .log .Warnf ("Failed to load model %s (%s): %v" , loadRequest .Model , modelID , err )
494+ if errors .Is (err , errModelTooBig ) {
495+ http .Error (w , "model too big for available memory" , http .StatusInsufficientStorage )
496+ } else if errors .Is (err , context .Canceled ) {
497+ http .Error (w , "request canceled" , http .StatusRequestTimeout )
498+ } else {
499+ http .Error (w , fmt .Sprintf ("failed to load model: %v" , err ), http .StatusInternalServerError )
500+ }
501+ return
502+ }
503+
504+ // Release the runner immediately since we're just loading it, not using it
505+ defer s .loader .release (runner )
506+
507+ // Return success response
508+ response := LoadResponse {
509+ Status : "loaded" ,
510+ Message : fmt .Sprintf ("Model %s loaded successfully" , loadRequest .Model ),
511+ }
512+ w .Header ().Set ("Content-Type" , "application/json" )
513+ w .WriteHeader (http .StatusOK )
514+ if err := json .NewEncoder (w ).Encode (response ); err != nil {
515+ s .log .Warnf ("Failed to encode load response: %v" , err )
516+ }
517+ }
518+
434519// GetAllActiveRunners returns information about all active runners
435520func (s * Scheduler ) GetAllActiveRunners () []metrics.ActiveRunner {
436521 runningBackends := s .getLoaderStatus (context .Background ())
0 commit comments