@@ -535,6 +535,122 @@ func (s *Scheduler) handleModels(w http.ResponseWriter, r *http.Request) {
535535 s .modelManager .ServeHTTP (w , r )
536536}
537537
538+ // HandleGenerate handles /api/generate requests
539+ // If prompt is empty, loads the model into memory
540+ // If prompt is empty and keep_alive is 0, unloads the model
541+ func (s * Scheduler ) HandleGenerate (w http.ResponseWriter , r * http.Request ) {
542+ body , err := io .ReadAll (http .MaxBytesReader (w , r .Body , maximumOpenAIInferenceRequestSize ))
543+ if err != nil {
544+ if _ , ok := err .(* http.MaxBytesError ); ok {
545+ http .Error (w , "request too large" , http .StatusBadRequest )
546+ } else {
547+ http .Error (w , "unknown error" , http .StatusInternalServerError )
548+ }
549+ return
550+ }
551+
552+ var request GenerateRequest
553+ if err := json .Unmarshal (body , & request ); err != nil {
554+ http .Error (w , "invalid request" , http .StatusBadRequest )
555+ return
556+ }
557+
558+ if request .Model == "" {
559+ http .Error (w , "model is required" , http .StatusBadRequest )
560+ return
561+ }
562+
563+ // Check if it's a load/unload request (empty prompt)
564+ if request .Prompt == "" {
565+ // Load request - if keep_alive is 0, it's an unload request
566+ if request .KeepAlive != nil && * request .KeepAlive == 0 {
567+ // Unload the model
568+ unloadReq := UnloadRequest {
569+ Models : []string {request .Model },
570+ Backend : "" , // Use default backend
571+ }
572+ _ = UnloadResponse {s .loader .Unload (r .Context (), unloadReq )}
573+
574+ // Return unload response
575+ response := GenerateResponse {
576+ Model : request .Model ,
577+ CreatedAt : time .Now ().UTC (),
578+ Response : "" ,
579+ Done : true ,
580+ DoneReason : "unload" ,
581+ }
582+
583+ w .Header ().Set ("Content-Type" , "application/json" )
584+ json .NewEncoder (w ).Encode (response )
585+ return
586+ } else {
587+ // Load the model by requesting a minimal inference
588+ // This will trigger the loading mechanism in the loader
589+ backend := s .defaultBackend
590+ if backend == nil {
591+ http .Error (w , "no default backend available" , http .StatusInternalServerError )
592+ return
593+ }
594+
595+ modelID := s .modelManager .ResolveModelID (request .Model )
596+
597+ // Request a runner to load the model - we'll do a minimal operation to trigger loading
598+ runner , err := s .loader .load (r .Context (), backend .Name (), modelID , request .Model , inference .BackendModeCompletion )
599+ if err != nil {
600+ http .Error (w , fmt .Errorf ("unable to load runner: %w" , err ).Error (), http .StatusInternalServerError )
601+ return
602+ }
603+ defer s .loader .release (runner )
604+
605+ // Return load response
606+ response := GenerateResponse {
607+ Model : request .Model ,
608+ CreatedAt : time .Now ().UTC (),
609+ Response : "" ,
610+ Done : true ,
611+ }
612+
613+ w .Header ().Set ("Content-Type" , "application/json" )
614+ json .NewEncoder (w ).Encode (response )
615+ return
616+ }
617+ }
618+
619+ // Regular generate request - convert to OpenAI format and reuse existing logic
620+ // Create an OpenAI-compatible request
621+ openAIRequest := map [string ]interface {}{
622+ "model" : request .Model ,
623+ "prompt" : request .Prompt ,
624+ "stream" : request .Stream ,
625+ "system" : request .System ,
626+ "raw" : request .Raw ,
627+ "options" : request .Options ,
628+ }
629+
630+ // Add context if it exists
631+ if request .Context != nil {
632+ openAIRequest ["context" ] = request .Context
633+ }
634+
635+ // Add template if it exists
636+ if request .Template != "" {
637+ openAIRequest ["template" ] = request .Template
638+ }
639+
640+ openAIBody , err := json .Marshal (openAIRequest )
641+ if err != nil {
642+ http .Error (w , "failed to process request" , http .StatusInternalServerError )
643+ return
644+ }
645+
646+ // Create a new request with the OpenAI body for forwarding
647+ upstreamRequest := r .Clone (r .Context ())
648+ upstreamRequest .Body = io .NopCloser (bytes .NewReader (openAIBody ))
649+
650+ // Call the existing OpenAI inference handler
651+ s .handleOpenAIInference (w , upstreamRequest )
652+ }
653+
538654// ServeHTTP implements net/http.Handler.ServeHTTP.
539655func (s * Scheduler ) ServeHTTP (w http.ResponseWriter , r * http.Request ) {
540656 s .lock .RLock ()
0 commit comments