Add /api/generate endpoint for model loading and unloading

ericcurtin · ericcurtin · commit 167b136e2a34 · 2025-10-27T11:02:43.000Z
So we can load and unload models

Signed-off-by: Eric Curtin &lt;eric.curtin@docker.com&gt;
diff --git a/cmd/cli/commands/root.go b/cmd/cli/commands/root.go
@@ -113,6 +113,7 @@ func NewRootCmd(cli *command.DockerCli) *cobra.Command {
 		newConfigureCmd(),
 		newPSCmd(),
 		newDFCmd(),
+		newStopCmd(),
 		newUnloadCmd(),
 		newRequestsCmd(),
 	)
diff --git a/cmd/cli/commands/stop.go b/cmd/cli/commands/stop.go
@@ -0,0 +1,48 @@
+package commands
+
+import (
+	"fmt"
+
+	"github.com/docker/model-runner/cmd/cli/commands/completion"
+	"github.com/docker/model-runner/cmd/cli/desktop"
+	"github.com/docker/model-runner/pkg/inference/models"
+	"github.com/spf13/cobra"
+)
+
+func newStopCmd() *cobra.Command {
+	var backend string
+
+	const cmdArgs = "MODEL"
+	c := &cobra.Command{
+		Use:   "stop " + cmdArgs,
+		Short: "Stop a running model",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			model := models.NormalizeModelName(args[0])
+			unloadResp, err := desktopClient.Unload(desktop.UnloadRequest{Backend: backend, Models: []string{model}})
+			if err != nil {
+				err = handleClientError(err, "Failed to stop model")
+				return handleNotRunningError(err)
+			}
+			unloaded := unloadResp.UnloadedRunners
+			if unloaded == 0 {
+				cmd.Println("No such model running.")
+			} else {
+				cmd.Printf("Stopped %d model(s).\n", unloaded)
+			}
+			return nil
+		},
+		ValidArgsFunction: completion.NoComplete,
+	}
+	c.Args = func(cmd *cobra.Command, args []string) error {
+		if len(args) < 1 {
+			return fmt.Errorf(
+				"'docker model stop' requires MODEL.\\n\\n" +
+					"Usage:  docker model stop " + cmdArgs + "\\n\\n" +
+					"See 'docker model stop --help' for more information.",
+			)
+		}
+		return nil
+	}
+	c.Flags().StringVar(&backend, "backend", "", "Optional backend to target")
+	return c
+}
diff --git a/main.go b/main.go
@@ -155,6 +155,17 @@ func main() {
 	router.Handle(inference.ModelsPrefix+"/", modelManager)
 	router.Handle(inference.InferencePrefix+"/", scheduler)
 
+	// Add API endpoints by creating a custom handler
+	apiHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/generate":
+			scheduler.HandleGenerate(w, r)
+		default:
+			http.NotFound(w, r)
+		}
+	})
+	router.Handle("/api/generate", apiHandler)
+
 	// Add metrics endpoint if enabled
 	if os.Getenv("DISABLE_METRICS") != "1" {
 		metricsHandler := metrics.NewAggregatedMetricsHandler(
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -93,3 +93,33 @@ type ConfigureRequest struct {
 	RuntimeFlags    []string `json:"runtime-flags,omitempty"`
 	RawRuntimeFlags string   `json:"raw-runtime-flags,omitempty"`
 }
+
+// GenerateRequest represents the request structure for /api/generate endpoint
+type GenerateRequest struct {
+	Model     string `json:"model"`
+	Prompt    string `json:"prompt"`
+	System    string `json:"system,omitempty"`
+	Template  string `json:"template,omitempty"`
+	Context   []int  `json:"context,omitempty"`
+	Stream    *bool  `json:"stream,omitempty"`
+	Raw       bool   `json:"raw,omitempty"`
+	KeepAlive *int   `json:"keep_alive,omitempty"`
+	Options   map[string]interface{} `json:"options,omitempty"`
+}
+
+// GenerateResponse represents the response structure for /api/generate endpoint
+type GenerateResponse struct {
+	Model     string    `json:"model"`
+	CreatedAt time.Time `json:"created_at"`
+	Response  string    `json:"response"`
+	Done      bool      `json:"done"`
+	DoneReason string   `json:"done_reason,omitempty"`
+	Context    []int    `json:"context,omitempty"`
+	TotalDuration int64 `json:"total_duration,omitempty"`
+	LoadDuration  int64 `json:"load_duration,omitempty"`
+	PromptEvalCount int `json:"prompt_eval_count,omitempty"`
+	PromptEvalDuration int64 `json:"prompt_eval_duration,omitempty"`
+	EvalCount int         `json:"eval_count,omitempty"`
+	EvalDuration int64    `json:"eval_duration,omitempty"`
+}
+
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
@@ -535,6 +535,122 @@ func (s *Scheduler) handleModels(w http.ResponseWriter, r *http.Request) {
 	s.modelManager.ServeHTTP(w, r)
 }
 
+// HandleGenerate handles /api/generate requests
+// If prompt is empty, loads the model into memory
+// If prompt is empty and keep_alive is 0, unloads the model
+func (s *Scheduler) HandleGenerate(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
+	if err != nil {
+		if _, ok := err.(*http.MaxBytesError); ok {
+			http.Error(w, "request too large", http.StatusBadRequest)
+		} else {
+			http.Error(w, "unknown error", http.StatusInternalServerError)
+		}
+		return
+	}
+
+	var request GenerateRequest
+	if err := json.Unmarshal(body, &request); err != nil {
+		http.Error(w, "invalid request", http.StatusBadRequest)
+		return
+	}
+	
+	if request.Model == "" {
+		http.Error(w, "model is required", http.StatusBadRequest)
+		return
+	}
+
+	// Check if it's a load/unload request (empty prompt)
+	if request.Prompt == "" {
+		// Load request - if keep_alive is 0, it's an unload request
+		if request.KeepAlive != nil && *request.KeepAlive == 0 {
+			// Unload the model
+			unloadReq := UnloadRequest{
+				Models:  []string{request.Model},
+				Backend: "", // Use default backend
+			}
+			_ = UnloadResponse{s.loader.Unload(r.Context(), unloadReq)}
+			
+			// Return unload response
+			response := GenerateResponse{
+				Model:      request.Model,
+				CreatedAt:  time.Now().UTC(),
+				Response:   "",
+				Done:       true,
+				DoneReason: "unload",
+			}
+			
+			w.Header().Set("Content-Type", "application/json")
+			json.NewEncoder(w).Encode(response)
+			return
+		} else {
+			// Load the model by requesting a minimal inference
+			// This will trigger the loading mechanism in the loader
+			backend := s.defaultBackend
+			if backend == nil {
+				http.Error(w, "no default backend available", http.StatusInternalServerError)
+				return
+			}
+			
+			modelID := s.modelManager.ResolveModelID(request.Model)
+			
+			// Request a runner to load the model - we'll do a minimal operation to trigger loading
+			runner, err := s.loader.load(r.Context(), backend.Name(), modelID, request.Model, inference.BackendModeCompletion)
+			if err != nil {
+				http.Error(w, fmt.Errorf("unable to load runner: %w", err).Error(), http.StatusInternalServerError)
+				return
+			}
+			defer s.loader.release(runner)
+			
+			// Return load response
+			response := GenerateResponse{
+				Model:     request.Model,
+				CreatedAt: time.Now().UTC(),
+				Response:  "",
+				Done:      true,
+			}
+			
+			w.Header().Set("Content-Type", "application/json")
+			json.NewEncoder(w).Encode(response)
+			return
+		}
+	}
+
+	// Regular generate request - convert to OpenAI format and reuse existing logic
+	// Create an OpenAI-compatible request
+	openAIRequest := map[string]interface{}{
+		"model":     request.Model,
+		"prompt":    request.Prompt,
+		"stream":    request.Stream,
+		"system":    request.System,
+		"raw":       request.Raw,
+		"options":   request.Options,
+	}
+
+	// Add context if it exists
+	if request.Context != nil {
+		openAIRequest["context"] = request.Context
+	}
+
+	// Add template if it exists
+	if request.Template != "" {
+		openAIRequest["template"] = request.Template
+	}
+	
+	openAIBody, err := json.Marshal(openAIRequest)
+	if err != nil {
+		http.Error(w, "failed to process request", http.StatusInternalServerError)
+		return
+	}
+
+	// Create a new request with the OpenAI body for forwarding
+	upstreamRequest := r.Clone(r.Context())
+	upstreamRequest.Body = io.NopCloser(bytes.NewReader(openAIBody))
+
+	// Call the existing OpenAI inference handler
+	s.handleOpenAIInference(w, upstreamRequest)
+}
+
 // ServeHTTP implements net/http.Handler.ServeHTTP.
 func (s *Scheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	s.lock.RLock()

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ func NewRootCmd(cli command.DockerCli) cobra.Command {`
`113`	`113`	`newConfigureCmd(),`
`114`	`114`	`newPSCmd(),`
`115`	`115`	`newDFCmd(),`
	`116`	`+ newStopCmd(),`
`116`	`117`	`newUnloadCmd(),`
`117`	`118`	`newRequestsCmd(),`
`118`	`119`	`)`