Add load endpoint and update --detach to use it

ericcurtin · ericcurtin · commit f6278e3d9841 · 2025-10-20T10:45:45.000+01:00
Update documentation for new load endpoint and --detach mode

Signed-off-by: Eric Curtin &lt;eric.curtin@docker.com&gt;
diff --git a/README.md b/README.md
@@ -93,6 +93,9 @@ MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli list
 
 # Pull and run a model
 MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli run ai/smollm2 "Hello, how are you?" 
+
+# Load a model into memory without interaction (detached mode)
+MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli run --detach ai/smollm2
 ```
 
 #### Option 2: Using Docker
@@ -195,6 +198,11 @@ curl http://localhost:8080/engines/llama.cpp/v1/chat/completions -X POST -d '{
   ]
 }'
 
+# Load a model into memory (without inference)
+curl http://localhost:8080/engines/llama.cpp/load -X POST -d '{
+  "model": "ai/smollm2"
+}'
+
 # Delete a model
 curl http://localhost:8080/models/ai/smollm2 -X DELETE
 
diff --git a/cmd/cli/commands/run.go b/cmd/cli/commands/run.go
@@ -208,7 +208,7 @@ func generateInteractiveWithReadline(cmd *cobra.Command, desktopClient *desktop.
 			// Create a cancellable context for the chat request
 			// This allows us to cancel the request if the user presses Ctrl+C during response generation
 			chatCtx, cancelChat := context.WithCancel(cmd.Context())
-			
+
 			// Set up signal handler to cancel the context on Ctrl+C
 			sigChan := make(chan os.Signal, 1)
 			signal.Notify(sigChan, syscall.SIGINT)
@@ -222,7 +222,7 @@ func generateInteractiveWithReadline(cmd *cobra.Command, desktopClient *desktop.
 			}()
 
 			err := chatWithMarkdownContext(chatCtx, cmd, desktopClient, backend, model, userInput, apiKey)
-			
+
 			// Clean up signal handler
 			signal.Stop(sigChan)
 			// Do not close sigChan to avoid race condition
@@ -268,7 +268,7 @@ func generateInteractiveBasic(cmd *cobra.Command, desktopClient *desktop.Client,
 		// Create a cancellable context for the chat request
 		// This allows us to cancel the request if the user presses Ctrl+C during response generation
 		chatCtx, cancelChat := context.WithCancel(cmd.Context())
-		
+
 		// Set up signal handler to cancel the context on Ctrl+C
 		sigChan := make(chan os.Signal, 1)
 		signal.Notify(sigChan, syscall.SIGINT)
@@ -283,7 +283,7 @@ func generateInteractiveBasic(cmd *cobra.Command, desktopClient *desktop.Client,
 		}()
 
 		err = chatWithMarkdownContext(chatCtx, cmd, desktopClient, backend, model, userInput, apiKey)
-		
+
 		cancelChat()
 		signal.Stop(sigChan)
 		cancelChat()
@@ -615,10 +615,8 @@ func newRunCmd() *cobra.Command {
 
 			// Handle --detach flag: just load the model without interaction
 			if detach {
-				// Make a minimal request to load the model into memory
-				err := desktopClient.Chat(backend, model, "", apiKey, func(content string) {
-					// Silently discard output in detach mode
-				}, false)
+				// Load the model into memory using the new load endpoint
+				err := desktopClient.WarmupModel(cmd.Context(), backend, model)
 				if err != nil {
 					return handleClientError(err, "Failed to load model")
 				}
diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go
@@ -799,6 +799,56 @@ func (c *Client) handleQueryError(err error, path string) error {
 	return fmt.Errorf("error querying %s: %w", path, err)
 }
 
+// WarmupModel loads a model into memory without performing inference.
+// This is useful for warming up models in detached mode.
+func (c *Client) WarmupModel(ctx context.Context, backend, model string) error {
+	model = dmrm.NormalizeModelName(model)
+	if !strings.Contains(strings.Trim(model, "/"), "/") {
+		// Do an extra API call to check if the model parameter isn't a model ID.
+		if expanded, err := c.fullModelID(model); err == nil {
+			model = expanded
+		}
+	}
+
+	reqBody := struct {
+		Model string `json:"model"`
+	}{
+		Model: model,
+	}
+
+	jsonData, err := json.Marshal(reqBody)
+	if err != nil {
+		return fmt.Errorf("error marshaling request: %w", err)
+	}
+
+	var loadPath string
+	if backend != "" {
+		loadPath = inference.InferencePrefix + "/" + backend + "/load"
+	} else {
+		loadPath = inference.InferencePrefix + "/load"
+	}
+
+	resp, err := c.doRequestWithAuthContext(
+		ctx,
+		http.MethodPost,
+		loadPath,
+		bytes.NewReader(jsonData),
+		backend,
+		"", // no API key needed for local load
+	)
+	if err != nil {
+		return c.handleQueryError(err, loadPath)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("load failed with status %d: %s", resp.StatusCode, body)
+	}
+
+	return nil
+}
+
 func (c *Client) Tag(source, targetRepo, targetTag string) error {
 	source = dmrm.NormalizeModelName(source)
 	// Check if the source is a model ID, and expand it if necessary
diff --git a/cmd/cli/desktop/desktop_test.go b/cmd/cli/desktop/desktop_test.go
@@ -2,6 +2,7 @@ package desktop
 
 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"io"
 	"net/http"
@@ -225,3 +226,57 @@ func TestInspectOpenAIHuggingFaceModel(t *testing.T) {
 	assert.NoError(t, err)
 	assert.Equal(t, expectedLowercase, model.ID)
 }
+
+func TestWarmupModel(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	modelName := "ai/smollm2"
+	expectedModelName := "ai/smollm2:latest" // normalized with tag
+	backend := "llama.cpp"
+
+	mockClient := mockdesktop.NewMockDockerHttpClient(ctrl)
+	mockContext := NewContextForMock(mockClient)
+	client := New(mockContext)
+
+	mockClient.EXPECT().Do(gomock.Any()).Do(func(req *http.Request) {
+		// Verify the request path contains the backend
+		assert.Contains(t, req.URL.Path, backend)
+		assert.Contains(t, req.URL.Path, "/load")
+
+		// Verify the request body contains the model name
+		var reqBody struct {
+			Model string `json:"model"`
+		}
+		err := json.NewDecoder(req.Body).Decode(&reqBody)
+		require.NoError(t, err)
+		assert.Equal(t, expectedModelName, reqBody.Model)
+	}).Return(&http.Response{
+		StatusCode: http.StatusOK,
+		Body:       io.NopCloser(bytes.NewBufferString(`{"status":"loaded","message":"Model ai/smollm2 loaded successfully"}`)),
+	}, nil)
+
+	err := client.WarmupModel(context.Background(), backend, modelName)
+	assert.NoError(t, err)
+}
+
+func TestWarmupModelWithError(t *testing.T) {
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	modelName := "ai/smollm2"
+	backend := "llama.cpp"
+
+	mockClient := mockdesktop.NewMockDockerHttpClient(ctrl)
+	mockContext := NewContextForMock(mockClient)
+	client := New(mockContext)
+
+	mockClient.EXPECT().Do(gomock.Any()).Return(&http.Response{
+		StatusCode: http.StatusInternalServerError,
+		Body:       io.NopCloser(bytes.NewBufferString("failed to load model")),
+	}, nil)
+
+	err := client.WarmupModel(context.Background(), backend, modelName)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "load failed")
+}
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -91,3 +91,14 @@ type ConfigureRequest struct {
 	RuntimeFlags    []string `json:"runtime-flags,omitempty"`
 	RawRuntimeFlags string   `json:"raw-runtime-flags,omitempty"`
 }
+
+// LoadRequest specifies the model to load into memory.
+type LoadRequest struct {
+	Model string `json:"model"`
+}
+
+// LoadResponse indicates whether the model was loaded successfully.
+type LoadResponse struct {
+	Status  string `json:"status"`
+	Message string `json:"message,omitempty"`
+}
diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go
@@ -469,7 +469,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 				return l.slots[existing.slot], nil
 			}
 		}
-		
+
 		if runtime.GOOS == "windows" {
 			// On Windows, we can use up to half of the total system RAM as shared GPU memory,
 			// limited by the currently available RAM.
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
@@ -125,6 +125,8 @@ func (s *Scheduler) routeHandlers() map[string]http.HandlerFunc {
 	m["POST "+inference.InferencePrefix+"/unload"] = s.Unload
 	m["POST "+inference.InferencePrefix+"/{backend}/_configure"] = s.Configure
 	m["POST "+inference.InferencePrefix+"/_configure"] = s.Configure
+	m["POST "+inference.InferencePrefix+"/{backend}/load"] = s.Load
+	m["POST "+inference.InferencePrefix+"/load"] = s.Load
 	m["GET "+inference.InferencePrefix+"/requests"] = s.openAIRecorder.GetRecordsHandler()
 	return m
 }
@@ -431,6 +433,89 @@ func (s *Scheduler) Configure(w http.ResponseWriter, r *http.Request) {
 	w.WriteHeader(http.StatusAccepted)
 }
 
+// Load handles loading a model into memory without performing inference.
+// This is useful for warming up models in detached mode.
+func (s *Scheduler) Load(w http.ResponseWriter, r *http.Request) {
+	// Determine the requested backend and ensure that it's valid.
+	var backend inference.Backend
+	if b := r.PathValue("backend"); b == "" {
+		backend = s.defaultBackend
+	} else {
+		backend = s.backends[b]
+	}
+	if backend == nil {
+		http.Error(w, ErrBackendNotFound.Error(), http.StatusNotFound)
+		return
+	}
+
+	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
+	if err != nil {
+		if _, ok := err.(*http.MaxBytesError); ok {
+			http.Error(w, "request too large", http.StatusBadRequest)
+		} else {
+			http.Error(w, "unknown error", http.StatusInternalServerError)
+		}
+		return
+	}
+
+	var loadRequest LoadRequest
+	if err := json.Unmarshal(body, &loadRequest); err != nil {
+		http.Error(w, "invalid request", http.StatusBadRequest)
+		return
+	}
+
+	if loadRequest.Model == "" {
+		http.Error(w, "model name is required", http.StatusBadRequest)
+		return
+	}
+
+	// Wait for the backend installation to complete
+	if err := s.installer.wait(r.Context(), backend.Name()); err != nil {
+		if errors.Is(err, ErrBackendNotFound) {
+			http.Error(w, err.Error(), http.StatusNotFound)
+		} else if errors.Is(err, errInstallerNotStarted) {
+			http.Error(w, err.Error(), http.StatusServiceUnavailable)
+		} else if errors.Is(err, context.Canceled) {
+			http.Error(w, "service unavailable", http.StatusServiceUnavailable)
+		} else {
+			http.Error(w, fmt.Errorf("backend installation failed: %w", err).Error(), http.StatusServiceUnavailable)
+		}
+		return
+	}
+
+	// Resolve the model ID
+	modelID := s.modelManager.ResolveModelID(loadRequest.Model)
+
+	// Load the model using the loader (default to completion mode)
+	mode := inference.BackendModeCompletion
+	runner, err := s.loader.load(r.Context(), backend.Name(), modelID, loadRequest.Model, mode)
+	if err != nil {
+		s.log.Warnf("Failed to load model %s (%s): %v", loadRequest.Model, modelID, err)
+		if errors.Is(err, errModelTooBig) {
+			http.Error(w, "model too big for available memory", http.StatusInsufficientStorage)
+		} else if errors.Is(err, context.Canceled) {
+			http.Error(w, "request canceled", http.StatusRequestTimeout)
+		} else {
+			http.Error(w, fmt.Sprintf("failed to load model: %v", err), http.StatusInternalServerError)
+		}
+		return
+	}
+
+	// Release the runner immediately since we're just loading it, not using it
+	defer s.loader.release(runner)
+
+	// Return success response
+	response := LoadResponse{
+		Status:  "loaded",
+		Message: fmt.Sprintf("Model %s loaded successfully", loadRequest.Model),
+	}
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	if err := json.NewEncoder(w).Encode(response); err != nil {
+		s.log.Warnf("Failed to encode load response: %v", err)
+	}
+}
+
 // GetAllActiveRunners returns information about all active runners
 func (s *Scheduler) GetAllActiveRunners() []metrics.ActiveRunner {
 	runningBackends := s.getLoaderStatus(context.Background())
diff --git a/pkg/inference/scheduling/scheduler_test.go b/pkg/inference/scheduling/scheduler_test.go
@@ -13,7 +13,7 @@ import (
 type systemMemoryInfo struct{}
 
 func (i systemMemoryInfo) HaveSufficientMemory(req inference.RequiredMemory) (bool, error) {
-    return true, nil
+	return true, nil
 }
 
 func (i systemMemoryInfo) GetTotalMemory() inference.RequiredMemory {

Original file line number	Diff line number	Diff line change
`@@ -469,7 +469,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string`
`469`	`469`	`return l.slots[existing.slot], nil`
`470`	`470`	`}`
`471`	`471`	`}`
`472`		`-`
	`472`	`+`
`473`	`473`	`if runtime.GOOS == "windows" {`
`474`	`474`	`// On Windows, we can use up to half of the total system RAM as shared GPU memory,`
`475`	`475`	`// limited by the currently available RAM.`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ import (`
`13`	`13`	`type systemMemoryInfo struct{}`
`14`	`14`
`15`	`15`	`func (i systemMemoryInfo) HaveSufficientMemory(req inference.RequiredMemory) (bool, error) {`
`16`		`- return true, nil`
	`16`	`+ return true, nil`
`17`	`17`	`}`
`18`	`18`
`19`	`19`	`func (i systemMemoryInfo) GetTotalMemory() inference.RequiredMemory {`