Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/cli/commands/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ func NewRootCmd(cli *command.DockerCli) *cobra.Command {
newConfigureCmd(),
newPSCmd(),
newDFCmd(),
newStopCmd(),
newUnloadCmd(),
newRequestsCmd(),
)
Expand Down
14 changes: 13 additions & 1 deletion cmd/cli/commands/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -691,8 +691,12 @@ func newRunCmd() *cobra.Command {
}
}

// Check if a prompt was explicitly provided (even if empty string)
// If args length > 1, then a prompt argument was provided (even if it's "")
explicitPromptProvided := len(args) > 1

// Handle --detach flag: just load the model without interaction
if detach {
if detach || (explicitPromptProvided && prompt == "") {
// Make a minimal request to load the model into memory
err := desktopClient.Chat(model, "", nil, func(content string) {
// Silently discard output in detach mode
Expand All @@ -714,6 +718,14 @@ func newRunCmd() *cobra.Command {
return nil
}

// Preload the model in the background to optimize for the first user interaction
// This makes sure the model is loaded when the user types their first prompt
go func() {
_ = desktopClient.Chat(model, "", nil, func(content string) {
// Silently preload the model - discard output
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The request/response will still be recorder.
Why don't we just load the model without running inference?

}, false)
}()

// Use enhanced readline-based interactive mode when terminal is available
if term.IsTerminal(int(os.Stdin.Fd())) {
return generateInteractiveWithReadline(cmd, desktopClient, model)
Expand Down
48 changes: 48 additions & 0 deletions cmd/cli/commands/stop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package commands

import (
"fmt"

"github.com/docker/model-runner/cmd/cli/commands/completion"
"github.com/docker/model-runner/cmd/cli/desktop"
"github.com/docker/model-runner/pkg/inference/models"
"github.com/spf13/cobra"
)

func newStopCmd() *cobra.Command {
var backend string

const cmdArgs = "MODEL"
c := &cobra.Command{
Use: "stop " + cmdArgs,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this and don't use unload?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've found myself using docker model stop in the past, I think its exactly the same behavior than docker model unload, to me docker model stop its more intuitive.
If the goal of the stop command is to unload a model, maybe we could use an alias? so docker model stop and docker model unload do the same thing?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, an alias would be the way to go for this IMO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, lets alias, might as well create "docker model load" also

Short: "Stop a running model",
RunE: func(cmd *cobra.Command, args []string) error {
model := models.NormalizeModelName(args[0])
unloadResp, err := desktopClient.Unload(desktop.UnloadRequest{Backend: backend, Models: []string{model}})
if err != nil {
err = handleClientError(err, "Failed to stop model")
return handleNotRunningError(err)
}
unloaded := unloadResp.UnloadedRunners
if unloaded == 0 {
cmd.Println("No such model running.")
} else {
cmd.Printf("Stopped %d model(s).\n", unloaded)
}
return nil
},
ValidArgsFunction: completion.NoComplete,
}
c.Args = func(cmd *cobra.Command, args []string) error {
if len(args) < 1 {
return fmt.Errorf(
"'docker model stop' requires MODEL.\\n\\n" +
"Usage: docker model stop " + cmdArgs + "\\n\\n" +
"See 'docker model stop --help' for more information.",
)
}
return nil
}
c.Flags().StringVar(&backend, "backend", "", "Optional backend to target")
return c
}
8 changes: 8 additions & 0 deletions cmd/cli/commands/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ func handleClientError(err error, message string) error {
return errors.Join(err, errors.New(message))
}

// handleNotRunningError checks if the error indicates that the model was not running
// and returns a user-friendly message in that case
func handleNotRunningError(err error) error {
// For now, just return the error as-is
// This function can be expanded to handle specific "model not running" errors in the future
return err
}

Comment on lines +42 to +49
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done in the handleClientError.

Suggested change
// handleNotRunningError checks if the error indicates that the model was not running
// and returns a user-friendly message in that case
func handleNotRunningError(err error) error {
// For now, just return the error as-is
// This function can be expanded to handle specific "model not running" errors in the future
return err
}

// stripDefaultsFromModelName removes the default "ai/" prefix and ":latest" tag for display.
// Examples:
// - "ai/gemma3:latest" -> "gemma3"
Expand Down
2 changes: 2 additions & 0 deletions cmd/cli/docs/reference/docker_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ cname:
- docker model run
- docker model start-runner
- docker model status
- docker model stop
- docker model stop-runner
- docker model tag
- docker model uninstall-runner
Expand All @@ -44,6 +45,7 @@ clink:
- docker_model_run.yaml
- docker_model_start-runner.yaml
- docker_model_status.yaml
- docker_model_stop.yaml
- docker_model_stop-runner.yaml
- docker_model_tag.yaml
- docker_model_uninstall-runner.yaml
Expand Down
23 changes: 23 additions & 0 deletions cmd/cli/docs/reference/docker_model_stop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
command: docker model stop
short: Stop a running model
long: Stop a running model
usage: docker model stop MODEL
pname: docker model
plink: docker_model.yaml
options:
- option: backend
value_type: string
description: Optional backend to target
deprecated: false
hidden: false
experimental: false
experimentalcli: false
kubernetes: false
swarm: false
deprecated: false
hidden: false
experimental: false
experimentalcli: false
kubernetes: false
swarm: false

1 change: 1 addition & 0 deletions cmd/cli/docs/reference/model.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Docker Model Runner
| [`run`](model_run.md) | Run a model and interact with it using a submitted prompt or chat mode |
| [`start-runner`](model_start-runner.md) | Start Docker Model Runner (Docker Engine only) |
| [`status`](model_status.md) | Check if the Docker Model Runner is running |
| [`stop`](model_stop.md) | Stop a running model |
| [`stop-runner`](model_stop-runner.md) | Stop Docker Model Runner (Docker Engine only) |
| [`tag`](model_tag.md) | Tag a model |
| [`uninstall-runner`](model_uninstall-runner.md) | Uninstall Docker Model Runner (Docker Engine only) |
Expand Down
14 changes: 14 additions & 0 deletions cmd/cli/docs/reference/model_stop.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# docker model stop

<!---MARKER_GEN_START-->
Stop a running model

### Options

| Name | Type | Default | Description |
|:------------|:---------|:--------|:---------------------------|
| `--backend` | `string` | | Optional backend to target |


<!---MARKER_GEN_END-->

11 changes: 11 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,17 @@ func main() {
// Add /v1 as an alias for /engines/v1
router.Handle("/v1/", &V1AliasHandler{scheduler: scheduler})

// Add API endpoints by creating a custom handler
apiHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/api/generate":
scheduler.HandleGenerate(w, r)
default:
http.NotFound(w, r)
}
})
router.Handle("/api/generate", apiHandler)

// Add metrics endpoint if enabled
if os.Getenv("DISABLE_METRICS") != "1" {
metricsHandler := metrics.NewAggregatedMetricsHandler(
Expand Down
30 changes: 30 additions & 0 deletions pkg/inference/scheduling/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,33 @@ type ConfigureRequest struct {
RawRuntimeFlags string `json:"raw-runtime-flags,omitempty"`
Speculative *inference.SpeculativeDecodingConfig `json:"speculative,omitempty"`
}

// GenerateRequest represents the request structure for /api/generate endpoint
type GenerateRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
System string `json:"system,omitempty"`
Template string `json:"template,omitempty"`
Context []int `json:"context,omitempty"`
Stream *bool `json:"stream,omitempty"`
Raw bool `json:"raw,omitempty"`
KeepAlive *int `json:"keep_alive,omitempty"`
Options map[string]interface{} `json:"options,omitempty"`
}

// GenerateResponse represents the response structure for /api/generate endpoint
type GenerateResponse struct {
Model string `json:"model"`
CreatedAt time.Time `json:"created_at"`
Response string `json:"response"`
Done bool `json:"done"`
DoneReason string `json:"done_reason,omitempty"`
Context []int `json:"context,omitempty"`
TotalDuration int64 `json:"total_duration,omitempty"`
LoadDuration int64 `json:"load_duration,omitempty"`
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
PromptEvalDuration int64 `json:"prompt_eval_duration,omitempty"`
EvalCount int `json:"eval_count,omitempty"`
EvalDuration int64 `json:"eval_duration,omitempty"`
}

17 changes: 9 additions & 8 deletions pkg/inference/scheduling/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/memory"
"github.com/docker/model-runner/pkg/inference/models"
"github.com/docker/model-runner/pkg/internal/utils"
"github.com/docker/model-runner/pkg/logging"
"github.com/docker/model-runner/pkg/metrics"
)
Expand Down Expand Up @@ -229,7 +230,7 @@ func (l *loader) evict(idleOnly bool) int {
}
if unused && (!idleOnly || idle || defunct) {
l.log.Infof("Evicting %s backend runner with model %s (%s) in %s mode",
r.backend, r.modelID, runnerInfo.modelRef, r.mode,
r.backend, r.modelID, utils.SanitizeForLog(runnerInfo.modelRef), r.mode,
)
l.slots[runnerInfo.slot].terminate()
l.slots[runnerInfo.slot] = nil
Expand All @@ -251,7 +252,7 @@ func (l *loader) evictRunner(backend, model string, mode inference.BackendMode)
unused := l.references[runnerInfo.slot] == 0
if unused && (allBackends || r.backend == backend) && r.modelID == model && r.mode == mode {
l.log.Infof("Evicting %s backend runner with model %s (%s) in %s mode",
r.backend, r.modelID, runnerInfo.modelRef, r.mode,
r.backend, r.modelID, utils.SanitizeForLog(runnerInfo.modelRef), r.mode,
)
l.slots[runnerInfo.slot].terminate()
l.slots[runnerInfo.slot] = nil
Expand Down Expand Up @@ -434,15 +435,15 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
// TODO(p1-0tr): For now override memory checks in case model can't be parsed
// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
// way to bypass these checks.
l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it. Error: %s", modelID, parseErr)
l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it. Error: %s", utils.SanitizeForLog(modelID), parseErr)
memory = inference.RequiredMemory{
RAM: 0,
VRAM: 0,
}
} else if err != nil {
return nil, err
}
l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024)
l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", utils.SanitizeForLog(modelID), memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024)
if l.totalMemory.RAM == 1 {
l.log.Warnf("RAM size unknown. Assume model will fit, but only one.")
memory.RAM = 1
Expand Down Expand Up @@ -491,7 +492,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
if ok {
select {
case <-l.slots[existing.slot].done:
l.log.Warnf("%s runner for %s is defunct. Waiting for it to be evicted.", backendName, existing.modelRef)
l.log.Warnf("%s runner for %s is defunct. Waiting for it to be evicted.", backendName, utils.SanitizeForLog(existing.modelRef))
if l.references[existing.slot] == 0 {
l.evictRunner(backendName, modelID, mode)
} else {
Expand Down Expand Up @@ -534,11 +535,11 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
if slot >= 0 {
// runnerConfig was already retrieved earlier (lines 401-405), no need to look it up again
// Create the runner.
l.log.Infof("Loading %s backend runner with model %s in %s mode", backendName, modelID, mode)
l.log.Infof("Loading %s backend runner with model %s in %s mode", backendName, utils.SanitizeForLog(modelID), mode)
runner, err := run(l.log, backend, modelID, modelRef, mode, slot, runnerConfig, l.openAIRecorder)
if err != nil {
l.log.Warnf("Unable to start %s backend runner with model %s in %s mode: %v",
backendName, modelID, mode, err,
backendName, utils.SanitizeForLog(modelID), mode, err,
)
return nil, fmt.Errorf("unable to start runner: %w", err)
}
Expand All @@ -552,7 +553,7 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
if err := runner.wait(ctx); err != nil {
runner.terminate()
l.log.Warnf("Initialization for %s backend runner with model %s in %s mode failed: %v",
backendName, modelID, mode, err,
backendName, utils.SanitizeForLog(modelID), mode, err,
)
return nil, fmt.Errorf("error waiting for runner to be ready: %w", err)
}
Expand Down
Loading
Loading