Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/promote-to-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
workflow_dispatch:
inputs:
version:
description: 'version'
description: "version"
required: true
type: string

Expand Down Expand Up @@ -42,6 +42,11 @@ jobs:
echo "Promoting vLLM CUDA images"
crane tag "docker/model-runner:${{ inputs.version }}-vllm-cuda" "latest-vllm-cuda"

- name: Promote SGLang CUDA images
run: |
echo "Promoting SGLang CUDA images"
crane tag "docker/model-runner:${{ inputs.version }}-sglang-cuda" "latest-sglang-cuda"

- name: Promote ROCm images
run: |
echo "Promoting ROCm images"
Expand Down
37 changes: 32 additions & 5 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,33 @@ on:
workflow_dispatch:
inputs:
pushLatest:
description: 'Tag images produced by this job as latest'
description: "Tag images produced by this job as latest"
required: false
type: boolean
default: false
releaseTag:
description: 'Release tag'
description: "Release tag"
required: false
type: string
default: "test"
llamaServerVersion:
description: 'llama-server version'
description: "llama-server version"
required: false
type: string
default: "latest"
vllmVersion:
description: 'vLLM version'
description: "vLLM version"
required: false
type: string
default: "0.12.0"
sglangVersion:
description: "SGLang version"
required: false
type: string
default: "0.4.0"
# This can be removed once we have llama.cpp built for MUSA and CANN.
buildMusaCann:
description: 'Build MUSA and CANN images'
description: "Build MUSA and CANN images"
required: false
type: boolean
default: false
Expand Down Expand Up @@ -76,6 +81,12 @@ jobs:
echo "docker/model-runner:latest-vllm-cuda" >> "$GITHUB_OUTPUT"
fi
echo 'EOF' >> "$GITHUB_OUTPUT"
echo "sglang-cuda<<EOF" >> "$GITHUB_OUTPUT"
echo "docker/model-runner:${{ inputs.releaseTag }}-sglang-cuda" >> "$GITHUB_OUTPUT"
if [ "${{ inputs.pushLatest }}" == "true" ]; then
echo "docker/model-runner:latest-sglang-cuda" >> "$GITHUB_OUTPUT"
fi
echo 'EOF' >> "$GITHUB_OUTPUT"
echo "rocm<<EOF" >> "$GITHUB_OUTPUT"
echo "docker/model-runner:${{ inputs.releaseTag }}-rocm" >> "$GITHUB_OUTPUT"
if [ "${{ inputs.pushLatest }}" == "true" ]; then
Expand Down Expand Up @@ -155,6 +166,22 @@ jobs:
provenance: mode=max
tags: ${{ steps.tags.outputs.vllm-cuda }}

- name: Build SGLang CUDA image
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
with:
file: Dockerfile
target: final-sglang
platforms: linux/amd64
build-args: |
"LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
"LLAMA_SERVER_VARIANT=cuda"
"BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
"SGLANG_VERSION=${{ inputs.sglangVersion }}"
push: true
sbom: true
provenance: mode=max
tags: ${{ steps.tags.outputs.sglang-cuda }}

- name: Build ROCm image
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
with:
Expand Down
57 changes: 50 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ COPY --link . .
# Build the Go binary (static build)
RUN --mount=type=cache,target=/go/pkg/mod \
--mount=type=cache,target=/root/.cache/go-build \
CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner .

# Build the Go binary for SGLang (without vLLM)
FROM builder AS builder-sglang
RUN --mount=type=cache,target=/go/pkg/mod \
--mount=type=cache,target=/root/.cache/go-build \
CGO_ENABLED=1 GOOS=linux go build -tags=novllm -ldflags="-s -w" -o model-runner .

# --- Get llama.cpp binary ---
FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
Expand Down Expand Up @@ -97,21 +103,58 @@ USER modelrunner

# Install uv and vLLM as modelrunner user
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
&& if [ "$TARGETARCH" = "amd64" ]; then \
WHEEL_ARCH="manylinux_2_31_x86_64"; \
WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
&& if [ "$TARGETARCH" = "amd64" ]; then \
WHEEL_ARCH="manylinux_2_31_x86_64"; \
WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
else \
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
fi

RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version

# --- SGLang variant ---
FROM llamacpp AS sglang

ARG SGLANG_VERSION=0.5.6

USER root

# Install CUDA toolkit 13 for nvcc (needed for flashinfer JIT compilation)
RUN apt update && apt install -y \
python3 python3-venv python3-dev \
curl ca-certificates build-essential \
libnuma1 libnuma-dev numactl ninja-build \
wget gnupg \
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
&& dpkg -i cuda-keyring_1.1-1_all.deb \
&& apt update && apt install -y cuda-toolkit-13-0 \
&& rm cuda-keyring_1.1-1_all.deb \
&& rm -rf /var/lib/apt/lists/*

RUN mkdir -p /opt/sglang-env && chown -R modelrunner:modelrunner /opt/sglang-env

USER modelrunner

# Set CUDA paths for nvcc (needed during flashinfer compilation)
ENV PATH=/usr/local/cuda-13.0/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:$LD_LIBRARY_PATH

# Install uv and SGLang as modelrunner user
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/sglang-env \
&& ~/.local/bin/uv pip install --python /opt/sglang-env/bin/python "sglang==${SGLANG_VERSION}"

RUN /opt/sglang-env/bin/python -c "import sglang; print(sglang.__version__)" > /opt/sglang-env/version
FROM llamacpp AS final-llamacpp
# Copy the built binary from builder
COPY --from=builder /app/model-runner /app/model-runner

FROM vllm AS final-vllm
# Copy the built binary from builder
COPY --from=builder /app/model-runner /app/model-runner

FROM sglang AS final-sglang
# Copy the built binary from builder-sglang (without vLLM)
COPY --from=builder-sglang /app/model-runner /app/model-runner
34 changes: 32 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ BASE_IMAGE := ubuntu:24.04
VLLM_BASE_IMAGE := nvidia/cuda:13.0.2-runtime-ubuntu24.04
DOCKER_IMAGE := docker/model-runner:latest
DOCKER_IMAGE_VLLM := docker/model-runner:latest-vllm-cuda
DOCKER_IMAGE_SGLANG := docker/model-runner:latest-sglang
DOCKER_IMAGE_SGLANG_CUDA := docker/model-runner:latest-sglang-cuda
DOCKER_TARGET ?= final-llamacpp
PORT := 8080
MODELS_PATH := $(shell pwd)/models-store
Expand All @@ -31,13 +33,13 @@ LICENSE ?=
BUILD_DMR ?= 1

# Main targets
.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-run-impl help validate lint model-distribution-tool
.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang docker-build-sglang-cuda docker-run-sglang-cuda docker-run-impl help validate lint model-distribution-tool
# Default target
.DEFAULT_GOAL := build

# Build the Go application
build:
CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) .

# Build model-distribution-tool
model-distribution-tool:
Expand Down Expand Up @@ -116,6 +118,30 @@ docker-build-vllm:
docker-run-vllm: docker-build-vllm
@$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_VLLM)

# Build SGLang Docker image (CPU variant)
docker-build-sglang:
@$(MAKE) docker-build \
DOCKER_TARGET=final-sglang \
DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG) \
LLAMA_SERVER_VARIANT=cpu \
BASE_IMAGE=$(BASE_IMAGE)

# Run SGLang Docker container (CPU variant) with TCP port access and mounted model storage
docker-run-sglang: docker-build-sglang
@$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG)

# Build SGLang Docker image (CUDA variant)
docker-build-sglang-cuda:
@$(MAKE) docker-build \
DOCKER_TARGET=final-sglang \
DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG_CUDA) \
LLAMA_SERVER_VARIANT=cuda \
BASE_IMAGE=$(VLLM_BASE_IMAGE)

# Run SGLang Docker container (CUDA variant) with TCP port access and mounted model storage
docker-run-sglang-cuda: docker-build-sglang-cuda
@$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG_CUDA)

# Common implementation for running Docker container
docker-run-impl:
@echo ""
Expand Down Expand Up @@ -178,6 +204,10 @@ help:
@echo " docker-run - Run in Docker container with TCP port access and mounted model storage"
@echo " docker-build-vllm - Build vLLM Docker image"
@echo " docker-run-vllm - Run vLLM Docker container"
@echo " docker-build-sglang - Build SGLang Docker image (CPU)"
@echo " docker-run-sglang - Run SGLang Docker container (CPU)"
@echo " docker-build-sglang-cuda - Build SGLang Docker image (CUDA)"
@echo " docker-run-sglang-cuda - Run SGLang Docker container (CUDA)"
@echo " help - Show this help message"
@echo ""
@echo "Model distribution tool targets:"
Expand Down
23 changes: 23 additions & 0 deletions backends_vllm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//go:build !novllm

package main

import (
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/backends/vllm"
"github.com/docker/model-runner/pkg/inference/models"
"github.com/sirupsen/logrus"
)

func initVLLMBackend(log *logrus.Logger, modelManager *models.Manager) (inference.Backend, error) {
return vllm.New(
log,
modelManager,
log.WithFields(logrus.Fields{"component": vllm.Name}),
nil,
)
}

func registerVLLMBackend(backends map[string]inference.Backend, backend inference.Backend) {
backends[vllm.Name] = backend
}
17 changes: 17 additions & 0 deletions backends_vllm_stub.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//go:build novllm

package main

import (
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/models"
"github.com/sirupsen/logrus"
)

func initVLLMBackend(log *logrus.Logger, modelManager *models.Manager) (inference.Backend, error) {
return nil, nil
}

func registerVLLMBackend(backends map[string]inference.Backend, backend inference.Backend) {
// No-op when vLLM is disabled
}
31 changes: 20 additions & 11 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
"github.com/docker/model-runner/pkg/inference/backends/mlx"
"github.com/docker/model-runner/pkg/inference/backends/sglang"
"github.com/docker/model-runner/pkg/inference/backends/vllm"
"github.com/docker/model-runner/pkg/inference/config"
"github.com/docker/model-runner/pkg/inference/memory"
Expand Down Expand Up @@ -124,12 +125,7 @@ func main() {

memEstimator.SetDefaultBackend(llamaCppBackend)

vllmBackend, err := vllm.New(
log,
modelManager,
log.WithFields(logrus.Fields{"component": vllm.Name}),
nil,
)
vllmBackend, err := initVLLMBackend(log, modelManager)
if err != nil {
log.Fatalf("unable to initialize %s backend: %v", vllm.Name, err)
}
Expand All @@ -144,13 +140,26 @@ func main() {
log.Fatalf("unable to initialize %s backend: %v", mlx.Name, err)
}

sglangBackend, err := sglang.New(
log,
modelManager,
log.WithFields(logrus.Fields{"component": sglang.Name}),
nil,
)
if err != nil {
log.Fatalf("unable to initialize %s backend: %v", sglang.Name, err)
}

backends := map[string]inference.Backend{
llamacpp.Name: llamaCppBackend,
mlx.Name: mlxBackend,
sglang.Name: sglangBackend,
}
registerVLLMBackend(backends, vllmBackend)

scheduler := scheduling.NewScheduler(
log,
map[string]inference.Backend{
llamacpp.Name: llamaCppBackend,
vllm.Name: vllmBackend,
mlx.Name: mlxBackend,
},
backends,
llamaCppBackend,
modelManager,
http.DefaultClient,
Expand Down
4 changes: 4 additions & 0 deletions pkg/inference/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ type Backend interface {
// external model management system and false if the backend uses the shared
// model manager.
UsesExternalModelManagement() bool
// UsesTCP returns true if the backend uses TCP for communication instead
// of Unix sockets. When true, the scheduler will create a TCP transport
// and pass a "host:port" address to Run instead of a Unix socket path.
UsesTCP() bool
// Install ensures that the backend is installed. It should return a nil
// error if installation succeeds or if the backend is already installed.
// The provided HTTP client should be used for any HTTP operations.
Expand Down
5 changes: 5 additions & 0 deletions pkg/inference/backends/llamacpp/llamacpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ func (l *llamaCpp) UsesExternalModelManagement() bool {
return false
}

// UsesTCP implements inference.Backend.UsesTCP.
func (l *llamaCpp) UsesTCP() bool {
return false
}

// Install implements inference.Backend.Install.
func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
l.updatedLlamaCpp = false
Expand Down
5 changes: 5 additions & 0 deletions pkg/inference/backends/mlx/mlx.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ func (m *mlx) UsesExternalModelManagement() bool {
return false
}

// UsesTCP implements inference.Backend.UsesTCP.
func (m *mlx) UsesTCP() bool {
return false
}

// Install implements inference.Backend.Install.
func (m *mlx) Install(ctx context.Context, httpClient *http.Client) error {
if !platform.SupportsMLX() {
Expand Down
Loading