Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,11 @@ type BackendAuthConfig struct {
// These are the user-facing values stored in VirtualMCPServer.Status.DiscoveredBackends.
// Use BackendHealthStatus.ToCRDStatus() to convert from internal health status.
const (
BackendStatusReady = "ready"
BackendStatusUnavailable = "unavailable"
BackendStatusDegraded = "degraded"
BackendStatusUnknown = "unknown"
BackendStatusReady = "ready"
BackendStatusUnavailable = "unavailable"
BackendStatusDegraded = "degraded"
BackendStatusUnknown = "unknown"
BackendStatusUnauthenticated = "unauthenticated"
)

// DiscoveredBackend is an alias to the canonical definition in pkg/vmcp/types.go
Expand Down Expand Up @@ -222,8 +223,8 @@ type VirtualMCPServerStatus struct {
// +optional
DiscoveredBackends []DiscoveredBackend `json:"discoveredBackends,omitempty"`

// BackendCount is the number of healthy/ready backends
// (excludes unavailable, degraded, and unknown backends)
// BackendCount is the number of routable backends (ready + unauthenticated).
// Excludes unavailable, degraded, and unknown backends.
// +optional
BackendCount int32 `json:"backendCount,omitempty"`

Expand Down
25 changes: 14 additions & 11 deletions cmd/thv-operator/controllers/virtualmcpserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1496,14 +1496,17 @@ type statusDecision struct {
conditionState metav1.ConditionStatus
}

// countBackendHealth counts ready and unhealthy backends
func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBackend) (ready, unhealthy int) {
// countBackendHealth counts routable and unhealthy backends.
// Unauthenticated backends are routable — they are reachable but require per-request
// user auth (e.g., upstream OAuth). Health probes lack user tokens, but real requests
// with valid OAuth tokens will be served.
func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBackend) (routable, unhealthy int) {
ctxLogger := log.FromContext(ctx)

for _, backend := range backends {
switch backend.Status {
case mcpv1alpha1.BackendStatusReady:
ready++
case mcpv1alpha1.BackendStatusReady, mcpv1alpha1.BackendStatusUnauthenticated:
routable++
case mcpv1alpha1.BackendStatusUnavailable,
mcpv1alpha1.BackendStatusDegraded,
mcpv1alpha1.BackendStatusUnknown:
Expand All @@ -1514,7 +1517,7 @@ func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBa
unhealthy++
}
}
return ready, unhealthy
return routable, unhealthy
}

// determineStatusFromBackends evaluates backend health to determine status
Expand All @@ -1524,11 +1527,11 @@ func (*VirtualMCPServerReconciler) determineStatusFromBackends(
) statusDecision {
ctxLogger := log.FromContext(ctx)

ready, unhealthy := countBackendHealth(ctx, vmcp.Status.DiscoveredBackends)
total := ready + unhealthy
routable, unhealthy := countBackendHealth(ctx, vmcp.Status.DiscoveredBackends)
total := routable + unhealthy

// All backends unhealthy
if ready == 0 && unhealthy > 0 {
if routable == 0 && unhealthy > 0 {
return statusDecision{
phase: mcpv1alpha1.VirtualMCPServerPhaseDegraded,
message: fmt.Sprintf("Virtual MCP server is running but all %d backends are unhealthy", unhealthy),
Expand All @@ -1542,15 +1545,15 @@ func (*VirtualMCPServerReconciler) determineStatusFromBackends(
if unhealthy > 0 {
return statusDecision{
phase: mcpv1alpha1.VirtualMCPServerPhaseDegraded,
message: fmt.Sprintf("Virtual MCP server is running with %d/%d backends available", ready, total),
message: fmt.Sprintf("Virtual MCP server is running with %d/%d backends available", routable, total),
reason: "BackendsDegraded",
conditionMsg: "Some backends are unhealthy",
conditionState: metav1.ConditionFalse,
}
}

// All backends ready
if ready > 0 {
// All backends routable
if routable > 0 {
return statusDecision{
phase: mcpv1alpha1.VirtualMCPServerPhaseReady,
message: "Virtual MCP server is running",
Expand Down
12 changes: 7 additions & 5 deletions cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,14 +211,16 @@ func (s *StatusCollector) UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alp
// Apply discovered backends change
if s.discoveredBackends != nil {
vmcpStatus.DiscoveredBackends = s.discoveredBackends
// BackendCount represents the number of ready backends
var readyCount int32
// BackendCount represents the number of routable backends (ready + unauthenticated).
// Unauthenticated backends are reachable but require per-request user auth.
var routableCount int32
for _, backend := range s.discoveredBackends {
if backend.Status == mcpv1alpha1.BackendStatusReady {
readyCount++
if backend.Status == mcpv1alpha1.BackendStatusReady ||
backend.Status == mcpv1alpha1.BackendStatusUnauthenticated {
routableCount++
}
}
vmcpStatus.BackendCount = readyCount
vmcpStatus.BackendCount = routableCount
}

ctxLogger.V(1).Info("Batched status update applied",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2104,8 +2104,8 @@ spec:
properties:
backendCount:
description: |-
BackendCount is the number of healthy/ready backends
(excludes unavailable, degraded, and unknown backends)
BackendCount is the number of routable backends (ready + unauthenticated).
Excludes unavailable, degraded, and unknown backends.
format: int32
type: integer
conditions:
Expand Down Expand Up @@ -2218,7 +2218,7 @@ spec:
type: string
status:
description: |-
Status is the current status of the backend (ready, degraded, unavailable, unknown).
Status is the current status of the backend (ready, degraded, unavailable, unauthenticated, unknown).
Use BackendHealthStatus.ToCRDStatus() to populate this field.
type: string
url:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2107,8 +2107,8 @@ spec:
properties:
backendCount:
description: |-
BackendCount is the number of healthy/ready backends
(excludes unavailable, degraded, and unknown backends)
BackendCount is the number of routable backends (ready + unauthenticated).
Excludes unavailable, degraded, and unknown backends.
format: int32
type: integer
conditions:
Expand Down Expand Up @@ -2221,7 +2221,7 @@ spec:
type: string
status:
description: |-
Status is the current status of the backend (ready, degraded, unavailable, unknown).
Status is the current status of the backend (ready, degraded, unavailable, unauthenticated, unknown).
Use BackendHealthStatus.ToCRDStatus() to populate this field.
type: string
url:
Expand Down
4 changes: 4 additions & 0 deletions pkg/vmcp/discovery/middleware.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ func Middleware(
// or degraded. Backends that are unhealthy, unknown, or unauthenticated are excluded
// from capability aggregation to prevent exposing tools from unavailable backends.
//
// TODO(#4824): Unauthenticated backends are treated as routable for phase determination
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exclusion is intentional and the comment explains why clearly. One suggestion: reference a dedicated follow-up issue here rather than #4824 (which is the bug this PR fixes, not the future work). When the auth server (stacklok-epics#198) can provide service-account tokens for health probes, this is the exact place to revisit — a linked issue would make that easier to track and avoids the TODO pointing at a closed bug.

// but are excluded here because discovery probes cannot carry user tokens. If discovery
// could use user tokens, unauthenticated backends' capabilities could be included.
//
// Health status filtering:
// - healthy: included (fully operational)
// - degraded: included (slow but working)
Expand Down
90 changes: 71 additions & 19 deletions pkg/vmcp/health/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,13 @@ type Summary struct {
Unauthenticated int
}

// Routable returns the number of backends that can serve traffic.
// This includes healthy backends and unauthenticated backends (which are
// reachable but require per-request user auth, e.g., upstream OAuth).
func (s Summary) Routable() int {
return s.Healthy + s.Unauthenticated
}

// String returns a human-readable summary.
func (s Summary) String() string {
return fmt.Sprintf("total=%d healthy=%d degraded=%d unhealthy=%d unknown=%d unauthenticated=%d",
Expand All @@ -557,11 +564,12 @@ func (s Summary) String() string {
// This converts backend health information into the format needed for status reporting
// to the Kubernetes API or CLI output.
//
// Phase determination:
// - Ready: All backends healthy, or no backends configured (cold start)
// Phase determination (unauthenticated backends are routable — they need per-request user auth
// but are reachable and running):
// - Ready: All backends healthy or unauthenticated, or no backends configured (cold start)
// - Pending: Backends configured but no health check data yet (waiting for first check)
// - Degraded: Some backends healthy, some degraded/unhealthy
// - Failed: No healthy backends (and at least one backend exists)
// - Degraded: Some backends routable (healthy/unauthenticated), some degraded/unhealthy
// - Failed: No routable backends (and at least one backend exists)
//
// Returns a Status instance with current health information and discovered backends.
//
Expand Down Expand Up @@ -592,16 +600,18 @@ func (m *Monitor) BuildStatus() *vmcp.Status {
Message: message,
Conditions: conditions,
DiscoveredBackends: discoveredBackends,
BackendCount: int32(summary.Healthy), //nolint:gosec // healthy count is bounded by backend list size
BackendCount: int32(summary.Routable()), //nolint:gosec // routable count is bounded by backend list size
Timestamp: time.Now(),
}
}

// determinePhase determines the overall phase based on backend health.
// Unauthenticated backends are treated as routable — they are reachable and running,
// they just require per-request user auth (e.g., upstream OAuth).
// Takes both the health summary and the count of configured backends to distinguish:
// - No backends configured (configuredCount==0): Ready (cold start)
// - Backends configured but no health data (configuredCount>0 && summary.Total==0): Pending
// - Has health data: Ready/Degraded/Failed based on health status
// - Has health data: Ready/Degraded/Failed based on routable (healthy + unauthenticated) count
func determinePhase(summary Summary, configuredBackendCount int) vmcp.Phase {
if summary.Total == 0 {
// No health data yet - distinguish cold start from waiting for first check
Expand All @@ -610,10 +620,11 @@ func determinePhase(summary Summary, configuredBackendCount int) vmcp.Phase {
}
return vmcp.PhasePending // Backends configured but health checks not complete
}
if summary.Healthy == summary.Total {

if summary.Routable() == summary.Total {
return vmcp.PhaseReady
}
if summary.Healthy == 0 {
if summary.Routable() == 0 {
return vmcp.PhaseFailed
}
return vmcp.PhaseDegraded
Expand All @@ -629,18 +640,27 @@ func formatStatusMessage(summary Summary, phase vmcp.Phase, configuredBackendCou
return fmt.Sprintf("Waiting for initial health checks (%d backends configured)", configuredBackendCount)
}
if phase == vmcp.PhaseReady {
return fmt.Sprintf("All %d backends healthy", summary.Healthy)
if summary.Unauthenticated == 0 {
return fmt.Sprintf("All %d %s healthy", summary.Healthy, pluralBackend(summary.Healthy))
}
if summary.Healthy == 0 {
return fmt.Sprintf("%s %s authentication",
quantifyBackends(summary.Unauthenticated), pluralRequire(summary.Unauthenticated))
}
return fmt.Sprintf("%d %s healthy, %d %s authentication",
summary.Healthy, pluralBackend(summary.Healthy),
summary.Unauthenticated, pluralRequire(summary.Unauthenticated))
}

// Format unhealthy backend counts (shared by Failed and Degraded)
unhealthyDetails := fmt.Sprintf("%d degraded, %d unhealthy, %d unknown, %d unauthenticated",
summary.Degraded, summary.Unhealthy, summary.Unknown, summary.Unauthenticated)
// Format non-routable backend counts (shared by Failed and Degraded)
nonRoutableDetails := fmt.Sprintf("%d degraded, %d unhealthy, %d unknown",
summary.Degraded, summary.Unhealthy, summary.Unknown)

if phase == vmcp.PhaseFailed {
return fmt.Sprintf("No healthy backends (%s)", unhealthyDetails)
return fmt.Sprintf("No routable backends (%s)", nonRoutableDetails)
}
// Degraded
return fmt.Sprintf("%d/%d backends healthy (%s)", summary.Healthy, summary.Total, unhealthyDetails)
return fmt.Sprintf("%d/%d backends routable (%s)", summary.Routable(), summary.Total, nonRoutableDetails)
}

// convertToDiscoveredBackends converts backend health states to DiscoveredBackend format.
Expand Down Expand Up @@ -712,6 +732,30 @@ func extractAuthInfo(backend vmcp.Backend) (authConfigRef, authType string) {
return backend.AuthConfigRef, backend.AuthConfig.Type
}

// pluralBackend returns "backend" or "backends" based on count.
func pluralBackend(n int) string {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] pluralBackend() breaks formatBackendMessage godoc (Consensus: 10/10)

This function is inserted between the doc comment for formatBackendMessage and its function signature. In the resulting file, the three-line comment block ("formatBackendMessage creates a human-readable message...") is now separated from func formatBackendMessage(...) by the entire pluralBackend function body — formatBackendMessage becomes undocumented in godoc.

Move pluralBackend() above the formatBackendMessage doc comment block, or near formatStatusMessage where it's primarily used.

Raised by: go-expert-developer, code-reviewer, kubernetes-expert, toolhive-expert

if n == 1 {
return "backend"
}
return "backends"
}

// pluralRequire returns "requires" or "require" based on count for subject-verb agreement.
func pluralRequire(n int) string {
if n == 1 {
return "requires"
}
return "require"
}

// quantifyBackends returns "All N backends" for plural, "1 backend" for singular.
func quantifyBackends(n int) string {
if n == 1 {
return fmt.Sprintf("%d backend", n)
}
return fmt.Sprintf("All %d backends", n)
}

// formatBackendMessage creates a human-readable message for a backend's health state.
// This returns generic error categories to avoid exposing sensitive error details in status.
// Detailed errors are logged when they occur (in performHealthCheck) for debugging.
Expand Down Expand Up @@ -821,19 +865,27 @@ func buildConditions(summary Summary, phase vmcp.Phase, configuredBackendCount i
switch phase {
case vmcp.PhaseReady:
readyCondition.Status = metav1.ConditionTrue
readyCondition.Reason = "AllBackendsHealthy"
// Distinguish cold start (no backends configured) from having healthy backends
readyCondition.Reason = "AllBackendsRoutable"
// Distinguish cold start (no backends configured) from having routable backends
if summary.Total == 0 && configuredBackendCount == 0 {
readyCondition.Message = "Ready, no backends configured"
} else if summary.Unauthenticated == 0 {
readyCondition.Message = fmt.Sprintf("All %d %s are healthy",
summary.Healthy, pluralBackend(summary.Healthy))
} else if summary.Healthy == 0 {
readyCondition.Message = fmt.Sprintf("%s %s authentication",
quantifyBackends(summary.Unauthenticated), pluralRequire(summary.Unauthenticated))
} else {
readyCondition.Message = fmt.Sprintf("All %d backends are healthy", summary.Healthy)
readyCondition.Message = fmt.Sprintf("%d %s healthy, %d %s authentication",
summary.Healthy, pluralBackend(summary.Healthy),
summary.Unauthenticated, pluralRequire(summary.Unauthenticated))
}
case vmcp.PhaseDegraded:
readyCondition.Reason = "SomeBackendsUnhealthy"
readyCondition.Message = fmt.Sprintf("%d/%d backends healthy", summary.Healthy, summary.Total)
readyCondition.Message = fmt.Sprintf("%d/%d backends routable", summary.Routable(), summary.Total)
case vmcp.PhaseFailed:
readyCondition.Reason = "NoHealthyBackends"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reason string is now semantically mismatched: the condition message in this branch says "No routable backends available" but the reason is still NoHealthyBackends. The PR description says condition reasons were intentionally left unchanged to avoid breaking tooling — but AllBackendsHealthy was renamed to AllBackendsRoutable a few lines above (line 868), so the freeze isn't consistent. Worth either aligning this to NoRoutableBackends here too, or documenting explicitly why the PhaseFailed reason is frozen while the PhaseReady one was not. At minimum, a follow-up issue to track the rename would prevent the inconsistency from calcifying.

readyCondition.Message = "No healthy backends available"
readyCondition.Message = "No routable backends available"
case vmcp.PhasePending:
readyCondition.Reason = "BackendsPending"
readyCondition.Message = fmt.Sprintf("Waiting for initial health checks (%d backends configured)", configuredBackendCount)
Expand Down
Loading
Loading