-
Notifications
You must be signed in to change notification settings - Fork 42
[FR-126] Rollback #274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[FR-126] Rollback #274
Changes from all commits
12ed35e
2a60a93
cca2c35
cb0252b
64a1cbd
a5d1e33
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,6 +83,22 @@ type WorkerDeploymentSpec struct { | |
| // How to rollout new workflow executions to the target version. | ||
| RolloutStrategy RolloutStrategy `json:"rollout"` | ||
|
|
||
| // How to rollback to a previous version. If not specified, defaults to AllAtOnce strategy. | ||
| // | ||
| // A rollback is triggered automatically when the target version's pod spec is updated and | ||
| // the resulting build ID has previously been set as the default (current) version of the | ||
| // worker deployment. The controller detects this by checking whether Temporal recorded a | ||
| // non-nil LastCurrentTime for that build ID. | ||
| // | ||
| // The rollback strategy controls routing of NEW workflow executions only. Workflows already | ||
| // running are pinned to the version they started on and continue executing there; they are | ||
| // not affected by the rollback. Only new workflow executions will be routed to the rollback | ||
| // target version. | ||
| // | ||
| // Rollback is suppressed when the rollout strategy is Manual. | ||
| // +optional | ||
| RollbackStrategy *RollbackStrategy `json:"rollback,omitempty"` | ||
|
|
||
| // How to manage sunsetting drained versions. | ||
| SunsetStrategy SunsetStrategy `json:"sunset"` | ||
|
|
||
|
|
@@ -338,17 +354,17 @@ type DeprecatedWorkerDeploymentVersion struct { | |
| EligibleForDeletion bool `json:"eligibleForDeletion,omitempty"` | ||
| } | ||
|
|
||
| // DefaultVersionUpdateStrategy describes how to cut over new workflow executions | ||
| // VersionRolloutStrategy describes how to cut over new workflow executions | ||
| // to the target worker deployment version. | ||
| // +kubebuilder:validation:Enum=Manual;AllAtOnce;Progressive | ||
| type DefaultVersionUpdateStrategy string | ||
| type VersionRolloutStrategy string | ||
|
|
||
| const ( | ||
| // UpdateManual scales worker resources up or down, but does not update the current or ramping worker deployment version. | ||
| UpdateManual DefaultVersionUpdateStrategy = "Manual" | ||
| UpdateManual VersionRolloutStrategy = "Manual" | ||
|
|
||
| // UpdateAllAtOnce starts 100% of new workflow executions on the new worker deployment version as soon as it's healthy. | ||
| UpdateAllAtOnce DefaultVersionUpdateStrategy = "AllAtOnce" | ||
| UpdateAllAtOnce VersionRolloutStrategy = "AllAtOnce" | ||
|
|
||
| // UpdateProgressive ramps up the percentage of new workflow executions targeting the new worker deployment version over time. | ||
| // | ||
|
|
@@ -358,7 +374,19 @@ const ( | |
| // Sending a percentage of traffic to a "nil" version means that traffic will be sent to unversioned workers. If | ||
| // there are no unversioned workers, those tasks will get stuck. This behavior ensures that all traffic on the task | ||
| // queues in this worker deployment can be handled by an active poller. | ||
| UpdateProgressive DefaultVersionUpdateStrategy = "Progressive" | ||
| UpdateProgressive VersionRolloutStrategy = "Progressive" | ||
| ) | ||
|
|
||
| // VersionRollbackStrategy describes how to cut over during rollback to a previous version. | ||
| // +kubebuilder:validation:Enum=AllAtOnce;Progressive | ||
| type VersionRollbackStrategy string | ||
|
|
||
| const ( | ||
| // RollbackAllAtOnce immediately switches 100% of traffic back to the previous version. | ||
| RollbackAllAtOnce VersionRollbackStrategy = "AllAtOnce" | ||
|
|
||
| // RollbackProgressive gradually ramps traffic back to the previous version. | ||
| RollbackProgressive VersionRollbackStrategy = "Progressive" | ||
| ) | ||
|
|
||
| type GateWorkflowConfig struct { | ||
|
|
@@ -393,7 +421,7 @@ type RolloutStrategy struct { | |
| // - "Manual" | ||
| // - "AllAtOnce" | ||
| // - "Progressive" | ||
| Strategy DefaultVersionUpdateStrategy `json:"strategy"` | ||
| Strategy VersionRolloutStrategy `json:"strategy"` | ||
|
|
||
| // Gate specifies a workflow type that must run once to completion on the new worker deployment version before | ||
| // any traffic is directed to the new version. | ||
|
|
@@ -405,6 +433,27 @@ type RolloutStrategy struct { | |
| Steps []RolloutStep `json:"steps,omitempty" protobuf:"bytes,3,rep,name=steps"` | ||
| } | ||
|
|
||
| // RollbackStrategy defines strategy to apply when rolling back to a previous version. | ||
| // This is separate from RolloutStrategy because rollbacks have different requirements: | ||
| // - No gate workflow (already trusted version) | ||
| // - No manual mode (rollbacks should be automatic) | ||
| // - Default to AllAtOnce for fast recovery | ||
| type RollbackStrategy struct { | ||
|
eniko-dif marked this conversation as resolved.
|
||
| // Strategy for rollback. Valid values are "AllAtOnce" or "Progressive". | ||
| // Defaults to "AllAtOnce" for fast recovery. | ||
| Strategy VersionRollbackStrategy `json:"strategy"` | ||
|
|
||
| // Steps to execute progressive rollbacks. Only required when strategy is "Progressive". | ||
| // +optional | ||
| Steps []RolloutStep `json:"steps,omitempty"` | ||
|
eniko-dif marked this conversation as resolved.
|
||
|
|
||
| // MaxVersionAge limits which versions are eligible as rollback targets. | ||
| // A version is only considered a rollback target if it was last current within this duration. | ||
| // If nil, there is no age limit. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default applied in the webhook is 1h though, so this comment doesn't match current behavior. We should also document the meaning of setting to 0. Right now I don't think there is a way to achieve the "no age limit" behavior, beyond setting a very high value? (I think this is fine, but just clarifying since it seems like you can either specify a limit or disable rollback entirely by setting to 0). |
||
| // +optional | ||
| MaxVersionAge *metav1.Duration `json:"maxVersionAge,omitempty"` | ||
| } | ||
|
|
||
| // SunsetStrategy defines strategy to apply when sunsetting k8s deployments of drained versions. | ||
| type SunsetStrategy struct { | ||
| // ScaledownDelay specifies how long to wait after a version is drained before scaling its Deployment to zero. | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -7,6 +7,7 @@ package v1alpha1 | |||||||||||||||||||||||
| import ( | ||||||||||||||||||||||||
| "context" | ||||||||||||||||||||||||
| "fmt" | ||||||||||||||||||||||||
| "time" | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| "github.com/temporalio/temporal-worker-controller/internal/defaults" | ||||||||||||||||||||||||
| apierrors "k8s.io/apimachinery/pkg/api/errors" | ||||||||||||||||||||||||
|
|
@@ -53,6 +54,14 @@ func (s *WorkerDeploymentSpec) Default(ctx context.Context) error { | |||||||||||||||||||||||
| s.SunsetStrategy.DeleteDelay = &v1.Duration{Duration: defaults.DeleteDelay} | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if s.RollbackStrategy == nil { | ||||||||||||||||||||||||
| s.RollbackStrategy = &RollbackStrategy{Strategy: RollbackAllAtOnce} | ||||||||||||||||||||||||
|
jlegrone marked this conversation as resolved.
|
||||||||||||||||||||||||
| } else if s.RollbackStrategy.Strategy == "" { | ||||||||||||||||||||||||
| s.RollbackStrategy.Strategy = RollbackAllAtOnce | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
Comment on lines
+57
to
+61
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: this is entirely a matter of taste but I think two
Suggested change
|
||||||||||||||||||||||||
| if s.RollbackStrategy.MaxVersionAge == nil { | ||||||||||||||||||||||||
| s.RollbackStrategy.MaxVersionAge = &v1.Duration{Duration: defaults.RollbackMaxVersionAge} | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return nil | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
@@ -81,10 +90,17 @@ func (r *WorkerDeployment) validateForUpdateOrCreate(ctx context.Context, obj ru | |||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| func validateForUpdateOrCreate(old, new *WorkerDeployment) (admission.Warnings, error) { | ||||||||||||||||||||||||
| allErrs := validateRolloutStrategy(new.Spec.RolloutStrategy) | ||||||||||||||||||||||||
| var allErrs field.ErrorList | ||||||||||||||||||||||||
| allErrs = append(allErrs, validateRolloutStrategy(new.Spec.RolloutStrategy)...) | ||||||||||||||||||||||||
| if new.Spec.RollbackStrategy != nil { | ||||||||||||||||||||||||
| allErrs = append(allErrs, validateRollbackStrategy(*new.Spec.RollbackStrategy)...) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if len(allErrs) > 0 { | ||||||||||||||||||||||||
| return nil, newInvalidErr(new, allErrs) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if new.Spec.RollbackStrategy != nil { | ||||||||||||||||||||||||
| return warnRollbackSlowerThanRollout(new.Spec.RolloutStrategy, *new.Spec.RollbackStrategy), nil | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return nil, nil | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
@@ -96,15 +112,7 @@ func validateRolloutStrategy(s RolloutStrategy) []*field.Error { | |||||||||||||||||||||||
| var allErrs []*field.Error | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if s.Strategy == UpdateProgressive { | ||||||||||||||||||||||||
| var lastRamp int | ||||||||||||||||||||||||
| for i, step := range s.Steps { | ||||||||||||||||||||||||
| if step.RampPercentage <= lastRamp { | ||||||||||||||||||||||||
| allErrs = append(allErrs, | ||||||||||||||||||||||||
| field.Invalid(field.NewPath(fmt.Sprintf("spec.rollout.steps[%d].rampPercentage", i)), step.RampPercentage, "rampPercentage must increase between each step"), | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| lastRamp = step.RampPercentage | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| allErrs = append(allErrs, validateProgressiveStrategySteps("spec.rollout.steps", s.Steps)...) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if s.Gate != nil && s.Gate.Input != nil && s.Gate.InputFrom != nil { | ||||||||||||||||||||||||
|
|
@@ -117,6 +125,67 @@ func validateRolloutStrategy(s RolloutStrategy) []*field.Error { | |||||||||||||||||||||||
| return allErrs | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| func validateRollbackStrategy(s RollbackStrategy) []*field.Error { | ||||||||||||||||||||||||
| var allErrs []*field.Error | ||||||||||||||||||||||||
| if s.Strategy == RollbackProgressive { | ||||||||||||||||||||||||
| allErrs = append(allErrs, validateProgressiveStrategySteps("spec.rollback.steps", s.Steps)...) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return allErrs | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| func warnRollbackSlowerThanRollout(rollout RolloutStrategy, rollback RollbackStrategy) admission.Warnings { | ||||||||||||||||||||||||
| switch rollout.Strategy { | ||||||||||||||||||||||||
| case UpdateAllAtOnce: | ||||||||||||||||||||||||
| if rollback.Strategy != RollbackAllAtOnce { | ||||||||||||||||||||||||
| return admission.Warnings{"rollback strategy is slower than rollout: rollout is AllAtOnce, but rollback is Progressive — is that intended?"} | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| case UpdateProgressive: | ||||||||||||||||||||||||
| if rollback.Strategy == RollbackProgressive { | ||||||||||||||||||||||||
| var rolloutTotal, rollbackTotal time.Duration | ||||||||||||||||||||||||
| for _, s := range rollout.Steps { | ||||||||||||||||||||||||
| rolloutTotal += s.PauseDuration.Duration | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| for _, s := range rollback.Steps { | ||||||||||||||||||||||||
| rollbackTotal += s.PauseDuration.Duration | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if rollbackTotal > rolloutTotal { | ||||||||||||||||||||||||
| return admission.Warnings{fmt.Sprintf( | ||||||||||||||||||||||||
| "rollback strategy is slower than rollout: progressive rollback total duration (%s) exceeds progressive rollout total duration (%s) — is that intended?", | ||||||||||||||||||||||||
| rollbackTotal, rolloutTotal, | ||||||||||||||||||||||||
| )} | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return nil | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| func validateProgressiveStrategySteps(specName string, steps []RolloutStep) []*field.Error { | ||||||||||||||||||||||||
| var allErrs []*field.Error | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if len(steps) == 0 { | ||||||||||||||||||||||||
| allErrs = append(allErrs, | ||||||||||||||||||||||||
| field.Invalid(field.NewPath(specName), steps, "steps are required for Progressive strategy"), | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| var lastRamp int | ||||||||||||||||||||||||
| for i, step := range steps { | ||||||||||||||||||||||||
| if step.PauseDuration.Duration < 30*time.Second { | ||||||||||||||||||||||||
| allErrs = append(allErrs, | ||||||||||||||||||||||||
| field.Invalid(field.NewPath(fmt.Sprintf("%s[%d].pauseDuration", specName, i)), step.PauseDuration.Duration.String(), "pause duration must be at least 30s"), | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if step.RampPercentage <= lastRamp { | ||||||||||||||||||||||||
| allErrs = append(allErrs, | ||||||||||||||||||||||||
| field.Invalid(field.NewPath(fmt.Sprintf("%s[%d].rampPercentage", specName, i)), step.RampPercentage, "rampPercentage must increase between each step"), | ||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| lastRamp = step.RampPercentage | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| return allErrs | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| func newInvalidErr(dep *WorkerDeployment, errs field.ErrorList) *apierrors.StatusError { | ||||||||||||||||||||||||
| return apierrors.NewInvalid(dep.GroupVersionKind().GroupKind(), dep.GetName(), errs) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.