diff --git a/apps/workspace-engine/oapi/openapi.json b/apps/workspace-engine/oapi/openapi.json index eca9dab51..e985a587f 100644 --- a/apps/workspace-engine/oapi/openapi.json +++ b/apps/workspace-engine/oapi/openapi.json @@ -1313,6 +1313,9 @@ "retry": { "$ref": "#/components/schemas/RetryRule" }, + "rollback": { + "$ref": "#/components/schemas/RollbackRule" + }, "verification": { "$ref": "#/components/schemas/VerificationRule" }, @@ -1851,6 +1854,23 @@ ], "type": "object" }, + "RollbackRule": { + "properties": { + "onJobStatuses": { + "description": "Job statuses that will trigger a rollback", + "items": { + "$ref": "#/components/schemas/JobStatus" + }, + "type": "array" + }, + "onVerificationFailure": { + "default": false, + "description": "If true, a release target will be rolled back if the verification fails", + "type": "boolean" + } + }, + "type": "object" + }, "RuleEvaluation": { "properties": { "actionRequired": { diff --git a/apps/workspace-engine/oapi/spec/schemas/policy.jsonnet b/apps/workspace-engine/oapi/spec/schemas/policy.jsonnet index c6974a5a7..41b4e900d 100644 --- a/apps/workspace-engine/oapi/spec/schemas/policy.jsonnet +++ b/apps/workspace-engine/oapi/spec/schemas/policy.jsonnet @@ -65,6 +65,7 @@ local openapi = import '../lib/openapi.libsonnet'; deploymentWindow: openapi.schemaRef('DeploymentWindowRule'), verification: openapi.schemaRef('VerificationRule'), versionCooldown: openapi.schemaRef('VersionCooldownRule'), + rollback: openapi.schemaRef('RollbackRule'), }, }, @@ -235,6 +236,22 @@ local openapi = import '../lib/openapi.libsonnet'; }, }, + RollbackRule: { + type: 'object', + properties: { + onJobStatuses: { + type: 'array', + items: openapi.schemaRef('JobStatus'), + description: 'Job statuses that will trigger a rollback', + }, + onVerificationFailure: { + type: 'boolean', + default: false, + description: 'If true, a release target will be rolled back if the verification fails', + }, + }, + }, + DeployDecision: { type: 'object', required: ['policyResults'], diff --git a/apps/workspace-engine/pkg/oapi/oapi.gen.go b/apps/workspace-engine/pkg/oapi/oapi.gen.go index 71a6e8fa4..2441e70ca 100644 --- a/apps/workspace-engine/pkg/oapi/oapi.gen.go +++ b/apps/workspace-engine/pkg/oapi/oapi.gen.go @@ -798,6 +798,7 @@ type PolicyRule struct { Id string `json:"id"` PolicyId string `json:"policyId"` Retry *RetryRule `json:"retry,omitempty"` + Rollback *RollbackRule `json:"rollback,omitempty"` Verification *VerificationRule `json:"verification,omitempty"` VersionCooldown *VersionCooldownRule `json:"versionCooldown,omitempty"` VersionSelector *VersionSelectorRule `json:"versionSelector,omitempty"` @@ -996,6 +997,15 @@ type RetryRule struct { // RetryRuleBackoffStrategy Backoff strategy: "linear" uses constant backoffSeconds delay, "exponential" doubles the delay with each retry (backoffSeconds * 2^(attempt-1)). type RetryRuleBackoffStrategy string +// RollbackRule defines model for RollbackRule. +type RollbackRule struct { + // OnJobStatuses Job statuses that will trigger a rollback + OnJobStatuses *[]JobStatus `json:"onJobStatuses,omitempty"` + + // OnVerificationFailure If true, a release target will be rolled back if the verification fails + OnVerificationFailure *bool `json:"onVerificationFailure,omitempty"` +} + // RuleEvaluation defines model for RuleEvaluation. type RuleEvaluation struct { // ActionRequired Whether the rule requires an action (e.g., approval, wait) diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/action/action.go b/apps/workspace-engine/pkg/workspace/releasemanager/action/action.go index 6bdc0e9b4..edf060ad4 100644 --- a/apps/workspace-engine/pkg/workspace/releasemanager/action/action.go +++ b/apps/workspace-engine/pkg/workspace/releasemanager/action/action.go @@ -9,10 +9,11 @@ import ( type ActionTrigger string const ( - TriggerJobCreated ActionTrigger = "job.created" - TriggerJobStarted ActionTrigger = "job.started" - TriggerJobSuccess ActionTrigger = "job.success" - TriggerJobFailure ActionTrigger = "job.failure" + TriggerJobCreated ActionTrigger = "job.created" + TriggerJobStarted ActionTrigger = "job.started" + TriggerJobSuccess ActionTrigger = "job.success" + TriggerJobFailure ActionTrigger = "job.failure" + TriggerJobStatusChange ActionTrigger = "job.statuschange" ) // ActionContext provides context for action execution diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/action/orchestrator.go b/apps/workspace-engine/pkg/workspace/releasemanager/action/orchestrator.go index 0adc4f819..afa250b74 100644 --- a/apps/workspace-engine/pkg/workspace/releasemanager/action/orchestrator.go +++ b/apps/workspace-engine/pkg/workspace/releasemanager/action/orchestrator.go @@ -132,5 +132,9 @@ func determineTrigger( return TriggerJobFailure } + if currentStatus != previousStatus { + return TriggerJobStatusChange + } + return "" } diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/hooks.go b/apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/hooks.go new file mode 100644 index 000000000..31a1bf8c2 --- /dev/null +++ b/apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/hooks.go @@ -0,0 +1,159 @@ +package rollback + +import ( + "context" + "time" + "workspace-engine/pkg/oapi" + "workspace-engine/pkg/workspace/releasemanager/deployment/jobs" + "workspace-engine/pkg/workspace/releasemanager/verification" + "workspace-engine/pkg/workspace/store" + + "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" +) + +var hookTracer = otel.Tracer("RollbackHooks") + +type RollbackHooks struct { + store *store.Store + dispatcher *jobs.Dispatcher +} + +var _ verification.VerificationHooks = &RollbackHooks{} + +func NewRollbackHooks(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackHooks { + return &RollbackHooks{ + store: store, + dispatcher: dispatcher, + } +} + +func (h *RollbackHooks) OnVerificationStarted(ctx context.Context, verification *oapi.JobVerification) error { + return nil +} + +func (h *RollbackHooks) OnMeasurementTaken(ctx context.Context, verification *oapi.JobVerification, metricIndex int, measurement *oapi.VerificationMeasurement) error { + return nil +} + +func (h *RollbackHooks) OnMetricComplete(ctx context.Context, verification *oapi.JobVerification, metricIndex int) error { + return nil +} + +func (h *RollbackHooks) OnVerificationComplete(ctx context.Context, verificationResult *oapi.JobVerification) error { + ctx, span := hookTracer.Start(ctx, "RollbackHooks.OnVerificationComplete") + defer span.End() + + span.SetAttributes( + attribute.String("verification.id", verificationResult.Id), + attribute.String("verification.job_id", verificationResult.JobId), + ) + + status := verificationResult.Status() + span.SetAttributes(attribute.String("verification.status", string(status))) + + if status != oapi.JobVerificationStatusFailed { + span.SetStatus(codes.Ok, "verification did not fail") + return nil + } + + job, ok := h.store.Jobs.Get(verificationResult.JobId) + if !ok { + span.SetStatus(codes.Error, "job not found") + return nil + } + + release, ok := h.store.Releases.Get(job.ReleaseId) + if !ok { + span.SetStatus(codes.Error, "release not found") + return nil + } + + span.SetAttributes( + attribute.String("release.id", release.ID()), + attribute.String("release_target.key", release.ReleaseTarget.Key()), + ) + + policies, err := h.store.ReleaseTargets.GetPolicies(ctx, &release.ReleaseTarget) + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "failed to get policies") + return nil + } + + if !h.shouldRollbackOnVerificationFailure(policies) { + span.SetAttributes(attribute.Bool("rollback_applicable", false)) + span.SetStatus(codes.Ok, "no applicable rollback policy for verification failure") + return nil + } + + span.SetAttributes(attribute.Bool("rollback_applicable", true)) + + currentRelease, lastSuccessfulJob, err := h.store.ReleaseTargets.GetCurrentRelease(ctx, &release.ReleaseTarget) + if err != nil { + span.AddEvent("No previous release to roll back to") + span.SetStatus(codes.Ok, "no previous release available") + return nil + } + + // Don't rollback to the same release + if currentRelease.ID() == release.ID() { + span.AddEvent("Current release is the same as failed release, no rollback needed") + span.SetStatus(codes.Ok, "already on current release") + return nil + } + + span.SetAttributes( + attribute.String("rollback_to_release.id", currentRelease.ID()), + attribute.String("rollback_to_version.id", currentRelease.Version.Id), + attribute.String("rollback_to_version.tag", currentRelease.Version.Tag), + ) + + now := time.Now() + newJob := oapi.Job{ + Id: uuid.New().String(), + ReleaseId: lastSuccessfulJob.ReleaseId, + JobAgentId: lastSuccessfulJob.JobAgentId, + JobAgentConfig: lastSuccessfulJob.JobAgentConfig, + Status: oapi.JobStatusPending, + CreatedAt: now, + UpdatedAt: now, + } + + h.store.Jobs.Upsert(ctx, &newJob) + + if err := h.dispatcher.DispatchJob(ctx, &newJob); err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "rollback execution failed") + return err + } + + span.SetStatus(codes.Ok, "rollback executed successfully") + return nil +} + +func (h *RollbackHooks) OnVerificationStopped(ctx context.Context, verification *oapi.JobVerification) error { + return nil +} + +func (h *RollbackHooks) shouldRollbackOnVerificationFailure(policies []*oapi.Policy) bool { + for _, policy := range policies { + if !policy.Enabled { + continue + } + + for _, rule := range policy.Rules { + if rule.Rollback == nil { + continue + } + + if rule.Rollback.OnVerificationFailure != nil && *rule.Rollback.OnVerificationFailure { + return true + } + } + } + + return false +} diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/rollback.go b/apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/rollback.go new file mode 100644 index 000000000..613939ce4 --- /dev/null +++ b/apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/rollback.go @@ -0,0 +1,120 @@ +package rollback + +import ( + "context" + "slices" + "time" + "workspace-engine/pkg/oapi" + "workspace-engine/pkg/workspace/releasemanager/action" + "workspace-engine/pkg/workspace/releasemanager/deployment/jobs" + "workspace-engine/pkg/workspace/store" + + "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" +) + +var tracer = otel.Tracer("RollbackAction") + +type RollbackAction struct { + store *store.Store + dispatcher *jobs.Dispatcher +} + +func NewRollbackAction(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackAction { + return &RollbackAction{ + store: store, + dispatcher: dispatcher, + } +} + +func (r *RollbackAction) Name() string { + return "rollback" +} + +func (r *RollbackAction) Execute( + ctx context.Context, + trigger action.ActionTrigger, + actx action.ActionContext, +) error { + ctx, span := tracer.Start(ctx, "RollbackAction.Execute") + defer span.End() + + span.SetAttributes( + attribute.String("trigger", string(trigger)), + attribute.String("release.id", actx.Release.ID()), + attribute.String("job.id", actx.Job.Id), + attribute.String("job.status", string(actx.Job.Status)), + ) + + if !r.shouldRollback(actx.Policies, actx.Job.Status) { + span.SetAttributes(attribute.Bool("rollback_applicable", false)) + span.SetStatus(codes.Ok, "no applicable rollback policy") + return nil + } + + span.SetAttributes(attribute.Bool("rollback_applicable", true)) + + currentRelease, lastSuccessfulJob, err := r.store.ReleaseTargets.GetCurrentRelease(ctx, &actx.Release.ReleaseTarget) + if err != nil { + span.AddEvent("No previous release to roll back to") + span.SetStatus(codes.Ok, "no previous release available") + return nil + } + + if currentRelease.ID() == actx.Release.ID() { + span.AddEvent("Current release is the same as failed release, no rollback needed") + span.SetStatus(codes.Ok, "already on current release") + return nil + } + + span.SetAttributes( + attribute.String("rollback_to_release.id", currentRelease.ID()), + attribute.String("rollback_to_version.id", currentRelease.Version.Id), + attribute.String("rollback_to_version.tag", currentRelease.Version.Tag), + ) + + now := time.Now() + newJob := oapi.Job{ + Id: uuid.New().String(), + ReleaseId: lastSuccessfulJob.ReleaseId, + JobAgentId: lastSuccessfulJob.JobAgentId, + JobAgentConfig: lastSuccessfulJob.JobAgentConfig, + Status: oapi.JobStatusPending, + CreatedAt: now, + UpdatedAt: now, + } + + r.store.Jobs.Upsert(ctx, &newJob) + + if err := r.dispatcher.DispatchJob(ctx, &newJob); err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "rollback execution failed") + return err + } + + span.SetStatus(codes.Ok, "rollback executed successfully") + return nil +} + +func (r *RollbackAction) shouldRollback(policies []*oapi.Policy, jobStatus oapi.JobStatus) bool { + for _, policy := range policies { + if !policy.Enabled { + continue + } + + for _, rule := range policy.Rules { + if rule.Rollback == nil { + continue + } + + if rule.Rollback.OnJobStatuses != nil && + slices.Contains(*rule.Rollback.OnJobStatuses, jobStatus) { + return true + } + } + } + + return false +} diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/manager.go b/apps/workspace-engine/pkg/workspace/releasemanager/manager.go index a4d8d0ab3..5f333db6a 100644 --- a/apps/workspace-engine/pkg/workspace/releasemanager/manager.go +++ b/apps/workspace-engine/pkg/workspace/releasemanager/manager.go @@ -9,7 +9,9 @@ import ( "workspace-engine/pkg/oapi" "workspace-engine/pkg/statechange" "workspace-engine/pkg/workspace/relationships" + "workspace-engine/pkg/workspace/releasemanager/action/rollback" "workspace-engine/pkg/workspace/releasemanager/deployment" + "workspace-engine/pkg/workspace/releasemanager/deployment/jobs" "workspace-engine/pkg/workspace/releasemanager/trace" "workspace-engine/pkg/workspace/releasemanager/verification" "workspace-engine/pkg/workspace/store" @@ -48,7 +50,8 @@ func New(store *store.Store, traceStore PersistenceStore) *Manager { stateCache := NewStateCache(store, deploymentOrch.Planner()) releaseManagerHooks := newReleaseManagerVerificationHooks(store, stateCache) - compositeHooks := verification.NewCompositeHooks(releaseManagerHooks) + rollbackHooks := rollback.NewRollbackHooks(store, jobs.NewDispatcher(store, verificationManager)) + compositeHooks := verification.NewCompositeHooks(releaseManagerHooks, rollbackHooks) verificationManager.SetHooks(compositeHooks) return &Manager{ diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/evaulator.go b/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/evaulator.go index e9531fc01..28defd5b0 100644 --- a/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/evaulator.go +++ b/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/evaulator.go @@ -68,6 +68,7 @@ const ( RuleTypeDeployableVersions = "deployableVersions" RuleTypeDeploymentWindow = "deploymentWindow" RuleTypeVersionCooldown = "versionCooldown" + RuleTypeRollback = "rollback" ) // WithMemoization wraps an evaluator with caching based on its declared scope fields. diff --git a/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/rollback/rollback.go b/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/rollback/rollback.go new file mode 100644 index 000000000..4eb289383 --- /dev/null +++ b/apps/workspace-engine/pkg/workspace/releasemanager/policy/evaluator/rollback/rollback.go @@ -0,0 +1,86 @@ +package rollback + +import ( + "context" + "slices" + "sort" + "workspace-engine/pkg/oapi" + "workspace-engine/pkg/workspace/releasemanager/policy/evaluator" + "workspace-engine/pkg/workspace/releasemanager/policy/results" + "workspace-engine/pkg/workspace/store" +) + +type RollbackEvaluator struct { + store *store.Store + ruleId string + rule *oapi.RollbackRule +} + +func NewEvaluator(store *store.Store, rule *oapi.PolicyRule) evaluator.Evaluator { + if rule == nil || rule.Rollback == nil || store == nil { + return nil + } + return evaluator.WithMemoization(&RollbackEvaluator{store: store, ruleId: rule.Id, rule: rule.Rollback}) +} + +func (e *RollbackEvaluator) ScopeFields() evaluator.ScopeFields { + return evaluator.ScopeReleaseTarget +} + +func (e *RollbackEvaluator) RuleType() string { + return evaluator.RuleTypeRollback +} + +func (e *RollbackEvaluator) RuleId() string { + return e.ruleId +} + +func (e *RollbackEvaluator) Complexity() int { + return 4 +} + +func (e *RollbackEvaluator) getLatestJobForReleaseTarget(releaseTarget *oapi.ReleaseTarget) *oapi.Job { + jobs := e.store.Jobs.GetJobsForReleaseTarget(releaseTarget) + jobsSlice := make([]*oapi.Job, 0, len(jobs)) + for _, job := range jobs { + jobsSlice = append(jobsSlice, job) + } + if len(jobsSlice) == 0 { + return nil + } + sort.Slice(jobsSlice, func(i, j int) bool { + return jobsSlice[i].CreatedAt.After(jobsSlice[j].CreatedAt) + }) + return jobsSlice[0] +} + +func (e *RollbackEvaluator) Evaluate(ctx context.Context, scope evaluator.EvaluatorScope) *oapi.RuleEvaluation { + releaseTarget := scope.ReleaseTarget + latestJob := e.getLatestJobForReleaseTarget(releaseTarget) + if latestJob == nil { + return results.NewAllowedResult("No jobs found for release target") + } + + jobStatus := latestJob.Status + if e.rule.OnJobStatuses != nil && slices.Contains(*e.rule.OnJobStatuses, jobStatus) { + return results.NewDeniedResult("Job status is in rollback statuses"). + WithDetail("job", latestJob) + } + + if e.rule.OnVerificationFailure == nil || !*e.rule.OnVerificationFailure { + return results.NewAllowedResult("Job status is not in rollback statuses and on verification failure is not enabled").WithDetail("job", latestJob) + } + + verifications := e.store.JobVerifications.GetByJobId(latestJob.Id) + + for _, verification := range verifications { + verificationStatus := verification.Status() + if verificationStatus == oapi.JobVerificationStatusFailed { + return results.NewDeniedResult("Verification failed"). + WithDetail("verification", verification). + WithDetail("job", latestJob) + } + } + + return results.NewAllowedResult("No verification failures found").WithDetail("job", latestJob) +} diff --git a/apps/workspace-engine/pkg/workspace/workspace.go b/apps/workspace-engine/pkg/workspace/workspace.go index 7e77d1250..d54b9d8eb 100644 --- a/apps/workspace-engine/pkg/workspace/workspace.go +++ b/apps/workspace-engine/pkg/workspace/workspace.go @@ -6,7 +6,9 @@ import ( "workspace-engine/pkg/statechange" "workspace-engine/pkg/workspace/releasemanager" "workspace-engine/pkg/workspace/releasemanager/action" + "workspace-engine/pkg/workspace/releasemanager/action/rollback" verificationaction "workspace-engine/pkg/workspace/releasemanager/action/verification" + "workspace-engine/pkg/workspace/releasemanager/deployment/jobs" "workspace-engine/pkg/workspace/releasemanager/policy/evaluator/deploymentdependency" "workspace-engine/pkg/workspace/releasemanager/trace" "workspace-engine/pkg/workspace/releasemanager/trace/spanstore" @@ -44,6 +46,11 @@ func New(ctx context.Context, id string, options ...WorkspaceOption) *Workspace return ws.releasemanager.ReconcileTarget(ctx, target, releasemanager.WithTrigger(trace.TriggerJobSuccess)) }, ), + ).RegisterAction( + rollback.NewRollbackAction( + s, + jobs.NewDispatcher(s, ws.releasemanager.VerificationManager()), + ), ) return ws diff --git a/apps/workspace-engine/test/e2e/engine_policy_rollback_test.go b/apps/workspace-engine/test/e2e/engine_policy_rollback_test.go new file mode 100644 index 000000000..e0e038afc --- /dev/null +++ b/apps/workspace-engine/test/e2e/engine_policy_rollback_test.go @@ -0,0 +1,983 @@ +package e2e + +import ( + "context" + "testing" + "time" + "workspace-engine/pkg/events/handler" + "workspace-engine/pkg/oapi" + "workspace-engine/test/integration" + c "workspace-engine/test/integration/creators" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestEngine_Rollback_OnJobFailure tests that when a job fails with a status +// configured in the rollback policy, a new job is created for the previous release. +func TestEngine_Rollback_OnJobFailure(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure), + ), + ), + ) + + ctx := context.Background() + + // Step 1: Deploy v1.0.0 and mark it successful (establish baseline) + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1, "expected 1 pending job for v1.0.0") + + job1 := getFirstJob(pendingJobs) + release1, _ := engine.Workspace().Releases().Get(job1.ReleaseId) + assert.Equal(t, "v1.0.0", release1.Version.Tag) + + // Mark job1 as successful + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Step 2: Deploy v2.0.0 + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1, "expected 1 pending job for v2.0.0") + + job2 := getFirstJob(pendingJobs) + release2, _ := engine.Workspace().Releases().Get(job2.ReleaseId) + assert.Equal(t, "v2.0.0", release2.Version.Tag) + + // Step 3: Mark v2.0.0 job as failed - this should trigger rollback + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusFailure)) + + // Step 4: Verify rollback job was created for v1.0.0 + pendingJobs = engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1, "expected 1 pending rollback job") + + rollbackJob := getFirstJob(pendingJobs) + assert.NotEqual(t, job1.Id, rollbackJob.Id, "rollback job should have a new ID") + assert.NotEqual(t, job2.Id, rollbackJob.Id, "rollback job should not be the failed job") + + rollbackRelease, _ := engine.Workspace().Releases().Get(rollbackJob.ReleaseId) + assert.Equal(t, "v1.0.0", rollbackRelease.Version.Tag, "rollback should target v1.0.0") +} + +// TestEngine_Rollback_OnJobFailure_NoRollbackForUnmatchedStatus tests that rollback +// does NOT happen when the job fails with a status not in the rollback policy. +func TestEngine_Rollback_OnJobFailure_NoRollbackForUnmatchedStatus(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + // Only rollback on JobStatusFailure, NOT JobStatusCancelled + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure), + ), + ), + ) + + ctx := context.Background() + + // Step 1: Deploy v1.0.0 and mark it successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Step 2: Deploy v2.0.0 + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + // Step 3: Mark v2.0.0 as CANCELLED - should NOT trigger rollback + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusCancelled)) + + // Step 4: Verify NO rollback job was created + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0, "should not create rollback job for cancelled status") +} + +// TestEngine_Rollback_OnJobFailure_NoPreviousRelease tests that rollback handles +// the case where there's no previous release to roll back to gracefully. +func TestEngine_Rollback_OnJobFailure_NoPreviousRelease(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure), + ), + ), + ) + + ctx := context.Background() + + // First ever deployment - deploy v1.0.0 and fail it + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1) + + job1 := getFirstJob(pendingJobs) + + // Mark first ever job as failed - should NOT crash, just no rollback + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusFailure)) + + // Should NOT create a rollback job (nothing to roll back to) + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0, "should not create rollback job when no previous release exists") +} + +// TestEngine_Rollback_OnJobFailure_MultipleStatuses tests rollback with multiple +// configured failure statuses. +func TestEngine_Rollback_OnJobFailure_MultipleStatuses(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + // Rollback on both failure and invalidJobAgent + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure, oapi.JobStatusInvalidJobAgent), + ), + ), + ) + + ctx := context.Background() + + // Deploy v1.0.0 and mark it successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Deploy v2.0.0 and mark it as invalidJobAgent + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusInvalidJobAgent)) + + // Verify rollback job was created for v1.0.0 + pendingJobs = engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1, "expected rollback job") + + rollbackJob := getFirstJob(pendingJobs) + rollbackRelease, _ := engine.Workspace().Releases().Get(rollbackJob.ReleaseId) + assert.Equal(t, "v1.0.0", rollbackRelease.Version.Tag, "rollback should target v1.0.0") +} + +// TestEngine_Rollback_OnJobFailure_PolicyNotMatching tests that rollback does NOT +// happen when the release target doesn't match the policy. +func TestEngine_Rollback_OnJobFailure_PolicyNotMatching(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + // This selector won't match the deployment + integration.PolicyTargetJsonDeploymentSelector(map[string]any{ + "type": "name", + "operator": "equals", + "value": "non-existent-deployment", + }), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure), + ), + ), + ) + + ctx := context.Background() + + // Deploy v1.0.0 and mark it successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Deploy v2.0.0 and mark it failed + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusFailure)) + + // Verify NO rollback job was created (policy doesn't match) + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0, "should not create rollback job when policy doesn't match") +} + +// TestEngine_Rollback_OnJobFailure_DisabledPolicy tests that rollback does NOT +// happen when the policy is disabled. +func TestEngine_Rollback_OnJobFailure_DisabledPolicy(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + ) + + ctx := context.Background() + + // Create a disabled policy manually + policy := c.NewPolicy(engine.Workspace().ID) + policy.Name = "rollback-policy" + policy.Enabled = false + selector := c.NewPolicyTargetSelector() + celSelector := &oapi.Selector{} + _ = celSelector.FromCelSelector(oapi.CelSelector{Cel: "true"}) + selector.DeploymentSelector = celSelector + selector.EnvironmentSelector = celSelector + selector.ResourceSelector = celSelector + policy.Selectors = []oapi.PolicyTargetSelector{*selector} + + rollBackStatuses := []oapi.JobStatus{oapi.JobStatusFailure} + policy.Rules = []oapi.PolicyRule{{ + Id: uuid.New().String(), + PolicyId: policy.Id, + Rollback: &oapi.RollbackRule{ + OnJobStatuses: &rollBackStatuses, + }, + }} + engine.PushEvent(ctx, handler.PolicyCreate, policy) + + // Deploy v1.0.0 and mark it successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Deploy v2.0.0 and mark it failed + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusFailure)) + + // Verify NO rollback job was created (policy is disabled) + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0, "should not create rollback job when policy is disabled") +} + +// TestEngine_Rollback_OnVerificationFailure tests that when verification fails +// and the policy has onVerificationFailure=true, rollback is triggered. +func TestEngine_Rollback_OnVerificationFailure(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRollbackOnVerificationFailure(), + ), + ), + ) + + ctx := context.Background() + + // Step 1: Deploy v1.0.0 and mark successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Step 2: Deploy v2.0.0 and mark successful (job-level) + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusSuccessful)) + + // Verify no pending jobs after job2 succeeds + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0) + + // Step 3: Start and fail verification for v2.0.0 + metricProvider := oapi.MetricProvider{} + _ = metricProvider.FromSleepMetricProvider(oapi.SleepMetricProvider{ + Type: oapi.Sleep, + DurationSeconds: 0, + }) + + // This condition will always fail (result.ok is undefined for sleep) + failureCondition := "result.ok == true" + metric := oapi.VerificationMetricSpec{ + Name: "always-fail", + IntervalSeconds: 1, + Count: 1, + FailureCondition: &failureCondition, + Provider: metricProvider, + } + + err := engine.Workspace().ReleaseManager().VerificationManager().StartVerification(ctx, job2, []oapi.VerificationMetricSpec{metric}) + require.NoError(t, err) + + // Wait for verification to complete and fail + time.Sleep(3 * time.Second) + + // Step 4: Verify rollback job was created for v1.0.0 + pendingJobs = engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1, "expected rollback job after verification failure") + + rollbackJob := getFirstJob(pendingJobs) + rollbackRelease, _ := engine.Workspace().Releases().Get(rollbackJob.ReleaseId) + assert.Equal(t, "v1.0.0", rollbackRelease.Version.Tag, "rollback should target v1.0.0") +} + +// TestEngine_Rollback_OnVerificationFailure_NotConfigured tests that verification failure +// does NOT trigger rollback when onVerificationFailure is not configured. +func TestEngine_Rollback_OnVerificationFailure_NotConfigured(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + // Only rollback on job failure, NOT on verification failure + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure), + ), + ), + ) + + ctx := context.Background() + + // Step 1: Deploy v1.0.0 and mark successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Step 2: Deploy v2.0.0 and mark successful (job-level) + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusSuccessful)) + + // Step 3: Start and fail verification for v2.0.0 + metricProvider := oapi.MetricProvider{} + _ = metricProvider.FromSleepMetricProvider(oapi.SleepMetricProvider{ + Type: oapi.Sleep, + DurationSeconds: 0, + }) + + failureCondition := "result.ok == true" + metric := oapi.VerificationMetricSpec{ + Name: "always-fail", + IntervalSeconds: 1, + Count: 1, + SuccessCondition: failureCondition, + Provider: metricProvider, + } + + err := engine.Workspace().ReleaseManager().VerificationManager().StartVerification(ctx, job2, []oapi.VerificationMetricSpec{metric}) + require.NoError(t, err) + + // Wait for verification to complete + time.Sleep(3 * time.Second) + + // Step 4: Verify NO rollback job was created (onVerificationFailure not configured) + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0, "should not create rollback job when onVerificationFailure is not configured") +} + +// TestEngine_Rollback_OnVerificationFailure_NoPreviousRelease tests that verification +// failure handles the case where there's no previous release gracefully. +func TestEngine_Rollback_OnVerificationFailure_NoPreviousRelease(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRollbackOnVerificationFailure(), + ), + ), + ) + + ctx := context.Background() + + // First ever deployment - deploy v1.0.0 and mark successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Start and fail verification for v1.0.0 + metricProvider := oapi.MetricProvider{} + _ = metricProvider.FromSleepMetricProvider(oapi.SleepMetricProvider{ + Type: oapi.Sleep, + DurationSeconds: 0, + }) + + failureCondition := "result.ok == true" + metric := oapi.VerificationMetricSpec{ + Name: "always-fail", + IntervalSeconds: 1, + Count: 1, + SuccessCondition: failureCondition, + Provider: metricProvider, + } + + err := engine.Workspace().ReleaseManager().VerificationManager().StartVerification(ctx, job1, []oapi.VerificationMetricSpec{metric}) + require.NoError(t, err) + + // Wait for verification to complete + time.Sleep(3 * time.Second) + + // Should NOT create a rollback job (nothing to roll back to) + pendingJobs = engine.Workspace().Jobs().GetPending() + assert.Len(t, pendingJobs, 0, "should not create rollback job when no previous release exists") +} + +// TestEngine_Rollback_BothJobAndVerificationConfigured tests a policy that has both +// job status rollback AND verification failure rollback configured. +func TestEngine_Rollback_BothJobAndVerificationConfigured(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + // Both job and verification rollback configured + integration.WithRuleRollback( + []oapi.JobStatus{oapi.JobStatusFailure}, + true, // onVerificationFailure + ), + ), + ), + ) + + ctx := context.Background() + + // Deploy v1.0.0 and mark successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Deploy v2.0.0 and make it fail at job level + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusFailure)) + + // Verify rollback happened due to job failure + pendingJobs = engine.Workspace().Jobs().GetPending() + require.Len(t, pendingJobs, 1, "expected rollback job from job failure") + + rollbackJob := getFirstJob(pendingJobs) + rollbackRelease, _ := engine.Workspace().Releases().Get(rollbackJob.ReleaseId) + assert.Equal(t, "v1.0.0", rollbackRelease.Version.Tag) +} + +// TestEngine_Rollback_NoRollbackWhenSameRelease tests that rollback doesn't happen +// when the current release is the same as the failed release. +func TestEngine_Rollback_NoRollbackWhenSameRelease(t *testing.T) { + jobAgentID := uuid.New().String() + deploymentID := uuid.New().String() + environmentID := uuid.New().String() + resourceID := uuid.New().String() + + engine := integration.NewTestWorkspace(t, + integration.WithJobAgent( + integration.JobAgentID(jobAgentID), + integration.JobAgentName("test-agent"), + ), + integration.WithSystem( + integration.SystemName("test-system"), + integration.WithDeployment( + integration.DeploymentID(deploymentID), + integration.DeploymentName("api-service"), + integration.DeploymentJobAgent(jobAgentID), + integration.DeploymentCelResourceSelector("true"), + ), + integration.WithEnvironment( + integration.EnvironmentID(environmentID), + integration.EnvironmentName("production"), + integration.EnvironmentCelResourceSelector("true"), + ), + ), + integration.WithResource( + integration.ResourceID(resourceID), + integration.ResourceName("server-1"), + ), + integration.WithPolicy( + integration.PolicyName("retry-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRetry(1, nil), // Allow 1 retry + ), + ), + integration.WithPolicy( + integration.PolicyName("rollback-policy"), + integration.WithPolicyTargetSelector( + integration.PolicyTargetCelDeploymentSelector("true"), + integration.PolicyTargetCelEnvironmentSelector("true"), + integration.PolicyTargetCelResourceSelector("true"), + ), + integration.WithPolicyRule( + integration.WithRuleRollbackOnJobStatuses(oapi.JobStatusFailure), + ), + ), + ) + + ctx := context.Background() + + // Get the resource for triggering reconcile + resources := engine.Workspace().Resources().Items() + var r1 *oapi.Resource + for _, res := range resources { + r1 = res + break + } + + // Deploy v1.0.0 and mark successful + dv1 := c.NewDeploymentVersion() + dv1.DeploymentId = deploymentID + dv1.Tag = "v1.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv1) + + pendingJobs := engine.Workspace().Jobs().GetPending() + job1 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job1, jobAgentID, oapi.JobStatusSuccessful)) + + // Deploy v2.0.0 and mark successful (this becomes "current release") + dv2 := c.NewDeploymentVersion() + dv2.DeploymentId = deploymentID + dv2.Tag = "v2.0.0" + engine.PushEvent(ctx, handler.DeploymentVersionCreate, dv2) + + pendingJobs = engine.Workspace().Jobs().GetPending() + job2 := getFirstJob(pendingJobs) + + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(job2, jobAgentID, oapi.JobStatusSuccessful)) + + // Now trigger a retry on v2.0.0 (same release) + engine.PushEvent(ctx, handler.ResourceUpdate, r1) + + pendingJobs = engine.Workspace().Jobs().GetPending() + // Due to retry policy, we might get a retry job. If a retry job is created and fails, + // it shouldn't rollback to itself + if len(pendingJobs) > 0 { + retryJob := getFirstJob(pendingJobs) + retryRelease, _ := engine.Workspace().Releases().Get(retryJob.ReleaseId) + + // If it's a retry of v2.0.0, fail it + if retryRelease.Version.Tag == "v2.0.0" { + engine.PushEvent(ctx, handler.JobUpdate, createJobUpdateEvent(retryJob, jobAgentID, oapi.JobStatusFailure)) + + // Should NOT rollback to v2.0.0 (the same release) + // Instead, should rollback to v1.0.0 + pendingJobs = engine.Workspace().Jobs().GetPending() + if len(pendingJobs) > 0 { + rollbackJob := getFirstJob(pendingJobs) + rollbackRelease, _ := engine.Workspace().Releases().Get(rollbackJob.ReleaseId) + // Rollback should be to v1.0.0, not v2.0.0 + assert.Equal(t, "v1.0.0", rollbackRelease.Version.Tag, "rollback should target previous successful release, not the same release") + } + } + } +} + +// createJobUpdateEvent creates a JobUpdateEvent for use in PushEvent. +// This creates a proper update event with the required fields. +func createJobUpdateEvent(job *oapi.Job, agentID string, status oapi.JobStatus) oapi.JobUpdateEvent { + completedAt := time.Now() + return oapi.JobUpdateEvent{ + Id: &job.Id, + AgentId: &agentID, + Job: oapi.Job{ + Id: job.Id, + Status: status, + CompletedAt: &completedAt, + }, + FieldsToUpdate: &[]oapi.JobUpdateEventFieldsToUpdate{ + oapi.JobUpdateEventFieldsToUpdateStatus, + oapi.JobUpdateEventFieldsToUpdateCompletedAt, + }, + } +} diff --git a/apps/workspace-engine/test/integration/opts.go b/apps/workspace-engine/test/integration/opts.go index 95243868b..f847c8df1 100644 --- a/apps/workspace-engine/test/integration/opts.go +++ b/apps/workspace-engine/test/integration/opts.go @@ -898,6 +898,46 @@ func WithRuleVersionCooldown(intervalSeconds int32) PolicyRuleOption { } } +// ===== RollbackRule Options ===== + +// WithRuleRollback configures a rollback rule that triggers rollback to the previous +// release when specified conditions are met. +func WithRuleRollback(rollBackJobStatuses []oapi.JobStatus, onVerificationFailure bool) PolicyRuleOption { + return func(_ *TestWorkspace, r *oapi.PolicyRule) error { + r.Rollback = &oapi.RollbackRule{} + if len(rollBackJobStatuses) > 0 { + r.Rollback.OnJobStatuses = &rollBackJobStatuses + } + if onVerificationFailure { + r.Rollback.OnVerificationFailure = &onVerificationFailure + } + return nil + } +} + +// WithRuleRollbackOnJobStatuses configures a rollback rule that triggers rollback +// when a job completes with one of the specified statuses. +func WithRuleRollbackOnJobStatuses(statuses ...oapi.JobStatus) PolicyRuleOption { + return func(_ *TestWorkspace, r *oapi.PolicyRule) error { + r.Rollback = &oapi.RollbackRule{ + OnJobStatuses: &statuses, + } + return nil + } +} + +// WithRuleRollbackOnVerificationFailure configures a rollback rule that triggers +// rollback when verification fails. +func WithRuleRollbackOnVerificationFailure() PolicyRuleOption { + return func(_ *TestWorkspace, r *oapi.PolicyRule) error { + onVerificationFailure := true + r.Rollback = &oapi.RollbackRule{ + OnVerificationFailure: &onVerificationFailure, + } + return nil + } +} + // ===== DeploymentDependencyRule Options ===== type DeploymentDependencyRuleOption func(*TestWorkspace, *oapi.DeploymentDependencyRule) error