Skip to content

Commit b28d750

Browse files
matheuscscpStephen Solka
andcommitted
Add feature gate to cancel health checks on new revisions
Signed-off-by: Matheus Pimenta <[email protected]> Co-authored-by: Stephen Solka <[email protected]>
1 parent bce9a08 commit b28d750

File tree

6 files changed

+246
-32
lines changed

6 files changed

+246
-32
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ require (
2929
github.com/fluxcd/pkg/http/fetch v0.19.0
3030
github.com/fluxcd/pkg/kustomize v1.22.0
3131
github.com/fluxcd/pkg/runtime v0.86.0
32-
github.com/fluxcd/pkg/ssa v0.57.0
32+
github.com/fluxcd/pkg/ssa v0.58.0
3333
github.com/fluxcd/pkg/tar v0.14.0
3434
github.com/fluxcd/pkg/testserver v0.13.0
3535
github.com/fluxcd/source-controller/api v1.7.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,8 @@ github.com/fluxcd/pkg/runtime v0.86.0 h1:q7aBSerJwt0N9hpurPVElG+HWpVhZcs6t96bcNQ
211211
github.com/fluxcd/pkg/runtime v0.86.0/go.mod h1:Wt9mUzQgMPQMu2D/wKl5pG4zh5vu/tfF5wq9pPobxOQ=
212212
github.com/fluxcd/pkg/sourceignore v0.14.0 h1:ZiZzbXtXb/Qp7I7JCStsxOlX8ri8rWwCvmvIrJ0UzQQ=
213213
github.com/fluxcd/pkg/sourceignore v0.14.0/go.mod h1:E3zKvyTyB+oQKqm/2I/jS6Rrt3B7fNuig/4bY2vi3bg=
214-
github.com/fluxcd/pkg/ssa v0.57.0 h1:G2cKyeyOtEdOdLeMBWZe0XT+J0rBWSBzy9xln2myTaI=
215-
github.com/fluxcd/pkg/ssa v0.57.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA=
214+
github.com/fluxcd/pkg/ssa v0.58.0 h1:W7m2LQFsZxPN9nn3lfGVDwXsZnIgCWWJ/+/K5hpzW+k=
215+
github.com/fluxcd/pkg/ssa v0.58.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA=
216216
github.com/fluxcd/pkg/tar v0.14.0 h1:9Gku8FIvPt2bixKldZnzXJ/t+7SloxePlzyVGOK8GVQ=
217217
github.com/fluxcd/pkg/tar v0.14.0/go.mod h1:+rOWYk93qLEJ8WwmkvJOkB8i0dna1mrwJFybE8i9Udo=
218218
github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg=

internal/controller/kustomization_controller.go

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,12 @@ type KustomizationReconciler struct {
114114

115115
// Feature gates
116116

117-
AdditiveCELDependencyCheck bool
118-
AllowExternalArtifact bool
119-
FailFast bool
120-
GroupChangeLog bool
121-
StrictSubstitutions bool
117+
AdditiveCELDependencyCheck bool
118+
AllowExternalArtifact bool
119+
CancelHealthCheckOnNewRevision bool
120+
FailFast bool
121+
GroupChangeLog bool
122+
StrictSubstitutions bool
122123
}
123124

124125
func (r *KustomizationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
@@ -983,7 +984,39 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
983984
}
984985

985986
// Check the health with a default timeout of 30sec shorter than the reconciliation interval.
986-
if err := manager.WaitForSet(toCheck, ssa.WaitOptions{
987+
healthCtx := ctx
988+
if r.CancelHealthCheckOnNewRevision {
989+
// Create a cancellable context for health checks that monitors for new revisions
990+
var cancel context.CancelFunc
991+
healthCtx, cancel = context.WithCancel(ctx)
992+
defer cancel()
993+
994+
// Start monitoring for new revisions to allow early cancellation
995+
go func() {
996+
ticker := time.NewTicker(5 * time.Second)
997+
defer ticker.Stop()
998+
999+
for {
1000+
select {
1001+
case <-healthCtx.Done():
1002+
return
1003+
case <-ticker.C:
1004+
// Get the latest source artifact
1005+
latestSrc, err := r.getSource(ctx, obj)
1006+
if err == nil && latestSrc.GetArtifact() != nil {
1007+
if newRevision := latestSrc.GetArtifact().Revision; newRevision != revision {
1008+
const msg = "New revision detected during health check, cancelling"
1009+
r.event(obj, revision, originRevision, eventv1.EventSeverityInfo, msg, nil)
1010+
ctrl.LoggerFrom(ctx).Info(msg, "current", revision, "new", newRevision)
1011+
cancel()
1012+
return
1013+
}
1014+
}
1015+
}
1016+
}
1017+
}()
1018+
}
1019+
if err := manager.WaitForSetWithContext(healthCtx, toCheck, ssa.WaitOptions{
9871020
Interval: 5 * time.Second,
9881021
Timeout: obj.GetTimeout(),
9891022
FailFast: r.FailFast,

internal/controller/kustomization_wait_test.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,3 +466,165 @@ func TestKustomizationReconciler_RESTMapper(t *testing.T) {
466466
g.Expect(err).To(HaveOccurred())
467467
})
468468
}
469+
470+
func TestKustomizationReconciler_CancelHealthCheckOnNewRevision(t *testing.T) {
471+
g := NewWithT(t)
472+
id := "cancel-" + randStringRunes(5)
473+
resultK := &kustomizev1.Kustomization{}
474+
timeout := 60 * time.Second
475+
476+
reconciler.CancelHealthCheckOnNewRevision = true
477+
t.Cleanup(func() { reconciler.CancelHealthCheckOnNewRevision = false })
478+
479+
err := createNamespace(id)
480+
g.Expect(err).NotTo(HaveOccurred(), "failed to create test namespace")
481+
482+
err = createKubeConfigSecret(id)
483+
g.Expect(err).NotTo(HaveOccurred(), "failed to create kubeconfig secret")
484+
485+
// Create initial successful manifests
486+
successManifests := []testserver.File{
487+
{
488+
Name: "configmap.yaml",
489+
Body: fmt.Sprintf(`apiVersion: v1
490+
kind: ConfigMap
491+
metadata:
492+
name: test-config
493+
namespace: %s
494+
data:
495+
foo: bar`, id),
496+
},
497+
}
498+
artifact, err := testServer.ArtifactFromFiles(successManifests)
499+
g.Expect(err).ToNot(HaveOccurred())
500+
501+
repositoryName := types.NamespacedName{
502+
Name: fmt.Sprintf("cancel-%s", randStringRunes(5)),
503+
Namespace: id,
504+
}
505+
506+
err = applyGitRepository(repositoryName, artifact, "main/"+artifact)
507+
g.Expect(err).NotTo(HaveOccurred())
508+
509+
kustomization := &kustomizev1.Kustomization{}
510+
kustomization.Name = id
511+
kustomization.Namespace = id
512+
kustomization.Spec = kustomizev1.KustomizationSpec{
513+
Interval: metav1.Duration{Duration: 10 * time.Minute},
514+
Path: "./",
515+
Wait: true,
516+
Timeout: &metav1.Duration{Duration: 5 * time.Minute},
517+
SourceRef: kustomizev1.CrossNamespaceSourceReference{
518+
Name: repositoryName.Name,
519+
Kind: sourcev1.GitRepositoryKind,
520+
Namespace: id,
521+
},
522+
KubeConfig: &meta.KubeConfigReference{
523+
SecretRef: &meta.SecretKeyReference{
524+
Name: "kubeconfig",
525+
},
526+
},
527+
}
528+
529+
err = k8sClient.Create(context.Background(), kustomization)
530+
g.Expect(err).NotTo(HaveOccurred())
531+
532+
// Wait for initial reconciliation to succeed
533+
g.Eventually(func() bool {
534+
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
535+
return conditions.IsReady(resultK)
536+
}, timeout, time.Second).Should(BeTrue())
537+
538+
// Create failing manifests (deployment with bad image that will timeout)
539+
failingManifests := []testserver.File{
540+
{
541+
Name: "deployment.yaml",
542+
Body: fmt.Sprintf(`apiVersion: apps/v1
543+
kind: Deployment
544+
metadata:
545+
name: failing-deployment
546+
namespace: %s
547+
spec:
548+
replicas: 1
549+
selector:
550+
matchLabels:
551+
app: failing-app
552+
template:
553+
metadata:
554+
labels:
555+
app: failing-app
556+
spec:
557+
containers:
558+
- name: app
559+
image: nonexistent.registry/badimage:latest
560+
ports:
561+
- containerPort: 8080`, id),
562+
},
563+
}
564+
565+
// Apply failing revision
566+
failingArtifact, err := testServer.ArtifactFromFiles(failingManifests)
567+
g.Expect(err).ToNot(HaveOccurred())
568+
569+
err = applyGitRepository(repositoryName, failingArtifact, "main/"+failingArtifact)
570+
g.Expect(err).NotTo(HaveOccurred())
571+
572+
// Wait for reconciliation to start on failing revision
573+
g.Eventually(func() bool {
574+
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
575+
return resultK.Status.LastAttemptedRevision == "main/"+failingArtifact
576+
}, timeout, time.Second).Should(BeTrue())
577+
578+
// Now quickly apply a fixed revision while health check should be in progress
579+
fixedManifests := []testserver.File{
580+
{
581+
Name: "deployment.yaml",
582+
Body: fmt.Sprintf(`apiVersion: apps/v1
583+
kind: Deployment
584+
metadata:
585+
name: working-deployment
586+
namespace: %s
587+
spec:
588+
replicas: 1
589+
selector:
590+
matchLabels:
591+
app: working-app
592+
template:
593+
metadata:
594+
labels:
595+
app: working-app
596+
spec:
597+
containers:
598+
- name: app
599+
image: nginx:latest
600+
ports:
601+
- containerPort: 80`, id),
602+
},
603+
}
604+
605+
fixedArtifact, err := testServer.ArtifactFromFiles(fixedManifests)
606+
g.Expect(err).ToNot(HaveOccurred())
607+
608+
// Apply the fixed revision shortly after the failing one
609+
time.Sleep(2 * time.Second) // Give some time for health check to start
610+
err = applyGitRepository(repositoryName, fixedArtifact, "main/"+fixedArtifact)
611+
g.Expect(err).NotTo(HaveOccurred())
612+
613+
// The key test: verify that the fixed revision gets attempted
614+
// and that the health check cancellation worked
615+
g.Eventually(func() bool {
616+
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
617+
return resultK.Status.LastAttemptedRevision == "main/"+fixedArtifact
618+
}, timeout, time.Second).Should(BeTrue())
619+
620+
// Check cancellation event was emitted
621+
events := getEvents(resultK.GetName(), nil)
622+
var found bool
623+
for _, e := range events {
624+
if e.Message == "New revision detected during health check, cancelling" {
625+
found = true
626+
break
627+
}
628+
}
629+
g.Expect(found).To(BeTrue(), "did not find event for health check cancellation")
630+
}

internal/features/features.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ const (
5959

6060
// ExternalArtifact controls whether the ExternalArtifact source type is enabled.
6161
ExternalArtifact = "ExternalArtifact"
62+
63+
// CancelHealthCheckOnNewRevision controls whether ongoing health checks
64+
// should be cancelled when a new source revision becomes available.
65+
//
66+
// When enabled, if a new revision is detected while waiting for resources
67+
// to become ready, the current health check will be cancelled to allow
68+
// immediate processing of the new revision. This can help avoid getting
69+
// stuck on failing deployments when fixes are available.
70+
CancelHealthCheckOnNewRevision = "CancelHealthCheckOnNewRevision"
6271
)
6372

6473
var features = map[string]bool{
@@ -83,6 +92,9 @@ var features = map[string]bool{
8392
// ExternalArtifact
8493
// opt-in from v1.7
8594
ExternalArtifact: false,
95+
// CancelHealthCheckOnNewRevision
96+
// opt-in from v1.7
97+
CancelHealthCheckOnNewRevision: false,
8698
}
8799

88100
func init() {

main.go

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,12 @@ func main() {
293293
os.Exit(1)
294294
}
295295

296+
cancelHealthCheckOnNewRevision, err := features.Enabled(features.CancelHealthCheckOnNewRevision)
297+
if err != nil {
298+
setupLog.Error(err, "unable to check feature gate "+features.CancelHealthCheckOnNewRevision)
299+
os.Exit(1)
300+
}
301+
296302
var tokenCache *pkgcache.TokenCache
297303
if tokenCacheOptions.MaxSize > 0 {
298304
var err error
@@ -307,29 +313,30 @@ func main() {
307313
}
308314

309315
if err = (&controller.KustomizationReconciler{
310-
AdditiveCELDependencyCheck: additiveCELDependencyCheck,
311-
AllowExternalArtifact: allowExternalArtifact,
312-
APIReader: mgr.GetAPIReader(),
313-
ArtifactFetchRetries: httpRetry,
314-
Client: mgr.GetClient(),
315-
ClusterReader: clusterReader,
316-
ConcurrentSSA: concurrentSSA,
317-
ControllerName: controllerName,
318-
DefaultServiceAccount: defaultServiceAccount,
319-
DependencyRequeueInterval: requeueDependency,
320-
DisallowedFieldManagers: disallowedFieldManagers,
321-
EventRecorder: eventRecorder,
322-
FailFast: failFast,
323-
GroupChangeLog: groupChangeLog,
324-
KubeConfigOpts: kubeConfigOpts,
325-
Mapper: restMapper,
326-
Metrics: metricsH,
327-
NoCrossNamespaceRefs: aclOptions.NoCrossNamespaceRefs,
328-
NoRemoteBases: noRemoteBases,
329-
SOPSAgeSecret: sopsAgeSecret,
330-
StatusManager: fmt.Sprintf("gotk-%s", controllerName),
331-
StrictSubstitutions: strictSubstitutions,
332-
TokenCache: tokenCache,
316+
AdditiveCELDependencyCheck: additiveCELDependencyCheck,
317+
AllowExternalArtifact: allowExternalArtifact,
318+
CancelHealthCheckOnNewRevision: cancelHealthCheckOnNewRevision,
319+
APIReader: mgr.GetAPIReader(),
320+
ArtifactFetchRetries: httpRetry,
321+
Client: mgr.GetClient(),
322+
ClusterReader: clusterReader,
323+
ConcurrentSSA: concurrentSSA,
324+
ControllerName: controllerName,
325+
DefaultServiceAccount: defaultServiceAccount,
326+
DependencyRequeueInterval: requeueDependency,
327+
DisallowedFieldManagers: disallowedFieldManagers,
328+
EventRecorder: eventRecorder,
329+
FailFast: failFast,
330+
GroupChangeLog: groupChangeLog,
331+
KubeConfigOpts: kubeConfigOpts,
332+
Mapper: restMapper,
333+
Metrics: metricsH,
334+
NoCrossNamespaceRefs: aclOptions.NoCrossNamespaceRefs,
335+
NoRemoteBases: noRemoteBases,
336+
SOPSAgeSecret: sopsAgeSecret,
337+
StatusManager: fmt.Sprintf("gotk-%s", controllerName),
338+
StrictSubstitutions: strictSubstitutions,
339+
TokenCache: tokenCache,
333340
}).SetupWithManager(ctx, mgr, controller.KustomizationReconcilerOptions{
334341
RateLimiter: runtimeCtrl.GetRateLimiter(rateLimiterOptions),
335342
WatchConfigsPredicate: watchConfigsPredicate,

0 commit comments

Comments
 (0)