Skip to content

Commit dbfd59a

Browse files
authored
Improve bundle and GitRepo status error reporting (#4167)
When a bundle reconcile attempt fails, the reconciler now does one of the following: * log the error and retry with exponential backoff, when dealing with a retryable error such as a failure to perform a CRUD operation on a Kubernetes resource * skip retries and report the error through the bundle's status, setting its `Ready` condition to `False`, when the error is non-retryable (e.g. configuration error). In such cases, the reconciler newly makes use of a `TerminalError` [1], indicating that no retries should take place. Distinguishing retryable errors from non-retryable ones should both optimise the number of reconciles run for a given resource, and make non-retryable errors more visible to users, as they would typically be especially interested in them, since they may well be able to act on them. This happens through new, common logic living in `reconciler/error_handling.go`, encapsulating propagation of errors through status conditions, and distinctions between retryable and non-retryable errors. To illustrate the potential of this new logic, the gitOps controller partly uses this new logic when reconciling GitRepos, although most of the work to improve error reporting in that reconciler remains to be done. [1]: https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/reconcile#TypedReconciler
1 parent 2a5b433 commit dbfd59a

File tree

8 files changed

+1299
-197
lines changed

8 files changed

+1299
-197
lines changed

internal/cmd/controller/gitops/reconciler/gitjob_controller.go

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"github.com/go-logr/logr"
1313
"github.com/reugn/go-quartz/quartz"
1414

15-
fleetutil "github.com/rancher/fleet/internal/cmd/controller/errorutil"
1615
"github.com/rancher/fleet/internal/cmd/controller/finalize"
1716
"github.com/rancher/fleet/internal/cmd/controller/imagescan"
1817
"github.com/rancher/fleet/internal/cmd/controller/reconciler"
@@ -28,7 +27,6 @@ import (
2827

2928
batchv1 "k8s.io/api/batch/v1"
3029
corev1 "k8s.io/api/core/v1"
31-
"k8s.io/apimachinery/pkg/api/equality"
3230
apierrors "k8s.io/apimachinery/pkg/api/errors"
3331
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3432
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -233,10 +231,10 @@ func (r *GitJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
233231

234232
res, err := r.manageGitJob(ctx, logger, gitrepo, oldCommit, repoPolled)
235233
if err != nil || res.RequeueAfter > 0 {
236-
return res, err
234+
return res, updateErrorStatus(ctx, r.Client, req.NamespacedName, gitrepo.Status, err)
237235
}
238236

239-
setAcceptedCondition(&gitrepo.Status, nil)
237+
reconciler.SetCondition(v1alpha1.GitRepoAcceptedCondition, &gitrepo.Status, nil)
240238

241239
err = updateStatus(ctx, r.Client, req.NamespacedName, gitrepo.Status)
242240
if err != nil {
@@ -262,7 +260,6 @@ func monitorLatestCommit(obj metav1.Object, fetch func() (string, error)) (strin
262260

263261
// manageGitJob is responsible for creating, updating and deleting the GitJob and setting the GitRepo's status accordingly
264262
func (r *GitJobReconciler) manageGitJob(ctx context.Context, logger logr.Logger, gitrepo *v1alpha1.GitRepo, oldCommit string, repoPolled bool) (reconcile.Result, error) {
265-
name := types.NamespacedName{Namespace: gitrepo.Namespace, Name: gitrepo.Name}
266263
var job batchv1.Job
267264
err := r.Get(ctx, types.NamespacedName{
268265
Namespace: gitrepo.Namespace,
@@ -296,7 +293,7 @@ func (r *GitJobReconciler) manageGitJob(ctx context.Context, logger logr.Logger,
296293
r.updateGenerationValuesIfNeeded(gitrepo)
297294
if err := r.validateExternalSecretExist(ctx, gitrepo); err != nil {
298295
r.Recorder.Event(gitrepo, fleetevent.Warning, "FailedValidatingSecret", err.Error())
299-
return r.result(gitrepo), updateErrorStatus(ctx, r.Client, name, gitrepo.Status, err)
296+
return r.result(gitrepo), fmt.Errorf("error validating external secrets: %w", err)
300297
}
301298
if err := r.createJobAndResources(ctx, gitrepo, logger); err != nil {
302299
gitjobsCreatedFailure.Inc(gitrepo)
@@ -319,7 +316,7 @@ func (r *GitJobReconciler) manageGitJob(ctx context.Context, logger logr.Logger,
319316
gitrepo.Status.ObservedGeneration = gitrepo.Generation
320317

321318
if err = setStatusFromGitjob(ctx, r.Client, gitrepo, &job); err != nil {
322-
return r.result(gitrepo), updateErrorStatus(ctx, r.Client, name, gitrepo.Status, err)
319+
return r.result(gitrepo), fmt.Errorf("error setting GitRepo status from git job: %w", err)
323320
}
324321

325322
return reconcile.Result{}, nil
@@ -657,19 +654,10 @@ func setStatusFromGitjob(ctx context.Context, c client.Client, gitRepo *v1alpha1
657654
return nil
658655
}
659656

660-
// setAcceptedCondition sets the condition and updates the timestamp, if the condition changed
661-
func setAcceptedCondition(status *v1alpha1.GitRepoStatus, err error) {
662-
cond := condition.Cond(v1alpha1.GitRepoAcceptedCondition)
663-
origStatus := status.DeepCopy()
664-
cond.SetError(status, "", fleetutil.IgnoreConflict(err))
665-
if !equality.Semantic.DeepEqual(origStatus, status) {
666-
cond.LastUpdated(status, time.Now().UTC().Format(time.RFC3339))
667-
}
668-
}
669-
670657
// updateErrorStatus sets the condition in the status and tries to update the resource
671658
func updateErrorStatus(ctx context.Context, c client.Client, req types.NamespacedName, status v1alpha1.GitRepoStatus, orgErr error) error {
672-
setAcceptedCondition(&status, orgErr)
659+
reconciler.SetCondition(v1alpha1.GitRepoAcceptedCondition, &status, orgErr)
660+
673661
if statusErr := updateStatus(ctx, c, req, status); statusErr != nil {
674662
merr := []error{orgErr, fmt.Errorf("failed to update the status: %w", statusErr)}
675663
return errutil.NewAggregate(merr)

internal/cmd/controller/gitops/reconciler/gitjob_test.go

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"os"
1010
"slices"
11+
"strings"
1112
"testing"
1213
"time"
1314

@@ -522,16 +523,18 @@ func TestReconcile_Error_WhenGetGitJobErrors(t *testing.T) {
522523
mockClient := mocks.NewMockK8sClient(mockCtrl)
523524
mockClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes().Return(nil)
524525

525-
mockClient.EXPECT().Get(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Times(1).DoAndReturn(
526-
func(ctx context.Context, req types.NamespacedName, gitrepo *fleetv1.GitRepo, opts ...interface{}) error {
527-
gitrepo.Name = gitRepo.Name
528-
gitrepo.Namespace = gitRepo.Namespace
529-
gitrepo.Spec.Repo = "repo"
530-
controllerutil.AddFinalizer(gitrepo, finalize.GitRepoFinalizer)
531-
gitrepo.Status.Commit = "dd45c7ad68e10307765104fea4a1f5997643020f"
532-
return nil
533-
},
534-
)
526+
mockClient.EXPECT().Get(gomock.Any(), gomock.Any(), gomock.AssignableToTypeOf(&fleetv1.GitRepo{}), gomock.Any()).
527+
Times(2).
528+
DoAndReturn(
529+
func(ctx context.Context, req types.NamespacedName, gitrepo *fleetv1.GitRepo, opts ...interface{}) error {
530+
gitrepo.Name = gitRepo.Name
531+
gitrepo.Namespace = gitRepo.Namespace
532+
gitrepo.Spec.Repo = "repo"
533+
controllerutil.AddFinalizer(gitrepo, finalize.GitRepoFinalizer)
534+
gitrepo.Status.Commit = "dd45c7ad68e10307765104fea4a1f5997643020f"
535+
return nil
536+
},
537+
)
535538
mockFetcher := gitmocks.NewMockGitFetcher(mockCtrl)
536539
commit := "1883fd54bc5dfd225acf02aecbb6cb8020458e33"
537540
mockFetcher.EXPECT().LatestCommit(gomock.Any(), gomock.Any(), gomock.Any()).Times(1).Return(commit, nil)
@@ -542,6 +545,20 @@ func TestReconcile_Error_WhenGetGitJobErrors(t *testing.T) {
542545
},
543546
)
544547

548+
statusClient := mocks.NewMockSubResourceWriter(mockCtrl)
549+
mockClient.EXPECT().Status().Times(1).Return(statusClient)
550+
statusClient.EXPECT().Update(gomock.Any(), gomock.Any(), gomock.Any()).Do(
551+
func(ctx context.Context, repo *fleetv1.GitRepo, opts ...interface{}) {
552+
c, found := getCondition(repo, fleetv1.GitRepoAcceptedCondition)
553+
if !found {
554+
t.Errorf("expecting to find the %s condition and could not find it.", fleetv1.GitRepoAcceptedCondition)
555+
}
556+
if !strings.Contains(c.Message, "GITJOB ERROR") {
557+
t.Errorf("expecting message containing [GITJOB ERROR] in condition, got [%s]", c.Message)
558+
}
559+
},
560+
)
561+
545562
recorderMock := mocks.NewMockEventRecorder(mockCtrl)
546563
recorderMock.EXPECT().Event(
547564
&gitRepoMatcher{gitRepo},
@@ -606,11 +623,13 @@ func TestReconcile_Error_WhenSecretDoesNotExist(t *testing.T) {
606623
mockFetcher.EXPECT().LatestCommit(gomock.Any(), gomock.Any(), gomock.Any()).Times(1).Return(commit, nil)
607624

608625
// we need to return a NotFound error, so the code tries to create it.
609-
mockClient.EXPECT().Get(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Times(1).DoAndReturn(
610-
func(ctx context.Context, req types.NamespacedName, job *batchv1.Job, opts ...interface{}) error {
611-
return apierrors.NewNotFound(schema.GroupResource{}, "TEST ERROR")
612-
},
613-
)
626+
mockClient.EXPECT().Get(gomock.Any(), gomock.Any(), gomock.AssignableToTypeOf(&batchv1.Job{}), gomock.Any()).
627+
Times(1).
628+
DoAndReturn(
629+
func(ctx context.Context, req types.NamespacedName, job *batchv1.Job, opts ...interface{}) error {
630+
return apierrors.NewNotFound(schema.GroupResource{}, "TEST ERROR")
631+
},
632+
)
614633

615634
mockClient.EXPECT().Get(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Times(1).DoAndReturn(
616635
func(ctx context.Context, req types.NamespacedName, job *corev1.Secret, opts ...interface{}) error {
@@ -640,7 +659,7 @@ func TestReconcile_Error_WhenSecretDoesNotExist(t *testing.T) {
640659
if !found {
641660
t.Errorf("expecting to find the %s condition and could not find it.", fleetv1.GitRepoAcceptedCondition)
642661
}
643-
if c.Message != "failed to look up HelmSecretNameForPaths, error: SECRET ERROR" {
662+
if c.Message != "error validating external secrets: failed to look up HelmSecretNameForPaths, error: SECRET ERROR" {
644663
t.Errorf("expecting message [failed to look up HelmSecretNameForPaths, error: SECRET ERROR] in condition, got [%s]", c.Message)
645664
}
646665
},
@@ -660,7 +679,7 @@ func TestReconcile_Error_WhenSecretDoesNotExist(t *testing.T) {
660679
if err == nil {
661680
t.Errorf("expecting an error, got nil")
662681
}
663-
if err.Error() != "failed to look up HelmSecretNameForPaths, error: SECRET ERROR" {
682+
if err.Error() != "error validating external secrets: failed to look up HelmSecretNameForPaths, error: SECRET ERROR" {
664683
t.Errorf("unexpected error %v", err)
665684
}
666685
}

0 commit comments

Comments
 (0)