Skip to content

Commit ae1d6ff

Browse files
committed
feat: Added Disruption control for Sandbox
feat: added PDB to Sandbox spec updated rbac generated file nit
1 parent 04c055c commit ae1d6ff

File tree

7 files changed

+230
-1
lines changed

7 files changed

+230
-1
lines changed

api/v1alpha1/sandbox_types.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ type PersistentVolumeClaimTemplate struct {
9090
Spec corev1.PersistentVolumeClaimSpec `json:"spec" protobuf:"bytes,3,opt,name=spec"`
9191
}
9292

93+
// ResilienceLevel defines the desired level of resilience for a Sandbox.
94+
type ResilienceLevel string
95+
96+
const (
97+
// ResilienceLevelHigh indicates the Sandbox should be protected from voluntary disruptions.
98+
ResilienceLevelHigh ResilienceLevel = "High"
99+
)
100+
93101
// SandboxSpec defines the desired state of Sandbox
94102
type SandboxSpec struct {
95103
// The following markers will use OpenAPI v3 schema to validate the value
@@ -109,6 +117,12 @@ type SandboxSpec struct {
109117
// If a time in the past is provided, the sandbox will be deleted immediately.
110118
// +kubebuilder:validation:Format="date-time"
111119
ShutdownTime *metav1.Time `json:"shutdownTime,omitempty"`
120+
121+
// Resilience defines the desired level of resilience for the Sandbox Pod.
122+
// When set to "High", a PodDisruptionBudget is created to prevent voluntary
123+
// disruptions and an annotation is added to prevent cluster-autoscaler evictions.
124+
// +optional
125+
Resilience ResilienceLevel `json:"resilience,omitempty"`
112126
}
113127

114128
// SandboxStatus defines the observed state of Sandbox.

codegen.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@
1717
package agentsandbox
1818

1919
// Generate CRDs and RBAC rules
20-
//go:generate go tool -modfile=tools.mod sigs.k8s.io/controller-tools/cmd/controller-gen object crd:maxDescLen=0 paths="./api/..." output:crd:dir=k8s/crds output:rbac:dir=k8s rbac:roleName=agent-sandbox-controller,fileName=rbac.generated.yaml
20+
//go:generate go tool -modfile=tools.mod sigs.k8s.io/controller-tools/cmd/controller-gen rbac:roleName=agent-sandbox-controller,fileName=rbac.generated.yaml crd:maxDescLen=0 paths="./..." output:crd:dir=k8s/crds output:rbac:dir=k8s
2121
//go:generate go tool -modfile=tools.mod sigs.k8s.io/controller-tools/cmd/controller-gen object crd:maxDescLen=0 paths="./extensions/..." output:crd:dir=k8s/crds output:rbac:dir=k8s rbac:roleName=agent-sandbox-controller,fileName=rbac.generated.yaml

controllers/sandbox_controller.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@ import (
2323
"time"
2424

2525
corev1 "k8s.io/api/core/v1"
26+
policyv1 "k8s.io/api/policy/v1"
2627
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2728
"k8s.io/apimachinery/pkg/api/meta"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/runtime"
3031
"k8s.io/apimachinery/pkg/types"
32+
"k8s.io/apimachinery/pkg/util/intstr"
3133
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3234
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
3335
ctrl "sigs.k8s.io/controller-runtime"
@@ -42,6 +44,8 @@ import (
4244

4345
const (
4446
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
47+
// safeToEvictAnnotation is used to mark pods that should not be evicted by a PDB
48+
safeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict"
4549
)
4650

4751
var (
@@ -66,6 +70,7 @@ type SandboxReconciler struct {
6670
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
6771
//+kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
6872
//+kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
73+
//+kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete
6974

7075
// Reconcile is part of the main kubernetes reconciliation loop which aims to
7176
// move the current state of the cluster closer to the desired state.
@@ -123,6 +128,10 @@ func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox
123128
err := r.reconcilePVCs(ctx, sandbox)
124129
allErrors = errors.Join(allErrors, err)
125130

131+
// Reconcile PDB
132+
err = r.reconcilePDB(ctx, sandbox, nameHash)
133+
allErrors = errors.Join(allErrors, err)
134+
126135
// Reconcile Pod
127136
pod, err := r.reconcilePod(ctx, sandbox, nameHash)
128137
allErrors = errors.Join(allErrors, err)
@@ -291,6 +300,10 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
291300
annotations[k] = v
292301
}
293302

303+
if sandbox.Spec.Resilience == sandboxv1alpha1.ResilienceLevelHigh {
304+
annotations[safeToEvictAnnotation] = "false"
305+
}
306+
294307
mutatedSpec := sandbox.Spec.PodTemplate.Spec.DeepCopy()
295308

296309
for _, pvcTemplate := range sandbox.Spec.VolumeClaimTemplates {
@@ -325,6 +338,59 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
325338
return pod, nil
326339
}
327340

341+
func (r *SandboxReconciler) reconcilePDB(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox, nameHash string) error {
342+
log := log.FromContext(ctx)
343+
pdb := &policyv1.PodDisruptionBudget{}
344+
pdbName := types.NamespacedName{Name: sandbox.Name, Namespace: sandbox.Namespace}
345+
346+
// If resilience is not "High", ensure the PDB is deleted.
347+
if sandbox.Spec.Resilience != sandboxv1alpha1.ResilienceLevelHigh {
348+
if err := r.Get(ctx, pdbName, pdb); err != nil {
349+
if k8serrors.IsNotFound(err) {
350+
return nil // PDB doesn't exist, which is the desired state.
351+
}
352+
return err
353+
}
354+
log.Info("Deleting PDB as resilience level is not High", "PDB.Name", pdb.Name)
355+
return r.Delete(ctx, pdb)
356+
}
357+
358+
// If resilience is "High", ensure the PDB exists.
359+
if err := r.Get(ctx, pdbName, pdb); err != nil {
360+
if !k8serrors.IsNotFound(err) {
361+
log.Error(err, "Failed to get PDB")
362+
return fmt.Errorf("PDB Get Failed: %w", err)
363+
}
364+
365+
// PDB does not exist, so create it.
366+
log.Info("Creating a new PodDisruptionBudget", "PDB.Namespace", sandbox.Namespace, "PDB.Name", sandbox.Name)
367+
minAvailable := intstr.FromInt(1) // For a single-pod Sandbox, minAvailable=1 is appropriate
368+
newPDB := &policyv1.PodDisruptionBudget{
369+
ObjectMeta: metav1.ObjectMeta{
370+
Name: sandbox.Name,
371+
Namespace: sandbox.Namespace,
372+
},
373+
Spec: policyv1.PodDisruptionBudgetSpec{
374+
MinAvailable: &minAvailable,
375+
Selector: &metav1.LabelSelector{
376+
MatchLabels: map[string]string{
377+
sandboxLabel: nameHash,
378+
},
379+
},
380+
},
381+
}
382+
383+
if err := ctrl.SetControllerReference(sandbox, newPDB, r.Scheme); err != nil {
384+
return fmt.Errorf("SetControllerReference for PDB failed: %w", err)
385+
}
386+
387+
return r.Create(ctx, newPDB)
388+
}
389+
390+
log.Info("Found PDB", "PDB.Name", pdb.Name)
391+
return nil
392+
}
393+
328394
func (r *SandboxReconciler) reconcilePVCs(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
329395
log := log.FromContext(ctx)
330396
for _, pvcTemplate := range sandbox.Spec.VolumeClaimTemplates {

controllers/sandbox_controller_test.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,13 @@ import (
2222
"github.com/google/go-cmp/cmp/cmpopts"
2323
"github.com/stretchr/testify/require"
2424
corev1 "k8s.io/api/core/v1"
25+
policyv1 "k8s.io/api/policy/v1"
26+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2527
"k8s.io/apimachinery/pkg/api/resource"
2628
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2729
"k8s.io/apimachinery/pkg/runtime"
2830
"k8s.io/apimachinery/pkg/types"
31+
"k8s.io/apimachinery/pkg/util/intstr"
2932
"k8s.io/utils/ptr"
3033
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
3134
ctrl "sigs.k8s.io/controller-runtime"
@@ -378,6 +381,86 @@ func TestReconcile(t *testing.T) {
378381
},
379382
},
380383
},
384+
{
385+
name: "sandbox with high resilience creates PDB and adds pod annotation",
386+
sandboxSpec: sandboxv1alpha1.SandboxSpec{
387+
Resilience: sandboxv1alpha1.ResilienceLevelHigh,
388+
PodTemplate: sandboxv1alpha1.PodTemplate{
389+
Spec: corev1.PodSpec{
390+
Containers: []corev1.Container{{Name: "test-container"}},
391+
},
392+
},
393+
},
394+
// Verify Sandbox status
395+
wantStatus: sandboxv1alpha1.SandboxStatus{
396+
Service: sandboxName,
397+
ServiceFQDN: "sandbox-name.sandbox-ns.svc.cluster.local",
398+
Conditions: []metav1.Condition{
399+
{
400+
Type: "Ready",
401+
Status: "False",
402+
ObservedGeneration: 1,
403+
Reason: "DependenciesNotReady",
404+
Message: "Pod exists with phase: ; Service Exists",
405+
},
406+
},
407+
},
408+
wantObjs: []client.Object{
409+
// Verify Pod has the new annotation
410+
&corev1.Pod{
411+
ObjectMeta: metav1.ObjectMeta{
412+
Name: sandboxName,
413+
Namespace: sandboxNs,
414+
ResourceVersion: "1",
415+
Labels: map[string]string{
416+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
417+
},
418+
Annotations: map[string]string{
419+
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
420+
},
421+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
422+
},
423+
Spec: corev1.PodSpec{
424+
Containers: []corev1.Container{{Name: "test-container"}},
425+
},
426+
},
427+
// Verify Service
428+
&corev1.Service{
429+
ObjectMeta: metav1.ObjectMeta{
430+
Name: sandboxName,
431+
Namespace: sandboxNs,
432+
ResourceVersion: "1",
433+
Labels: map[string]string{
434+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
435+
},
436+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
437+
},
438+
Spec: corev1.ServiceSpec{
439+
Selector: map[string]string{
440+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
441+
},
442+
ClusterIP: "None",
443+
},
444+
},
445+
// Verify the new PDB
446+
&policyv1.PodDisruptionBudget{
447+
ObjectMeta: metav1.ObjectMeta{
448+
Name: sandboxName,
449+
Namespace: sandboxNs,
450+
ResourceVersion: "1",
451+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
452+
},
453+
Spec: policyv1.PodDisruptionBudgetSpec{
454+
MinAvailable: ptr.To(intstr.FromInt(1)),
455+
Selector: &metav1.LabelSelector{
456+
MatchLabels: map[string]string{
457+
"agents.x-k8s.io/sandbox-name-hash": "ab179450",
458+
},
459+
},
460+
},
461+
},
462+
},
463+
},
381464
}
382465

383466
for _, tc := range testCases {
@@ -534,3 +617,54 @@ func TestReconcilePod(t *testing.T) {
534617
})
535618
}
536619
}
620+
621+
// This test simulates updating a Sandbox and ensures the controller correctly deletes the now-unneeded PDB
622+
func TestReconcile_ResilienceCleanup(t *testing.T) {
623+
sandboxName := "sandbox-name"
624+
sandboxNs := "sandbox-ns"
625+
626+
// Initial Sandbox with High Resilience
627+
sb := &sandboxv1alpha1.Sandbox{
628+
ObjectMeta: metav1.ObjectMeta{
629+
Name: sandboxName,
630+
Namespace: sandboxNs,
631+
Generation: 1,
632+
},
633+
Spec: sandboxv1alpha1.SandboxSpec{
634+
Resilience: sandboxv1alpha1.ResilienceLevelHigh,
635+
PodTemplate: sandboxv1alpha1.PodTemplate{
636+
Spec: corev1.PodSpec{
637+
Containers: []corev1.Container{{Name: "test-container"}},
638+
},
639+
},
640+
},
641+
}
642+
643+
r := SandboxReconciler{
644+
Client: newFakeClient(sb),
645+
Scheme: Scheme,
646+
}
647+
req := ctrl.Request{NamespacedName: types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}}
648+
649+
_, err := r.Reconcile(t.Context(), req)
650+
require.NoError(t, err)
651+
652+
// Verify PDB was created
653+
pdb := &policyv1.PodDisruptionBudget{}
654+
require.NoError(t, r.Get(t.Context(), req.NamespacedName, pdb), "PDB should exist after first reconcile")
655+
656+
// Update Sandbox to remove resilience
657+
liveSandbox := &sandboxv1alpha1.Sandbox{}
658+
require.NoError(t, r.Get(t.Context(), req.NamespacedName, liveSandbox))
659+
liveSandbox.Spec.Resilience = "" // Remove resilience
660+
liveSandbox.Generation = 2
661+
require.NoError(t, r.Update(t.Context(), liveSandbox))
662+
663+
// Re-run reconcile
664+
_, err = r.Reconcile(t.Context(), req)
665+
require.NoError(t, err)
666+
667+
// Verify PDB was deleted
668+
err = r.Get(t.Context(), req.NamespacedName, pdb)
669+
require.True(t, k8serrors.IsNotFound(err), "PDB should be deleted after resilience is removed")
670+
}

examples/sandbox.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ metadata:
44
name: sandbox-example
55
namespace: sandbox-ns
66
spec:
7+
resilience: High
78
podTemplate:
89
metadata:
910
labels:

k8s/crds/agents.x-k8s.io_sandboxes.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3814,6 +3814,8 @@ spec:
38143814
required:
38153815
- spec
38163816
type: object
3817+
resilience:
3818+
type: string
38173819
shutdownTime:
38183820
format: date-time
38193821
type: string

k8s/rbac.generated.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,15 @@ rules:
3838
- get
3939
- patch
4040
- update
41+
- apiGroups:
42+
- policy
43+
resources:
44+
- poddisruptionbudgets
45+
verbs:
46+
- create
47+
- delete
48+
- get
49+
- list
50+
- patch
51+
- update
52+
- watch

0 commit comments

Comments
 (0)