Skip to content

Commit a1fad25

Browse files
committed
monitoring stale cni node resources
1 parent 9c8dc66 commit a1fad25

File tree

3 files changed

+63
-2
lines changed

3 files changed

+63
-2
lines changed

controllers/crds/cninode_controller.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,22 @@ var (
5252
Help: "The number of requests that failed when controller tried to recreate the CNINode",
5353
},
5454
)
55+
staleCNINodeCount = prometheus.NewCounter(
56+
prometheus.CounterOpts{
57+
Name: "stale_cniNode_count",
58+
Help: "The number of stale CNINode resources with deletion timestamp older than 15 minutes",
59+
},
60+
)
5561
)
5662

5763
func prometheusRegister() {
5864
prometheusRegistered = true
5965

6066
metrics.Registry.MustRegister(
6167
recreateCNINodeCallCount,
62-
recreateCNINodeErrCount)
68+
recreateCNINodeErrCount,
69+
staleCNINodeCount,
70+
)
6371

6472
prometheusRegistered = true
6573
}
@@ -171,6 +179,11 @@ func (r *CNINodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
171179
return ctrl.Result{}, nil
172180

173181
} else { // CNINode is marked for deletion
182+
if time.Since(cniNode.GetDeletionTimestamp().Time).Minutes() > 15 {
183+
// delete stale CNINode resource
184+
r.log.Info("stale CNINode resource", "cniNode", cniNode.Name)
185+
staleCNINodeCount.Inc()
186+
}
174187
if !nodeFound {
175188
// node is also deleted, proceed with running the cleanup routine and remove the finalizer
176189

controllers/crds/cninode_controller_test.go

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package crds
33
import (
44
"context"
55
"testing"
6+
"time"
67

78
"github.com/aws/amazon-vpc-resource-controller-k8s/apis/vpcresources/v1alpha1"
89
mock_api "github.com/aws/amazon-vpc-resource-controller-k8s/mocks/amazon-vcp-resource-controller-k8s/pkg/aws/ec2/api"
@@ -12,6 +13,7 @@ import (
1213
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api/cleanup"
1314
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
1415
"github.com/golang/mock/gomock"
16+
"github.com/prometheus/client_golang/prometheus/testutil"
1517
"github.com/stretchr/testify/assert"
1618
corev1 "k8s.io/api/core/v1"
1719
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -24,7 +26,8 @@ import (
2426
)
2527

2628
type CNINodeMock struct {
27-
Reconciler CNINodeReconciler
29+
Reconciler CNINodeReconciler
30+
initialStaleCount float64
2831
}
2932

3033
var (
@@ -163,6 +166,50 @@ func TestCNINodeReconcile(t *testing.T) {
163166
assert.Equal(t, res, reconcile.Result{})
164167
},
165168
},
169+
{
170+
name: "verify stale CNINode metric is incremented when deletion timestamp is older than 15 minutes",
171+
args: args{
172+
mockNode: nil,
173+
mockCNINode: &v1alpha1.CNINode{
174+
ObjectMeta: metav1.ObjectMeta{
175+
Name: mockName,
176+
Labels: map[string]string{
177+
config.NodeLabelOS: config.OSLinux,
178+
},
179+
Finalizers: []string{config.NodeTerminationFinalizer},
180+
DeletionTimestamp: &metav1.Time{Time: metav1.Now().Add(-20 * time.Minute)}, // 20 minutes ago
181+
},
182+
Spec: v1alpha1.CNINodeSpec{
183+
Tags: map[string]string{
184+
config.NetworkInterfaceNodeIDKey: "i-1234567890",
185+
},
186+
},
187+
},
188+
},
189+
prepare: func(f *fields) {
190+
// Get initial metric value
191+
initialValue := testutil.ToFloat64(staleCNINodeCount)
192+
193+
f.mockCNINode.Reconciler.newResourceCleaner = func(nodeID string, eC2Wrapper ec2API.EC2Wrapper, vpcID string) cleanup.ResourceCleaner {
194+
return f.mockResourceCleaner
195+
}
196+
f.mockResourceCleaner.EXPECT().DeleteLeakedResources().Times(1).Return(nil)
197+
f.mockFinalizerManager.EXPECT().
198+
RemoveFinalizers(gomock.Any(), gomock.Any(), config.NodeTerminationFinalizer).
199+
Return(nil)
200+
201+
// Store initial value for comparison in asserts
202+
f.mockCNINode.initialStaleCount = initialValue
203+
},
204+
asserts: func(res reconcile.Result, err error, cniNode *v1alpha1.CNINode) {
205+
assert.NoError(t, err)
206+
assert.Equal(t, res, reconcile.Result{})
207+
208+
// Verify the stale metric was incremented
209+
finalValue := testutil.ToFloat64(staleCNINodeCount)
210+
assert.Greater(t, finalValue,float64(0))
211+
},
212+
},
166213
}
167214
for _, tt := range tests {
168215
t.Run(tt.name, func(t *testing.T) {

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ require (
4545
github.com/google/btree v1.1.3 // indirect
4646
github.com/google/gnostic-models v0.6.9 // indirect
4747
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
48+
github.com/kylelemons/godebug v1.1.0 // indirect
4849
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
4950
github.com/x448/float16 v0.8.4 // indirect
5051
golang.org/x/sync v0.13.0 // indirect

0 commit comments

Comments
 (0)