Skip to content

Commit 10f5d69

Browse files
committed
refactor: rename generic "instance" to "VM" from their natural ties to VM instances
1 parent 58fc719 commit 10f5d69

File tree

23 files changed

+277
-250
lines changed

23 files changed

+277
-250
lines changed

cmd/controller/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func main() {
5252

5353
aksCloudProvider := cloudprovider.New(
5454
op.InstanceTypesProvider,
55-
op.InstanceProvider,
55+
op.VMInstanceProvider,
5656
op.EventRecorder,
5757
op.GetClient(),
5858
op.ImageProvider,
@@ -79,7 +79,7 @@ func main() {
7979
op.GetClient(),
8080
op.EventRecorder,
8181
aksCloudProvider,
82-
op.InstanceProvider,
82+
op.VMInstanceProvider,
8383
// TODO: still need to refactor ImageProvider side of things.
8484
op.KubernetesVersionProvider,
8585
op.ImageProvider,

cmd/controller/main_ccp.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func main() {
5252

5353
aksCloudProvider := cloudprovider.New(
5454
op.InstanceTypesProvider,
55-
op.InstanceProvider,
55+
op.VMInstanceProvider,
5656
op.EventRecorder,
5757
op.GetClient(),
5858
op.ImageProvider,
@@ -79,7 +79,7 @@ func main() {
7979
op.GetClient(),
8080
op.EventRecorder,
8181
aksCloudProvider,
82-
op.InstanceProvider,
82+
op.VMInstanceProvider,
8383
// TODO: still need to refactor ImageProvider side of things.
8484
op.KubernetesVersionProvider,
8585
op.ImageProvider,

pkg/cloudprovider/cloudprovider.go

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -74,39 +74,39 @@ var _ cloudprovider.CloudProvider = (*CloudProvider)(nil)
7474

7575
type CloudProvider struct {
7676
instanceTypeProvider instancetype.Provider
77-
instanceProvider instance.Provider
77+
vmInstanceProvider instance.VMProvider
7878
kubeClient client.Client
7979
imageProvider imagefamily.NodeImageProvider
8080
recorder events.Recorder
8181
}
8282

8383
func New(
8484
instanceTypeProvider instancetype.Provider,
85-
instanceProvider instance.Provider,
85+
vmInstanceProvider instance.VMProvider,
8686
recorder events.Recorder,
8787
kubeClient client.Client,
8888
imageProvider imagefamily.NodeImageProvider,
8989
) *CloudProvider {
9090
return &CloudProvider{
9191
instanceTypeProvider: instanceTypeProvider,
92-
instanceProvider: instanceProvider,
92+
vmInstanceProvider: vmInstanceProvider,
9393
kubeClient: kubeClient,
9494
imageProvider: imageProvider,
9595
recorder: recorder,
9696
}
9797
}
9898

9999
// Create a node given the constraints.
100-
func (c *CloudProvider) handleInstanceCreation(ctx context.Context, instancePromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) error {
100+
func (c *CloudProvider) handleVMInstanceCreation(ctx context.Context, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) error {
101101
if c.isStandaloneNodeClaim(nodeClaim) {
102102
// processStandaloneNodeClaimDeletion:
103103
// Standalone NodeClaims aren’t re-queued for reconciliation in the provision_trigger controller,
104104
// so we delete them synchronously. After marking Launched=true,
105105
// their status can’t be reverted to false once the delete completes due to how core caches nodeclaims in
106106
// the lanch controller. This ensures we retry continuously until we hit the registration TTL
107-
err := instancePromise.Wait()
107+
err := vmPromise.Wait()
108108
if err != nil {
109-
return c.handleNodeClaimCreationError(ctx, err, instancePromise, nodeClaim, false)
109+
return c.handleNodeClaimCreationError(ctx, err, vmPromise, nodeClaim, false)
110110
}
111111
} else {
112112
// For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
@@ -115,7 +115,7 @@ func (c *CloudProvider) handleInstanceCreation(ctx context.Context, instanceProm
115115
// no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
116116
// be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
117117
// only held in memory.
118-
go c.waitOnPromise(ctx, instancePromise, nodeClaim)
118+
go c.waitOnPromise(ctx, vmPromise, nodeClaim)
119119
}
120120
return nil
121121
}
@@ -169,21 +169,22 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
169169
if len(instanceTypes) == 0 {
170170
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
171171
}
172-
instancePromise, err := c.instanceProvider.BeginCreate(ctx, nodeClass, nodeClaim, instanceTypes)
172+
vmPromise, err := c.vmInstanceProvider.BeginCreate(ctx, nodeClass, nodeClaim, instanceTypes)
173173
if err != nil {
174174
return nil, cloudprovider.NewCreateError(fmt.Errorf("creating instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
175175
}
176176

177-
if err = c.handleInstanceCreation(ctx, instancePromise, nodeClaim); err != nil {
177+
if err = c.handleVMInstanceCreation(ctx, vmPromise, nodeClaim); err != nil {
178178
return nil, err
179179
}
180180

181-
instance := instancePromise.VM
181+
vm := vmPromise.VM // This is best-effort populated by Karpenter to be used to create the VM server-side. Not all fields are guaranteed to be populated, especially status fields.
182+
// Double-check the code before making assumptions on their presence.
182183
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
183-
return i.Name == string(lo.FromPtr(instance.Properties.HardwareProfile.VMSize))
184+
return i.Name == string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))
184185
})
185186

186-
nc, err := c.instanceToNodeClaim(ctx, instance, instanceType)
187+
nc, err := c.vmInstanceToNodeClaim(ctx, vm, instanceType)
187188
if err != nil {
188189
return nil, err
189190
}
@@ -199,15 +200,15 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
199200
return nc, nil
200201
}
201202

202-
func (c *CloudProvider) waitOnPromise(ctx context.Context, promise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) {
203+
func (c *CloudProvider) waitOnPromise(ctx context.Context, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) {
203204
defer func() {
204205
if r := recover(); r != nil {
205206
err := fmt.Errorf("%v", r)
206207
log.FromContext(ctx).Error(err, "panic during waitOnPromise")
207208
}
208209
}()
209210

210-
err := promise.Wait()
211+
err := vmPromise.Wait()
211212

212213
// Wait until the claim is Launched, to avoid racing with creation.
213214
// This isn't strictly required, but without this, failure test scenarios are harder
@@ -217,7 +218,7 @@ func (c *CloudProvider) waitOnPromise(ctx context.Context, promise *instance.Vir
217218
c.waitUntilLaunched(ctx, nodeClaim)
218219

219220
if err != nil {
220-
_ = c.handleNodeClaimCreationError(ctx, err, promise, nodeClaim, true)
221+
_ = c.handleNodeClaimCreationError(ctx, err, vmPromise, nodeClaim, true)
221222
return
222223
}
223224
}
@@ -246,14 +247,14 @@ func (c *CloudProvider) waitUntilLaunched(ctx context.Context, nodeClaim *karpv1
246247
}
247248

248249
// handleNodeClaimCreationError handles common error processing for both standalone and async node claim creation failures
249-
func (c *CloudProvider) handleNodeClaimCreationError(ctx context.Context, err error, promise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim, removeNodeClaim bool) error {
250+
func (c *CloudProvider) handleNodeClaimCreationError(ctx context.Context, err error, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim, removeNodeClaim bool) error {
250251
c.recorder.Publish(cloudproviderevents.NodeClaimFailedToRegister(nodeClaim, err))
251252
log.FromContext(ctx).Error(err, "failed launching nodeclaim")
252253

253254
// Clean up the VM
254255
// TODO: This won't clean up leaked NICs if the VM doesn't exist... intentional?
255-
vmName := lo.FromPtr(promise.VM.Name)
256-
deleteErr := c.instanceProvider.Delete(ctx, vmName)
256+
vmName := lo.FromPtr(vmPromise.VM.Name)
257+
deleteErr := c.vmInstanceProvider.Delete(ctx, vmName)
257258
if cloudprovider.IgnoreNodeClaimNotFoundError(deleteErr) != nil {
258259
log.FromContext(ctx).Error(deleteErr, "failed to delete VM", "vmName", vmName)
259260
}
@@ -281,20 +282,20 @@ func (c *CloudProvider) handleNodeClaimCreationError(ctx context.Context, err er
281282
}
282283

283284
func (c *CloudProvider) List(ctx context.Context) ([]*karpv1.NodeClaim, error) {
284-
instances, err := c.instanceProvider.List(ctx)
285+
vmInstances, err := c.vmInstanceProvider.List(ctx)
285286
if err != nil {
286-
return nil, fmt.Errorf("listing instances, %w", err)
287+
return nil, fmt.Errorf("listing VM instances, %w", err)
287288
}
288289

289290
var nodeClaims []*karpv1.NodeClaim
290-
for _, instance := range instances {
291-
instanceType, err := c.resolveInstanceTypeFromInstance(ctx, instance)
291+
for _, instance := range vmInstances {
292+
instanceType, err := c.resolveInstanceTypeFromVMInstance(ctx, instance)
292293
if err != nil {
293-
return nil, fmt.Errorf("resolving instance type, %w", err)
294+
return nil, fmt.Errorf("resolving instance type for VM instance, %w", err)
294295
}
295-
nodeClaim, err := c.instanceToNodeClaim(ctx, instance, instanceType)
296+
nodeClaim, err := c.vmInstanceToNodeClaim(ctx, instance, instanceType)
296297
if err != nil {
297-
return nil, fmt.Errorf("converting instance to node claim, %w", err)
298+
return nil, fmt.Errorf("converting VM instance to node claim, %w", err)
298299
}
299300

300301
nodeClaims = append(nodeClaims, nodeClaim)
@@ -308,22 +309,23 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*karpv1.Nod
308309
return nil, fmt.Errorf("getting vm name, %w", err)
309310
}
310311
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("vmName", vmName))
311-
instance, err := c.instanceProvider.Get(ctx, vmName)
312+
vm, err := c.vmInstanceProvider.Get(ctx, vmName)
312313
if err != nil {
313-
return nil, fmt.Errorf("getting instance, %w", err)
314+
return nil, fmt.Errorf("getting VM instance, %w", err)
314315
}
315-
instanceType, err := c.resolveInstanceTypeFromInstance(ctx, instance)
316+
instanceType, err := c.resolveInstanceTypeFromVMInstance(ctx, vm)
316317
if err != nil {
317318
return nil, fmt.Errorf("resolving instance type, %w", err)
318319
}
319-
return c.instanceToNodeClaim(ctx, instance, instanceType)
320+
return c.vmInstanceToNodeClaim(ctx, vm, instanceType)
320321
}
321322

322323
func (c *CloudProvider) LivenessProbe(req *http.Request) error {
323324
return c.instanceTypeProvider.LivenessProbe(req)
324325
}
325326

326327
// GetInstanceTypes returns all available InstanceTypes
328+
// May return apimachinery.NotFoundError if NodeClass is not found.
327329
func (c *CloudProvider) GetInstanceTypes(ctx context.Context, nodePool *karpv1.NodePool) ([]*cloudprovider.InstanceType, error) {
328330
nodeClass, err := c.resolveNodeClassFromNodePool(ctx, nodePool)
329331
if err != nil {
@@ -348,7 +350,7 @@ func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *karpv1.NodeClaim)
348350
if err != nil {
349351
return fmt.Errorf("getting VM name, %w", err)
350352
}
351-
return c.instanceProvider.Delete(ctx, vmName)
353+
return c.vmInstanceProvider.Delete(ctx, vmName)
352354
}
353355

354356
func (c *CloudProvider) IsDrifted(ctx context.Context, nodeClaim *karpv1.NodeClaim) (cloudprovider.DriftReason, error) {
@@ -404,6 +406,7 @@ func (c *CloudProvider) RepairPolicies() []cloudprovider.RepairPolicy {
404406
}
405407
}
406408

409+
// May return apimachinery.NotFoundError if NodePool is not found.
407410
func (c *CloudProvider) resolveNodeClassFromNodePool(ctx context.Context, nodePool *karpv1.NodePool) (*v1beta1.AKSNodeClass, error) {
408411
nodeClass := &v1beta1.AKSNodeClass{}
409412
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: nodePool.Spec.Template.Spec.NodeClassRef.Name}, nodeClass); err != nil {
@@ -437,8 +440,8 @@ func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, nodeClaim *kar
437440
// }), nil
438441
}
439442

440-
func (c *CloudProvider) resolveInstanceTypeFromInstance(ctx context.Context, instance *armcompute.VirtualMachine) (*cloudprovider.InstanceType, error) {
441-
nodePool, err := c.resolveNodePoolFromInstance(ctx, instance)
443+
func (c *CloudProvider) resolveInstanceTypeFromVMInstance(ctx context.Context, vm *armcompute.VirtualMachine) (*cloudprovider.InstanceType, error) {
444+
nodePool, err := c.resolveNodePoolFromVMInstance(ctx, vm)
442445
if err != nil {
443446
// If we can't resolve the provisioner, we fallback to not getting instance type info
444447
return nil, client.IgnoreNotFound(fmt.Errorf("resolving node pool, %w", err))
@@ -449,13 +452,13 @@ func (c *CloudProvider) resolveInstanceTypeFromInstance(ctx context.Context, ins
449452
return nil, client.IgnoreNotFound(fmt.Errorf("resolving node template, %w", err))
450453
}
451454
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
452-
return i.Name == string(lo.FromPtr(instance.Properties.HardwareProfile.VMSize))
455+
return i.Name == string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))
453456
})
454457
return instanceType, nil
455458
}
456459

457-
func (c *CloudProvider) resolveNodePoolFromInstance(ctx context.Context, instance *armcompute.VirtualMachine) (*karpv1.NodePool, error) {
458-
nodePoolName, ok := instance.Tags[launchtemplate.NodePoolTagKey]
460+
func (c *CloudProvider) resolveNodePoolFromVMInstance(ctx context.Context, vm *armcompute.VirtualMachine) (*karpv1.NodePool, error) {
461+
nodePoolName, ok := vm.Tags[launchtemplate.NodePoolTagKey]
459462
if ok && *nodePoolName != "" {
460463
nodePool := &karpv1.NodePool{}
461464
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: *nodePoolName}, nodePool); err != nil {
@@ -467,7 +470,7 @@ func (c *CloudProvider) resolveNodePoolFromInstance(ctx context.Context, instanc
467470
return nil, errors.NewNotFound(schema.GroupResource{Group: coreapis.Group, Resource: "nodepools"}, "")
468471
}
469472

470-
func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute.VirtualMachine, instanceType *cloudprovider.InstanceType) (*karpv1.NodeClaim, error) {
473+
func (c *CloudProvider) vmInstanceToNodeClaim(ctx context.Context, vm *armcompute.VirtualMachine, instanceType *cloudprovider.InstanceType) (*karpv1.NodeClaim, error) {
471474
nodeClaim := &karpv1.NodeClaim{}
472475
labels := map[string]string{}
473476
annotations := map[string]string{}
@@ -484,28 +487,28 @@ func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute.
484487
labels[corev1.LabelTopologyZone] = zone
485488
}
486489

487-
labels[karpv1.CapacityTypeLabelKey] = instance.GetCapacityType(vm)
490+
labels[karpv1.CapacityTypeLabelKey] = instance.GetCapacityTypeFromVM(vm)
488491

489492
if tag, ok := vm.Tags[launchtemplate.NodePoolTagKey]; ok {
490493
labels[karpv1.NodePoolLabelKey] = *tag
491494
}
492495

493-
nodeClaim.Name = GenerateNodeClaimName(*vm.Name)
496+
nodeClaim.Name = GetNodeClaimNameFromVMName(*vm.Name)
494497
nodeClaim.Labels = labels
495498
nodeClaim.Annotations = annotations
496499
nodeClaim.CreationTimestamp = metav1.Time{Time: *vm.Properties.TimeCreated}
497500
// Set the deletionTimestamp to be the current time if the instance is currently terminating
498501
if utils.IsVMDeleting(*vm) {
499502
nodeClaim.DeletionTimestamp = &metav1.Time{Time: time.Now()}
500503
}
501-
nodeClaim.Status.ProviderID = utils.ResourceIDToProviderID(ctx, *vm.ID)
504+
nodeClaim.Status.ProviderID = utils.VMResourceIDToProviderID(ctx, *vm.ID)
502505
if vm.Properties != nil && vm.Properties.StorageProfile != nil && vm.Properties.StorageProfile.ImageReference != nil {
503506
nodeClaim.Status.ImageID = utils.ImageReferenceToString(vm.Properties.StorageProfile.ImageReference)
504507
}
505508
return nodeClaim, nil
506509
}
507510

508-
func GenerateNodeClaimName(vmName string) string {
511+
func GetNodeClaimNameFromVMName(vmName string) string {
509512
return strings.TrimLeft("aks-", vmName)
510513
}
511514

pkg/cloudprovider/drift.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ func (c *CloudProvider) isImageVersionDrifted(
145145
return "", err
146146
}
147147

148-
vm, err := c.instanceProvider.Get(ctx, id)
148+
vm, err := c.vmInstanceProvider.Get(ctx, id)
149149
if err != nil {
150150
// TODO (charliedmcb): Do we need to handle vm not found here before its provisioned?
151151
// I don't think we can get to Drift, until after ProviderID is set, so this should be a real issue.
@@ -203,7 +203,7 @@ func (c *CloudProvider) isSubnetDrifted(ctx context.Context, nodeClaim *karpv1.N
203203
nicName := instance.GenerateResourceName(nodeClaim.Name)
204204

205205
// TODO: Refactor all of AzConfig to be part of options
206-
nic, err := c.instanceProvider.GetNic(ctx, options.FromContext(ctx).NodeResourceGroup, nicName)
206+
nic, err := c.vmInstanceProvider.GetNic(ctx, options.FromContext(ctx).NodeResourceGroup, nicName)
207207
if err != nil {
208208
if sdkerrors.IsNotFoundErr(err) {
209209
return "", nil

pkg/cloudprovider/suite_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ var _ = BeforeSuite(func() {
8282
azureEnv = test.NewEnvironment(ctx, env)
8383
fakeClock = clock.NewFakeClock(time.Now())
8484
recorder = events.NewRecorder(&record.FakeRecorder{})
85-
cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.InstanceProvider, recorder, env.Client, azureEnv.ImageProvider)
85+
cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.VMInstanceProvider, recorder, env.Client, azureEnv.ImageProvider)
8686
cluster = state.NewCluster(fakeClock, env.Client, cloudProvider)
8787
prov = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock)
8888
})
@@ -208,7 +208,7 @@ var _ = Describe("CloudProvider", func() {
208208
// https://github.com/Azure/karpenter-provider-azure/blob/84e449787ec72268efb0c7af81ec87a6b3ee95fa/pkg/providers/instance/instance.go#L604
209209
// which has the sub const 12345678-1234-1234-1234-123456789012 passed in here:
210210
// https://github.com/Azure/karpenter-provider-azure/blob/84e449787ec72268efb0c7af81ec87a6b3ee95fa/pkg/test/environment.go#L152
211-
ProviderID: utils.ResourceIDToProviderID(ctx, fmt.Sprintf("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/%s/providers/Microsoft.Compute/virtualMachines/%s", rg, vmName)),
211+
ProviderID: utils.VMResourceIDToProviderID(ctx, fmt.Sprintf("/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/%s/providers/Microsoft.Compute/virtualMachines/%s", rg, vmName)),
212212
},
213213
ObjectMeta: metav1.ObjectMeta{
214214
Labels: map[string]string{

pkg/controllers/controllers.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func NewControllers(
4747
kubeClient client.Client,
4848
recorder events.Recorder,
4949
cloudProvider cloudprovider.CloudProvider,
50-
instanceProvider instance.Provider,
50+
vmInstanceProvider instance.VMProvider,
5151
kubernetesVersionProvider kubernetesversion.KubernetesVersionProvider,
5252
nodeImageProvider imagefamily.NodeImageProvider,
5353
inClusterKubernetesInterface kubernetes.Interface,
@@ -59,10 +59,10 @@ func NewControllers(
5959
nodeclasstermination.NewController(kubeClient, recorder),
6060

6161
nodeclaimgarbagecollection.NewVirtualMachine(kubeClient, cloudProvider),
62-
nodeclaimgarbagecollection.NewNetworkInterface(kubeClient, instanceProvider),
62+
nodeclaimgarbagecollection.NewNetworkInterface(kubeClient, vmInstanceProvider),
6363

6464
// TODO: nodeclaim tagging
65-
inplaceupdate.NewController(kubeClient, instanceProvider),
65+
inplaceupdate.NewController(kubeClient, vmInstanceProvider),
6666
status.NewController[*v1beta1.AKSNodeClass](kubeClient, mgr.GetEventRecorderFor("karpenter")),
6767
}
6868
return controllers

0 commit comments

Comments
 (0)