@@ -74,39 +74,39 @@ var _ cloudprovider.CloudProvider = (*CloudProvider)(nil)
7474
7575type CloudProvider struct {
7676 instanceTypeProvider instancetype.Provider
77- instanceProvider instance.Provider
77+ vmInstanceProvider instance.VMProvider
7878 kubeClient client.Client
7979 imageProvider imagefamily.NodeImageProvider
8080 recorder events.Recorder
8181}
8282
8383func New (
8484 instanceTypeProvider instancetype.Provider ,
85- instanceProvider instance.Provider ,
85+ vmInstanceProvider instance.VMProvider ,
8686 recorder events.Recorder ,
8787 kubeClient client.Client ,
8888 imageProvider imagefamily.NodeImageProvider ,
8989) * CloudProvider {
9090 return & CloudProvider {
9191 instanceTypeProvider : instanceTypeProvider ,
92- instanceProvider : instanceProvider ,
92+ vmInstanceProvider : vmInstanceProvider ,
9393 kubeClient : kubeClient ,
9494 imageProvider : imageProvider ,
9595 recorder : recorder ,
9696 }
9797}
9898
9999// Create a node given the constraints.
100- func (c * CloudProvider ) handleInstanceCreation (ctx context.Context , instancePromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) error {
100+ func (c * CloudProvider ) handleVMInstanceCreation (ctx context.Context , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) error {
101101 if c .isStandaloneNodeClaim (nodeClaim ) {
102102 // processStandaloneNodeClaimDeletion:
103103 // Standalone NodeClaims aren’t re-queued for reconciliation in the provision_trigger controller,
104104 // so we delete them synchronously. After marking Launched=true,
105105 // their status can’t be reverted to false once the delete completes due to how core caches nodeclaims in
106106 // the lanch controller. This ensures we retry continuously until we hit the registration TTL
107- err := instancePromise .Wait ()
107+ err := vmPromise .Wait ()
108108 if err != nil {
109- return c .handleNodeClaimCreationError (ctx , err , instancePromise , nodeClaim , false )
109+ return c .handleNodeClaimCreationError (ctx , err , vmPromise , nodeClaim , false )
110110 }
111111 } else {
112112 // For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
@@ -115,7 +115,7 @@ func (c *CloudProvider) handleInstanceCreation(ctx context.Context, instanceProm
115115 // no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
116116 // be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
117117 // only held in memory.
118- go c .waitOnPromise (ctx , instancePromise , nodeClaim )
118+ go c .waitOnPromise (ctx , vmPromise , nodeClaim )
119119 }
120120 return nil
121121}
@@ -169,21 +169,22 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
169169 if len (instanceTypes ) == 0 {
170170 return nil , cloudprovider .NewInsufficientCapacityError (fmt .Errorf ("all requested instance types were unavailable during launch" ))
171171 }
172- instancePromise , err := c .instanceProvider .BeginCreate (ctx , nodeClass , nodeClaim , instanceTypes )
172+ vmPromise , err := c .vmInstanceProvider .BeginCreate (ctx , nodeClass , nodeClaim , instanceTypes )
173173 if err != nil {
174174 return nil , cloudprovider .NewCreateError (fmt .Errorf ("creating instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
175175 }
176176
177- if err = c .handleInstanceCreation (ctx , instancePromise , nodeClaim ); err != nil {
177+ if err = c .handleVMInstanceCreation (ctx , vmPromise , nodeClaim ); err != nil {
178178 return nil , err
179179 }
180180
181- instance := instancePromise .VM
181+ vm := vmPromise .VM // This is best-effort populated by Karpenter to be used to create the VM server-side. Not all fields are guaranteed to be populated, especially status fields.
182+ // Double-check the code before making assumptions on their presence.
182183 instanceType , _ := lo .Find (instanceTypes , func (i * cloudprovider.InstanceType ) bool {
183- return i .Name == string (lo .FromPtr (instance .Properties .HardwareProfile .VMSize ))
184+ return i .Name == string (lo .FromPtr (vm .Properties .HardwareProfile .VMSize ))
184185 })
185186
186- nc , err := c .instanceToNodeClaim (ctx , instance , instanceType )
187+ nc , err := c .vmInstanceToNodeClaim (ctx , vm , instanceType )
187188 if err != nil {
188189 return nil , err
189190 }
@@ -199,15 +200,15 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
199200 return nc , nil
200201}
201202
202- func (c * CloudProvider ) waitOnPromise (ctx context.Context , promise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) {
203+ func (c * CloudProvider ) waitOnPromise (ctx context.Context , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) {
203204 defer func () {
204205 if r := recover (); r != nil {
205206 err := fmt .Errorf ("%v" , r )
206207 log .FromContext (ctx ).Error (err , "panic during waitOnPromise" )
207208 }
208209 }()
209210
210- err := promise .Wait ()
211+ err := vmPromise .Wait ()
211212
212213 // Wait until the claim is Launched, to avoid racing with creation.
213214 // This isn't strictly required, but without this, failure test scenarios are harder
@@ -217,7 +218,7 @@ func (c *CloudProvider) waitOnPromise(ctx context.Context, promise *instance.Vir
217218 c .waitUntilLaunched (ctx , nodeClaim )
218219
219220 if err != nil {
220- _ = c .handleNodeClaimCreationError (ctx , err , promise , nodeClaim , true )
221+ _ = c .handleNodeClaimCreationError (ctx , err , vmPromise , nodeClaim , true )
221222 return
222223 }
223224}
@@ -246,14 +247,14 @@ func (c *CloudProvider) waitUntilLaunched(ctx context.Context, nodeClaim *karpv1
246247}
247248
248249// handleNodeClaimCreationError handles common error processing for both standalone and async node claim creation failures
249- func (c * CloudProvider ) handleNodeClaimCreationError (ctx context.Context , err error , promise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim , removeNodeClaim bool ) error {
250+ func (c * CloudProvider ) handleNodeClaimCreationError (ctx context.Context , err error , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim , removeNodeClaim bool ) error {
250251 c .recorder .Publish (cloudproviderevents .NodeClaimFailedToRegister (nodeClaim , err ))
251252 log .FromContext (ctx ).Error (err , "failed launching nodeclaim" )
252253
253254 // Clean up the VM
254255 // TODO: This won't clean up leaked NICs if the VM doesn't exist... intentional?
255- vmName := lo .FromPtr (promise .VM .Name )
256- deleteErr := c .instanceProvider .Delete (ctx , vmName )
256+ vmName := lo .FromPtr (vmPromise .VM .Name )
257+ deleteErr := c .vmInstanceProvider .Delete (ctx , vmName )
257258 if cloudprovider .IgnoreNodeClaimNotFoundError (deleteErr ) != nil {
258259 log .FromContext (ctx ).Error (deleteErr , "failed to delete VM" , "vmName" , vmName )
259260 }
@@ -281,20 +282,20 @@ func (c *CloudProvider) handleNodeClaimCreationError(ctx context.Context, err er
281282}
282283
283284func (c * CloudProvider ) List (ctx context.Context ) ([]* karpv1.NodeClaim , error ) {
284- instances , err := c .instanceProvider .List (ctx )
285+ vmInstances , err := c .vmInstanceProvider .List (ctx )
285286 if err != nil {
286- return nil , fmt .Errorf ("listing instances, %w" , err )
287+ return nil , fmt .Errorf ("listing VM instances, %w" , err )
287288 }
288289
289290 var nodeClaims []* karpv1.NodeClaim
290- for _ , instance := range instances {
291- instanceType , err := c .resolveInstanceTypeFromInstance (ctx , instance )
291+ for _ , instance := range vmInstances {
292+ instanceType , err := c .resolveInstanceTypeFromVMInstance (ctx , instance )
292293 if err != nil {
293- return nil , fmt .Errorf ("resolving instance type, %w" , err )
294+ return nil , fmt .Errorf ("resolving instance type for VM instance , %w" , err )
294295 }
295- nodeClaim , err := c .instanceToNodeClaim (ctx , instance , instanceType )
296+ nodeClaim , err := c .vmInstanceToNodeClaim (ctx , instance , instanceType )
296297 if err != nil {
297- return nil , fmt .Errorf ("converting instance to node claim, %w" , err )
298+ return nil , fmt .Errorf ("converting VM instance to node claim, %w" , err )
298299 }
299300
300301 nodeClaims = append (nodeClaims , nodeClaim )
@@ -308,22 +309,23 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*karpv1.Nod
308309 return nil , fmt .Errorf ("getting vm name, %w" , err )
309310 }
310311 ctx = log .IntoContext (ctx , log .FromContext (ctx ).WithValues ("vmName" , vmName ))
311- instance , err := c .instanceProvider .Get (ctx , vmName )
312+ vm , err := c .vmInstanceProvider .Get (ctx , vmName )
312313 if err != nil {
313- return nil , fmt .Errorf ("getting instance, %w" , err )
314+ return nil , fmt .Errorf ("getting VM instance, %w" , err )
314315 }
315- instanceType , err := c .resolveInstanceTypeFromInstance (ctx , instance )
316+ instanceType , err := c .resolveInstanceTypeFromVMInstance (ctx , vm )
316317 if err != nil {
317318 return nil , fmt .Errorf ("resolving instance type, %w" , err )
318319 }
319- return c .instanceToNodeClaim (ctx , instance , instanceType )
320+ return c .vmInstanceToNodeClaim (ctx , vm , instanceType )
320321}
321322
322323func (c * CloudProvider ) LivenessProbe (req * http.Request ) error {
323324 return c .instanceTypeProvider .LivenessProbe (req )
324325}
325326
326327// GetInstanceTypes returns all available InstanceTypes
328+ // May return apimachinery.NotFoundError if NodeClass is not found.
327329func (c * CloudProvider ) GetInstanceTypes (ctx context.Context , nodePool * karpv1.NodePool ) ([]* cloudprovider.InstanceType , error ) {
328330 nodeClass , err := c .resolveNodeClassFromNodePool (ctx , nodePool )
329331 if err != nil {
@@ -348,7 +350,7 @@ func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *karpv1.NodeClaim)
348350 if err != nil {
349351 return fmt .Errorf ("getting VM name, %w" , err )
350352 }
351- return c .instanceProvider .Delete (ctx , vmName )
353+ return c .vmInstanceProvider .Delete (ctx , vmName )
352354}
353355
354356func (c * CloudProvider ) IsDrifted (ctx context.Context , nodeClaim * karpv1.NodeClaim ) (cloudprovider.DriftReason , error ) {
@@ -404,6 +406,7 @@ func (c *CloudProvider) RepairPolicies() []cloudprovider.RepairPolicy {
404406 }
405407}
406408
409+ // May return apimachinery.NotFoundError if NodePool is not found.
407410func (c * CloudProvider ) resolveNodeClassFromNodePool (ctx context.Context , nodePool * karpv1.NodePool ) (* v1beta1.AKSNodeClass , error ) {
408411 nodeClass := & v1beta1.AKSNodeClass {}
409412 if err := c .kubeClient .Get (ctx , types.NamespacedName {Name : nodePool .Spec .Template .Spec .NodeClassRef .Name }, nodeClass ); err != nil {
@@ -437,8 +440,8 @@ func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, nodeClaim *kar
437440 // }), nil
438441}
439442
440- func (c * CloudProvider ) resolveInstanceTypeFromInstance (ctx context.Context , instance * armcompute.VirtualMachine ) (* cloudprovider.InstanceType , error ) {
441- nodePool , err := c .resolveNodePoolFromInstance (ctx , instance )
443+ func (c * CloudProvider ) resolveInstanceTypeFromVMInstance (ctx context.Context , vm * armcompute.VirtualMachine ) (* cloudprovider.InstanceType , error ) {
444+ nodePool , err := c .resolveNodePoolFromVMInstance (ctx , vm )
442445 if err != nil {
443446 // If we can't resolve the provisioner, we fallback to not getting instance type info
444447 return nil , client .IgnoreNotFound (fmt .Errorf ("resolving node pool, %w" , err ))
@@ -449,13 +452,13 @@ func (c *CloudProvider) resolveInstanceTypeFromInstance(ctx context.Context, ins
449452 return nil , client .IgnoreNotFound (fmt .Errorf ("resolving node template, %w" , err ))
450453 }
451454 instanceType , _ := lo .Find (instanceTypes , func (i * cloudprovider.InstanceType ) bool {
452- return i .Name == string (lo .FromPtr (instance .Properties .HardwareProfile .VMSize ))
455+ return i .Name == string (lo .FromPtr (vm .Properties .HardwareProfile .VMSize ))
453456 })
454457 return instanceType , nil
455458}
456459
457- func (c * CloudProvider ) resolveNodePoolFromInstance (ctx context.Context , instance * armcompute.VirtualMachine ) (* karpv1.NodePool , error ) {
458- nodePoolName , ok := instance .Tags [launchtemplate .NodePoolTagKey ]
460+ func (c * CloudProvider ) resolveNodePoolFromVMInstance (ctx context.Context , vm * armcompute.VirtualMachine ) (* karpv1.NodePool , error ) {
461+ nodePoolName , ok := vm .Tags [launchtemplate .NodePoolTagKey ]
459462 if ok && * nodePoolName != "" {
460463 nodePool := & karpv1.NodePool {}
461464 if err := c .kubeClient .Get (ctx , types.NamespacedName {Name : * nodePoolName }, nodePool ); err != nil {
@@ -467,7 +470,7 @@ func (c *CloudProvider) resolveNodePoolFromInstance(ctx context.Context, instanc
467470 return nil , errors .NewNotFound (schema.GroupResource {Group : coreapis .Group , Resource : "nodepools" }, "" )
468471}
469472
470- func (c * CloudProvider ) instanceToNodeClaim (ctx context.Context , vm * armcompute.VirtualMachine , instanceType * cloudprovider.InstanceType ) (* karpv1.NodeClaim , error ) {
473+ func (c * CloudProvider ) vmInstanceToNodeClaim (ctx context.Context , vm * armcompute.VirtualMachine , instanceType * cloudprovider.InstanceType ) (* karpv1.NodeClaim , error ) {
471474 nodeClaim := & karpv1.NodeClaim {}
472475 labels := map [string ]string {}
473476 annotations := map [string ]string {}
@@ -484,28 +487,28 @@ func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute.
484487 labels [corev1 .LabelTopologyZone ] = zone
485488 }
486489
487- labels [karpv1 .CapacityTypeLabelKey ] = instance .GetCapacityType (vm )
490+ labels [karpv1 .CapacityTypeLabelKey ] = instance .GetCapacityTypeFromVM (vm )
488491
489492 if tag , ok := vm .Tags [launchtemplate .NodePoolTagKey ]; ok {
490493 labels [karpv1 .NodePoolLabelKey ] = * tag
491494 }
492495
493- nodeClaim .Name = GenerateNodeClaimName (* vm .Name )
496+ nodeClaim .Name = GetNodeClaimNameFromVMName (* vm .Name )
494497 nodeClaim .Labels = labels
495498 nodeClaim .Annotations = annotations
496499 nodeClaim .CreationTimestamp = metav1.Time {Time : * vm .Properties .TimeCreated }
497500 // Set the deletionTimestamp to be the current time if the instance is currently terminating
498501 if utils .IsVMDeleting (* vm ) {
499502 nodeClaim .DeletionTimestamp = & metav1.Time {Time : time .Now ()}
500503 }
501- nodeClaim .Status .ProviderID = utils .ResourceIDToProviderID (ctx , * vm .ID )
504+ nodeClaim .Status .ProviderID = utils .VMResourceIDToProviderID (ctx , * vm .ID )
502505 if vm .Properties != nil && vm .Properties .StorageProfile != nil && vm .Properties .StorageProfile .ImageReference != nil {
503506 nodeClaim .Status .ImageID = utils .ImageReferenceToString (vm .Properties .StorageProfile .ImageReference )
504507 }
505508 return nodeClaim , nil
506509}
507510
508- func GenerateNodeClaimName (vmName string ) string {
511+ func GetNodeClaimNameFromVMName (vmName string ) string {
509512 return strings .TrimLeft ("aks-" , vmName )
510513}
511514
0 commit comments