Skip to content

Commit ca23f0b

Browse files
authored
Merge pull request #182 from guardian/an/rotate-between-asgs
enable rotating nodes into new ASGs
2 parents 150c5f2 + 00c7b7a commit ca23f0b

File tree

11 files changed

+534
-25
lines changed

11 files changed

+534
-25
lines changed

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,25 @@ in this project. The frequency of node rotations is passed into the template as
2828
### Running Manually
2929
Sometimes it's useful to rotate an ES node manually (e.g. during an ES upgrade), you can optionally pass a `targetInstanceId` in the step function input object. It's usually easiest to open an existing execution and click `New Execution` then just edit the input object.
3030

31+
### Rotating nodes into a new ASG
32+
33+
Very occasionally, it is required to migrate a cluster into a new Autoscaling Group. To do this with the node rotation step function by:
34+
35+
1. Follow the setup steps above.
36+
1. Create the new ASG with DesiredCapacity set to 0.
37+
1. Set the MinimumCapacity of the old ASG to 0.
38+
1. Tag the new ASG with `gu:riffraff:new-asg = True`. (This is the tag that is already used by riff-raff for identifying the newer ASG during migrations).
39+
1. Run as normal, either manually or letting the schedule rotate the instances.
40+
41+
The step function will detect and launch new instances in the new ASG, while removing nodes from the old ASG.
42+
43+
> [!WARNING]
44+
> This feature has been developed and tested for Elasticsearch clusters which exist in a single ASG, and the "new" ASG can be
45+
> matched to the "old" one using Stage/Stack/App tags. If your usecase doesn't match this, you'll likely need to do some more testing
46+
> and possibly improve this feature. If you're at all unsure, get in touch in the Elasticsearch chat space and we can figure out
47+
> any potential issues together.
48+
49+
3150
## Implementation
3251

3352
This Step Function triggers a number of TypeScript lambdas, which coordinate the process of replacing a node by:

cloudformation.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ Resources:
114114
Action:
115115
- autoscaling:DetachInstances
116116
- autoscaling:AttachInstances
117+
- autoscaling:SetDesiredCapacity
117118
- autoscaling:TerminateInstanceInAutoScalingGroup
118119
Resource:
119120
- !Sub arn:aws:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/*

src/addNode.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import {detachInstance} from './aws/autoscaling';
1+
import {launchNewInstance} from './aws/autoscaling';
22
import {AddNodeResponse, ClusterStatusResponse} from './utils/handlerInputs';
33
import {Elasticsearch} from './elasticsearch/elasticsearch';
44
import {Instance} from './aws/types';
@@ -7,17 +7,17 @@ import {ElasticsearchClusterStatus} from './elasticsearch/types';
77
export async function handler(event: ClusterStatusResponse): Promise<AddNodeResponse> {
88

99
const targetInstance: Instance = event.targetElasticSearchNode.ec2Instance;
10-
const asg: string = event.asgName;
10+
const asg: string = event.destinationAsgName;
1111
const elasticsearchClient = new Elasticsearch(targetInstance.id)
1212

1313
return new Promise<AddNodeResponse>((resolve, reject) => {
1414

1515
elasticsearchClient.updateRebalancingStatus("none")
16-
.then(() => detachInstance(targetInstance, asg))
16+
.then(() => launchNewInstance(targetInstance, asg))
1717
.then(() => elasticsearchClient.getClusterHealth())
1818
.then((clusterStatus: ElasticsearchClusterStatus) => {
1919
const response: AddNodeResponse = {
20-
"asgName": asg,
20+
"destinationAsgName": asg,
2121
"targetElasticSearchNode": event.targetElasticSearchNode,
2222
"expectedClusterSize": clusterStatus.number_of_nodes + 1
2323
};

src/autoScalingGroupCheck.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import {getASG} from "./aws/autoscaling";
33

44
export async function handler(event: AsgInput): Promise<AsgInput> {
55
try {
6-
const asg = await getASG(event.asgName)
6+
const asg = await getASG(event.destinationAsgName)
77

88
if (asg.MaxSize <= asg.Instances.length) {
99
const error = `ASG MaxSize must be greater than Desired Capacity to allow for ReattachTargetInstance step.`

src/aws/autoscaling.ts

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,50 @@ import {
55
AutoScaling, AutoScalingGroup, DescribeAutoScalingGroupsCommand, DescribeAutoScalingGroupsCommandOutput,
66
DetachInstancesCommand,
77
DetachInstancesCommandOutput,
8+
SetDesiredCapacityCommand,
89
TerminateInstanceInAutoScalingGroupCommand,
910
TerminateInstanceInAutoScalingGroupCommandOutput
1011
} from "@aws-sdk/client-auto-scaling";
1112

1213
const awsAutoscaling = new AutoScaling();
1314

14-
export function detachInstance(instance: Instance, asgName: string): Promise<DetachInstancesCommandOutput> {
15-
console.log(`Detaching ${instance.id} from ${asgName}. This should also bring a new instance into the ASG`);
16-
const params = {
17-
InstanceIds: [ instance.id ],
18-
AutoScalingGroupName: asgName,
19-
ShouldDecrementDesiredCapacity: false
20-
};
21-
const req = new DetachInstancesCommand(params);
15+
export async function launchNewInstance(instance: Instance, asgName: string): Promise<DetachInstancesCommandOutput> {
16+
if (instance.autoScalingGroupName === asgName) {
17+
console.log(`Detaching ${instance.id} from ${asgName}. This should also bring a new instance into the ASG`);
18+
const params = {
19+
InstanceIds: [ instance.id ],
20+
AutoScalingGroupName: asgName,
21+
ShouldDecrementDesiredCapacity: false
22+
};
23+
const req = new DetachInstancesCommand(params);
2224

23-
return retry(() => awsAutoscaling.send(req), `detaching instance ${instance.id}`, 5)
25+
return retry(() => awsAutoscaling.send(req), `detaching instance ${instance.id}`, 5)
26+
} else {
27+
console.log(`Launch new instance to new ASG ${asgName}.`);
28+
const asgs = await retry(
29+
() => awsAutoscaling.send(new DescribeAutoScalingGroupsCommand({
30+
AutoScalingGroupNames: [asgName]
31+
})),
32+
`getting current capacity in ${asgName}`,
33+
5,
34+
);
35+
if (!asgs.AutoScalingGroups || asgs.AutoScalingGroups.length === 0) {
36+
throw new Error(`No AutoScalingGroup found with name ${asgName}`);
37+
}
38+
const capacity = asgs.AutoScalingGroups[0].DesiredCapacity;
39+
if (typeof capacity !== 'number' || isNaN(capacity)) {
40+
throw new Error(`DesiredCapacity is not defined or not a number for ASG ${asgName}`);
41+
}
42+
43+
return retry(
44+
() => awsAutoscaling.send(new SetDesiredCapacityCommand({
45+
AutoScalingGroupName: asgName,
46+
DesiredCapacity: capacity + 1,
47+
})),
48+
`launching new instance in ${asgName}`,
49+
5,
50+
);
51+
}
2452
}
2553

2654
export function attachInstance(instance: Instance, asgName: string): Promise<{}> {

src/clusterSizeCheck.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import {getASG} from "./aws/autoscaling";
77

88
export async function handler(event: AddNodeResponse): Promise<TargetAndNewNodeResponse> {
99

10-
const asg = await getASG(event.asgName)
10+
const asg = await getASG(event.destinationAsgName)
1111
const instanceIds = asg.Instances.map(i => i.InstanceId)
1212
const newestInstance = await getSpecificInstance(instanceIds, findNewestInstance)
1313
const elasticsearchClient = new Elasticsearch(event.targetElasticSearchNode.ec2Instance.id)
@@ -27,7 +27,7 @@ export async function handler(event: AddNodeResponse): Promise<TargetAndNewNodeR
2727
})
2828
.then( (newestElasticsearchNode: ElasticsearchNode) => {
2929
const response: TargetAndNewNodeResponse = {
30-
"asgName": event.asgName,
30+
"destinationAsgName": event.destinationAsgName,
3131
"targetElasticSearchNode": event.targetElasticSearchNode,
3232
"newestElasticsearchNode": newestElasticsearchNode
3333
};

src/elasticsearch/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export class ElasticsearchNode {
1212
nodeId: string;
1313
isMasterEligible: boolean;
1414

15-
constructor(instance: Instance, nodeId: string, isMasterEligible) {
15+
constructor(instance: Instance, nodeId: string, isMasterEligible: boolean) {
1616
this.ec2Instance = instance;
1717
this.nodeId = nodeId;
1818
this.isMasterEligible = isMasterEligible

src/getTargetNode.ts

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,48 @@ import {getInstancesByTag} from './aws/ec2Instances';
44
import {getASGsByTag} from "./aws/autoscaling";
55
import {Elasticsearch} from "./elasticsearch/elasticsearch";
66
import {Instance} from "./aws/types";
7+
import { type AutoScalingGroup } from '@aws-sdk/client-auto-scaling';
8+
9+
function asgTagsToRecord(asg: AutoScalingGroup): Record<string, string> {
10+
return Object.fromEntries(
11+
(asg.Tags ?? [])
12+
.filter(tag => tag.Key !== undefined && tag.Value !== undefined)
13+
.map(tag => [tag.Key!, tag.Value!])
14+
);
15+
}
16+
17+
/** attempt to find an ASG with the same tagging as the target instance's,
18+
* but identifies itself as a 'new' ASG (using the 'gu:riffraff:new-asg'
19+
* tag) so should be the destination of the node rotation.
20+
* Only respects the Stage/Stack/App tags, when defined on the target
21+
* instance's ASG.
22+
*/
23+
function findNewAsgMatchingInstanceAsg(
24+
eligibleASGs: AutoScalingGroup[],
25+
targetInstance: Instance,
26+
): AutoScalingGroup | undefined {
27+
28+
const targetInstanceAsg = eligibleASGs
29+
.find(a => a.AutoScalingGroupName === targetInstance.autoScalingGroupName);
30+
31+
if (!targetInstanceAsg) {
32+
throw new Error(`Couldn't find target instance's ASG (${targetInstance.autoScalingGroupName}) in the list of eligible ASGs - that shouldn't happen!`);
33+
}
34+
35+
const targetAsgTags = asgTagsToRecord(targetInstanceAsg);
36+
const expectedTags = ['App', 'Stack', 'Stage'].filter(t => t in targetAsgTags);
37+
38+
const newAsg = eligibleASGs.find(a => {
39+
const asgTags = asgTagsToRecord(a);
40+
41+
return a.AutoScalingGroupName !== targetInstanceAsg.AutoScalingGroupName
42+
&& expectedTags.every(t => asgTags[t] === targetAsgTags[t])
43+
&& 'gu:riffraff:new-asg' in asgTags;
44+
});
45+
46+
return newAsg;
47+
48+
}
749

850
export async function handler(event: StateMachineInput): Promise<AsgDiscoveryResponse> {
951
const runningExecutionsPromise = totalRunningExecutions(event.stepFunctionArn)
@@ -14,14 +56,12 @@ export async function handler(event: StateMachineInput): Promise<AsgDiscoveryRes
1456
return { skipRotation: true };
1557
}
1658

17-
const eligibleASGs = (
18-
await getASGsByTag(event.autoScalingGroupDiscoveryTagKey, "true")
19-
).map(asg => asg.AutoScalingGroupName);
59+
const eligibleASGs = await getASGsByTag(event.autoScalingGroupDiscoveryTagKey, "true");
2060

2161
const eligibleInstances = (
2262
// TODO it would be nice to not need the Tags on the instances as well, but currently used in the ElasticsearchAdminSsmPolicy IAM policy in cloudformation.yaml
2363
await getInstancesByTag(event.autoScalingGroupDiscoveryTagKey, "true")
24-
).filter(i => eligibleASGs.includes(i.autoScalingGroupName));
64+
).filter(i => eligibleASGs.some(a => a.AutoScalingGroupName === i.autoScalingGroupName));
2565

2666
// We can manually run rotation against a particular instance if needed
2767
if(event.targetInstanceId) {
@@ -38,7 +78,11 @@ export async function handler(event: StateMachineInput): Promise<AsgDiscoveryRes
3878
const elasticsearchClient = new Elasticsearch(targetInstanceId);
3979
const targetElasticSearchNode = await elasticsearchClient.getElasticsearchNode(targetInstance);
4080
console.log(`Instance ${targetInstanceId} (ASG: ${asgName}) specified as input. Moving on...`);
41-
return { asgName, targetElasticSearchNode, skipRotation: false };
81+
82+
const maybeNewAsg = findNewAsgMatchingInstanceAsg(eligibleASGs, targetInstance);
83+
84+
const destinationAsgName = maybeNewAsg?.AutoScalingGroupName ?? asgName;
85+
return { destinationAsgName, targetElasticSearchNode, skipRotation: false };
4286
}
4387

4488
console.log(`Found ${eligibleInstances.length} instances with tag ${event.autoScalingGroupDiscoveryTagKey}`);
@@ -58,8 +102,10 @@ export async function handler(event: StateMachineInput): Promise<AsgDiscoveryRes
58102
const elasticsearchClient = new Elasticsearch(oldestInstance.id);
59103
const targetElasticSearchNode = await elasticsearchClient.getElasticsearchNode(oldestInstance);
60104
console.log(`Triggering rotation of oldest instance ${oldestInstance.id} (ASG: ${oldestInstance.autoScalingGroupName})`);
105+
106+
const maybeNewAsg = findNewAsgMatchingInstanceAsg(eligibleASGs, oldestInstance);
61107
return {
62-
asgName: oldestInstance.autoScalingGroupName,
108+
destinationAsgName: maybeNewAsg?.AutoScalingGroupName ?? oldestInstance.autoScalingGroupName,
63109
targetElasticSearchNode,
64110
skipRotation: false
65111
}

src/reattachTargetInstance.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@ import {Instance} from './aws/types';
55
export async function handler(event: TargetAndNewNodeResponse): Promise<TargetAndNewNodeResponse> {
66

77
const targetInstance: Instance = event.targetElasticSearchNode.ec2Instance;
8-
const asg: string = event.asgName;
8+
const asg: string = event.destinationAsgName;
9+
10+
if (targetInstance.autoScalingGroupName !== asg) {
11+
console.log(`New instance launched in different ASG than target instance, so nothing to reattach`);
12+
return event;
13+
}
914

1015
return new Promise<TargetAndNewNodeResponse>((resolve, reject) => {
1116
attachInstance(targetInstance, asg)

src/utils/handlerInputs.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ export interface StateMachineInput {
88
}
99

1010
export interface AsgInput {
11-
asgName: string;
11+
// The ASG a node will be rotated _into_. In certain circumstances this may
12+
// not be the same as the ASG the node was rotated _out of_, which is stored at
13+
// targetElasticSearchNode.ec2Instance.autoScalingGroupName.
14+
destinationAsgName: string;
1215
targetElasticSearchNode: ElasticsearchNode;
1316
}
1417

0 commit comments

Comments
 (0)