Skip to content

Commit c1acf7a

Browse files
tanjinxbantyKGuptaManan100
authored
Vtorc: Recheck primary health before attempting a failure mitigation (vitessio#18234) (#739)
Signed-off-by: Banty Kumar <[email protected]> Signed-off-by: Manan Gupta <[email protected]> Co-authored-by: Banty Kumar <[email protected]> Co-authored-by: Manan Gupta <[email protected]>
1 parent 6301472 commit c1acf7a

File tree

2 files changed

+152
-0
lines changed

2 files changed

+152
-0
lines changed

go/vt/vtorc/logic/topology_recovery.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,20 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
546546
return err
547547
}
548548

549+
// Prioritise primary recovery.
550+
// If we are performing some other action, first ensure that it is not because of primary issues.
551+
// This step is only meant to improve the time taken to detect and fix cluster wide recoveries, it does not impact correctness.
552+
// If a VTOrc detects an issue on a replica like ReplicationStopped, the underlying cause could be a dead primary instead.
553+
// So, we try to reload that primary's information before proceeding with the replication stopped fix. We do this before acquiring the shard lock
554+
// to allow another VTOrc instance to proceed with the dead primary recovery if it is indeed the case and it detects it before us. If however, the primary
555+
// is not dead, then we will proceed with the fix for the replica. Essentially, we are trading off speed in replica recoveries (by doing an additional primary tablet reload)
556+
// for speed in cluster-wide recoveries (by not holding the shard lock before reloading the primary tablet information).
557+
if !isClusterWideRecovery(checkAndRecoverFunctionCode) {
558+
if err = recheckPrimaryHealth(analysisEntry, DiscoverInstance); err != nil {
559+
return err
560+
}
561+
}
562+
549563
// We lock the shard here and then refresh the tablets information
550564
ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard,
551565
getLockAction(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis),
@@ -667,6 +681,36 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
667681
return err
668682
}
669683

684+
// recheckPrimaryHealth check the health of the primary node.
685+
// It then checks whether, given the re-discovered primary health, the original recovery is still valid.
686+
// If not valid then it will abort the current analysis.
687+
func recheckPrimaryHealth(analysisEntry *inst.ReplicationAnalysis, discoveryFunc func(string, bool)) error {
688+
originalAnalysisEntry := analysisEntry.Analysis
689+
primaryTabletAlias := analysisEntry.AnalyzedInstancePrimaryAlias
690+
691+
// re-check if there are any mitigation required for the leader node.
692+
// if the current problem is because of dead primary, this call will update the analysis entry
693+
discoveryFunc(primaryTabletAlias, true)
694+
695+
// checking if the original analysis is valid even after the primary refresh.
696+
recoveryRequired, err := checkIfAlreadyFixed(analysisEntry)
697+
if err != nil {
698+
log.Infof("recheckPrimaryHealth: Checking if recovery is required returned err: %v", err)
699+
return err
700+
}
701+
702+
// The original analysis for the tablet has changed.
703+
// This could mean that either the original analysis has changed or some other Vtorc instance has already performing the mitigation.
704+
// In either case, the original analysis is stale which can be safely aborted.
705+
if recoveryRequired {
706+
log.Infof("recheckPrimaryHealth: Primary recovery is required, Tablet alias: %v", primaryTabletAlias)
707+
// original analysis is stale, abort.
708+
return fmt.Errorf("aborting %s, primary mitigation is required", originalAnalysisEntry)
709+
}
710+
711+
return nil
712+
}
713+
670714
// checkIfAlreadyFixed checks whether the problem that the analysis entry represents has already been fixed by another agent or not
671715
func checkIfAlreadyFixed(analysisEntry *inst.ReplicationAnalysis) (bool, error) {
672716
// Run a replication analysis again. We will check if the problem persisted

go/vt/vtorc/logic/topology_recovery_test.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@ import (
2424

2525
"github.com/stretchr/testify/require"
2626

27+
"vitess.io/vitess/go/vt/external/golib/sqlutils"
2728
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
2829
"vitess.io/vitess/go/vt/topo/memorytopo"
2930
"vitess.io/vitess/go/vt/topo/topoproto"
31+
"vitess.io/vitess/go/vt/vtctl/reparentutil/policy"
3032
"vitess.io/vitess/go/vt/vtorc/config"
3133
"vitess.io/vitess/go/vt/vtorc/db"
3234
"vitess.io/vitess/go/vt/vtorc/inst"
35+
"vitess.io/vitess/go/vt/vtorc/test"
3336
_ "vitess.io/vitess/go/vt/vttablet/grpctmclient"
3437
)
3538

@@ -311,3 +314,108 @@ func TestGetCheckAndRecoverFunctionCode(t *testing.T) {
311314
})
312315
}
313316
}
317+
318+
func TestRecheckPrimaryHealth(t *testing.T) {
319+
tests := []struct {
320+
name string
321+
info []*test.InfoForRecoveryAnalysis
322+
wantErr string
323+
}{
324+
{
325+
name: "analysis change",
326+
info: []*test.InfoForRecoveryAnalysis{{
327+
TabletInfo: &topodatapb.Tablet{
328+
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100},
329+
Hostname: "localhost",
330+
Keyspace: "ks",
331+
Shard: "0",
332+
Type: topodatapb.TabletType_PRIMARY,
333+
MysqlHostname: "localhost",
334+
MysqlPort: 6709,
335+
},
336+
DurabilityPolicy: "none",
337+
LastCheckValid: 0,
338+
CountReplicas: 4,
339+
CountValidReplicas: 4,
340+
CountValidReplicatingReplicas: 0,
341+
}},
342+
wantErr: "aborting ReplicationStopped, primary mitigation is required",
343+
},
344+
{
345+
name: "analysis did not change",
346+
info: []*test.InfoForRecoveryAnalysis{{
347+
TabletInfo: &topodatapb.Tablet{
348+
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101},
349+
Hostname: "localhost",
350+
Keyspace: "ks",
351+
Shard: "0",
352+
Type: topodatapb.TabletType_PRIMARY,
353+
MysqlHostname: "localhost",
354+
MysqlPort: 6708,
355+
},
356+
DurabilityPolicy: policy.DurabilityNone,
357+
LastCheckValid: 1,
358+
CountReplicas: 4,
359+
CountValidReplicas: 4,
360+
CountValidReplicatingReplicas: 3,
361+
CountValidOracleGTIDReplicas: 4,
362+
CountLoggingReplicas: 2,
363+
IsPrimary: 1,
364+
CurrentTabletType: int(topodatapb.TabletType_PRIMARY),
365+
}, {
366+
TabletInfo: &topodatapb.Tablet{
367+
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100},
368+
Hostname: "localhost",
369+
Keyspace: "ks",
370+
Shard: "0",
371+
Type: topodatapb.TabletType_REPLICA,
372+
MysqlHostname: "localhost",
373+
MysqlPort: 6709,
374+
},
375+
DurabilityPolicy: policy.DurabilityNone,
376+
PrimaryTabletInfo: &topodatapb.Tablet{
377+
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101},
378+
},
379+
LastCheckValid: 1,
380+
ReadOnly: 1,
381+
ReplicationStopped: 1,
382+
}},
383+
},
384+
}
385+
386+
for _, tt := range tests {
387+
t.Run(tt.name, func(t *testing.T) {
388+
// reset vtorc db after every test
389+
oldDB := db.Db
390+
defer func() {
391+
db.Db = oldDB
392+
}()
393+
394+
var rowMaps []sqlutils.RowMap
395+
for _, analysis := range tt.info {
396+
analysis.SetValuesFromTabletInfo()
397+
rowMaps = append(rowMaps, analysis.ConvertToRowMap())
398+
}
399+
400+
// set replication analysis in Vtorc DB.
401+
db.Db = test.NewTestDB([][]sqlutils.RowMap{rowMaps})
402+
403+
err := recheckPrimaryHealth(&inst.ReplicationAnalysis{
404+
AnalyzedInstanceAlias: "zon1-0000000100",
405+
Analysis: inst.ReplicationStopped,
406+
AnalyzedKeyspace: "ks",
407+
AnalyzedShard: "0",
408+
}, func(s string, b bool) {
409+
// the implementation for DiscoverInstance is not required because we are mocking the db response.
410+
})
411+
412+
if tt.wantErr != "" {
413+
require.EqualError(t, err, tt.wantErr)
414+
return
415+
}
416+
417+
require.NoError(t, err)
418+
})
419+
}
420+
421+
}

0 commit comments

Comments
 (0)