Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 43 additions & 7 deletions go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,14 @@ const (

var (
// The following flags initialize the tablet record.
tabletHostname string
initKeyspace string
initShard string
initTabletType string
initDbNameOverride string
skipBuildInfoTags = "/.*/"
initTags flagutil.StringMapValue
tabletHostname string
initKeyspace string
initShard string
initTabletType string
initTabletTypeLookup bool
initDbNameOverride string
skipBuildInfoTags = "/.*/"
initTags flagutil.StringMapValue

initTimeout = 1 * time.Minute
mysqlShutdownTimeout = mysqlctl.DefaultShutdownTimeout
Expand All @@ -106,6 +107,7 @@ func registerInitFlags(fs *pflag.FlagSet) {
utils.SetFlagStringVar(fs, &initKeyspace, "init-keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet")
utils.SetFlagStringVar(fs, &initShard, "init-shard", initShard, "(init parameter) shard to use for this tablet")
utils.SetFlagStringVar(fs, &initTabletType, "init-tablet-type", initTabletType, "(init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.")
fs.BoolVar(&initTabletTypeLookup, "init-tablet-type-lookup", initTabletTypeLookup, "(optional, init parameter) if enabled, look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.")
utils.SetFlagStringVar(fs, &initDbNameOverride, "init-db-name-override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>")
utils.SetFlagStringVar(fs, &skipBuildInfoTags, "vttablet-skip-buildinfo-tags", skipBuildInfoTags, "comma-separated list of buildinfo tags to skip from merging with --init-tags. each tag is either an exact match or a regular expression of the form '/regexp/'.")
utils.SetFlagVar(fs, &initTags, "init-tags", "(init parameter) comma separated list of key:value pairs used to tag the tablet")
Expand Down Expand Up @@ -373,6 +375,40 @@ func (tm *TabletManager) Start(tablet *topodatapb.Tablet, config *tabletenv.Tabl
tm.DBConfigs.DBName = topoproto.TabletDbName(tablet)
tm.tabletAlias = tablet.Alias
tm.tmc = tmclient.NewTabletManagerClient()

// Check if there's an existing tablet record in topology and use it if flag is enabled
if initTabletTypeLookup {
ctx, cancel := context.WithTimeout(tm.BatchCtx, initTimeout)
defer cancel()
existingTablet, err := tm.TopoServer.GetTablet(ctx, tablet.Alias)
if err != nil && !topo.IsErrType(err, topo.NoNode) {
// Error other than "node doesn't exist" - return it
return vterrors.Wrap(err, "failed to check for existing tablet record")
}

// If we found an existing tablet record, use its tablet type instead of the initial one
if err == nil {
// Skip transient operational types (BACKUP, RESTORE)
// These are temporary states that should not be preserved across restarts
if existingTablet.Type == topodatapb.TabletType_BACKUP || existingTablet.Type == topodatapb.TabletType_RESTORE {
log.Infof("Found existing tablet record with transient type %v, using init-tablet-type %v instead",
existingTablet.Type, tablet.Type)
} else {
log.Infof("Found existing tablet record with --init-tablet-type-lookup enabled, using tablet type %v from topology instead of init-tablet-type %v",
existingTablet.Type, tablet.Type)
tablet.Type = existingTablet.Type
// If it was a PRIMARY, preserve the start time
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to preserve tablet type PRIMARY? Imagine this:

  • vttablet is running as Primary.
  • vttablet crashes.
  • vtorc fails-over.
  • vttablet gets restarted by systemd.
  • vttablet restores its tablet type to Primary, but there is already a primary set by vtorc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

had initially thought the checkPrimaryShip() would handle any conflicts, but seems like it wouldn't revert the tablet back to REPLICA.

added special handling for type detected previously as PRIMARY to be set to REPLICA, allowing checkPrimaryShip() to set back to PRIMARY if appropriate.
f2eb9b8

if existingTablet.Type == topodatapb.TabletType_PRIMARY {
tablet.PrimaryTermStartTime = existingTablet.PrimaryTermStartTime
}
}
} else {
log.Infof("No existing tablet record found, using init-tablet-type: %v", tablet.Type)
}
} else {
log.Infof("Using init-tablet-type %v (--init-tablet-type-lookup is not enabled)", tablet.Type)
}

tm.tmState = newTMState(tm, tablet)
tm.actionSema = semaphore.NewWeighted(1)
tm._waitForGrantsComplete = make(chan struct{})
Expand Down
322 changes: 322 additions & 0 deletions go/vt/vttablet/tabletmanager/tm_init_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -961,3 +961,325 @@ func grantAllPrivilegesToUser(t *testing.T, connParams mysql.ConnParams, testUse
require.NoError(t, err)
conn.Close()
}

func TestInitTabletTypeLookup_PreservesRDONLY(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA (normal startup) with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate operator changing tablet to RDONLY in topology
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_RDONLY
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should preserve RDONLY
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_RDONLY, ti.Type)
tm.Stop()
}

func TestInitTabletTypeLookup_PreservesPrimaryWithTermTime(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate promotion to PRIMARY with a specific term start time
now := time.Now()
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_PRIMARY
t.PrimaryTermStartTime = protoutil.TimeToProto(now)
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should preserve PRIMARY and term start time
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
assert.Equal(t, now.Unix(), ti.GetPrimaryTermStartTime().Unix())
tm.Stop()
}

func TestInitTabletTypeLookup_FallbackWhenNoTopoRecord(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// Start new tablet with flag enabled but no existing topo record
initTabletTypeLookup = true
tm := newTestTM(t, ts, 1, "ks", "0", nil)
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should use initTabletType (REPLICA)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()
}

func TestInitTabletTypeLookup_DisabledUsesInitType(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate operator changing tablet to RDONLY in topology
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_RDONLY
return nil
})
require.NoError(t, err)

// 3. Restart with flag still disabled - should use initTabletType (REPLICA)
initTabletTypeLookup = false
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Topo record should be overwritten with REPLICA
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()
}

func TestInitTabletTypeLookup_SkipsTransientBackupType(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate crash during backup (tablet type is BACKUP in topo)
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_BACKUP
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should skip BACKUP and use initTabletType
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should use initTabletType (REPLICA), not preserve BACKUP
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()
}

func TestInitTabletTypeLookup_SkipsTransientRestoreType(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate crash during restore (tablet type is RESTORE in topo)
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_RESTORE
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should skip RESTORE and use initTabletType
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should use initTabletType (REPLICA), not preserve RESTORE
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()
}

func TestInitTabletTypeLookup_PreservesDrained(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Admin sets type to DRAINED for maintenance
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_DRAINED
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should preserve DRAINED
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should preserve DRAINED from topology
assert.Equal(t, topodatapb.TabletType_DRAINED, ti.Type)
tm.Stop()
}

func TestInitTabletTypeLookup_InteractionWithCheckPrimaryShip(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, 1, "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Set shard's PrimaryAlias to this tablet
now := time.Now()
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.PrimaryAlias = alias
si.PrimaryTermStartTime = protoutil.TimeToProto(now)
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - checkPrimaryShip should still promote to PRIMARY
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should be PRIMARY due to checkPrimaryShip logic
assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
tm.Stop()
}