Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion go/flags/endtoend/vtcombo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ Flags:
--init-db-name-override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>
--init-keyspace string (init parameter) keyspace to use for this tablet
--init-shard string (init parameter) shard to use for this tablet
--init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.
--init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: REPLICA, RDONLY, and SPARE. The default is REPLICA.
--init-tablet-type-lookup (Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.
--init-tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet
--init-timeout duration (init parameter) timeout to use for the init phase. (default 1m0s)
--jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done
Expand Down
3 changes: 2 additions & 1 deletion go/flags/endtoend/vttablet.txt
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ Flags:
--init-db-name-override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>
--init-keyspace string (init parameter) keyspace to use for this tablet
--init-shard string (init parameter) shard to use for this tablet
--init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.
--init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: REPLICA, RDONLY, and SPARE. The default is REPLICA.
--init-tablet-type-lookup (Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.
--init-tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet
--init-timeout duration (init parameter) timeout to use for the init phase. (default 1m0s)
--jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done
Expand Down
54 changes: 46 additions & 8 deletions go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,14 @@ const (

var (
// The following flags initialize the tablet record.
tabletHostname string
initKeyspace string
initShard string
initTabletType string
initDbNameOverride string
skipBuildInfoTags = "/.*/"
initTags flagutil.StringMapValue
tabletHostname string
initKeyspace string
initShard string
initTabletType string
initTabletTypeLookup bool
initDbNameOverride string
skipBuildInfoTags = "/.*/"
initTags flagutil.StringMapValue

initTimeout = 1 * time.Minute
mysqlShutdownTimeout = mysqlctl.DefaultShutdownTimeout
Expand All @@ -105,7 +106,8 @@ func registerInitFlags(fs *pflag.FlagSet) {
utils.SetFlagStringVar(fs, &tabletHostname, "tablet-hostname", tabletHostname, "if not empty, this hostname will be assumed instead of trying to resolve it")
utils.SetFlagStringVar(fs, &initKeyspace, "init-keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet")
utils.SetFlagStringVar(fs, &initShard, "init-shard", initShard, "(init parameter) shard to use for this tablet")
utils.SetFlagStringVar(fs, &initTabletType, "init-tablet-type", initTabletType, "(init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.")
utils.SetFlagStringVar(fs, &initTabletType, "init-tablet-type", initTabletType, "(init parameter) tablet type to use for this tablet. Valid values are: REPLICA, RDONLY, and SPARE. The default is REPLICA.")
fs.BoolVar(&initTabletTypeLookup, "init-tablet-type-lookup", initTabletTypeLookup, "(Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.")
utils.SetFlagStringVar(fs, &initDbNameOverride, "init-db-name-override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>")
utils.SetFlagStringVar(fs, &skipBuildInfoTags, "vttablet-skip-buildinfo-tags", skipBuildInfoTags, "comma-separated list of buildinfo tags to skip from merging with --init-tags. each tag is either an exact match or a regular expression of the form '/regexp/'.")
utils.SetFlagVar(fs, &initTags, "init-tags", "(init parameter) comma separated list of key:value pairs used to tag the tablet")
Expand Down Expand Up @@ -373,6 +375,42 @@ func (tm *TabletManager) Start(tablet *topodatapb.Tablet, config *tabletenv.Tabl
tm.DBConfigs.DBName = topoproto.TabletDbName(tablet)
tm.tabletAlias = tablet.Alias
tm.tmc = tmclient.NewTabletManagerClient()

// Check if there's an existing tablet record in topology and use it if flag is enabled
if initTabletTypeLookup {
ctx, cancel := context.WithTimeout(tm.BatchCtx, initTimeout)
defer cancel()
existingTablet, err := tm.TopoServer.GetTablet(ctx, tablet.Alias)
if err != nil && !topo.IsErrType(err, topo.NoNode) {
// Error other than "node doesn't exist" - return it
return vterrors.Wrap(err, "--init-tablet-type-lookup is enabled but failed to get existing tablet record from topology, unable to determine tablet type during startup")
}

// If we found an existing tablet record, determine which type to use
switch {
case err != nil:
// No existing tablet found, use init-tablet-type
log.Infof("No existing tablet record found, using init-tablet-type: %v", tablet.Type)
case existingTablet.Type == topodatapb.TabletType_PRIMARY:
// Don't set to PRIMARY yet - let checkPrimaryShip() validate and decide
// checkPrimaryShip() has the logic to verify shard records and determine if this tablet should really be PRIMARY
log.Infof("Found existing tablet record with PRIMARY type, setting to REPLICA and allowing checkPrimaryShip() to validate")
tablet.Type = topodatapb.TabletType_REPLICA
case existingTablet.Type == topodatapb.TabletType_BACKUP || existingTablet.Type == topodatapb.TabletType_RESTORE:
// Skip transient operational types (BACKUP, RESTORE)
// These are temporary states that should not be preserved across restarts
log.Infof("Found existing tablet record with transient type %v, using init-tablet-type %v instead",
existingTablet.Type, tablet.Type)
default:
// Safe to restore the type for non-PRIMARY, non-transient types
log.Infof("Found existing tablet record with --init-tablet-type-lookup enabled, using tablet type %v from topology instead of init-tablet-type %v",
existingTablet.Type, tablet.Type)
tablet.Type = existingTablet.Type
}
} else {
log.Infof("Using init-tablet-type %v", tablet.Type)
}

tm.tmState = newTMState(tm, tablet)
tm.actionSema = semaphore.NewWeighted(1)
tm._waitForGrantsComplete = make(chan struct{})
Expand Down
277 changes: 277 additions & 0 deletions go/vt/vttablet/tabletmanager/tm_init_test.go
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️ These look pretty good. I'll take a closer look at them once we agree on the other notes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated tests based on comments 🙇‍♂️ 9727ef9

Original file line number Diff line number Diff line change
Expand Up @@ -961,3 +961,280 @@ func grantAllPrivilegesToUser(t *testing.T, connParams mysql.ConnParams, testUse
require.NoError(t, err)
conn.Close()
}

func TestInitTabletTypeLookup_PreservesTabletTypes(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

tests := []struct {
name string
preservedType topodatapb.TabletType
}{
{
name: "RDONLY",
preservedType: topodatapb.TabletType_RDONLY,
},
{
name: "DRAINED",
preservedType: topodatapb.TabletType_DRAINED,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := t.Context()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA (normal startup) with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate operator changing tablet type in topology
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = tt.preservedType
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should preserve the tablet type
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
defer tm.Stop()
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, tt.preservedType, ti.Type)
})
}
}

func TestInitTabletTypeLookup_PreservesPrimaryWithTermTime(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx := t.Context()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate promotion to PRIMARY with a specific term start time
now := time.Now()
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_PRIMARY
t.PrimaryTermStartTime = protoutil.TimeToProto(now)
return nil
})
require.NoError(t, err)

// 3. Update shard's PrimaryAlias to point to this tablet so checkPrimaryShip will promote it
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.PrimaryAlias = alias
si.PrimaryTermStartTime = protoutil.TimeToProto(now)
return nil
})
require.NoError(t, err)

// 4. Restart with flag enabled - should set to REPLICA initially, then checkPrimaryShip promotes to PRIMARY
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
defer tm.Stop()
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should be promoted to PRIMARY by checkPrimaryShip and preserve the term start time
assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
assert.Equal(t, now.Unix(), ti.GetPrimaryTermStartTime().Unix())
}

func TestInitTabletTypeLookup_FallbackWhenNoTopoRecord(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx := t.Context()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// Start new tablet with flag enabled but no existing topo record
initTabletTypeLookup = true
tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
defer tm.Stop()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should use initTabletType (REPLICA)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
}

func TestInitTabletTypeLookup_DisabledUsesInitType(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx := t.Context()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate operator changing tablet to RDONLY in topology
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_RDONLY
return nil
})
require.NoError(t, err)

// 3. Restart with flag still disabled - should use initTabletType (REPLICA)
initTabletTypeLookup = false
err = tm.Start(tablet, nil)
require.NoError(t, err)
defer tm.Stop()
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Topo record should be overwritten with REPLICA
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
}

func TestInitTabletTypeLookup_SkipsTransientTypes(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

tests := []struct {
name string
transientType topodatapb.TabletType
}{
{
name: "BACKUP",
transientType: topodatapb.TabletType_BACKUP,
},
{
name: "RESTORE",
transientType: topodatapb.TabletType_RESTORE,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := t.Context()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Simulate crash during backup/restore (tablet type is transient in topo)
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = tt.transientType
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - should skip transient type and use initTabletType
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
defer tm.Stop()
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should use initTabletType (REPLICA), not preserve transient type
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
})
}
}

func TestInitTabletTypeLookup_InteractionWithCheckPrimaryShip(t *testing.T) {
defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx := t.Context()
cell := "cell1"
ts := memorytopo.NewServer(ctx, cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize tablet as REPLICA with flag disabled
initTabletTypeLookup = false
tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
tablet := tm.Tablet()
ensureSrvKeyspace(t, ctx, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Set shard's PrimaryAlias to this tablet
now := time.Now()
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.PrimaryAlias = alias
si.PrimaryTermStartTime = protoutil.TimeToProto(now)
return nil
})
require.NoError(t, err)

// 3. Restart with flag enabled - checkPrimaryShip should still promote to PRIMARY
initTabletTypeLookup = true
err = tm.Start(tablet, nil)
require.NoError(t, err)
defer tm.Stop()
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Should be PRIMARY due to checkPrimaryShip logic
assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
}
Loading