Skip to content

Commit 6301472

Browse files
sbaker617tanjinx
andauthored
[slack-22.0] restore tablet type during the tabletmanager initialization (#741)
* backport tablet type lookup to v22 Signed-off-by: Stephen Baker <[email protected]> * more txt adjustments --------- Signed-off-by: Stephen Baker <[email protected]> Co-authored-by: Tanjin Xu <[email protected]>
1 parent 9d6acc8 commit 6301472

File tree

4 files changed

+383
-10
lines changed

4 files changed

+383
-10
lines changed

go/flags/endtoend/vtcombo.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,11 @@ Flags:
173173
--hot_row_protection_concurrent_transactions int Number of concurrent transactions let through to the txpool/MySQL for the same hot row. Should be > 1 to have enough 'ready' transactions in MySQL and benefit from a pipelining effect. (default 5)
174174
--hot_row_protection_max_global_queue_size int Global queue limit across all row (ranges). Useful to prevent that the queue can grow unbounded. (default 1000)
175175
--hot_row_protection_max_queue_size int Maximum number of BeginExecute RPCs which will be queued for the same row (range). (default 20)
176+
--init-tablet-type-lookup (Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init_tablet_type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init_tablet_type will be used.
176177
--init_db_name_override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>
177178
--init_keyspace string (init parameter) keyspace to use for this tablet
178179
--init_shard string (init parameter) shard to use for this tablet
179-
--init_tablet_type string (init parameter) the tablet type to use for this tablet.
180+
--init_tablet_type string (init parameter) the tablet type to use for this tablet. Can be REPLICA, RDONLY, or SPARE. The default is REPLICA.
180181
--init_tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet
181182
--init_timeout duration (init parameter) timeout to use for the init phase. (default 1m0s)
182183
--jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done

go/flags/endtoend/vttablet.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,11 @@ Flags:
201201
--hot_row_protection_concurrent_transactions int Number of concurrent transactions let through to the txpool/MySQL for the same hot row. Should be > 1 to have enough 'ready' transactions in MySQL and benefit from a pipelining effect. (default 5)
202202
--hot_row_protection_max_global_queue_size int Global queue limit across all row (ranges). Useful to prevent that the queue can grow unbounded. (default 1000)
203203
--hot_row_protection_max_queue_size int Maximum number of BeginExecute RPCs which will be queued for the same row (range). (default 20)
204+
--init-tablet-type-lookup (Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init_tablet_type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init_tablet_type will be used.
204205
--init_db_name_override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>
205206
--init_keyspace string (init parameter) keyspace to use for this tablet
206207
--init_shard string (init parameter) shard to use for this tablet
207-
--init_tablet_type string (init parameter) the tablet type to use for this tablet.
208+
--init_tablet_type string (init parameter) the tablet type to use for this tablet. Can be REPLICA, RDONLY, or SPARE. The default is REPLICA.
208209
--init_tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet
209210
--init_timeout duration (init parameter) timeout to use for the init phase. (default 1m0s)
210211
--jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done

go/vt/vttablet/tabletmanager/tm_init.go

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,14 @@ const (
8888

8989
var (
9090
// The following flags initialize the tablet record.
91-
tabletHostname string
92-
initKeyspace string
93-
initShard string
94-
initTabletType string
95-
initDbNameOverride string
96-
skipBuildInfoTags = "/.*/"
97-
initTags flagutil.StringMapValue
91+
tabletHostname string
92+
initKeyspace string
93+
initShard string
94+
initTabletType string
95+
initTabletTypeLookup bool
96+
initDbNameOverride string
97+
skipBuildInfoTags = "/.*/"
98+
initTags flagutil.StringMapValue
9899

99100
initTimeout = 1 * time.Minute
100101
mysqlShutdownTimeout = mysqlctl.DefaultShutdownTimeout
@@ -104,7 +105,8 @@ func registerInitFlags(fs *pflag.FlagSet) {
104105
fs.StringVar(&tabletHostname, "tablet_hostname", tabletHostname, "if not empty, this hostname will be assumed instead of trying to resolve it")
105106
fs.StringVar(&initKeyspace, "init_keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet")
106107
fs.StringVar(&initShard, "init_shard", initShard, "(init parameter) shard to use for this tablet")
107-
fs.StringVar(&initTabletType, "init_tablet_type", initTabletType, "(init parameter) the tablet type to use for this tablet.")
108+
fs.StringVar(&initTabletType, "init_tablet_type", initTabletType, "(init parameter) the tablet type to use for this tablet. Can be REPLICA, RDONLY, or SPARE. The default is REPLICA.")
109+
fs.BoolVar(&initTabletTypeLookup, "init-tablet-type-lookup", initTabletTypeLookup, "(Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init_tablet_type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init_tablet_type will be used.")
108110
fs.StringVar(&initDbNameOverride, "init_db_name_override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_<keyspacename>")
109111
fs.StringVar(&skipBuildInfoTags, "vttablet_skip_buildinfo_tags", skipBuildInfoTags, "comma-separated list of buildinfo tags to skip from merging with --init_tags. each tag is either an exact match or a regular expression of the form '/regexp/'.")
110112
fs.Var(&initTags, "init_tags", "(init parameter) comma separated list of key:value pairs used to tag the tablet")
@@ -371,6 +373,45 @@ func (tm *TabletManager) Start(tablet *topodatapb.Tablet, config *tabletenv.Tabl
371373
log.Infof("TabletManager Start")
372374
tm.DBConfigs.DBName = topoproto.TabletDbName(tablet)
373375
tm.tabletAlias = tablet.Alias
376+
377+
// Check if there's an existing tablet record in topology and use it if flag is enabled
378+
if initTabletTypeLookup {
379+
ctx, cancel := context.WithTimeout(tm.BatchCtx, initTimeout)
380+
defer cancel()
381+
existingTablet, err := tm.TopoServer.GetTablet(ctx, tablet.Alias)
382+
if err != nil && !topo.IsErrType(err, topo.NoNode) {
383+
// Error other than "node doesn't exist" - return it
384+
return vterrors.Wrap(err, "failed to get existing tablet record from topology, unable to determine tablet type during startup")
385+
}
386+
387+
// If we found an existing tablet record, determine which type to use
388+
switch {
389+
case err != nil:
390+
// No existing tablet found, use init_tablet_type
391+
log.Infof("No existing tablet record found, using init_tablet_type: %v", tablet.Type)
392+
393+
case existingTablet.Type == topodatapb.TabletType_PRIMARY:
394+
// Don't set to PRIMARY yet - let checkPrimaryShip() validate and decide
395+
// checkPrimaryShip() has the logic to verify shard records and determine if this tablet should really be PRIMARY
396+
log.Infof("Found existing tablet record with PRIMARY type, setting to REPLICA and allowing checkPrimaryShip() to validate")
397+
tablet.Type = topodatapb.TabletType_REPLICA
398+
399+
case existingTablet.Type == topodatapb.TabletType_BACKUP || existingTablet.Type == topodatapb.TabletType_RESTORE:
400+
// Skip transient operational types (BACKUP, RESTORE)
401+
// These are temporary states that should not be preserved across restarts
402+
log.Infof("Found existing tablet record with transient type %v, using init_tablet_type %v instead",
403+
existingTablet.Type, tablet.Type)
404+
405+
default:
406+
// Safe to restore the type for non-PRIMARY, non-transient types
407+
log.Infof("Found existing tablet record, using tablet type %v from topology instead of init_tablet_type %v",
408+
existingTablet.Type, tablet.Type)
409+
tablet.Type = existingTablet.Type
410+
}
411+
} else {
412+
log.Infof("Using init_tablet_type %v", tablet.Type)
413+
}
414+
374415
tm.tmc = tmclient.NewTabletManagerClient()
375416
tm.tmState = newTMState(tm, tablet)
376417
tm.actionSema = semaphore.NewWeighted(1)

0 commit comments

Comments
 (0)