Skip to content

Commit 4397174

Browse files
grpctmclient: validate tablet record on dial
Signed-off-by: Tim Vaillancourt <[email protected]>
1 parent ee18041 commit 4397174

File tree

10 files changed

+287
-45
lines changed

10 files changed

+287
-45
lines changed

go/test/endtoend/reparent/plannedreparent/reparent_test.go

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"google.golang.org/protobuf/encoding/protojson"
3030

3131
"vitess.io/vitess/go/mysql/replication"
32+
"vitess.io/vitess/go/protoutil"
3233
"vitess.io/vitess/go/test/endtoend/cluster"
3334
"vitess.io/vitess/go/test/endtoend/reparent/utils"
3435
"vitess.io/vitess/go/vt/log"
@@ -112,16 +113,43 @@ func TestReparentReplicaOffline(t *testing.T) {
112113
clusterInstance := utils.SetupReparentCluster(t, policy.DurabilitySemiSync)
113114
defer utils.TeardownCluster(clusterInstance)
114115
tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets
116+
killTablet := tablets[3]
115117

116-
// Kill one tablet so we seem offline
117-
utils.StopTablet(t, tablets[3], true)
118+
// TabletShutdownTime is v24+.
119+
if clusterInstance.VtctlMajorVersion >= 24 {
120+
tabletInfo, err := clusterInstance.VtctldClientProcess.GetTablet(killTablet.Alias)
121+
require.NoError(t, err)
122+
require.Nil(t, tabletInfo.TabletShutdownTime)
123+
}
124+
125+
// Gracefully kill one tablet so we seem offline. Use a SIGKILL-fallback delay of 30s, like kube.
126+
startKillTime := time.Now()
127+
killTablet.VttabletProcess.TearDownWithTimeout(30 * time.Second)
128+
129+
// Confirm the tablet shutdown via the topo.
130+
require.EventuallyWithT(t, func(c *assert.CollectT) {
131+
tabletInfo, err := clusterInstance.VtctldClientProcess.GetTablet(killTablet.Alias)
132+
require.NoError(c, err)
133+
134+
// TabletShutdownTime is v24+. Test this is the vttablet and vtctld are v24+.
135+
if clusterInstance.VtctlMajorVersion >= 24 && clusterInstance.VtTabletMajorVersion >= 24 {
136+
require.NotNil(c, tabletInfo.TabletShutdownTime)
137+
shutdownTime := protoutil.TimeFromProto(tabletInfo.TabletShutdownTime)
138+
require.WithinRange(c, shutdownTime, startKillTime, time.Now())
139+
} else {
140+
// TODO: remove this test after v25.
141+
require.Empty(c, tabletInfo.Hostname)
142+
require.Empty(c, tabletInfo.MysqlHostname)
143+
require.Empty(c, tabletInfo.PortMap)
144+
}
145+
}, time.Second, time.Second*31)
118146

119147
// Perform a graceful reparent operation.
120148
out, err := utils.PrsWithTimeout(t, clusterInstance, tablets[1], false, "", "31s")
121149
require.Error(t, err)
122150

123151
// Assert that PRS failed
124-
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc")
152+
assert.Contains(t, out, "rpc error: code = Unknown desc = tablet is shutdown")
125153
utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
126154
}
127155

@@ -151,7 +179,7 @@ func TestReparentAvoid(t *testing.T) {
151179
utils.StopTablet(t, tablets[0], true)
152180
out, err := utils.PrsAvoid(t, clusterInstance, tablets[1])
153181
require.Error(t, err)
154-
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc = latest balancer error")
182+
assert.Contains(t, out, "rpc error: code = Unknown desc = tablet is shutdown")
155183
utils.ValidateTopology(t, clusterInstance, false)
156184
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
157185

go/vt/proto/topodata/topodata.pb.go

Lines changed: 55 additions & 40 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/vt/proto/topodata/topodata_vtproto.pb.go

Lines changed: 53 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/vt/vtorc/inst/instance_dao.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ func ReadTopologyInstanceBufferable(tabletAlias string, latency *stopwatch.Named
191191
goto Cleanup
192192
}
193193

194+
// Don't poll the tablet if we know it is down.
195+
if tablet.TabletShutdownTime != nil {
196+
goto Cleanup
197+
}
198+
194199
fs, err = fullStatus(tablet)
195200
if err != nil {
196201
goto Cleanup

go/vt/vttablet/grpctmclient/client.go

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata"
4646
tabletmanagerservicepb "vitess.io/vitess/go/vt/proto/tabletmanagerservice"
4747
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
48+
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
4849
)
4950

5051
type DialPoolGroup int
@@ -160,9 +161,39 @@ func NewClient() *Client {
160161
}
161162
}
162163

164+
// validateTablet confirms the tablet record contains the necessary fields
165+
// for talking gRPC.
166+
func validateTablet(tablet *topodatapb.Tablet) error {
167+
// v24+ adds a TabletShutdownTime field that is set to "now" on clean shutdown.
168+
if tablet.TabletShutdownTime != nil {
169+
return vterrors.New(vtrpcpb.Code_UNAVAILABLE, "tablet is shutdown")
170+
}
171+
172+
// <= v23 compatibility to similuate missing TabletShutdownTime field. Remove in v25.
173+
if tablet.Hostname == "" && tablet.MysqlHostname == "" && tablet.PortMap == nil && tablet.Type != topodatapb.TabletType_UNKNOWN {
174+
return vterrors.New(vtrpcpb.Code_UNAVAILABLE, "tablet is shutdown")
175+
}
176+
177+
if tablet.Hostname == "" {
178+
return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet hostname")
179+
}
180+
if tablet.PortMap == nil {
181+
return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet port map")
182+
}
183+
grpcPort := int32(tablet.PortMap["grpc"])
184+
if grpcPort <= 0 {
185+
return vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "invalid tablet grpc port")
186+
}
187+
return nil
188+
}
189+
163190
// dial returns a client to use
164191
func (client *grpcClient) dial(ctx context.Context, tablet *topodatapb.Tablet) (tabletmanagerservicepb.TabletManagerClient, io.Closer, error) {
165-
addr := netutil.JoinHostPort(tablet.Hostname, int32(tablet.PortMap["grpc"]))
192+
if err := validateTablet(tablet); err != nil {
193+
return nil, nil, err
194+
}
195+
196+
addr := netutil.JoinHostPort(tablet.Hostname, tablet.PortMap["grpc"])
166197
opt, err := grpcclient.SecureDialOption(cert, key, ca, crl, name)
167198
if err != nil {
168199
return nil, nil, err
@@ -187,6 +218,10 @@ func (client *grpcClient) createTmc(ctx context.Context, addr string, opt grpc.D
187218
}
188219

189220
func (client *grpcClient) dialPool(ctx context.Context, tablet *topodatapb.Tablet) (tabletmanagerservicepb.TabletManagerClient, error) {
221+
if err := validateTablet(tablet); err != nil {
222+
return nil, err
223+
}
224+
190225
addr := netutil.JoinHostPort(tablet.Hostname, int32(tablet.PortMap["grpc"]))
191226
opt, err := grpcclient.SecureDialOption(cert, key, ca, crl, name)
192227
if err != nil {
@@ -227,6 +262,10 @@ func (client *grpcClient) dialPool(ctx context.Context, tablet *topodatapb.Table
227262
}
228263

229264
func (client *grpcClient) dialDedicatedPool(ctx context.Context, dialPoolGroup DialPoolGroup, tablet *topodatapb.Tablet) (tabletmanagerservicepb.TabletManagerClient, invalidatorFunc, error) {
265+
if err := validateTablet(tablet); err != nil {
266+
return nil, nil, err
267+
}
268+
230269
addr := netutil.JoinHostPort(tablet.Hostname, int32(tablet.PortMap["grpc"]))
231270
opt, err := grpcclient.SecureDialOption(cert, key, ca, crl, name)
232271
if err != nil {

0 commit comments

Comments
 (0)