Skip to content

Commit c7fc585

Browse files
haoqixuevan-bradleysrikanthccvtigrannajaryan
authored
Gracefully shutdown of the websocket client (#213)
Resolves #163 Co-authored-by: Evan Bradley <[email protected]> Co-authored-by: Srikanth Chekuri <[email protected]> Co-authored-by: Tigran Najaryan <[email protected]>
1 parent c319cfa commit c7fc585

File tree

4 files changed

+326
-35
lines changed

4 files changed

+326
-35
lines changed

client/internal/wsreceiver.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ type wsReceiver struct {
1717
sender *WSSender
1818
callbacks types.Callbacks
1919
processor receivedProcessor
20+
21+
// Indicates that the receiver has fully stopped.
22+
stopped chan struct{}
2023
}
2124

2225
// NewWSReceiver creates a new Receiver that uses WebSocket to receive
@@ -36,25 +39,40 @@ func NewWSReceiver(
3639
sender: sender,
3740
callbacks: callbacks,
3841
processor: newReceivedProcessor(logger, callbacks, sender, clientSyncedState, packagesStateProvider, capabilities),
42+
stopped: make(chan struct{}),
3943
}
4044

4145
return w
4246
}
4347

44-
// ReceiverLoop runs the receiver loop. To stop the receiver cancel the context.
48+
// Start starts the receiver loop.
49+
func (r *wsReceiver) Start(ctx context.Context) {
50+
go r.ReceiverLoop(ctx)
51+
}
52+
53+
// IsStopped returns a channel that's closed when the receiver is stopped.
54+
func (r *wsReceiver) IsStopped() <-chan struct{} {
55+
return r.stopped
56+
}
57+
58+
// ReceiverLoop runs the receiver loop.
59+
// To stop the receiver cancel the context and close the websocket connection
4560
func (r *wsReceiver) ReceiverLoop(ctx context.Context) {
4661
type receivedMessage struct {
4762
message *protobufs.ServerToAgent
4863
err error
4964
}
5065

66+
defer func() { close(r.stopped) }()
67+
5168
for {
5269
select {
5370
case <-ctx.Done():
5471
return
5572
default:
5673
result := make(chan receivedMessage, 1)
5774

75+
// To stop this goroutine, close the websocket connection
5876
go func() {
5977
var message protobufs.ServerToAgent
6078
err := r.receiveMessage(&message)

client/internal/wssender.go

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package internal
22

33
import (
44
"context"
5+
"time"
56

67
"github.com/gorilla/websocket"
78
"google.golang.org/protobuf/proto"
@@ -11,13 +12,19 @@ import (
1112
"github.com/open-telemetry/opamp-go/protobufs"
1213
)
1314

15+
const (
16+
defaultSendCloseMessageTimeout = 5 * time.Second
17+
)
18+
1419
// WSSender implements the WebSocket client's sending portion of OpAMP protocol.
1520
type WSSender struct {
1621
SenderCommon
1722
conn *websocket.Conn
1823
logger types.Logger
24+
1925
// Indicates that the sender has fully stopped.
2026
stopped chan struct{}
27+
err error
2128
}
2229

2330
// NewSender creates a new Sender that uses WebSocket to send
@@ -37,15 +44,22 @@ func (s *WSSender) Start(ctx context.Context, conn *websocket.Conn) error {
3744

3845
// Run the sender in the background.
3946
s.stopped = make(chan struct{})
47+
s.err = nil
4048
go s.run(ctx)
4149

4250
return err
4351
}
4452

45-
// WaitToStop blocks until the sender is stopped. To stop the sender cancel the context
46-
// that was passed to Start().
47-
func (s *WSSender) WaitToStop() {
48-
<-s.stopped
53+
// IsStopped returns a channel that's closed when the sender is stopped.
54+
func (s *WSSender) IsStopped() <-chan struct{} {
55+
return s.stopped
56+
}
57+
58+
// StoppingErr returns an error if there was a problem with stopping the sender.
59+
// If stopping was successful will return nil.
60+
// StoppingErr() can be called only after IsStopped() is signalled.
61+
func (s *WSSender) StoppingErr() error {
62+
return s.err
4963
}
5064

5165
func (s *WSSender) run(ctx context.Context) {
@@ -56,13 +70,24 @@ out:
5670
s.sendNextMessage(ctx)
5771

5872
case <-ctx.Done():
73+
if err := s.sendCloseMessage(); err != nil && err != websocket.ErrCloseSent {
74+
s.err = err
75+
}
5976
break out
6077
}
6178
}
6279

6380
close(s.stopped)
6481
}
6582

83+
func (s *WSSender) sendCloseMessage() error {
84+
return s.conn.WriteControl(
85+
websocket.CloseMessage,
86+
websocket.FormatCloseMessage(websocket.CloseNormalClosure, "Normal closure"),
87+
time.Now().Add(defaultSendCloseMessageTimeout),
88+
)
89+
}
90+
6691
func (s *WSSender) sendNextMessage(ctx context.Context) error {
6792
msgToSend := s.nextMessage.PopPending()
6893
if msgToSend != nil && !proto.Equal(msgToSend, &protobufs.AgentToServer{}) {

client/wsclient.go

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ import (
1818
"github.com/open-telemetry/opamp-go/protobufs"
1919
)
2020

21+
const (
22+
defaultShutdownTimeout = 5 * time.Second
23+
)
24+
2125
// wsClient is an OpAMP Client implementation for WebSocket transport.
2226
// See specification: https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#websocket-transport
2327
type wsClient struct {
@@ -40,6 +44,10 @@ type wsClient struct {
4044
// last non-nil internal error that was encountered in the conn retry loop,
4145
// currently used only for testing.
4246
lastInternalErr atomic.Pointer[error]
47+
48+
// Network connection timeout used for the WebSocket closing handshake.
49+
// This field is currently only modified during testing.
50+
connShutdownTimeout time.Duration
4351
}
4452

4553
// NewWebSocket creates a new OpAMP Client that uses WebSocket transport.
@@ -50,8 +58,9 @@ func NewWebSocket(logger types.Logger) *wsClient {
5058

5159
sender := internal.NewSender(logger)
5260
w := &wsClient{
53-
common: internal.NewClientCommon(logger, sender),
54-
sender: sender,
61+
common: internal.NewClientCommon(logger, sender),
62+
sender: sender,
63+
connShutdownTimeout: defaultShutdownTimeout,
5564
}
5665
return w
5766
}
@@ -85,15 +94,6 @@ func (c *wsClient) Start(ctx context.Context, settings types.StartSettings) erro
8594
}
8695

8796
func (c *wsClient) Stop(ctx context.Context) error {
88-
// Close connection if any.
89-
c.connMutex.RLock()
90-
conn := c.conn
91-
c.connMutex.RUnlock()
92-
93-
if conn != nil {
94-
_ = conn.Close()
95-
}
96-
9797
return c.common.Stop(ctx)
9898
}
9999

@@ -232,19 +232,25 @@ func (c *wsClient) ensureConnected(ctx context.Context) error {
232232
// runOneCycle performs the following actions:
233233
// 1. connect (try until succeeds).
234234
// 2. send first status report.
235-
// 3. receive and process messages until error happens.
235+
// 3. start the sender to wait for scheduled messages and send them to the server.
236+
// 4. start the receiver to receive and process messages until an error happens.
237+
// 5. wait until both the sender and receiver are stopped.
236238
//
237-
// If it encounters an error it closes the connection and returns.
238-
// Will stop and return if Stop() is called (ctx is cancelled, isStopping is set).
239+
// runOneCycle will close the connection it created before it return.
240+
//
241+
// When Stop() is called (ctx is cancelled, isStopping is set), wsClient will shutdown gracefully:
242+
// 1. sender will be cancelled by the ctx, send the close message to server and return the error via sender.Err().
243+
// 2. runOneCycle will handle that error and wait for the close message from server until timeout.
239244
func (c *wsClient) runOneCycle(ctx context.Context) {
240245
if err := c.ensureConnected(ctx); err != nil {
241246
// Can't connect, so can't move forward. This currently happens when we
242247
// are being stopped.
243248
return
244249
}
250+
// Close the underlying connection.
251+
defer c.conn.Close()
245252

246253
if c.common.IsStopping() {
247-
_ = c.conn.Close()
248254
return
249255
}
250256

@@ -256,15 +262,14 @@ func (c *wsClient) runOneCycle(ctx context.Context) {
256262
}
257263

258264
// Create a cancellable context for background processors.
259-
procCtx, procCancel := context.WithCancel(ctx)
265+
senderCtx, stopSender := context.WithCancel(ctx)
266+
defer stopSender()
260267

261268
// Connected successfully. Start the sender. This will also send the first
262269
// status report.
263-
if err := c.sender.Start(procCtx, c.conn); err != nil {
264-
c.common.Logger.Errorf(procCtx, "Failed to send first status report: %v", err)
270+
if err := c.sender.Start(senderCtx, c.conn); err != nil {
271+
c.common.Logger.Errorf(senderCtx, "Failed to send first status report: %v", err)
265272
// We could not send the report, the only thing we can do is start over.
266-
_ = c.conn.Close()
267-
procCancel()
268273
return
269274
}
270275

@@ -278,19 +283,41 @@ func (c *wsClient) runOneCycle(ctx context.Context) {
278283
c.common.PackagesStateProvider,
279284
c.common.Capabilities,
280285
)
281-
r.ReceiverLoop(ctx)
282-
283-
// Stop the background processors.
284-
procCancel()
285286

286-
// If we exited receiverLoop it means there is a connection error, we cannot
287-
// read messages anymore. We need to start over.
287+
// When the wsclient is closed, the context passed to runOneCycle will be canceled.
288+
// The receiver should keep running and processing messages
289+
// until it received a Close message from the server which means the server has no more messages.
290+
receiverCtx, stopReceiver := context.WithCancel(context.Background())
291+
defer stopReceiver()
292+
r.Start(receiverCtx)
293+
294+
select {
295+
case <-c.sender.IsStopped():
296+
// sender will send close message to initiate the close handshake
297+
if err := c.sender.StoppingErr(); err != nil {
298+
c.common.Logger.Debugf(ctx, "Error stopping the sender: %v", err)
299+
300+
stopReceiver()
301+
<-r.IsStopped()
302+
break
303+
}
288304

289-
// Close the connection to unblock the WSSender as well.
290-
_ = c.conn.Close()
305+
c.common.Logger.Debugf(ctx, "Waiting for receiver to stop.")
306+
select {
307+
case <-r.IsStopped():
308+
c.common.Logger.Debugf(ctx, "Receiver stopped.")
309+
case <-time.After(c.connShutdownTimeout):
310+
c.common.Logger.Debugf(ctx, "Timeout waiting for receiver to stop.")
311+
stopReceiver()
312+
<-r.IsStopped()
313+
}
314+
case <-r.IsStopped():
315+
// If we exited receiverLoop it means there is a connection error, we cannot
316+
// read messages anymore. We need to start over.
291317

292-
// Wait for WSSender to stop.
293-
c.sender.WaitToStop()
318+
stopSender()
319+
<-c.sender.IsStopped()
320+
}
294321
}
295322

296323
func (c *wsClient) runUntilStopped(ctx context.Context) {

0 commit comments

Comments
 (0)