@@ -18,6 +18,10 @@ import (
1818 "github.com/open-telemetry/opamp-go/protobufs"
1919)
2020
21+ const (
22+ defaultShutdownTimeout = 5 * time .Second
23+ )
24+
2125// wsClient is an OpAMP Client implementation for WebSocket transport.
2226// See specification: https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#websocket-transport
2327type wsClient struct {
@@ -40,6 +44,10 @@ type wsClient struct {
4044 // last non-nil internal error that was encountered in the conn retry loop,
4145 // currently used only for testing.
4246 lastInternalErr atomic.Pointer [error ]
47+
48+ // Network connection timeout used for the WebSocket closing handshake.
49+ // This field is currently only modified during testing.
50+ connShutdownTimeout time.Duration
4351}
4452
4553// NewWebSocket creates a new OpAMP Client that uses WebSocket transport.
@@ -50,8 +58,9 @@ func NewWebSocket(logger types.Logger) *wsClient {
5058
5159 sender := internal .NewSender (logger )
5260 w := & wsClient {
53- common : internal .NewClientCommon (logger , sender ),
54- sender : sender ,
61+ common : internal .NewClientCommon (logger , sender ),
62+ sender : sender ,
63+ connShutdownTimeout : defaultShutdownTimeout ,
5564 }
5665 return w
5766}
@@ -85,15 +94,6 @@ func (c *wsClient) Start(ctx context.Context, settings types.StartSettings) erro
8594}
8695
8796func (c * wsClient ) Stop (ctx context.Context ) error {
88- // Close connection if any.
89- c .connMutex .RLock ()
90- conn := c .conn
91- c .connMutex .RUnlock ()
92-
93- if conn != nil {
94- _ = conn .Close ()
95- }
96-
9797 return c .common .Stop (ctx )
9898}
9999
@@ -232,19 +232,25 @@ func (c *wsClient) ensureConnected(ctx context.Context) error {
232232// runOneCycle performs the following actions:
233233// 1. connect (try until succeeds).
234234// 2. send first status report.
235- // 3. receive and process messages until error happens.
235+ // 3. start the sender to wait for scheduled messages and send them to the server.
236+ // 4. start the receiver to receive and process messages until an error happens.
237+ // 5. wait until both the sender and receiver are stopped.
236238//
237- // If it encounters an error it closes the connection and returns.
238- // Will stop and return if Stop() is called (ctx is cancelled, isStopping is set).
239+ // runOneCycle will close the connection it created before it return.
240+ //
241+ // When Stop() is called (ctx is cancelled, isStopping is set), wsClient will shutdown gracefully:
242+ // 1. sender will be cancelled by the ctx, send the close message to server and return the error via sender.Err().
243+ // 2. runOneCycle will handle that error and wait for the close message from server until timeout.
239244func (c * wsClient ) runOneCycle (ctx context.Context ) {
240245 if err := c .ensureConnected (ctx ); err != nil {
241246 // Can't connect, so can't move forward. This currently happens when we
242247 // are being stopped.
243248 return
244249 }
250+ // Close the underlying connection.
251+ defer c .conn .Close ()
245252
246253 if c .common .IsStopping () {
247- _ = c .conn .Close ()
248254 return
249255 }
250256
@@ -256,15 +262,14 @@ func (c *wsClient) runOneCycle(ctx context.Context) {
256262 }
257263
258264 // Create a cancellable context for background processors.
259- procCtx , procCancel := context .WithCancel (ctx )
265+ senderCtx , stopSender := context .WithCancel (ctx )
266+ defer stopSender ()
260267
261268 // Connected successfully. Start the sender. This will also send the first
262269 // status report.
263- if err := c .sender .Start (procCtx , c .conn ); err != nil {
264- c .common .Logger .Errorf (procCtx , "Failed to send first status report: %v" , err )
270+ if err := c .sender .Start (senderCtx , c .conn ); err != nil {
271+ c .common .Logger .Errorf (senderCtx , "Failed to send first status report: %v" , err )
265272 // We could not send the report, the only thing we can do is start over.
266- _ = c .conn .Close ()
267- procCancel ()
268273 return
269274 }
270275
@@ -278,19 +283,41 @@ func (c *wsClient) runOneCycle(ctx context.Context) {
278283 c .common .PackagesStateProvider ,
279284 c .common .Capabilities ,
280285 )
281- r .ReceiverLoop (ctx )
282-
283- // Stop the background processors.
284- procCancel ()
285286
286- // If we exited receiverLoop it means there is a connection error, we cannot
287- // read messages anymore. We need to start over.
287+ // When the wsclient is closed, the context passed to runOneCycle will be canceled.
288+ // The receiver should keep running and processing messages
289+ // until it received a Close message from the server which means the server has no more messages.
290+ receiverCtx , stopReceiver := context .WithCancel (context .Background ())
291+ defer stopReceiver ()
292+ r .Start (receiverCtx )
293+
294+ select {
295+ case <- c .sender .IsStopped ():
296+ // sender will send close message to initiate the close handshake
297+ if err := c .sender .StoppingErr (); err != nil {
298+ c .common .Logger .Debugf (ctx , "Error stopping the sender: %v" , err )
299+
300+ stopReceiver ()
301+ <- r .IsStopped ()
302+ break
303+ }
288304
289- // Close the connection to unblock the WSSender as well.
290- _ = c .conn .Close ()
305+ c .common .Logger .Debugf (ctx , "Waiting for receiver to stop." )
306+ select {
307+ case <- r .IsStopped ():
308+ c .common .Logger .Debugf (ctx , "Receiver stopped." )
309+ case <- time .After (c .connShutdownTimeout ):
310+ c .common .Logger .Debugf (ctx , "Timeout waiting for receiver to stop." )
311+ stopReceiver ()
312+ <- r .IsStopped ()
313+ }
314+ case <- r .IsStopped ():
315+ // If we exited receiverLoop it means there is a connection error, we cannot
316+ // read messages anymore. We need to start over.
291317
292- // Wait for WSSender to stop.
293- c .sender .WaitToStop ()
318+ stopSender ()
319+ <- c .sender .IsStopped ()
320+ }
294321}
295322
296323func (c * wsClient ) runUntilStopped (ctx context.Context ) {
0 commit comments