@@ -6,12 +6,18 @@ use canister_utils::{
66use gateway_state:: { CanisterEntry , CanisterPrincipal , ClientSender , GatewayState , PollerState } ;
77use ic_agent:: { Agent , AgentError } ;
88use std:: { sync:: Arc , time:: Duration } ;
9- use tokio:: sync:: mpsc:: Sender ;
9+ use tokio:: { sync:: mpsc:: Sender , time :: timeout } ;
1010use tracing:: { error, span, trace, warn, Instrument , Level , Span } ;
1111
12- enum PollingStatus {
12+ pub ( crate ) const POLLING_TIMEOUT_MS : u64 = 5_000 ;
13+
14+ type PollingTimeout = Duration ;
15+
16+ #[ derive( Debug , PartialEq , Eq ) ]
17+ pub ( crate ) enum PollingStatus {
1318 NoMessagesPolled ,
1419 MessagesPolled ( CanisterOutputCertifiedMessages ) ,
20+ PollerTimedOut ,
1521}
1622
1723/// Poller which periodically queries a canister for new messages and relays them to the client
@@ -59,7 +65,7 @@ impl CanisterPoller {
5965 // initially set to None as the first iteration will not have a previous span
6066 let mut previous_polling_iteration_span: Option < Span > = None ;
6167 loop {
62- let polling_iteration_span = span ! ( Level :: TRACE , "Polling Iteration" , canister_id = %self . canister_id, polling_iteration = self . polling_iteration) ;
68+ let polling_iteration_span = span ! ( Level :: TRACE , "Polling Iteration" , canister_id = %self . canister_id, polling_iteration = self . polling_iteration, cargo_version = env! ( "CARGO_PKG_VERSION" ) ) ;
6369 if let Some ( previous_polling_iteration_span) = previous_polling_iteration_span {
6470 // create a follow from relationship between the current and previous polling iteration
6571 // this enables to crawl polling iterations in reverse chronological order
@@ -97,33 +103,39 @@ impl CanisterPoller {
97103 pub async fn poll_and_relay ( & mut self ) -> Result < ( ) , String > {
98104 let start_polling_instant = tokio:: time:: Instant :: now ( ) ;
99105
100- if let PollingStatus :: MessagesPolled ( certified_canister_output) =
101- self . poll_canister ( ) . await ?
102- {
103- let relay_messages_span =
104- span ! ( parent: & Span :: current( ) , Level :: TRACE , "Relay Canister Messages" ) ;
105- let end_of_queue_reached = {
106- match certified_canister_output. is_end_of_queue {
107- Some ( is_end_of_queue_reached) => is_end_of_queue_reached,
108- // if 'is_end_of_queue' is None, the CDK version is < 0.3.1 and does not have such a field
109- // in this case, assume that the queue is fully drained and therefore will be polled again
110- // after waiting for 'polling_interval_ms'
111- None => true ,
106+ match self . poll_canister ( ) . await ? {
107+ PollingStatus :: MessagesPolled ( certified_canister_output) => {
108+ let relay_messages_span =
109+ span ! ( parent: & Span :: current( ) , Level :: TRACE , "Relay Canister Messages" ) ;
110+ let end_of_queue_reached = {
111+ match certified_canister_output. is_end_of_queue {
112+ Some ( is_end_of_queue_reached) => is_end_of_queue_reached,
113+ // if 'is_end_of_queue' is None, the CDK version is < 0.3.1 and does not have such a field
114+ // in this case, assume that the queue is fully drained and therefore will be polled again
115+ // after waiting for 'polling_interval_ms'
116+ None => true ,
117+ }
118+ } ;
119+ self . update_nonce ( & certified_canister_output) ?;
120+ // relaying of messages cannot be done in a separate task for each polling iteration
121+ // as they might interleave and break the correct ordering of messages
122+ // TODO: create a separate task dedicated to relaying messages which receives the messages from the poller via a queue
123+ // and relays them in FIFO order
124+ self . relay_messages ( certified_canister_output)
125+ . instrument ( relay_messages_span)
126+ . await ;
127+ if !end_of_queue_reached {
128+ // if the queue is not fully drained, return immediately so that the next polling iteration can be started
129+ warn ! ( "Canister queue is not fully drained. Polling immediately" ) ;
130+ return Ok ( ( ) ) ;
112131 }
113- } ;
114- self . update_nonce ( & certified_canister_output) ?;
115- // relaying of messages cannot be done in a separate task for each polling iteration
116- // as they might interleave and break the correct ordering of messages
117- // TODO: create a separate task dedicated to relaying messages which receives the messages from the poller via a queue
118- // and relays them in FIFO order
119- self . relay_messages ( certified_canister_output)
120- . instrument ( relay_messages_span)
121- . await ;
122- if !end_of_queue_reached {
123- // if the queue is not fully drained, return immediately so that the next polling iteration can be started
124- warn ! ( "Canister queue is not fully drained. Polling immediately" ) ;
132+ } ,
133+ PollingStatus :: PollerTimedOut => {
134+ // if the poller timed out, it already waited way too long... return immediately so that the next polling iteration can be started
135+ warn ! ( "Poller timed out. Polling immediately" ) ;
125136 return Ok ( ( ) ) ;
126- }
137+ } ,
138+ PollingStatus :: NoMessagesPolled => ( ) ,
127139 }
128140
129141 // compute the amout of time to sleep for before polling again
@@ -135,20 +147,26 @@ impl CanisterPoller {
135147 }
136148
137149 /// Polls the canister for messages
138- async fn poll_canister ( & mut self ) -> Result < PollingStatus , String > {
150+ pub ( crate ) async fn poll_canister ( & mut self ) -> Result < PollingStatus , String > {
139151 trace ! ( "Started polling iteration" ) ;
140152
141153 // get messages to be relayed to clients from canister (starting from 'message_nonce')
142- match ws_get_messages (
143- & self . agent ,
144- & self . canister_id ,
145- CanisterWsGetMessagesArguments {
146- nonce : self . next_message_nonce ,
147- } ,
154+ // the response timeout of the IC CDK is 2 minutes which implies that the poller would be stuck for that long waiting for a response
155+ // to prevent this, we set a timeout of 5 seconds, if the poller does not receive a response in time, it polls immediately
156+ // in case of a timeout, the message nonce is not updated so that no messages are lost by polling immediately again
157+ match timeout (
158+ PollingTimeout :: from_millis ( POLLING_TIMEOUT_MS ) ,
159+ ws_get_messages (
160+ & self . agent ,
161+ & self . canister_id ,
162+ CanisterWsGetMessagesArguments {
163+ nonce : self . next_message_nonce ,
164+ } ,
165+ ) ,
148166 )
149167 . await
150168 {
151- Ok ( certified_canister_output) => {
169+ Ok ( Ok ( certified_canister_output) ) => {
152170 let number_of_polled_messages = certified_canister_output. messages . len ( ) ;
153171 if number_of_polled_messages == 0 {
154172 trace ! ( "No messages polled from canister" ) ;
@@ -161,7 +179,7 @@ impl CanisterPoller {
161179 Ok ( PollingStatus :: MessagesPolled ( certified_canister_output) )
162180 }
163181 } ,
164- Err ( IcError :: Agent ( e) ) => {
182+ Ok ( Err ( IcError :: Agent ( e) ) ) => {
165183 if is_recoverable_error ( & e) {
166184 // if the error is due to a replica which is either actively malicious or simply unavailable
167185 // or to a malfunctioning boundary node,
@@ -174,8 +192,12 @@ impl CanisterPoller {
174192 Err ( format ! ( "Unrecoverable agent error: {:?}" , e) )
175193 }
176194 } ,
177- Err ( IcError :: Candid ( e) ) => Err ( format ! ( "Unrecoverable candid error: {:?}" , e) ) ,
178- Err ( IcError :: Cdk ( e) ) => Err ( format ! ( "Unrecoverable CDK error: {:?}" , e) ) ,
195+ Ok ( Err ( IcError :: Candid ( e) ) ) => Err ( format ! ( "Unrecoverable candid error: {:?}" , e) ) ,
196+ Ok ( Err ( IcError :: Cdk ( e) ) ) => Err ( format ! ( "Unrecoverable CDK error: {:?}" , e) ) ,
197+ Err ( e) => {
198+ warn ! ( "Poller took too long to retrieve messages: {:?}" , e) ;
199+ Ok ( PollingStatus :: PollerTimedOut )
200+ } ,
179201 }
180202 }
181203
0 commit comments