Skip to content

Commit e8fc7fe

Browse files
Merge pull request #1769 from roboflow/feat/parametrize-webrtc-worker-watchdog-timeout
Parametrize modal webrtc worker watchdog timeout
2 parents 286d346 + d70f468 commit e8fc7fe

File tree

4 files changed

+18
-5
lines changed

4 files changed

+18
-5
lines changed

inference/core/env.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,8 @@
722722
# seconds
723723
WEBRTC_MODAL_RESPONSE_TIMEOUT = int(os.getenv("WEBRTC_MODAL_RESPONSE_TIMEOUT", "60"))
724724
# seconds
725+
WEBRTC_MODAL_WATCHDOG_TIMEMOUT = int(os.getenv("WEBRTC_MODAL_WATCHDOG_TIMEMOUT", "60"))
726+
# seconds
725727
WEBRTC_MODAL_FUNCTION_TIME_LIMIT = int(
726728
os.getenv("WEBRTC_MODAL_FUNCTION_TIME_LIMIT", "3600")
727729
)

inference/core/interfaces/webrtc_worker/modal.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
WEBRTC_MODAL_SHUTDOWN_RESERVE,
4444
WEBRTC_MODAL_TOKEN_ID,
4545
WEBRTC_MODAL_TOKEN_SECRET,
46+
WEBRTC_MODAL_WATCHDOG_TIMEMOUT,
4647
WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE,
4748
)
4849
from inference.core.exceptions import RoboflowAPIUnsuccessfulRequestError
@@ -184,6 +185,7 @@ def check_nvidia_smi_gpu() -> str:
184185
"WEBRTC_MODAL_RTSP_PLACEHOLDER": WEBRTC_MODAL_RTSP_PLACEHOLDER,
185186
"WEBRTC_MODAL_RTSP_PLACEHOLDER_URL": WEBRTC_MODAL_RTSP_PLACEHOLDER_URL,
186187
"WEBRTC_MODAL_SHUTDOWN_RESERVE": str(WEBRTC_MODAL_SHUTDOWN_RESERVE),
188+
"WEBRTC_MODAL_WATCHDOG_TIMEMOUT": str(WEBRTC_MODAL_WATCHDOG_TIMEMOUT),
187189
},
188190
"volumes": {MODEL_CACHE_DIR: rfcache_volume},
189191
}
@@ -198,7 +200,7 @@ async def run_rtc_peer_connection_with_watchdog(
198200
)
199201

200202
watchdog = Watchdog(
201-
timeout_seconds=30,
203+
timeout_seconds=WEBRTC_MODAL_WATCHDOG_TIMEMOUT,
202204
)
203205

204206
rtc_peer_connection_task = asyncio.create_task(
@@ -211,7 +213,6 @@ async def run_rtc_peer_connection_with_watchdog(
211213
)
212214

213215
def on_timeout():
214-
logger.info("Watchdog timeout reached")
215216
rtc_peer_connection_task.cancel()
216217

217218
watchdog.on_timeout = on_timeout
@@ -266,6 +267,7 @@ def rtc_peer_connection_modal(
266267
logger.info("declared_fps: %s", webrtc_request.declared_fps)
267268
logger.info("rtsp_url: %s", webrtc_request.rtsp_url)
268269
logger.info("processing_timeout: %s", webrtc_request.processing_timeout)
270+
logger.info("watchdog_timeout: %s", WEBRTC_MODAL_WATCHDOG_TIMEMOUT)
269271
logger.info("requested_plan: %s", webrtc_request.requested_plan)
270272
logger.info("requested_region: %s", webrtc_request.requested_region)
271273
logger.info(

inference/core/interfaces/webrtc_worker/watchdog.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def __init__(
2020
self._heartbeats = 0
2121

2222
def start(self):
23+
logger.info("Starting watchdog with timeout %s", self.timeout_seconds)
2324
if not self.on_timeout:
2425
raise ValueError(
2526
"on_timeout callback must be provided before starting the watchdog"
@@ -32,12 +33,13 @@ def stop(self):
3233
self._thread.join()
3334

3435
def _watchdog_thread(self):
36+
logger.info("Watchdog thread started")
3537
while not self._stopping:
3638
if not self.is_alive():
3739
logger.error("Watchdog timeout reached")
3840
self.on_timeout()
3941
break
40-
time.sleep(0.1)
42+
time.sleep(1)
4143
logger.info("Watchdog stopped")
4244

4345
def heartbeat(self):

inference/core/interfaces/webrtc_worker/webrtc.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,7 @@ async def recv(self):
634634

635635
async def _wait_ice_complete(peer_connection: RTCPeerConnectionWithLoop, timeout=2.0):
636636
if peer_connection.iceGatheringState == "complete":
637+
logger.info("ICE gathering state already complete")
637638
return
638639
fut = asyncio.get_running_loop().create_future()
639640

@@ -662,6 +663,7 @@ async def init_rtc_peer_connection_with_loop(
662663
shutdown_reserve: int = WEBRTC_MODAL_SHUTDOWN_RESERVE,
663664
heartbeat_callback: Optional[Callable[[], None]] = None,
664665
) -> RTCPeerConnectionWithLoop:
666+
logger.info("Initializing RTC peer connection with loop")
665667
# ice._mdns is instantiated on the module level, it has a lock that is bound to the event loop
666668
# avoid RuntimeError: asyncio.locks.Lock is bound to a different event loop
667669
if hasattr(ice, "_mdns"):
@@ -956,10 +958,13 @@ def on_message(message):
956958
answer = await peer_connection.createAnswer()
957959
await peer_connection.setLocalDescription(answer)
958960

959-
logger.debug(f"WebRTC connection status: {peer_connection.connectionState}")
960-
961961
await _wait_ice_complete(peer_connection, timeout=2.0)
962962

963+
logger.info(
964+
"Initialized RTC peer connection with loop (status: %s), sending answer",
965+
peer_connection.connectionState,
966+
)
967+
963968
send_answer(
964969
WebRTCWorkerResult(
965970
answer={
@@ -969,7 +974,9 @@ def on_message(message):
969974
)
970975
)
971976

977+
logger.info("Answer sent, waiting for termination event")
972978
await terminate_event.wait()
979+
logger.info("Termination event received, closing WebRTC connection")
973980
if player:
974981
logger.info("Stopping player")
975982
player.video.stop()

0 commit comments

Comments
 (0)