Merge pull request #1766 from roboflow/fix/watchdog-modal-termination

grzegorz-roboflow · web-flow · commit d1a3f4e1f507 · 2025-12-02T16:57:13.000+01:00
Specify public stun servers
diff --git a/inference/core/env.py b/inference/core/env.py
@@ -765,7 +765,18 @@
 WEBRTC_MODAL_MODELS_PRELOAD_API_KEY = os.getenv("WEBRTC_MODAL_MODELS_PRELOAD_API_KEY")
 WEBRTC_MODAL_PRELOAD_MODELS = os.getenv("WEBRTC_MODAL_PRELOAD_MODELS")
 WEBRTC_MODAL_PRELOAD_HF_IDS = os.getenv("WEBRTC_MODAL_PRELOAD_HF_IDS")
-WEBRTC_MODAL_MIN_RAM_MB = int(os.getenv("WEBRTC_MODAL_MIN_RAM_MB", "4096"))
+try:
+    WEBRTC_MODAL_MIN_CPU_CORES = int(os.getenv("WEBRTC_MODAL_MIN_CPU_CORES"))
+except (ValueError, TypeError):
+    WEBRTC_MODAL_MIN_CPU_CORES = None
+try:
+    WEBRTC_MODAL_MIN_RAM_MB = int(os.getenv("WEBRTC_MODAL_MIN_RAM_MB"))
+except (ValueError, TypeError):
+    WEBRTC_MODAL_MIN_RAM_MB = None
+WEBRTC_MODAL_PUBLIC_STUN_SERVERS = os.getenv(
+    "WEBRTC_MODAL_PUBLIC_STUN_SERVERS",
+    "stun:stun.l.google.com:19302,stun:stun1.l.google.com:19302,stun:stun2.l.google.com:19302,stun:stun3.l.google.com:19302,stun:stun4.l.google.com:19302",
+)
 
 HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED = str2bool(
     os.getenv("HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED", "True")
diff --git a/inference/core/interfaces/webrtc_worker/modal.py b/inference/core/interfaces/webrtc_worker/modal.py
@@ -3,7 +3,7 @@
 import os
 import subprocess
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Callable, Dict, Optional
 
 from inference.core import logger
 from inference.core.env import (
@@ -31,6 +31,7 @@
     WEBRTC_MODAL_GCP_SECRET_NAME,
     WEBRTC_MODAL_IMAGE_NAME,
     WEBRTC_MODAL_IMAGE_TAG,
+    WEBRTC_MODAL_MIN_CPU_CORES,
     WEBRTC_MODAL_MIN_RAM_MB,
     WEBRTC_MODAL_MODELS_PRELOAD_API_KEY,
     WEBRTC_MODAL_PRELOAD_HF_IDS,
@@ -39,6 +40,7 @@
     WEBRTC_MODAL_ROBOFLOW_INTERNAL_SERVICE_NAME,
     WEBRTC_MODAL_RTSP_PLACEHOLDER,
     WEBRTC_MODAL_RTSP_PLACEHOLDER_URL,
+    WEBRTC_MODAL_SHUTDOWN_RESERVE,
     WEBRTC_MODAL_TOKEN_ID,
     WEBRTC_MODAL_TOKEN_SECRET,
     WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE,
@@ -126,6 +128,7 @@ def check_nvidia_smi_gpu() -> str:
         "buffer_containers": WEBRTC_MODAL_FUNCTION_BUFFER_CONTAINERS,
         "scaledown_window": WEBRTC_MODAL_FUNCTION_SCALEDOWN_WINDOW,
         "memory": WEBRTC_MODAL_MIN_RAM_MB,
+        "cpu": WEBRTC_MODAL_MIN_CPU_CORES,
         "timeout": WEBRTC_MODAL_FUNCTION_TIME_LIMIT,
         "enable_memory_snapshot": WEBRTC_MODAL_FUNCTION_ENABLE_MEMORY_SNAPSHOT,
         "max_inputs": WEBRTC_MODAL_FUNCTION_MAX_INPUTS,
@@ -167,17 +170,62 @@ def check_nvidia_smi_gpu() -> str:
             "WEBRTC_MODAL_FUNCTION_TIME_LIMIT": str(WEBRTC_MODAL_FUNCTION_TIME_LIMIT),
             "WEBRTC_MODAL_IMAGE_NAME": WEBRTC_MODAL_IMAGE_NAME,
             "WEBRTC_MODAL_IMAGE_TAG": WEBRTC_MODAL_IMAGE_TAG,
+            "WEBRTC_MODAL_MIN_CPU_CORES": str(
+                WEBRTC_MODAL_MIN_CPU_CORES if WEBRTC_MODAL_MIN_CPU_CORES else ""
+            ),
+            "WEBRTC_MODAL_MIN_RAM_MB": str(
+                WEBRTC_MODAL_MIN_RAM_MB if WEBRTC_MODAL_MIN_RAM_MB else ""
+            ),
             "WEBRTC_MODAL_MODELS_PRELOAD_API_KEY": (
                 str(WEBRTC_MODAL_MODELS_PRELOAD_API_KEY)
                 if WEBRTC_MODAL_MODELS_PRELOAD_API_KEY
                 else ""
             ),
             "WEBRTC_MODAL_RTSP_PLACEHOLDER": WEBRTC_MODAL_RTSP_PLACEHOLDER,
             "WEBRTC_MODAL_RTSP_PLACEHOLDER_URL": WEBRTC_MODAL_RTSP_PLACEHOLDER_URL,
+            "WEBRTC_MODAL_SHUTDOWN_RESERVE": str(WEBRTC_MODAL_SHUTDOWN_RESERVE),
         },
         "volumes": {MODEL_CACHE_DIR: rfcache_volume},
     }
 
+    async def run_rtc_peer_connection_with_watchdog(
+        webrtc_request: WebRTCWorkerRequest,
+        send_answer: Callable[[WebRTCWorkerResult], None],
+        model_manager: ModelManager,
+    ):
+        from inference.core.interfaces.webrtc_worker.webrtc import (
+            init_rtc_peer_connection_with_loop,
+        )
+
+        watchdog = Watchdog(
+            timeout_seconds=30,
+        )
+
+        rtc_peer_connection_task = asyncio.create_task(
+            init_rtc_peer_connection_with_loop(
+                webrtc_request=webrtc_request,
+                send_answer=send_answer,
+                model_manager=model_manager,
+                heartbeat_callback=watchdog.heartbeat,
+            )
+        )
+
+        def on_timeout():
+            logger.info("Watchdog timeout reached")
+            rtc_peer_connection_task.cancel()
+
+        watchdog.on_timeout = on_timeout
+        watchdog.start()
+
+        try:
+            await rtc_peer_connection_task
+        except asyncio.CancelledError as exc:
+            logger.info("WebRTC connection task was cancelled (%s)", exc)
+        except Exception as exc:
+            logger.error(exc)
+        finally:
+            watchdog.stop()
+
     class RTCPeerConnectionModal:
         _model_manager: Optional[ModelManager] = modal.parameter(
             default=None, init=False
@@ -190,10 +238,6 @@ def rtc_peer_connection_modal(
             webrtc_request: WebRTCWorkerRequest,
             q: modal.Queue,
         ):
-            from inference.core.interfaces.webrtc_worker.webrtc import (
-                init_rtc_peer_connection_with_loop,
-            )
-
             logger.info("*** Spawning %s:", self.__class__.__name__)
             logger.info("Running on %s", self._gpu)
             logger.info("Inference tag: %s", docker_tag)
@@ -223,7 +267,6 @@ def rtc_peer_connection_modal(
             logger.info("rtsp_url: %s", webrtc_request.rtsp_url)
             logger.info("processing_timeout: %s", webrtc_request.processing_timeout)
             logger.info("requested_plan: %s", webrtc_request.requested_plan)
-            logger.info("requested_gpu: %s", webrtc_request.requested_gpu)
             logger.info("requested_region: %s", webrtc_request.requested_region)
             logger.info(
                 "ICE servers: %s",
@@ -233,42 +276,28 @@ def rtc_peer_connection_modal(
                     else []
                 ),
             )
+            logger.info(
+                "WEBRTC_MODAL_MIN_CPU_CORES: %s",
+                WEBRTC_MODAL_MIN_CPU_CORES or "not set",
+            )
+            logger.info(
+                "WEBRTC_MODAL_MIN_RAM_MB: %s", WEBRTC_MODAL_MIN_RAM_MB or "not set"
+            )
             logger.info("MODAL_CLOUD_PROVIDER: %s", MODAL_CLOUD_PROVIDER)
             logger.info("MODAL_IMAGE_ID: %s", MODAL_IMAGE_ID)
             logger.info("MODAL_REGION: %s", MODAL_REGION)
             logger.info("MODAL_TASK_ID: %s", MODAL_TASK_ID)
             logger.info("MODAL_ENVIRONMENT: %s", MODAL_ENVIRONMENT)
             logger.info("MODAL_IDENTITY_TOKEN: %s", MODAL_IDENTITY_TOKEN)
 
-            try:
-                current_loop = asyncio.get_running_loop()
-            except RuntimeError:
-                current_loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(current_loop)
-
-            def on_timeout():
-                def shutdown():
-                    for task in asyncio.all_tasks():
-                        task.cancel()
-                    current_loop.stop()
-
-                current_loop.call_soon_threadsafe(shutdown)
-
-            watchdog = Watchdog(
-                timeout_seconds=30,
-                on_timeout=on_timeout,
-            )
-
             def send_answer(obj: WebRTCWorkerResult):
                 logger.info("Sending webrtc answer")
                 q.put(obj)
-                watchdog.start()
 
             if webrtc_request.processing_timeout == 0:
                 error_msg = "Processing timeout is 0, skipping processing"
                 logger.info(error_msg)
                 send_answer(WebRTCWorkerResult(error_message=error_msg))
-                watchdog.stop()
                 return
             if (
                 not webrtc_request.webrtc_offer
@@ -278,20 +307,15 @@ def send_answer(obj: WebRTCWorkerResult):
                 error_msg = "Webrtc offer is missing, skipping processing"
                 logger.info(error_msg)
                 send_answer(WebRTCWorkerResult(error_message=error_msg))
-                watchdog.stop()
                 return
 
-            try:
-                asyncio.run(
-                    init_rtc_peer_connection_with_loop(
-                        webrtc_request=webrtc_request,
-                        send_answer=send_answer,
-                        model_manager=self._model_manager,
-                        heartbeat_callback=watchdog.heartbeat,
-                    )
+            asyncio.run(
+                run_rtc_peer_connection_with_watchdog(
+                    webrtc_request=webrtc_request,
+                    send_answer=send_answer,
+                    model_manager=self._model_manager,
                 )
-            except Exception as exc:
-                logger.error(exc)
+            )
 
             _exec_session_stopped = datetime.datetime.now()
             logger.info(
@@ -315,6 +339,8 @@ def send_answer(obj: WebRTCWorkerResult):
                 video_source = "rtsp"
             elif not webrtc_request.webrtc_realtime_processing:
                 video_source = "buffered browser stream"
+            else:
+                video_source = "realtime browser stream"
 
             usage_collector.record_usage(
                 source=workflow_id,
@@ -330,7 +356,6 @@ def send_answer(obj: WebRTCWorkerResult):
                 ).total_seconds(),
             )
             usage_collector.push_usage_payloads()
-            watchdog.stop()
             logger.info("Function completed")
 
         @modal.exit()
@@ -411,6 +436,9 @@ def start(self):
     def spawn_rtc_peer_connection_modal(
         webrtc_request: WebRTCWorkerRequest,
     ) -> WebRTCWorkerResult:
+        requested_gpu: Optional[str] = None
+        requested_ram_mb: Optional[int] = None
+        requested_cpu_cores: Optional[int] = None
         webrtc_plans: Optional[Dict[str, WebRTCPlan]] = (
             usage_collector._plan_details.get_webrtc_plans(
                 api_key=webrtc_request.api_key
@@ -421,9 +449,11 @@ def spawn_rtc_peer_connection_modal(
                 raise RoboflowAPIUnsuccessfulRequestError(
                     f"Unknown requested plan {webrtc_request.requested_plan}, available plans: {', '.join(webrtc_plans.keys())}"
                 )
-            webrtc_request.requested_gpu = webrtc_plans[
-                webrtc_request.requested_plan
-            ].gpu
+            requested_gpu = webrtc_plans[webrtc_request.requested_plan].gpu
+            requested_ram_mb = webrtc_plans[webrtc_request.requested_plan].ram_mb
+            requested_cpu_cores = webrtc_plans[webrtc_request.requested_plan].cpu_cores
+
+        # TODO: requested_gpu is replaced with requested_plan
         if (
             webrtc_plans
             and not webrtc_request.requested_plan
@@ -435,6 +465,7 @@ def spawn_rtc_peer_connection_modal(
                     f"Requested gpu {webrtc_request.requested_gpu} not associated with any plan, available gpus: {', '.join(gpu_to_plan.keys())}"
                 )
             webrtc_request.requested_plan = gpu_to_plan[webrtc_request.requested_gpu]
+            requested_gpu = webrtc_plans[webrtc_request.requested_plan].gpu
 
         # https://modal.com/docs/reference/modal.Client#from_credentials
         client = modal.Client.from_credentials(
@@ -483,7 +514,7 @@ def spawn_rtc_peer_connection_modal(
             logger.info("Parametrized preload models: %s", WEBRTC_MODAL_PRELOAD_MODELS)
             preload_models = WEBRTC_MODAL_PRELOAD_MODELS
 
-        if webrtc_request.requested_gpu:
+        if requested_gpu:
             RTCPeerConnectionModal = RTCPeerConnectionModalGPU
         else:
             RTCPeerConnectionModal = RTCPeerConnectionModalCPU
@@ -505,16 +536,16 @@ def spawn_rtc_peer_connection_modal(
         cls_with_options = deployed_cls.with_options(
             timeout=webrtc_request.processing_timeout,
         )
-        if webrtc_request.requested_gpu is not None:
+        if requested_gpu is not None:
             logger.info(
                 "Spawning webrtc modal function with gpu %s",
-                webrtc_request.requested_gpu,
+                requested_gpu,
             )
             # Specify fallback GPU
             # TODO: with_options does not support gpu fallback
             # https://modal.com/docs/examples/gpu_fallbacks#set-fallback-gpus
             cls_with_options = cls_with_options.with_options(
-                gpu=webrtc_request.requested_gpu,
+                gpu=requested_gpu,
             )
         if webrtc_request.requested_region:
             logger.info(
@@ -524,6 +555,22 @@ def spawn_rtc_peer_connection_modal(
             cls_with_options = cls_with_options.with_options(
                 region=webrtc_request.requested_region,
             )
+        if requested_ram_mb is not None:
+            logger.info(
+                "Spawning webrtc modal function with ram %s",
+                requested_ram_mb,
+            )
+            cls_with_options = cls_with_options.with_options(
+                ram=requested_ram_mb,
+            )
+        if requested_cpu_cores is not None:
+            logger.info(
+                "Spawning webrtc modal function with cpu cores %s",
+                requested_cpu_cores,
+            )
+            cls_with_options = cls_with_options.with_options(
+                cpu=requested_cpu_cores,
+            )
         rtc_modal_obj: RTCPeerConnectionModal = cls_with_options(
             preload_hf_ids=preload_hf_ids,
             preload_models=preload_models,
diff --git a/inference/core/interfaces/webrtc_worker/watchdog.py b/inference/core/interfaces/webrtc_worker/watchdog.py
@@ -1,23 +1,29 @@
 import datetime
 import threading
 import time
-from typing import Callable
+from typing import Callable, Optional
 
 from inference.core.logger import logger
 
 
 class Watchdog:
-    def __init__(self, timeout_seconds: int, on_timeout: Callable[[], None]):
+    def __init__(
+        self, timeout_seconds: int, on_timeout: Optional[Callable[[], None]] = None
+    ):
         self.timeout_seconds = timeout_seconds
         self.last_heartbeat = datetime.datetime.now()
-        self.on_timeout = on_timeout
+        self.on_timeout: Optional[Callable[[], None]] = on_timeout
         self._thread = threading.Thread(target=self._watchdog_thread)
         self._stopping = False
         self._last_log_ts = datetime.datetime.now()
         self._log_interval_seconds = 10
         self._heartbeats = 0
 
     def start(self):
+        if not self.on_timeout:
+            raise ValueError(
+                "on_timeout callback must be provided before starting the watchdog"
+            )
         self._thread.start()
 
     def stop(self):
diff --git a/inference/core/interfaces/webrtc_worker/webrtc.py b/inference/core/interfaces/webrtc_worker/webrtc.py
@@ -25,6 +25,7 @@
 
 from inference.core import logger
 from inference.core.env import (
+    WEBRTC_MODAL_PUBLIC_STUN_SERVERS,
     WEBRTC_MODAL_RTSP_PLACEHOLDER,
     WEBRTC_MODAL_RTSP_PLACEHOLDER_URL,
     WEBRTC_MODAL_SHUTDOWN_RESERVE,
@@ -814,6 +815,15 @@ async def init_rtc_peer_connection_with_loop(
                     credential=ice_server.credential,
                 )
             )
+        # Always add public stun servers (if specified)
+        if WEBRTC_MODAL_PUBLIC_STUN_SERVERS:
+            for stun_server in WEBRTC_MODAL_PUBLIC_STUN_SERVERS.split(","):
+                try:
+                    ice_servers.append(RTCIceServer(urls=stun_server.strip()))
+                except Exception as e:
+                    logger.warning(
+                        "Failed to add public stun server '%s': %s", stun_server, e
+                    )
     else:
         ice_servers = None
     peer_connection = RTCPeerConnectionWithLoop(
diff --git a/inference/usage_tracking/plan_details.py b/inference/usage_tracking/plan_details.py
@@ -14,6 +14,8 @@
 
 class WebRTCPlan(BaseModel):
     gpu: Optional[str] = None
+    cpu_cores: Optional[int] = None
+    ram_mb: Optional[int] = None
 
 
 class PlanDetails(SQLiteWrapper):