roboflow
diff --git a/‎inference/core/env.py‎
Lines changed: 1 addition & 0 deletions b/‎inference/core/env.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎inference/core/interfaces/stream/inference_pipeline.py‎
Lines changed: 2 additions & 3 deletions b/‎inference/core/interfaces/stream/inference_pipeline.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎inference/core/interfaces/webrtc_worker/modal.py‎
Lines changed: 48 additions & 11 deletions b/‎inference/core/interfaces/webrtc_worker/modal.py‎
Lines changed: 48 additions & 11 deletions
diff --git a/‎inference/core/interfaces/webrtc_worker/utils.py‎
Lines changed: 28 additions & 0 deletions b/‎inference/core/interfaces/webrtc_worker/utils.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎inference/core/interfaces/webrtc_worker/watchdog.py‎
Lines changed: 50 additions & 0 deletions b/‎inference/core/interfaces/webrtc_worker/watchdog.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎inference/core/interfaces/webrtc_worker/webrtc.py‎
Lines changed: 26 additions & 2 deletions b/‎inference/core/interfaces/webrtc_worker/webrtc.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎inference/core/workflows/core_steps/loader.py‎
Lines changed: 4 additions & 0 deletions b/‎inference/core/workflows/core_steps/loader.py‎
Lines changed: 4 additions & 0 deletions
@@ -765,6 +765,7 @@
 WEBRTC_MODAL_MODELS_PRELOAD_API_KEY = os.getenv("WEBRTC_MODAL_MODELS_PRELOAD_API_KEY")
 WEBRTC_MODAL_PRELOAD_MODELS = os.getenv("WEBRTC_MODAL_PRELOAD_MODELS")
 WEBRTC_MODAL_PRELOAD_HF_IDS = os.getenv("WEBRTC_MODAL_PRELOAD_HF_IDS")
+WEBRTC_MODAL_MIN_RAM_MB = int(os.getenv("WEBRTC_MODAL_MIN_RAM_MB", "4096"))
 
 HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED = str2bool(
     os.getenv("HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED", "True")
 
@@ -596,10 +596,9 @@ def init_with_workflow(
         named_workflow_specified = (workspace_name is not None) and (
             workflow_id is not None
         )
-        if not (named_workflow_specified != (workflow_specification is not None)):
+        if not named_workflow_specified and not workflow_specification:
             raise ValueError(
-                "Parameters (`workspace_name`, `workflow_id`) can be used mutually exclusive with "
-                "`workflow_specification`, but at least one must be set."
+                "Either (`workspace_name`, `workflow_id`) or `workflow_specification` must be provided."
             )
         try:
             from inference.core.interfaces.stream.model_handlers.workflows import (
 
@@ -31,6 +31,7 @@
     WEBRTC_MODAL_GCP_SECRET_NAME,
     WEBRTC_MODAL_IMAGE_NAME,
     WEBRTC_MODAL_IMAGE_TAG,
+    WEBRTC_MODAL_MIN_RAM_MB,
     WEBRTC_MODAL_MODELS_PRELOAD_API_KEY,
     WEBRTC_MODAL_PRELOAD_HF_IDS,
     WEBRTC_MODAL_PRELOAD_MODELS,
@@ -48,12 +49,11 @@
     WebRTCWorkerResult,
 )
 from inference.core.interfaces.webrtc_worker.utils import (
+    warmup_cuda,
     workflow_contains_instant_model,
     workflow_contains_preloaded_model,
 )
-from inference.core.interfaces.webrtc_worker.webrtc import (
-    init_rtc_peer_connection_with_loop,
-)
+from inference.core.interfaces.webrtc_worker.watchdog import Watchdog
 from inference.core.managers.base import ModelManager
 from inference.core.registries.roboflow import RoboflowModelRegistry
 from inference.core.roboflow_api import (
@@ -62,7 +62,7 @@
 )
 from inference.core.version import __version__
 from inference.models.aliases import resolve_roboflow_model_alias
-from inference.models.owlv2.owlv2 import preload_owlv2_model
+from inference.models.owlv2.owlv2 import PRELOADED_HF_MODELS, preload_owlv2_model
 from inference.models.utils import ROBOFLOW_MODEL_TYPES
 from inference.usage_tracking.collector import usage_collector
 from inference.usage_tracking.plan_details import WebRTCPlan
@@ -125,6 +125,7 @@ def check_nvidia_smi_gpu() -> str:
         "min_containers": WEBRTC_MODAL_FUNCTION_MIN_CONTAINERS,
         "buffer_containers": WEBRTC_MODAL_FUNCTION_BUFFER_CONTAINERS,
         "scaledown_window": WEBRTC_MODAL_FUNCTION_SCALEDOWN_WINDOW,
+        "memory": WEBRTC_MODAL_MIN_RAM_MB,
         "timeout": WEBRTC_MODAL_FUNCTION_TIME_LIMIT,
         "enable_memory_snapshot": WEBRTC_MODAL_FUNCTION_ENABLE_MEMORY_SNAPSHOT,
         "max_inputs": WEBRTC_MODAL_FUNCTION_MAX_INPUTS,
@@ -152,6 +153,7 @@ def check_nvidia_smi_gpu() -> str:
             "ROBOFLOW_INTERNAL_SERVICE_SECRET": ROBOFLOW_INTERNAL_SERVICE_SECRET,
             "WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE": WORKFLOWS_CUSTOM_PYTHON_EXECUTION_MODE,
             "TELEMETRY_USE_PERSISTENT_QUEUE": "False",
+            "TORCHINDUCTOR_COMPILE_THREADS": "1",
             "WEBRTC_MODAL_FUNCTION_BUFFER_CONTAINERS": str(
                 WEBRTC_MODAL_FUNCTION_BUFFER_CONTAINERS
             ),
@@ -188,6 +190,10 @@ def rtc_peer_connection_modal(
             webrtc_request: WebRTCWorkerRequest,
             q: modal.Queue,
         ):
+            from inference.core.interfaces.webrtc_worker.webrtc import (
+                init_rtc_peer_connection_with_loop,
+            )
+
             logger.info("*** Spawning %s:", self.__class__.__name__)
             logger.info("Running on %s", self._gpu)
             logger.info("Inference tag: %s", docker_tag)
@@ -199,6 +205,9 @@ def rtc_peer_connection_modal(
                     else ""
                 ),
             )
+            logger.info(
+                "Preloaded hf models: %s", ", ".join(PRELOADED_HF_MODELS.keys())
+            )
             _exec_session_started = datetime.datetime.now()
             webrtc_request.processing_session_started = _exec_session_started
             logger.info(
@@ -231,14 +240,35 @@ def rtc_peer_connection_modal(
             logger.info("MODAL_ENVIRONMENT: %s", MODAL_ENVIRONMENT)
             logger.info("MODAL_IDENTITY_TOKEN: %s", MODAL_IDENTITY_TOKEN)
 
+            try:
+                current_loop = asyncio.get_running_loop()
+            except RuntimeError:
+                current_loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(current_loop)
+
+            def on_timeout():
+                def shutdown():
+                    for task in asyncio.all_tasks():
+                        task.cancel()
+                    current_loop.stop()
+
+                current_loop.call_soon_threadsafe(shutdown)
+
+            watchdog = Watchdog(
+                timeout_seconds=30,
+                on_timeout=on_timeout,
+            )
+
             def send_answer(obj: WebRTCWorkerResult):
                 logger.info("Sending webrtc answer")
                 q.put(obj)
+                watchdog.start()
 
             if webrtc_request.processing_timeout == 0:
                 error_msg = "Processing timeout is 0, skipping processing"
                 logger.info(error_msg)
                 send_answer(WebRTCWorkerResult(error_message=error_msg))
+                watchdog.stop()
                 return
             if (
                 not webrtc_request.webrtc_offer
@@ -248,15 +278,21 @@ def send_answer(obj: WebRTCWorkerResult):
                 error_msg = "Webrtc offer is missing, skipping processing"
                 logger.info(error_msg)
                 send_answer(WebRTCWorkerResult(error_message=error_msg))
+                watchdog.stop()
                 return
 
-            asyncio.run(
-                init_rtc_peer_connection_with_loop(
-                    webrtc_request=webrtc_request,
-                    send_answer=send_answer,
-                    model_manager=self._model_manager,
+            try:
+                asyncio.run(
+                    init_rtc_peer_connection_with_loop(
+                        webrtc_request=webrtc_request,
+                        send_answer=send_answer,
+                        model_manager=self._model_manager,
+                        heartbeat_callback=watchdog.heartbeat,
+                    )
                 )
-            )
+            except Exception as exc:
+                logger.error(exc)
+
             _exec_session_stopped = datetime.datetime.now()
             logger.info(
                 "WebRTC session stopped at %s",
@@ -294,6 +330,7 @@ def send_answer(obj: WebRTCWorkerResult):
                 ).total_seconds(),
             )
             usage_collector.push_usage_payloads()
+            watchdog.stop()
             logger.info("Function completed")
 
         @modal.exit()
@@ -335,6 +372,7 @@ class RTCPeerConnectionModalGPU(RTCPeerConnectionModal):
         # https://modal.com/docs/guide/memory-snapshot#gpu-memory-snapshot
         @modal.enter(snap=True)
         def start(self):
+            warmup_cuda(max_retries=10, retry_delay=0.5)
             self._gpu = check_nvidia_smi_gpu()
             logger.info("Starting GPU container on %s", self._gpu)
             logger.info("Preload hf ids: %s", self.preload_hf_ids)
@@ -423,7 +461,6 @@ def spawn_rtc_peer_connection_modal(
                     workflow_id=webrtc_request.workflow_configuration.workflow_id,
                 )
             )
-
         tags = {"tag": docker_tag}
         if workspace_id:
             tags["workspace_id"] = workspace_id
 
@@ -1,5 +1,7 @@
+import ctypes
 import datetime
 import logging
+import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import cv2 as cv
@@ -181,3 +183,29 @@ def workflow_contains_preloaded_model(
         if model_id in preload_models or resolved_model_id in preload_models:
             return True
     return False
+
+
+def warmup_cuda(
+    max_retries: int = 10,
+    retry_delay: float = 0.5,
+):
+    cu = ctypes.CDLL("libcuda.so.1")
+
+    for attempt in range(max_retries):
+        rc = cu.cuInit(0)
+
+        if rc == 0:
+            break
+        else:
+            if attempt < max_retries - 1:
+                logger.warning(
+                    "cuInit failed on attempt %s/%s with code %s, retrying...",
+                    attempt + 1,
+                    max_retries,
+                    rc,
+                )
+                time.sleep(retry_delay)
+    else:
+        raise RuntimeError(f"CUDA initialization failed after {max_retries} attempts")
+
+    logger.info("CUDA initialization succeeded")
@@ -0,0 +1,50 @@
+import datetime
+import threading
+import time
+from typing import Callable
+
+from inference.core.logger import logger
+
+
+class Watchdog:
+    def __init__(self, timeout_seconds: int, on_timeout: Callable[[], None]):
+        self.timeout_seconds = timeout_seconds
+        self.last_heartbeat = datetime.datetime.now()
+        self.on_timeout = on_timeout
+        self._thread = threading.Thread(target=self._watchdog_thread)
+        self._stopping = False
+        self._last_log_ts = datetime.datetime.now()
+        self._log_interval_seconds = 10
+        self._heartbeats = 0
+
+    def start(self):
+        self._thread.start()
+
+    def stop(self):
+        self._stopping = True
+        if self._thread.is_alive():
+            self._thread.join()
+
+    def _watchdog_thread(self):
+        while not self._stopping:
+            if not self.is_alive():
+                logger.error("Watchdog timeout reached")
+                self.on_timeout()
+                break
+            time.sleep(0.1)
+        logger.info("Watchdog stopped")
+
+    def heartbeat(self):
+        self.last_heartbeat = datetime.datetime.now()
+        self._heartbeats += 1
+        if (
+            datetime.datetime.now() - self._last_log_ts
+        ).total_seconds() > self._log_interval_seconds:
+            logger.info("Watchdog heartbeat (%s since last)", self._heartbeats)
+            self._last_log_ts = datetime.datetime.now()
+            self._heartbeats = 0
+
+    def is_alive(self) -> bool:
+        return (
+            datetime.datetime.now() - self.last_heartbeat
+        ).total_seconds() < self.timeout_seconds
@@ -7,6 +7,7 @@
 
 import cv2
 import numpy as np
+from aioice import ice
 from aiortc import (
     RTCConfiguration,
     RTCDataChannel,
@@ -202,6 +203,7 @@ def __init__(
         termination_date: Optional[datetime.datetime] = None,
         terminate_event: Optional[asyncio.Event] = None,
         use_data_channel_frames: bool = False,
+        heartbeat_callback: Optional[Callable[[], None]] = None,
     ):
         self._loop = asyncio_loop
         self._termination_date = termination_date
@@ -212,6 +214,7 @@ def __init__(
         self._received_frames = 0
         self._declared_fps = declared_fps
         self._stop_processing = False
+        self.heartbeat_callback = heartbeat_callback
         self.use_data_channel_frames = use_data_channel_frames
         self._data_frame_queue: "asyncio.Queue[Optional[VideoFrame]]" = asyncio.Queue()
         self._chunk_reassembler = (
@@ -267,8 +270,8 @@ def _check_termination(self):
         if (
             self._termination_date
             and self._termination_date < datetime.datetime.now()
-            and self._terminate_event
-            and not self._terminate_event.is_set()
+            or self._terminate_event
+            and self._terminate_event.is_set()
         ):
             logger.info("Timeout reached, terminating inference pipeline")
             self._terminate_event.set()
@@ -401,6 +404,8 @@ async def process_frames_data_only(self):
             while not self._stop_processing:
                 if self._check_termination():
                     break
+                if self.heartbeat_callback:
+                    self.heartbeat_callback()
 
                 # Get frame from appropriate source
                 if self.use_data_channel_frames:
@@ -547,6 +552,7 @@ def __init__(
         termination_date: Optional[datetime.datetime] = None,
         terminate_event: Optional[asyncio.Event] = None,
         use_data_channel_frames: bool = False,
+        heartbeat_callback: Optional[Callable[[], None]] = None,
         *args,
         **kwargs,
     ):
@@ -564,6 +570,7 @@ def __init__(
             terminate_event=terminate_event,
             use_data_channel_frames=use_data_channel_frames,
             model_manager=model_manager,
+            heartbeat_callback=heartbeat_callback,
         )
 
     async def _auto_detect_stream_output(
@@ -589,6 +596,9 @@ async def recv(self):
             av_logging.set_libav_level(av_logging.ERROR)
             self._av_logging_set = True
 
+        if self.heartbeat_callback:
+            self.heartbeat_callback()
+
         # Check if we should terminate
         if self._check_termination():
             raise MediaStreamError("Processing terminated due to timeout")
@@ -649,7 +659,19 @@ async def init_rtc_peer_connection_with_loop(
     asyncio_loop: Optional[asyncio.AbstractEventLoop] = None,
     model_manager: Optional[ModelManager] = None,
     shutdown_reserve: int = WEBRTC_MODAL_SHUTDOWN_RESERVE,
+    heartbeat_callback: Optional[Callable[[], None]] = None,
 ) -> RTCPeerConnectionWithLoop:
+    # ice._mdns is instantiated on the module level, it has a lock that is bound to the event loop
+    # avoid RuntimeError: asyncio.locks.Lock is bound to a different event loop
+    if hasattr(ice, "_mdns"):
+        if hasattr(ice._mdns, "lock"):
+            logger.info("Removing lock from aioice.ice._mdns")
+            delattr(ice._mdns, "lock")
+    else:
+        logger.warning(
+            "aioice.ice implementation was changed, _mdns attribute is not available"
+        )
+
     termination_date = None
     terminate_event = asyncio.Event()
 
@@ -708,6 +730,7 @@ async def init_rtc_peer_connection_with_loop(
                 termination_date=termination_date,
                 terminate_event=terminate_event,
                 use_data_channel_frames=webrtc_request.use_data_channel_frames,
+                heartbeat_callback=heartbeat_callback,
             )
         else:
             # No video track - use base VideoFrameProcessor
@@ -723,6 +746,7 @@ async def init_rtc_peer_connection_with_loop(
                 termination_date=termination_date,
                 terminate_event=terminate_event,
                 use_data_channel_frames=webrtc_request.use_data_channel_frames,
+                heartbeat_callback=heartbeat_callback,
             )
     except (
         ValidationError,
 
@@ -164,6 +164,9 @@
 from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
     AnthropicClaudeBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v2 import (
+    AnthropicClaudeBlockV2,
+)
 from inference.core.workflows.core_steps.models.foundation.clip.v1 import (
     ClipModelBlockV1,
 )
@@ -582,6 +585,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         DimensionCollapseBlockV1,
         FirstNonEmptyOrDefaultBlockV1,
         AnthropicClaudeBlockV1,
+        AnthropicClaudeBlockV2,
         CosineSimilarityBlockV1,
         BackgroundColorVisualizationBlockV1,
         BarcodeDetectorBlockV1,
Original file line number	Diff line number	Diff line change
`@@ -596,10 +596,9 @@ def init_with_workflow(`
`596`	`596`	`named_workflow_specified = (workspace_name is not None) and (`
`597`	`597`	`workflow_id is not None`
`598`	`598`	`)`
`599`		`- if not (named_workflow_specified != (workflow_specification is not None)):`
	`599`	`+ if not named_workflow_specified and not workflow_specification:`
`600`	`600`	`raise ValueError(`
`601`		- "Parameters (`workspace_name`, `workflow_id`) can be used mutually exclusive with "
`602`		- "`workflow_specification`, but at least one must be set."
	`601`	+ "Either (`workspace_name`, `workflow_id`) or `workflow_specification` must be provided."
`603`	`602`	`)`
`604`	`603`	`try:`
`605`	`604`	`from inference.core.interfaces.stream.model_handlers.workflows import (`