gradio-app
diff --git a/‎backend/fastrtc/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backend/fastrtc/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backend/fastrtc/reply_on_pause.py‎
Lines changed: 27 additions & 13 deletions b/‎backend/fastrtc/reply_on_pause.py‎
Lines changed: 27 additions & 13 deletions
diff --git a/‎backend/fastrtc/reply_on_stopwords.py‎
Lines changed: 4 additions & 0 deletions b/‎backend/fastrtc/reply_on_stopwords.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backend/fastrtc/stream.py‎
Lines changed: 139 additions & 56 deletions b/‎backend/fastrtc/stream.py‎
Lines changed: 139 additions & 56 deletions
@@ -35,6 +35,7 @@
     AdditionalOutputs,
     CloseStream,
     Warning,
+    WebRTCData,
     WebRTCError,
     aggregate_bytes_to_16bit,
     async_aggregate_bytes_to_16bit,
@@ -92,4 +93,5 @@
     "CloseStream",
     "get_current_context",
     "CartesiaTTSOptions",
+    "WebRTCData",
 ]
@@ -11,7 +11,7 @@
 
 from .pause_detection import ModelOptions, PauseDetectionModel, get_silero_model
 from .tracks import EmitType, StreamHandler
-from .utils import AdditionalOutputs, create_message, split_output
+from .utils import AdditionalOutputs, WebRTCData, create_message, split_output
 
 logger = getLogger(__name__)
 
@@ -67,6 +67,14 @@ def new(self):
         [tuple[int, NDArray[np.int16]], Any],
         AsyncGenerator[EmitType, None],
     ]
+    | Callable[
+        [WebRTCData],
+        Generator[EmitType, None, None],
+    ]
+    | Callable[
+        [WebRTCData, Any],
+        AsyncGenerator[EmitType, None],
+    ]
 )
 
 
@@ -115,6 +123,7 @@ def __init__(
         output_frame_size: int | None = None,  # Deprecated
         input_sample_rate: int = 48000,
         model: PauseDetectionModel | None = None,
+        needs_args: bool = False,
     ):
         """
         Initializes the ReplyOnPause handler.
@@ -132,6 +141,7 @@ def __init__(
             output_frame_size: Deprecated.
             input_sample_rate: The expected sample rate of incoming audio.
             model: An optional pre-initialized VAD model instance.
+            needs_args: Whether the reply function expects additional arguments.
         """
         super().__init__(
             expected_layout,
@@ -152,11 +162,12 @@ def __init__(
         self.model_options = model_options
         self.algo_options = algo_options or AlgoOptions()
         self.startup_fn = startup_fn
+        self.needs_args = needs_args
 
     @property
     def _needs_additional_inputs(self) -> bool:
         """Checks if the reply function `fn` expects additional arguments."""
-        return len(inspect.signature(self.fn).parameters) > 1
+        return len(inspect.signature(self.fn).parameters) > 1 or self.needs_args
 
     def start_up(self):
         """
@@ -187,6 +198,7 @@ def copy(self):
             self.output_frame_size,
             self.input_sample_rate,
             self.model,
+            self.needs_args,
         )
 
     def determine_pause(
@@ -361,19 +373,21 @@ def emit(self):
         else:
             if not self.generator:
                 self.send_message_sync(create_message("log", "pause_detected"))
-                if self._needs_additional_inputs and not self.args_set.is_set():
-                    if not self.phone_mode:
-                        self.wait_for_args_sync()
-                    else:
-                        self.latest_args = [None]
-                        self.args_set.set()
+                if self._needs_additional_inputs and not self.phone_mode:
+                    self.wait_for_args_sync()
+                else:
+                    self.latest_args = [None]
+                    self.args_set.set()
                 logger.debug("Creating generator")
-                audio = cast(np.ndarray, self.state.stream).reshape(1, -1)
-                if self._needs_additional_inputs:
-                    self.latest_args[0] = (self.state.sampling_rate, audio)
-                    self.generator = self.fn(*self.latest_args)  # type: ignore
+                if self.state.stream is not None and self.state.stream.size > 0:
+                    audio = cast(np.ndarray, self.state.stream).reshape(1, -1)
                 else:
-                    self.generator = self.fn((self.state.sampling_rate, audio))  # type: ignore
+                    audio = np.array([[]], dtype=np.int16)
+                if isinstance(self.latest_args[0], WebRTCData):
+                    self.latest_args[0].audio = (self.state.sampling_rate, audio)
+                else:
+                    self.latest_args[0] = (self.state.sampling_rate, audio)
+                self.generator = self.fn(*self.latest_args)  # type: ignore
                 logger.debug("Latest args: %s", self.latest_args)
                 self.state = self.state.new()
             self.state.responding = True
 
@@ -61,6 +61,7 @@ def __init__(
         output_frame_size: int | None = None,  # Deprecated
         input_sample_rate: int = 48000,
         model: PauseDetectionModel | None = None,
+        needs_args: bool = False,
     ):
         """
         Initializes the ReplyOnStopWords handler.
@@ -80,6 +81,7 @@ def __init__(
             output_frame_size: Deprecated.
             input_sample_rate: The expected sample rate of incoming audio.
             model: An optional pre-initialized VAD model instance.
+            needs_args: Whether the reply function expects additional arguments.
         """
         super().__init__(
             fn,
@@ -92,6 +94,7 @@ def __init__(
             output_frame_size=output_frame_size,
             input_sample_rate=input_sample_rate,
             model=model,
+            needs_args=needs_args,
         )
         self.stop_words = stop_words
         self.state = ReplyOnStopWordsState()
@@ -236,4 +239,5 @@ def copy(self):
             self.output_frame_size,
             self.input_sample_rate,
             self.model,
+            self.needs_args,
         )
@@ -141,6 +141,15 @@ def __init__(
         self.modality = modality
         self.rtp_params = rtp_params
         self.event_handler = handler
+        if (
+            ui_args
+            and ui_args.get("variant") == "textbox"
+            and hasattr(handler, "needs_args")
+        ):
+            self.event_handler.needs_args = True  # type: ignore
+        else:
+            self.event_handler.needs_args = False  # type: ignore
+
         self.concurrency_limit = cast(
             (int),
             1 if concurrency_limit in ["default", None] else concurrency_limit,
@@ -574,28 +583,58 @@ def _generate_default_ui(
                     </div>
                     """
                         )
-                with gr.Row():
-                    with gr.Column():
-                        with gr.Group():
-                            image = WebRTC(
-                                label="Stream",
-                                rtc_configuration=self.rtc_configuration,
-                                track_constraints=self.track_constraints,
-                                mode="send",
-                                modality="audio",
-                                icon=ui_args.get("icon"),
-                                icon_button_color=ui_args.get("icon_button_color"),
-                                pulse_color=ui_args.get("pulse_color"),
-                                icon_radius=ui_args.get("icon_radius"),
-                            )
-                            self.webrtc_component = image
-                            for component in additional_input_components:
-                                if component not in same_components:
+                if ui_args.get("variant", "textbox"):
+                    with gr.Row():
+                        if additional_input_components:
+                            with gr.Column():
+                                for component in additional_input_components:
                                     component.render()
-                    if additional_output_components:
+                        diff_output_components = [
+                            component
+                            for component in additional_output_components
+                            if component not in same_components
+                        ]
+                        if diff_output_components:
+                            with gr.Column():
+                                for component in diff_output_components:
+                                    component.render()
+                    with gr.Row():
+                        image = WebRTC(
+                            label="Stream",
+                            rtc_configuration=self.rtc_configuration,
+                            track_constraints=self.track_constraints,
+                            mode="send",
+                            modality="audio",
+                            icon=ui_args.get("icon"),
+                            icon_button_color=ui_args.get("icon_button_color"),
+                            pulse_color=ui_args.get("pulse_color"),
+                            icon_radius=ui_args.get("icon_radius"),
+                            variant=ui_args.get("variant", "wave"),
+                        )
+                else:
+                    with gr.Row():
                         with gr.Column():
-                            for component in additional_output_components:
-                                component.render()
+                            with gr.Group():
+                                image = WebRTC(
+                                    label="Stream",
+                                    rtc_configuration=self.rtc_configuration,
+                                    track_constraints=self.track_constraints,
+                                    mode="send",
+                                    modality="audio",
+                                    icon=ui_args.get("icon"),
+                                    icon_button_color=ui_args.get("icon_button_color"),
+                                    pulse_color=ui_args.get("pulse_color"),
+                                    icon_radius=ui_args.get("icon_radius"),
+                                    variant=ui_args.get("variant", "wave"),
+                                )
+                                for component in additional_input_components:
+                                    if component not in same_components:
+                                        component.render()
+                                if additional_output_components:
+                                    with gr.Column():
+                                        for component in additional_output_components:
+                                            component.render()
+                self.webrtc_component = image
                 image.stream(
                     fn=self.event_handler,
                     inputs=[image] + additional_input_components,
@@ -630,45 +669,89 @@ def _generate_default_ui(
                     </div>
                     """
                         )
-                with gr.Row():
-                    with gr.Column():
-                        with gr.Group():
-                            image = WebRTC(
-                                label="Stream",
-                                rtc_configuration=self.rtc_configuration,
-                                track_constraints=self.track_constraints,
-                                mode="send-receive",
-                                modality="audio",
-                                icon=ui_args.get("icon"),
-                                icon_button_color=ui_args.get("icon_button_color"),
-                                pulse_color=ui_args.get("pulse_color"),
-                                icon_radius=ui_args.get("icon_radius"),
-                            )
-                            self.webrtc_component = image
-                            for component in additional_input_components:
-                                if component not in same_components:
+                if ui_args.get("variant", "") == "textbox":
+                    with gr.Row():
+                        if additional_input_components:
+                            with gr.Column():
+                                for component in additional_input_components:
                                     component.render()
+                        diff_output_components = [
+                            component
+                            for component in additional_output_components
+                            if component not in same_components
+                        ]
+                        if diff_output_components:
+                            with gr.Column():
+                                for component in diff_output_components:
+                                    component.render()
+                    with gr.Row():
+                        image = WebRTC(
+                            label="Stream",
+                            rtc_configuration=self.rtc_configuration,
+                            track_constraints=self.track_constraints,
+                            mode="send-receive",
+                            modality="audio",
+                            icon=ui_args.get("icon"),
+                            icon_button_color=ui_args.get("icon_button_color"),
+                            pulse_color=ui_args.get("pulse_color"),
+                            icon_radius=ui_args.get("icon_radius"),
+                            variant=ui_args.get("variant", "wave"),
+                        )
+                else:
                     if additional_output_components:
-                        with gr.Column():
-                            for component in additional_output_components:
-                                component.render()
-
-                    image.stream(
-                        fn=self.event_handler,
-                        inputs=[image] + additional_input_components,
-                        outputs=[image],
-                        time_limit=self.time_limit,
-                        concurrency_limit=self.concurrency_limit,  # type: ignore
-                        send_input_on=ui_args.get("send_input_on", "change"),
+                        with gr.Row():
+                            with gr.Column():
+                                image = WebRTC(
+                                    label="Stream",
+                                    rtc_configuration=self.rtc_configuration,
+                                    track_constraints=self.track_constraints,
+                                    mode="send-receive",
+                                    modality="audio",
+                                    icon=ui_args.get("icon"),
+                                    icon_button_color=ui_args.get("icon_button_color"),
+                                    pulse_color=ui_args.get("pulse_color"),
+                                    icon_radius=ui_args.get("icon_radius"),
+                                )
+                                for component in additional_input_components:
+                                    if component not in same_components:
+                                        component.render()
+                            with gr.Column():
+                                for component in additional_output_components:
+                                    component.render()
+                    else:
+                        with gr.Row():
+                            with gr.Column():
+                                image = WebRTC(
+                                    label="Stream",
+                                    rtc_configuration=self.rtc_configuration,
+                                    track_constraints=self.track_constraints,
+                                    mode="send-receive",
+                                    modality="audio",
+                                    icon=ui_args.get("icon"),
+                                    icon_button_color=ui_args.get("icon_button_color"),
+                                    pulse_color=ui_args.get("pulse_color"),
+                                    icon_radius=ui_args.get("icon_radius"),
+                                )
+                                for component in additional_input_components:
+                                    if component not in same_components:
+                                        component.render()
+                self.webrtc_component = image
+                image.stream(
+                    fn=self.event_handler,
+                    inputs=[image] + additional_input_components,
+                    outputs=[image],
+                    time_limit=self.time_limit,
+                    concurrency_limit=self.concurrency_limit,  # type: ignore
+                    send_input_on=ui_args.get("send_input_on", "change"),
+                )
+                if additional_output_components:
+                    assert self.additional_outputs_handler
+                    image.on_additional_outputs(
+                        self.additional_outputs_handler,
+                        inputs=additional_output_components,
+                        outputs=additional_output_components,
+                        concurrency_limit=self.concurrency_limit_gradio,  # type: ignore
                     )
-                    if additional_output_components:
-                        assert self.additional_outputs_handler
-                        image.on_additional_outputs(
-                            self.additional_outputs_handler,
-                            inputs=additional_output_components,
-                            outputs=additional_output_components,
-                            concurrency_limit=self.concurrency_limit_gradio,  # type: ignore
-                        )
         elif self.modality == "audio-video" and self.mode == "send-receive":
             css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
             .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""