google-gemini
diff --git a/‎agent.py‎
Lines changed: 114 additions & 72 deletions b/‎agent.py‎
Lines changed: 114 additions & 72 deletions
diff --git a/‎apiserver/app.py‎
Lines changed: 2 additions & 1 deletion b/‎apiserver/app.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎computers/playwright/playwright.py‎
Lines changed: 6 additions & 2 deletions b/‎computers/playwright/playwright.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎main.py‎
Lines changed: 1 addition & 1 deletion b/‎main.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/google_genai-1.14.0-py3-none-any.whl‎
-161 KB b/‎sdk/google_genai-1.14.0-py3-none-any.whl‎
-161 KB
diff --git a/‎sdk/google_genai-1.25.0-py3-none-any.whl‎
204 KB b/‎sdk/google_genai-1.25.0-py3-none-any.whl‎
204 KB
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Literal, Optional
+from typing import Literal, Optional, Union, Any
 from google import genai
 from google.genai import types
 import termcolor
@@ -33,6 +33,16 @@
 console = Console()
 
 
+# Built-in Computer Use tools will return "EnvState".
+# Custom provided functions will return "dict".
+FunctionResponseT = Union[EnvState, dict]
+
+
+def multiply_numbers(x: float, y: float) -> dict:
+    """Multiplies two numbers."""
+    return {"result": x * y}
+
+
 class BrowserAgent:
     def __init__(
         self,
@@ -50,8 +60,10 @@ def __init__(
             location=os.environ.get("VERTEXAI_LOCATION"),
             http_options=types.HttpOptions(
                 api_version="v1alpha",
-                base_url="https://generativelanguage.googleapis.com",
-                )
+                base_url=os.environ.get(
+                    "GEMINI_API_SERVER", "https://generativelanguage.googleapis.com"
+                ),
+            ),
         )
         self._contents: list[Content] = [
             Content(
@@ -61,21 +73,36 @@ def __init__(
                 ],
             )
         ]
+
+        # Exclude any predefined functions here.
+        excluded_predefined_functions = []
+
+        # Add your own custom functions here.
+        custom_functions = [
+            # For example:
+            types.FunctionDeclaration.from_callable(
+                client=self._client, callable=multiply_numbers
+            )
+        ]
+
         self._generate_content_config = GenerateContentConfig(
             temperature=1,
             top_p=0.95,
             top_k=40,
             max_output_tokens=8192,
             tools=[
                 types.Tool(
-                    computer_use=types.ComputerUse(
-                        environment=types.Environment.ENVIRONMENT_BROWSER
-                    )
-                )
+                    computer_use=types.ToolComputerUse(
+                        environment=types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=excluded_predefined_functions,
+                    ),
+                ),
+                types.Tool(function_declarations=custom_functions),
             ],
+            thinking_config=types.ThinkingConfig(include_thoughts=True),
         )
 
-    def handle_action(self, action: types.FunctionCall) -> EnvState:
+    def handle_action(self, action: types.FunctionCall) -> FunctionResponseT:
         """Handles the action and returns the environment state."""
         if action.name == "open_web_browser":
             return self._browser_computer.open_web_browser()
@@ -96,7 +123,7 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
         elif action.name == "type_text_at":
             x = self.normalize_x(action.args["x"])
             y = self.normalize_y(action.args["y"])
-            press_enter = action.args.get("press_enter", True)
+            press_enter = action.args.get("press_enter", False)
             clear_before_typing = action.args.get("clear_before_typing", True)
             return self._browser_computer.type_text_at(
                 x=x,
@@ -110,7 +137,7 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
         elif action.name == "scroll_at":
             x = self.normalize_x(action.args["x"])
             y = self.normalize_y(action.args["y"])
-            magnitude = action.args.get("magnitude", 200)
+            magnitude = action.args.get("magnitude", 800)
             direction = action.args["direction"]
 
             if direction in ("up", "down"):
@@ -147,6 +174,9 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
                 destination_x=destination_x,
                 destination_y=destination_y,
             )
+        # Handle the custom function declarations here.
+        elif action.name == multiply_numbers.__name__:
+            return multiply_numbers(x=action.args["x"], y=action.args["y"])
         else:
             raise ValueError(f"Unsupported function: {action}")
 
@@ -189,12 +219,13 @@ def get_text(self, candidate: Candidate) -> Optional[str]:
                 text.append(part.text)
         return " ".join(text) or None
 
-    def get_function_call(self, candidate: Candidate) -> Optional[types.FunctionCall]:
+    def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
         """Extracts the function call from the candidate."""
+        ret = []
         for part in candidate.content.parts:
             if part.function_call:
-                return part.function_call
-        return None
+                ret.append(part.function_call)
+        return ret
 
     def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
         # Generate a response from the model.
@@ -204,89 +235,100 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
             except Exception as e:
                 return "COMPLETE"
 
+        if not response.candidates:
+            print("Response has no candidates!")
+            print(response)
+            raise ValueError("Empty response")
+
         # Extract the text and function call from the response.
         candidate = response.candidates[0]
-        reasoning = self.get_text(candidate)
-        function_call = self.get_function_call(candidate)
-
-        # Append the model turn.
+        # Append the model turn to conversation history.
         self._contents.append(candidate.content)
 
-        if not function_call or not function_call.name:
+        reasoning = self.get_text(candidate)
+        function_calls = self.extract_function_calls(candidate)
+        if not function_calls:
             print(f"Agent Loop Complete: {reasoning}")
             return "COMPLETE"
 
-        # Print the function call and any reasoning.
-        function_call_str = f"Name: {function_call.name}"
-        if function_call.args:
-            function_call_str += f"\nArgs:"
-            for key, value in function_call.args.items():
-                function_call_str += f"\n  {key}: {value}"
+        function_call_strs = []
+        for function_call in function_calls:
+            # Print the function call and any reasoning.
+            function_call_str = f"Name: {function_call.name}"
+            if function_call.args:
+                function_call_str += f"\nArgs:"
+                for key, value in function_call.args.items():
+                    function_call_str += f"\n  {key}: {value}"
+            function_call_strs.append(function_call_str)
+
         table = Table(expand=True)
         table.add_column("Gemini Reasoning", header_style="magenta", ratio=1)
-        table.add_column("Function Call", header_style="cyan", ratio=1)
-        table.add_row(
-            reasoning,
-            function_call_str,
-        )
+        table.add_column("Function Call(s)", header_style="cyan", ratio=1)
+        table.add_row(reasoning, "\n".join(function_call_strs))
         console.print(table)
         print()
 
-        if safety := function_call.args.get("safety_decision"):
-            if safety["decision"] == "block":
-                termcolor.cprint(
-                    "Terminating loop due to safety block!",
-                    color="yellow",
-                    attrs=["bold"],
+        function_responses = []
+        for function_call in function_calls:
+            if function_call.args and (
+                safety := function_call.args.get("safety_decision")
+            ):
+                decision = self._get_safety_confirmation(safety)
+                if decision == "TERMINATE":
+                    print("Terminating agent loop")
+                    return "COMPLETE"
+            with console.status("Sending command to Computer...", spinner_style=None):
+                fc_result = self.handle_action(function_call)
+            if isinstance(fc_result, EnvState):
+                function_responses.append(
+                    FunctionResponse(
+                        name=function_call.name,
+                        response={
+                            "image": {
+                                "mimetype": "image/png",
+                                "data": base64.b64encode(fc_result.screenshot).decode(
+                                    "utf-8"
+                                ),
+                            },
+                            "url": fc_result.url,
+                        },
+                    )
                 )
-                print(safety["explanation"])
-                return "COMPLETE"
-            elif safety["decision"] == "require_confirmation":
-                termcolor.cprint(
-                    "Safety service requires explicit confirmation!",
-                    color="yellow",
-                    attrs=["bold"],
+            elif isinstance(fc_result, dict):
+                function_responses.append(
+                    FunctionResponse(name=function_call.name, response=fc_result)
                 )
-                print(safety["explanation"])
-                decision = ""
-                while decision.lower() not in ("y", "n", "ye", "yes", "no"):
-                    decision = input("Do you wish to proceed? [Y]es/[n]o\n")
-                if decision.lower() in ("n", "no"):
-                    print("Terminating agent loop.")
-                    return "COMPLETE"
-                print("Proceeding with agent loop.\n")
-
-        with console.status("Sending command to Computer...", spinner_style=None):
-            environment_state = self.handle_action(function_call)
 
         self._contents.append(
             Content(
                 role="user",
-                parts=[
-                    Part(
-                        function_response=FunctionResponse(
-                            name=function_call.name,
-                            response={
-                                "image": {
-                                    "mimetype": "image/png",
-                                    "data": base64.b64encode(
-                                        environment_state.screenshot
-                                    ).decode("utf-8"),
-                                },
-                                "url": environment_state.url,
-                            },
-                        )
-                    )
-                ],
+                parts=[Part(function_response=fr) for fr in function_responses],
             )
         )
         return "CONTINUE"
 
+    def _get_safety_confirmation(
+        self, safety: dict[str, Any]
+    ) -> Literal["CONTINUE", "TERMINATE"]:
+        if safety["decision"] != "require_confirmation":
+            raise ValueError(f"Unknown safety decision: safety['decision']")
+        termcolor.cprint(
+            "Safety service requires explicit confirmation!",
+            color="yellow",
+            attrs=["bold"],
+        )
+        print(safety["explanation"])
+        decision = ""
+        while decision.lower() not in ("y", "n", "ye", "yes", "no"):
+            decision = input("Do you wish to proceed? [Y]es/[n]o\n")
+        if decision.lower() in ("n", "no"):
+            return "TERMINATE"
+        return "CONTINUE"
+
     def agent_loop(self):
-        while True:
+        status = "CONTINUE"
+        while status == "CONTINUE":
             status = self.run_one_iteration()
-            if status == "COMPLETE":
-                return
 
     def normalize_x(self, x: int) -> int:
         return int(x / 1000 * self._browser_computer.screen_size()[0])
 
@@ -188,7 +188,8 @@ async def delete_session(
 
 
 # Static HTML5 to test the API.
-app.mount("/", StaticFiles(directory="static", html=True), name="static")
+static_dir = os.path.join(os.path.dirname(__file__), "static")
+app.mount("/", StaticFiles(directory=static_dir, html=True), name="static")
 
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "8000"))
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 import termcolor
 import time
+import sys
 from ..computer import (
     Computer,
     EnvState,
@@ -144,15 +145,18 @@ def type_text_at(
         x: int,
         y: int,
         text: str,
-        press_enter: bool = True,
+        press_enter: bool = False,
         clear_before_typing: bool = True,
     ) -> EnvState:
         self.highlight_mouse(x, y)
         self._page.mouse.click(x, y)
         self._page.wait_for_load_state()
 
         if clear_before_typing:
-            self.key_combination(["Control", "A"])
+            if sys.platform == "darwin":
+                self.key_combination(["Command", "A"])
+            else:
+                self.key_combination(["Control", "A"])
             self.key_combination(["Delete"])
 
         self._page.keyboard.type(text)
 
@@ -19,7 +19,7 @@
 
 
 CLOUD_RUN_SCREEN_SIZE = (1920, 1080)
-PLAYWRIGHT_SCREEN_SIZE = (1440, 810)
+PLAYWRIGHT_SCREEN_SIZE = (1920, 1080)
 
 
 def main() -> int:
 
@@ -1,6 +1,6 @@
 termcolor==3.1.0
 pydantic==2.11.4
-./sdk/google_genai-1.14.0-py3-none-any.whl
+./sdk/google_genai-1.25.0-py3-none-any.whl
 playwright==1.52.0
 browserbase==1.3.0
 rich