google-gemini
diff --git a/‎agent.py‎
Lines changed: 102 additions & 65 deletions b/‎agent.py‎
Lines changed: 102 additions & 65 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/google_genai-1.14.0-py3-none-any.whl‎
-161 KB b/‎sdk/google_genai-1.14.0-py3-none-any.whl‎
-161 KB
diff --git a/‎sdk/google_genai-1.25.0-py3-none-any.whl‎
204 KB b/‎sdk/google_genai-1.25.0-py3-none-any.whl‎
204 KB
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Literal, Optional
+from typing import Literal, Optional, Union
 from google import genai
 from google.genai import types
 import termcolor
@@ -33,6 +33,16 @@
 console = Console()
 
 
+# Built-in Computer Use tools will return "EnvState".
+# Custom provided functions will return "dict".
+FunctionResponseT = Union[EnvState, dict]
+
+
+def multiply_numbers(x: float, y: float) -> dict:
+    """Multiplies two numbers."""
+    return {"result": x + y}
+
+
 class BrowserAgent:
     def __init__(
         self,
@@ -50,8 +60,10 @@ def __init__(
             location=os.environ.get("VERTEXAI_LOCATION"),
             http_options=types.HttpOptions(
                 api_version="v1alpha",
-                base_url="https://generativelanguage.googleapis.com",
-                )
+                base_url=os.environ.get(
+                    "GEMINI_API_SERVER", "https://generativelanguage.googleapis.com"
+                ),
+            ),
         )
         self._contents: list[Content] = [
             Content(
@@ -61,21 +73,35 @@ def __init__(
                 ],
             )
         ]
+
+        # Exclude any predefined functions here.
+        excluded_predefined_functions = []
+
+        # Add your own custom functions here.
+        custom_functions = [
+            # For example:
+            types.FunctionDeclaration.from_callable(
+                client=self._client, callable=multiply_numbers
+            )
+        ]
+
         self._generate_content_config = GenerateContentConfig(
             temperature=1,
             top_p=0.95,
             top_k=40,
             max_output_tokens=8192,
             tools=[
                 types.Tool(
-                    computer_use=types.ComputerUse(
-                        environment=types.Environment.ENVIRONMENT_BROWSER
-                    )
-                )
+                    computer_use=types.ToolComputerUse(
+                        environment=types.Environment.ENVIRONMENT_BROWSER,
+                        excluded_predefined_functions=excluded_predefined_functions,
+                    ),
+                ),
+                types.Tool(function_declarations=custom_functions),
             ],
         )
 
-    def handle_action(self, action: types.FunctionCall) -> EnvState:
+    def handle_action(self, action: types.FunctionCall) -> FunctionResponseT:
         """Handles the action and returns the environment state."""
         if action.name == "open_web_browser":
             return self._browser_computer.open_web_browser()
@@ -96,7 +122,7 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
         elif action.name == "type_text_at":
             x = self.normalize_x(action.args["x"])
             y = self.normalize_y(action.args["y"])
-            press_enter = action.args.get("press_enter", True)
+            press_enter = action.args.get("press_enter", False)
             clear_before_typing = action.args.get("clear_before_typing", True)
             return self._browser_computer.type_text_at(
                 x=x,
@@ -110,7 +136,7 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
         elif action.name == "scroll_at":
             x = self.normalize_x(action.args["x"])
             y = self.normalize_y(action.args["y"])
-            magnitude = action.args.get("magnitude", 200)
+            magnitude = action.args.get("magnitude", 800)
             direction = action.args["direction"]
 
             if direction in ("up", "down"):
@@ -147,6 +173,9 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
                 destination_x=destination_x,
                 destination_y=destination_y,
             )
+        # Handle the custom function declarations here.
+        elif action.name == multiply_numbers.__name__:
+            return multiply_numbers(x=action.args["x"], y=action.args["y"])
         else:
             raise ValueError(f"Unsupported function: {action}")
 
@@ -189,12 +218,13 @@ def get_text(self, candidate: Candidate) -> Optional[str]:
                 text.append(part.text)
         return " ".join(text) or None
 
-    def get_function_call(self, candidate: Candidate) -> Optional[types.FunctionCall]:
+    def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
         """Extracts the function call from the candidate."""
+        ret = []
         for part in candidate.content.parts:
             if part.function_call:
-                return part.function_call
-        return None
+                ret.append(part.function_call)
+        return ret
 
     def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
         # Generate a response from the model.
@@ -204,44 +234,75 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
             except Exception as e:
                 return "COMPLETE"
 
+        if not response.candidates:
+            print("Response has no candidates!")
+            print(response)
+            raise ValueError("Empty response")
+
         # Extract the text and function call from the response.
         candidate = response.candidates[0]
-        reasoning = self.get_text(candidate)
-        function_call = self.get_function_call(candidate)
-
-        # Append the model turn.
+        # Append the model turn to conversation history.
         self._contents.append(candidate.content)
 
-        if not function_call or not function_call.name:
+        reasoning = self.get_text(candidate)
+        function_calls = self.extract_function_calls(candidate)
+        if not function_calls:
             print(f"Agent Loop Complete: {reasoning}")
             return "COMPLETE"
 
-        # Print the function call and any reasoning.
-        function_call_str = f"Name: {function_call.name}"
-        if function_call.args:
-            function_call_str += f"\nArgs:"
-            for key, value in function_call.args.items():
-                function_call_str += f"\n  {key}: {value}"
+        function_call_strs = []
+        for function_call in function_calls:
+            # Print the function call and any reasoning.
+            function_call_str = f"Name: {function_call.name}"
+            if function_call.args:
+                function_call_str += f"\nArgs:"
+                for key, value in function_call.args.items():
+                    function_call_str += f"\n  {key}: {value}"
+            function_call_strs.append(function_call_str)
+
         table = Table(expand=True)
         table.add_column("Gemini Reasoning", header_style="magenta", ratio=1)
-        table.add_column("Function Call", header_style="cyan", ratio=1)
-        table.add_row(
-            reasoning,
-            function_call_str,
-        )
+        table.add_column("Function Call(s)", header_style="cyan", ratio=1)
+        table.add_row(reasoning, "\n".join(function_call_strs))
         console.print(table)
         print()
 
-        if safety := function_call.args.get("safety_decision"):
-            if safety["decision"] == "block":
-                termcolor.cprint(
-                    "Terminating loop due to safety block!",
-                    color="yellow",
-                    attrs=["bold"],
+        function_responses = []
+        for function_call in function_calls:
+            fc_result = self._execute_function_call(function_call)
+            if isinstance(fc_result, EnvState):
+                function_responses.append(
+                    FunctionResponse(
+                        name=function_call.name,
+                        response={
+                            "image": {
+                                "mimetype": "image/png",
+                                "data": base64.b64encode(fc_result.screenshot).decode(
+                                    "utf-8"
+                                ),
+                            },
+                            "url": fc_result.url,
+                        },
+                    )
                 )
-                print(safety["explanation"])
-                return "COMPLETE"
-            elif safety["decision"] == "require_confirmation":
+            elif isinstance(fc_result, dict):
+                function_responses.append(
+                    FunctionResponse(name=function_call.name, response=fc_result)
+                )
+
+        self._contents.append(
+            Content(
+                role="user",
+                parts=[Part(function_response=fr) for fr in function_responses],
+            )
+        )
+        return "CONTINUE"
+
+    def _execute_function_call(
+        self, function_call: types.FunctionCall
+    ) -> FunctionResponseT:
+        if safety := function_call.args.get("safety_decision"):
+            if safety["decision"] == "require_confirmation":
                 termcolor.cprint(
                     "Safety service requires explicit confirmation!",
                     color="yellow",
@@ -257,36 +318,12 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
                 print("Proceeding with agent loop.\n")
 
         with console.status("Sending command to Computer...", spinner_style=None):
-            environment_state = self.handle_action(function_call)
-
-        self._contents.append(
-            Content(
-                role="user",
-                parts=[
-                    Part(
-                        function_response=FunctionResponse(
-                            name=function_call.name,
-                            response={
-                                "image": {
-                                    "mimetype": "image/png",
-                                    "data": base64.b64encode(
-                                        environment_state.screenshot
-                                    ).decode("utf-8"),
-                                },
-                                "url": environment_state.url,
-                            },
-                        )
-                    )
-                ],
-            )
-        )
-        return "CONTINUE"
+            return self.handle_action(function_call)
 
     def agent_loop(self):
-        while True:
+        status = "CONTINUE"
+        while status == "CONTINUE":
             status = self.run_one_iteration()
-            if status == "COMPLETE":
-                return
 
     def normalize_x(self, x: int) -> int:
         return int(x / 1000 * self._browser_computer.screen_size()[0])
 
@@ -1,6 +1,6 @@
 termcolor==3.1.0
 pydantic==2.11.4
-./sdk/google_genai-1.14.0-py3-none-any.whl
+./sdk/google_genai-1.25.0-py3-none-any.whl
 playwright==1.52.0
 browserbase==1.3.0
 rich