google-gemini · ericpts · Aug 19, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/README.md b/README.md
@@ -92,6 +92,7 @@ python main.py --query "Go to Google and type 'Hello World' into the search bar"
 - `cloud-run`: Connects to a deployed Cloud Run service (default).
 - `playwright`: Runs the browser locally using Playwright.
 - `browserbase`: Connects to a Browserbase instance.
+- `hud`: Integrates with hud's browser environment.
 
 **Local Playwright**
 
@@ -115,6 +116,14 @@ Runs the agent using Browserbase as the browser backend. Ensure the proper Brows
 python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase"
 ```
 
+**hud**
+
+Runs the agent using hud's browser environment. This is the same environment used by `hud_eval.py` but can be run directly with `main.py` for individual tasks. Ensure the `HUD_API_KEY` environment variable is set.
+
+```bash
+python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="hud"
+```
+
 **Cloud Run**
 
 Connects to an [API Server](./apiserver/) deployed on Cloud Run for computer use.
@@ -157,6 +166,31 @@ The `main.py` script is the command-line interface (CLI) for running the browser
 | API_SERVER_KEY | The API key for your deployed Cloud Run API server, if it's configured to require one. Can also be provided via the `--api_server_key` argument. | Conditionally (if API server requires it and not passed via CLI) |
 | BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) |
 | BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) |
+| HUD_API_KEY | Your API key for hud. Required for running evaluations with hud_eval.py. | Yes (when using the hud enviornment or running hud_eval.py) |
+
+## Evaluations
+
+The `hud_eval.py` script allows you to run automated evaluations against hud tasksets:
+
+```bash
+python hud_eval.py --taskset <taskset_id> [--parallel] [--max_concurrent <n>]
+```
+
+**Arguments:**
+- `--taskset`: The HUD taskset ID to evaluate (e.g., 'OSWorld-Verified')
+- `--parallel`: Run tasks in parallel (default: serial execution)
+- `--max_concurrent`: Maximum concurrent tasks when running in parallel (default: 3)
+- `--model`: Model name (default: 'gemini-2.0-flash-exp')
+- `--api_key`: Gemini API key (uses GEMINI_API_KEY env var if not provided)
+
+**Example:**
+```bash
+# Run a taskset serially
+python hud_eval.py --taskset SheetBench-V2
+
+# Run in parallel with 50 concurrent tasks (can support up to 400)
+python hud_eval.py --taskset OSWorld-Verified --parallel --max_concurrent 50
+```
 
 ## Computers
 

diff --git a/agent.py b/agent.py
@@ -32,7 +32,6 @@
 
 console = Console()
 
-
 # Built-in Computer Use tools will return "EnvState".
 # Custom provided functions will return "dict".
 FunctionResponseT = Union[EnvState, dict]
@@ -44,10 +43,12 @@ def multiply_numbers(x: float, y: float) -> dict:
 
 
 class BrowserAgent:
-    def __init__(self, browser_computer: Computer, query: str, model_name: str):
+    def __init__(self, browser_computer: Computer, query: str, model_name: str, verbose: bool = True):
         self._browser_computer = browser_computer
         self._query = query
         self._model_name = model_name
+        self._verbose = verbose
+        self.final_reasoning = None
         self._client = genai.Client(
             api_key=os.environ.get("GEMINI_API_KEY"),
             vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
@@ -208,6 +209,8 @@ def get_model_response(
 
     def get_text(self, candidate: Candidate) -> Optional[str]:
         """Extracts the text from the candidate."""
+        if not candidate.content or not candidate.content.parts:
+            return None
         text = []
         for part in candidate.content.parts:
             if part.text:
@@ -216,6 +219,8 @@ def get_text(self, candidate: Candidate) -> Optional[str]:
 
     def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
         """Extracts the function call from the candidate."""
+        if not candidate.content or not candidate.content.parts:
+            return []
         ret = []
         for part in candidate.content.parts:
             if part.function_call:
@@ -224,7 +229,13 @@ def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCal
 
     def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
         # Generate a response from the model.
-        with console.status("Generating response from Gemini...", spinner_style=None):
+        if self._verbose:
+            with console.status("Generating response from Gemini...", spinner_style=None):
+                try:
+                    response = self.get_model_response()
+                except Exception as e:
+                    return "COMPLETE"
+        else:
             try:
                 response = self.get_model_response()
             except Exception as e:
@@ -234,16 +245,18 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
             print("Response has no candidates!")
             print(response)
             raise ValueError("Empty response")
-
+        
         # Extract the text and function call from the response.
         candidate = response.candidates[0]
         # Append the model turn to conversation history.
-        self._contents.append(candidate.content)
+        if candidate.content:
+            self._contents.append(candidate.content)
 
         reasoning = self.get_text(candidate)
         function_calls = self.extract_function_calls(candidate)
         if not function_calls:
             print(f"Agent Loop Complete: {reasoning}")
+            self.final_reasoning = reasoning
             return "COMPLETE"
 
         function_call_strs = []
@@ -260,8 +273,9 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
         table.add_column("Gemini Reasoning", header_style="magenta", ratio=1)
         table.add_column("Function Call(s)", header_style="cyan", ratio=1)
         table.add_row(reasoning, "\n".join(function_call_strs))
-        console.print(table)
-        print()
+        if self._verbose:
+            console.print(table)
+            print()
 
         function_responses = []
         for function_call in function_calls:
@@ -272,7 +286,10 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
                 if decision == "TERMINATE":
                     print("Terminating agent loop")
                     return "COMPLETE"
-            with console.status("Sending command to Computer...", spinner_style=None):
+            if self._verbose:
+                with console.status("Sending command to Computer...", spinner_style=None):
+                    fc_result = self.handle_action(function_call)
+            else:
                 fc_result = self.handle_action(function_call)
             if isinstance(fc_result, EnvState):
                 function_responses.append(
@@ -299,6 +316,7 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
                 parts=[Part(function_response=fr) for fr in function_responses],
             )
         )
+
         return "CONTINUE"
 
     def _get_safety_confirmation(

diff --git a/computers/hud/hud.py b/computers/hud/hud.py
@@ -4,7 +4,8 @@
 import termcolor
 
 from ..computer import Computer, EnvState
-
+from hud.job import Job
+from hud.task import Task
 
 class HudComputer(Computer):
     """HUD SDK Computer implementation that uses HUD environments for browser control."""
@@ -15,11 +16,15 @@ def __init__(
         initial_url: str = "https://www.google.com",
         search_engine_url: str = "https://www.google.com",
         task_prompt: Optional[str] = None,
+        task: Optional[Task] = None,  # Optional Task object from HUD SDK
+        job: Optional[Job] = None,
     ):
         self._screen_size = screen_size
         self._initial_url = initial_url
         self._search_engine_url = search_engine_url
         self._task_prompt = task_prompt or "Browse the web"
+        self._task = task
+        self._job = job
         self._env = None
         self._obs = None
         self._done = False
@@ -40,16 +45,20 @@ def __enter__(self):
         except ImportError:
             raise ImportError("HUD SDK not installed. Please install with: pip install hud-python")
 
-        # Create a task for the browser environment
-        task = Task(
-            prompt=self._task_prompt,
-            gym="hud-browser",
-            setup=("goto", self._initial_url),
-            evaluate=("page_contains", "dummy")
-        )
+        # Use provided task or create a default one
+        if self._task:
+            task = self._task
+        else:
+            # Create a default task for the browser environment
+            task = Task(
+                prompt=self._task_prompt,
+                gym="hud-browser",
+                setup=("goto", self._initial_url),
+                evaluate=("page_contains", "dummy")
+            )
 
         # Create the environment
-        self._env = self._loop.run_until_complete(gym.make(task))
+        self._env = self._loop.run_until_complete(gym.make(task, job=self._job))
 
         # Reset the environment to get initial observation
         self._obs, _ = self._loop.run_until_complete(self._env.reset())
@@ -242,6 +251,7 @@ def scroll_at(
         direction: Literal["up", "down", "left", "right"],
         magnitude: int,
     ) -> EnvState:
+        self._execute_action("move", x=x, y=y)
         return self._execute_action(
             "scroll", x=x, y=y, direction=direction, magnitude=magnitude
         )
@@ -280,4 +290,9 @@ def drag_and_drop(
     def current_state(self) -> EnvState:
         screenshot = self._get_screenshot_from_obs()
         url = self._get_url_from_obs()
-        return EnvState(screenshot=screenshot, url=url) 
+        return EnvState(screenshot=screenshot, url=url) 
+
+    def evaluate(self) -> dict:
+        return self._loop.run_until_complete(
+            self._env.evaluate()
+        )