Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ python main.py --query "Go to Google and type 'Hello World' into the search bar"
- `cloud-run`: Connects to a deployed Cloud Run service (default).
- `playwright`: Runs the browser locally using Playwright.
- `browserbase`: Connects to a Browserbase instance.
- `hud`: Integrates with hud's browser environment.

**Local Playwright**

Expand All @@ -115,6 +116,14 @@ Runs the agent using Browserbase as the browser backend. Ensure the proper Brows
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase"
```

**hud**

Runs the agent using hud's browser environment. This is the same environment used by `hud_eval.py` but can be run directly with `main.py` for individual tasks. Ensure the `HUD_API_KEY` environment variable is set.

```bash
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="hud"
```

**Cloud Run**

Connects to an [API Server](./apiserver/) deployed on Cloud Run for computer use.
Expand Down Expand Up @@ -157,6 +166,31 @@ The `main.py` script is the command-line interface (CLI) for running the browser
| API_SERVER_KEY | The API key for your deployed Cloud Run API server, if it's configured to require one. Can also be provided via the `--api_server_key` argument. | Conditionally (if API server requires it and not passed via CLI) |
| BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) |
| BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) |
| HUD_API_KEY | Your API key for hud. Required for running evaluations with hud_eval.py. | Yes (when using the hud enviornment or running hud_eval.py) |

## Evaluations

The `hud_eval.py` script allows you to run automated evaluations against hud tasksets:

```bash
python hud_eval.py --taskset <taskset_id> [--parallel] [--max_concurrent <n>]
```

**Arguments:**
- `--taskset`: The HUD taskset ID to evaluate (e.g., 'OSWorld-Verified')
- `--parallel`: Run tasks in parallel (default: serial execution)
- `--max_concurrent`: Maximum concurrent tasks when running in parallel (default: 3)
- `--model`: Model name (default: 'gemini-2.0-flash-exp')
- `--api_key`: Gemini API key (uses GEMINI_API_KEY env var if not provided)

**Example:**
```bash
# Run a taskset serially
python hud_eval.py --taskset SheetBench-V2

# Run in parallel with 50 concurrent tasks (can support up to 400)
python hud_eval.py --taskset OSWorld-Verified --parallel --max_concurrent 50
```

## Computers

Expand Down
34 changes: 26 additions & 8 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@

console = Console()


# Built-in Computer Use tools will return "EnvState".
# Custom provided functions will return "dict".
FunctionResponseT = Union[EnvState, dict]
Expand All @@ -44,10 +43,12 @@ def multiply_numbers(x: float, y: float) -> dict:


class BrowserAgent:
def __init__(self, browser_computer: Computer, query: str, model_name: str):
def __init__(self, browser_computer: Computer, query: str, model_name: str, verbose: bool = True):
self._browser_computer = browser_computer
self._query = query
self._model_name = model_name
self._verbose = verbose
self.final_reasoning = None
self._client = genai.Client(
api_key=os.environ.get("GEMINI_API_KEY"),
vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
Expand Down Expand Up @@ -208,6 +209,8 @@ def get_model_response(

def get_text(self, candidate: Candidate) -> Optional[str]:
"""Extracts the text from the candidate."""
if not candidate.content or not candidate.content.parts:
return None
text = []
for part in candidate.content.parts:
if part.text:
Expand All @@ -216,6 +219,8 @@ def get_text(self, candidate: Candidate) -> Optional[str]:

def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
"""Extracts the function call from the candidate."""
if not candidate.content or not candidate.content.parts:
return []
ret = []
for part in candidate.content.parts:
if part.function_call:
Expand All @@ -224,7 +229,13 @@ def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCal

def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
# Generate a response from the model.
with console.status("Generating response from Gemini...", spinner_style=None):
if self._verbose:
with console.status("Generating response from Gemini...", spinner_style=None):
try:
response = self.get_model_response()
except Exception as e:
return "COMPLETE"
else:
try:
response = self.get_model_response()
except Exception as e:
Expand All @@ -234,16 +245,18 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
print("Response has no candidates!")
print(response)
raise ValueError("Empty response")

# Extract the text and function call from the response.
candidate = response.candidates[0]
# Append the model turn to conversation history.
self._contents.append(candidate.content)
if candidate.content:
self._contents.append(candidate.content)

reasoning = self.get_text(candidate)
function_calls = self.extract_function_calls(candidate)
if not function_calls:
print(f"Agent Loop Complete: {reasoning}")
self.final_reasoning = reasoning
return "COMPLETE"

function_call_strs = []
Expand All @@ -260,8 +273,9 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
table.add_column("Gemini Reasoning", header_style="magenta", ratio=1)
table.add_column("Function Call(s)", header_style="cyan", ratio=1)
table.add_row(reasoning, "\n".join(function_call_strs))
console.print(table)
print()
if self._verbose:
console.print(table)
print()

function_responses = []
for function_call in function_calls:
Expand All @@ -272,7 +286,10 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
if decision == "TERMINATE":
print("Terminating agent loop")
return "COMPLETE"
with console.status("Sending command to Computer...", spinner_style=None):
if self._verbose:
with console.status("Sending command to Computer...", spinner_style=None):
fc_result = self.handle_action(function_call)
else:
fc_result = self.handle_action(function_call)
if isinstance(fc_result, EnvState):
function_responses.append(
Expand All @@ -299,6 +316,7 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
parts=[Part(function_response=fr) for fr in function_responses],
)
)

return "CONTINUE"

def _get_safety_confirmation(
Expand Down
35 changes: 25 additions & 10 deletions computers/hud/hud.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import termcolor

from ..computer import Computer, EnvState

from hud.job import Job
from hud.task import Task

class HudComputer(Computer):
"""HUD SDK Computer implementation that uses HUD environments for browser control."""
Expand All @@ -15,11 +16,15 @@ def __init__(
initial_url: str = "https://www.google.com",
search_engine_url: str = "https://www.google.com",
task_prompt: Optional[str] = None,
task: Optional[Task] = None, # Optional Task object from HUD SDK
job: Optional[Job] = None,
):
self._screen_size = screen_size
self._initial_url = initial_url
self._search_engine_url = search_engine_url
self._task_prompt = task_prompt or "Browse the web"
self._task = task
self._job = job
self._env = None
self._obs = None
self._done = False
Expand All @@ -40,16 +45,20 @@ def __enter__(self):
except ImportError:
raise ImportError("HUD SDK not installed. Please install with: pip install hud-python")

# Create a task for the browser environment
task = Task(
prompt=self._task_prompt,
gym="hud-browser",
setup=("goto", self._initial_url),
evaluate=("page_contains", "dummy")
)
# Use provided task or create a default one
if self._task:
task = self._task
else:
# Create a default task for the browser environment
task = Task(
prompt=self._task_prompt,
gym="hud-browser",
setup=("goto", self._initial_url),
evaluate=("page_contains", "dummy")
)

# Create the environment
self._env = self._loop.run_until_complete(gym.make(task))
self._env = self._loop.run_until_complete(gym.make(task, job=self._job))

# Reset the environment to get initial observation
self._obs, _ = self._loop.run_until_complete(self._env.reset())
Expand Down Expand Up @@ -242,6 +251,7 @@ def scroll_at(
direction: Literal["up", "down", "left", "right"],
magnitude: int,
) -> EnvState:
self._execute_action("move", x=x, y=y)
return self._execute_action(
"scroll", x=x, y=y, direction=direction, magnitude=magnitude
)
Expand Down Expand Up @@ -280,4 +290,9 @@ def drag_and_drop(
def current_state(self) -> EnvState:
screenshot = self._get_screenshot_from_obs()
url = self._get_url_from_obs()
return EnvState(screenshot=screenshot, url=url)
return EnvState(screenshot=screenshot, url=url)

def evaluate(self) -> dict:
return self._loop.run_until_complete(
self._env.evaluate()
)
Loading