Skip to content

Commit 1083d1c

Browse files
authored
Merge pull request #32 from lorenss-m/runner-adjustments
hud runner adjustments for full evalsets
2 parents ef51cfa + a72b158 commit 1083d1c

File tree

6 files changed

+266
-19
lines changed

6 files changed

+266
-19
lines changed

README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ python main.py --query "Go to Google and type 'Hello World' into the search bar"
9292
- `cloud-run`: Connects to a deployed Cloud Run service (default).
9393
- `playwright`: Runs the browser locally using Playwright.
9494
- `browserbase`: Connects to a Browserbase instance.
95+
- `hud`: Integrates with hud's browser environment.
9596

9697
**Local Playwright**
9798

@@ -115,6 +116,14 @@ Runs the agent using Browserbase as the browser backend. Ensure the proper Brows
115116
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase"
116117
```
117118

119+
**hud**
120+
121+
Runs the agent using hud's browser environment. This is the same environment used by `hud_eval.py` but can be run directly with `main.py` for individual tasks. Ensure the `HUD_API_KEY` environment variable is set.
122+
123+
```bash
124+
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="hud"
125+
```
126+
118127
**Cloud Run**
119128

120129
Connects to an [API Server](./apiserver/) deployed on Cloud Run for computer use.
@@ -157,6 +166,31 @@ The `main.py` script is the command-line interface (CLI) for running the browser
157166
| API_SERVER_KEY | The API key for your deployed Cloud Run API server, if it's configured to require one. Can also be provided via the `--api_server_key` argument. | Conditionally (if API server requires it and not passed via CLI) |
158167
| BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) |
159168
| BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) |
169+
| HUD_API_KEY | Your API key for hud. Required for running evaluations with hud_eval.py. | Yes (when using the hud enviornment or running hud_eval.py) |
170+
171+
## Evaluations
172+
173+
The `hud_eval.py` script allows you to run automated evaluations against hud tasksets:
174+
175+
```bash
176+
python hud_eval.py --taskset <taskset_id> [--parallel] [--max_concurrent <n>]
177+
```
178+
179+
**Arguments:**
180+
- `--taskset`: The HUD taskset ID to evaluate (e.g., 'OSWorld-Verified')
181+
- `--parallel`: Run tasks in parallel (default: serial execution)
182+
- `--max_concurrent`: Maximum concurrent tasks when running in parallel (default: 3)
183+
- `--model`: Model name (default: 'gemini-2.0-flash-exp')
184+
- `--api_key`: Gemini API key (uses GEMINI_API_KEY env var if not provided)
185+
186+
**Example:**
187+
```bash
188+
# Run a taskset serially
189+
python hud_eval.py --taskset SheetBench-V2
190+
191+
# Run in parallel with 50 concurrent tasks (can support up to 400)
192+
python hud_eval.py --taskset OSWorld-Verified --parallel --max_concurrent 50
193+
```
160194

161195
## Computers
162196

agent.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
console = Console()
3434

35-
3635
# Built-in Computer Use tools will return "EnvState".
3736
# Custom provided functions will return "dict".
3837
FunctionResponseT = Union[EnvState, dict]
@@ -44,10 +43,12 @@ def multiply_numbers(x: float, y: float) -> dict:
4443

4544

4645
class BrowserAgent:
47-
def __init__(self, browser_computer: Computer, query: str, model_name: str):
46+
def __init__(self, browser_computer: Computer, query: str, model_name: str, verbose: bool = True):
4847
self._browser_computer = browser_computer
4948
self._query = query
5049
self._model_name = model_name
50+
self._verbose = verbose
51+
self.final_reasoning = None
5152
self._client = genai.Client(
5253
api_key=os.environ.get("GEMINI_API_KEY"),
5354
vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
@@ -208,6 +209,8 @@ def get_model_response(
208209

209210
def get_text(self, candidate: Candidate) -> Optional[str]:
210211
"""Extracts the text from the candidate."""
212+
if not candidate.content or not candidate.content.parts:
213+
return None
211214
text = []
212215
for part in candidate.content.parts:
213216
if part.text:
@@ -216,6 +219,8 @@ def get_text(self, candidate: Candidate) -> Optional[str]:
216219

217220
def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
218221
"""Extracts the function call from the candidate."""
222+
if not candidate.content or not candidate.content.parts:
223+
return []
219224
ret = []
220225
for part in candidate.content.parts:
221226
if part.function_call:
@@ -224,7 +229,13 @@ def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCal
224229

225230
def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
226231
# Generate a response from the model.
227-
with console.status("Generating response from Gemini...", spinner_style=None):
232+
if self._verbose:
233+
with console.status("Generating response from Gemini...", spinner_style=None):
234+
try:
235+
response = self.get_model_response()
236+
except Exception as e:
237+
return "COMPLETE"
238+
else:
228239
try:
229240
response = self.get_model_response()
230241
except Exception as e:
@@ -234,16 +245,18 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
234245
print("Response has no candidates!")
235246
print(response)
236247
raise ValueError("Empty response")
237-
248+
238249
# Extract the text and function call from the response.
239250
candidate = response.candidates[0]
240251
# Append the model turn to conversation history.
241-
self._contents.append(candidate.content)
252+
if candidate.content:
253+
self._contents.append(candidate.content)
242254

243255
reasoning = self.get_text(candidate)
244256
function_calls = self.extract_function_calls(candidate)
245257
if not function_calls:
246258
print(f"Agent Loop Complete: {reasoning}")
259+
self.final_reasoning = reasoning
247260
return "COMPLETE"
248261

249262
function_call_strs = []
@@ -260,8 +273,9 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
260273
table.add_column("Gemini Reasoning", header_style="magenta", ratio=1)
261274
table.add_column("Function Call(s)", header_style="cyan", ratio=1)
262275
table.add_row(reasoning, "\n".join(function_call_strs))
263-
console.print(table)
264-
print()
276+
if self._verbose:
277+
console.print(table)
278+
print()
265279

266280
function_responses = []
267281
for function_call in function_calls:
@@ -272,7 +286,10 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
272286
if decision == "TERMINATE":
273287
print("Terminating agent loop")
274288
return "COMPLETE"
275-
with console.status("Sending command to Computer...", spinner_style=None):
289+
if self._verbose:
290+
with console.status("Sending command to Computer...", spinner_style=None):
291+
fc_result = self.handle_action(function_call)
292+
else:
276293
fc_result = self.handle_action(function_call)
277294
if isinstance(fc_result, EnvState):
278295
function_responses.append(
@@ -299,6 +316,7 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
299316
parts=[Part(function_response=fr) for fr in function_responses],
300317
)
301318
)
319+
302320
return "CONTINUE"
303321

304322
def _get_safety_confirmation(

computers/hud/hud.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import termcolor
55

66
from ..computer import Computer, EnvState
7-
7+
from hud.job import Job
8+
from hud.task import Task
89

910
class HudComputer(Computer):
1011
"""HUD SDK Computer implementation that uses HUD environments for browser control."""
@@ -15,11 +16,15 @@ def __init__(
1516
initial_url: str = "https://www.google.com",
1617
search_engine_url: str = "https://www.google.com",
1718
task_prompt: Optional[str] = None,
19+
task: Optional[Task] = None, # Optional Task object from HUD SDK
20+
job: Optional[Job] = None,
1821
):
1922
self._screen_size = screen_size
2023
self._initial_url = initial_url
2124
self._search_engine_url = search_engine_url
2225
self._task_prompt = task_prompt or "Browse the web"
26+
self._task = task
27+
self._job = job
2328
self._env = None
2429
self._obs = None
2530
self._done = False
@@ -40,16 +45,20 @@ def __enter__(self):
4045
except ImportError:
4146
raise ImportError("HUD SDK not installed. Please install with: pip install hud-python")
4247

43-
# Create a task for the browser environment
44-
task = Task(
45-
prompt=self._task_prompt,
46-
gym="hud-browser",
47-
setup=("goto", self._initial_url),
48-
evaluate=("page_contains", "dummy")
49-
)
48+
# Use provided task or create a default one
49+
if self._task:
50+
task = self._task
51+
else:
52+
# Create a default task for the browser environment
53+
task = Task(
54+
prompt=self._task_prompt,
55+
gym="hud-browser",
56+
setup=("goto", self._initial_url),
57+
evaluate=("page_contains", "dummy")
58+
)
5059

5160
# Create the environment
52-
self._env = self._loop.run_until_complete(gym.make(task))
61+
self._env = self._loop.run_until_complete(gym.make(task, job=self._job))
5362

5463
# Reset the environment to get initial observation
5564
self._obs, _ = self._loop.run_until_complete(self._env.reset())
@@ -242,6 +251,7 @@ def scroll_at(
242251
direction: Literal["up", "down", "left", "right"],
243252
magnitude: int,
244253
) -> EnvState:
254+
self._execute_action("move", x=x, y=y)
245255
return self._execute_action(
246256
"scroll", x=x, y=y, direction=direction, magnitude=magnitude
247257
)
@@ -280,4 +290,9 @@ def drag_and_drop(
280290
def current_state(self) -> EnvState:
281291
screenshot = self._get_screenshot_from_obs()
282292
url = self._get_url_from_obs()
283-
return EnvState(screenshot=screenshot, url=url)
293+
return EnvState(screenshot=screenshot, url=url)
294+
295+
def evaluate(self) -> dict:
296+
return self._loop.run_until_complete(
297+
self._env.evaluate()
298+
)

0 commit comments

Comments
 (0)