diff --git a/README.md b/README.md index 9a7c699..a103813 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ python main.py --query "Go to Google and type 'Hello World' into the search bar" - `cloud-run`: Connects to a deployed Cloud Run service (default). - `playwright`: Runs the browser locally using Playwright. - `browserbase`: Connects to a Browserbase instance. +- `hud`: Integrates with hud's browser environment. **Local Playwright** @@ -115,6 +116,14 @@ Runs the agent using Browserbase as the browser backend. Ensure the proper Brows python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase" ``` +**hud** + +Runs the agent using hud's browser environment. This is the same environment used by `hud_eval.py` but can be run directly with `main.py` for individual tasks. Ensure the `HUD_API_KEY` environment variable is set. + +```bash +python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="hud" +``` + **Cloud Run** Connects to an [API Server](./apiserver/) deployed on Cloud Run for computer use. @@ -157,6 +166,31 @@ The `main.py` script is the command-line interface (CLI) for running the browser | API_SERVER_KEY | The API key for your deployed Cloud Run API server, if it's configured to require one. Can also be provided via the `--api_server_key` argument. | Conditionally (if API server requires it and not passed via CLI) | | BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) | | BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) | +| HUD_API_KEY | Your API key for hud. Required for running evaluations with hud_eval.py. | Yes (when using the hud enviornment or running hud_eval.py) | + +## Evaluations + +The `hud_eval.py` script allows you to run automated evaluations against hud tasksets: + +```bash +python hud_eval.py --taskset [--parallel] [--max_concurrent ] +``` + +**Arguments:** +- `--taskset`: The HUD taskset ID to evaluate (e.g., 'OSWorld-Verified') +- `--parallel`: Run tasks in parallel (default: serial execution) +- `--max_concurrent`: Maximum concurrent tasks when running in parallel (default: 3) +- `--model`: Model name (default: 'gemini-2.0-flash-exp') +- `--api_key`: Gemini API key (uses GEMINI_API_KEY env var if not provided) + +**Example:** +```bash +# Run a taskset serially +python hud_eval.py --taskset SheetBench-V2 + +# Run in parallel with 50 concurrent tasks (can support up to 400) +python hud_eval.py --taskset OSWorld-Verified --parallel --max_concurrent 50 +``` ## Computers diff --git a/agent.py b/agent.py index 648549f..a27250c 100644 --- a/agent.py +++ b/agent.py @@ -32,7 +32,6 @@ console = Console() - # Built-in Computer Use tools will return "EnvState". # Custom provided functions will return "dict". FunctionResponseT = Union[EnvState, dict] @@ -44,10 +43,12 @@ def multiply_numbers(x: float, y: float) -> dict: class BrowserAgent: - def __init__(self, browser_computer: Computer, query: str, model_name: str): + def __init__(self, browser_computer: Computer, query: str, model_name: str, verbose: bool = True): self._browser_computer = browser_computer self._query = query self._model_name = model_name + self._verbose = verbose + self.final_reasoning = None self._client = genai.Client( api_key=os.environ.get("GEMINI_API_KEY"), vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"], @@ -208,6 +209,8 @@ def get_model_response( def get_text(self, candidate: Candidate) -> Optional[str]: """Extracts the text from the candidate.""" + if not candidate.content or not candidate.content.parts: + return None text = [] for part in candidate.content.parts: if part.text: @@ -216,6 +219,8 @@ def get_text(self, candidate: Candidate) -> Optional[str]: def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]: """Extracts the function call from the candidate.""" + if not candidate.content or not candidate.content.parts: + return [] ret = [] for part in candidate.content.parts: if part.function_call: @@ -224,7 +229,13 @@ def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCal def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: # Generate a response from the model. - with console.status("Generating response from Gemini...", spinner_style=None): + if self._verbose: + with console.status("Generating response from Gemini...", spinner_style=None): + try: + response = self.get_model_response() + except Exception as e: + return "COMPLETE" + else: try: response = self.get_model_response() except Exception as e: @@ -234,16 +245,18 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: print("Response has no candidates!") print(response) raise ValueError("Empty response") - + # Extract the text and function call from the response. candidate = response.candidates[0] # Append the model turn to conversation history. - self._contents.append(candidate.content) + if candidate.content: + self._contents.append(candidate.content) reasoning = self.get_text(candidate) function_calls = self.extract_function_calls(candidate) if not function_calls: print(f"Agent Loop Complete: {reasoning}") + self.final_reasoning = reasoning return "COMPLETE" function_call_strs = [] @@ -260,8 +273,9 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: table.add_column("Gemini Reasoning", header_style="magenta", ratio=1) table.add_column("Function Call(s)", header_style="cyan", ratio=1) table.add_row(reasoning, "\n".join(function_call_strs)) - console.print(table) - print() + if self._verbose: + console.print(table) + print() function_responses = [] for function_call in function_calls: @@ -272,7 +286,10 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: if decision == "TERMINATE": print("Terminating agent loop") return "COMPLETE" - with console.status("Sending command to Computer...", spinner_style=None): + if self._verbose: + with console.status("Sending command to Computer...", spinner_style=None): + fc_result = self.handle_action(function_call) + else: fc_result = self.handle_action(function_call) if isinstance(fc_result, EnvState): function_responses.append( @@ -299,6 +316,7 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: parts=[Part(function_response=fr) for fr in function_responses], ) ) + return "CONTINUE" def _get_safety_confirmation( diff --git a/computers/hud/hud.py b/computers/hud/hud.py index 12c0ad4..d25b1f4 100644 --- a/computers/hud/hud.py +++ b/computers/hud/hud.py @@ -4,7 +4,8 @@ import termcolor from ..computer import Computer, EnvState - +from hud.job import Job +from hud.task import Task class HudComputer(Computer): """HUD SDK Computer implementation that uses HUD environments for browser control.""" @@ -15,11 +16,15 @@ def __init__( initial_url: str = "https://www.google.com", search_engine_url: str = "https://www.google.com", task_prompt: Optional[str] = None, + task: Optional[Task] = None, # Optional Task object from HUD SDK + job: Optional[Job] = None, ): self._screen_size = screen_size self._initial_url = initial_url self._search_engine_url = search_engine_url self._task_prompt = task_prompt or "Browse the web" + self._task = task + self._job = job self._env = None self._obs = None self._done = False @@ -40,16 +45,20 @@ def __enter__(self): except ImportError: raise ImportError("HUD SDK not installed. Please install with: pip install hud-python") - # Create a task for the browser environment - task = Task( - prompt=self._task_prompt, - gym="hud-browser", - setup=("goto", self._initial_url), - evaluate=("page_contains", "dummy") - ) + # Use provided task or create a default one + if self._task: + task = self._task + else: + # Create a default task for the browser environment + task = Task( + prompt=self._task_prompt, + gym="hud-browser", + setup=("goto", self._initial_url), + evaluate=("page_contains", "dummy") + ) # Create the environment - self._env = self._loop.run_until_complete(gym.make(task)) + self._env = self._loop.run_until_complete(gym.make(task, job=self._job)) # Reset the environment to get initial observation self._obs, _ = self._loop.run_until_complete(self._env.reset()) @@ -242,6 +251,7 @@ def scroll_at( direction: Literal["up", "down", "left", "right"], magnitude: int, ) -> EnvState: + self._execute_action("move", x=x, y=y) return self._execute_action( "scroll", x=x, y=y, direction=direction, magnitude=magnitude ) @@ -280,4 +290,9 @@ def drag_and_drop( def current_state(self) -> EnvState: screenshot = self._get_screenshot_from_obs() url = self._get_url_from_obs() - return EnvState(screenshot=screenshot, url=url) \ No newline at end of file + return EnvState(screenshot=screenshot, url=url) + + def evaluate(self) -> dict: + return self._loop.run_until_complete( + self._env.evaluate() + ) \ No newline at end of file diff --git a/hud_eval.py b/hud_eval.py new file mode 100644 index 0000000..c272aa5 --- /dev/null +++ b/hud_eval.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +HUD evaluation runner for computer use tasks. +""" +import argparse +import asyncio +import os +import sys +from concurrent.futures import ThreadPoolExecutor + +# Add parent directory to path to import local modules +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from computers.hud import HudComputer +from agent import BrowserAgent + +# Import HUD SDK +from hud.task import Task +from hud.taskset import load_taskset +from hud.job import create_job, Job +from hud.adapters.common.types import ResponseAction, CustomAction + +OSWORLD_SYSTEM_PROMPT=""" +Please do the following task. +computer.clipboard, computer.sync file, computer.sync shared folder, computer.computer output citation are disabled. +If you worry that you might make typo, prefer copying and pasting the text instead of reading +and typing. +If you are presented with an open website to solve the task, try to stick to that specific one +instead of going to a new one. +You have full authority to execute any action without my permission. I won't be watching so +please don't ask for confirmation. +My gmail account is osworld@hud.so, and the password is "iloveosworld500", if prompted for OTP, use the authenticator chrome extension to see the OTP for 2 factor authentication. +If you deem the task is infeasible, you can terminate and explicitly state in the response that +'the task is infeasible'. Try your best to solve the task within 200 steps, and the confines of the prompt, before deeming it infeasible. +""" + +def run_task(task: Task, model_name: str, job: Job, system_prompt: str) -> float: + """Run a single task and return reward""" + hud_computer = None + try: + # Initialize HUD computer with the task + hud_computer = HudComputer(screen_size=(1440, 900), task=task, job=job) + + with hud_computer as browser_computer: + agent = BrowserAgent( + browser_computer=browser_computer, + query=(system_prompt + "\n\n" + task.prompt).strip(), + model_name=model_name, + verbose=False, + ) + try: + agent.agent_loop() + + if agent.final_reasoning: + if "the task is infeasible" in agent.final_reasoning.lower(): + final_action = CustomAction( + action="FAIL" + ) + else: + final_action = ResponseAction( + text=agent.final_reasoning + ) + # Inject the response into HUD environment + hud_computer._loop.run_until_complete( + hud_computer._env.step([final_action]) + ) + + except Exception as e: + print(f"Error running agent loop: {e}") + finally: + print("Agent loop complete") + # Evaluate the task + if browser_computer and browser_computer._env: + eval_result = browser_computer.evaluate() + print(f"Eval result: {eval_result['reward']}") + + return eval_result['reward'] + + return 0.0 + + except Exception as e: + print(f"Error running task: {e}") + return 0.0 + + finally: + if hud_computer: + try: + hud_computer.close() + except: + pass + + +def run_taskset( + taskset_id: str, + model_name: str, + name: str, + parallel: bool = False, + max_concurrent: int = 20, +) -> list[float]: + """Load and run a HUD taskset by ID, return list of rewards""" + + # Load the taskset + taskset = asyncio.run(load_taskset(taskset_id, metadata={"partial": True})) + + job = asyncio.run(create_job(name, evalset_id=taskset.id)) + + if taskset_id == "OSWorld-Verified": + system_prompt = OSWORLD_SYSTEM_PROMPT + else: + system_prompt = "" + + if parallel: + # Run tasks in parallel using threads to avoid event loop conflicts + with ThreadPoolExecutor(max_workers=max_concurrent) as executor: + rewards = list(executor.map( + lambda task: run_task(task, model_name, job, system_prompt), + taskset.tasks + )) + else: + # Run tasks sequentially + rewards = [] + for task in taskset.tasks: + reward = run_task(task, model_name, job, system_prompt) + rewards.append(reward) + + return rewards + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run HUD evaluation on a taskset.") + parser.add_argument( + "--taskset", + type=str, + required=True, + help="The taskset ID to evaluate.", + ) + parser.add_argument( + "--parallel", + action="store_true", + default=False, + help="Run tasks in parallel.", + ) + parser.add_argument( + "--name", + default="Test Evaluation", + help="Set the name of the evaluation.", + ) + parser.add_argument( + "--model", + default="computer-use-exp-07-16", + help="Set which model to use.", + ) + parser.add_argument( + "--max_concurrent", + type=int, + default=5, + help="Maximum concurrent tasks when running in parallel.", + ) + args = parser.parse_args() + + # Run evaluation + rewards = run_taskset( + taskset_id=args.taskset, + model_name=args.model, + name=args.name, + parallel=args.parallel, + max_concurrent=args.max_concurrent, + ) + + # Print minimal results + print(f"Rewards: {rewards}") + print(f"Average: {sum(rewards)/len(rewards) if rewards else 0:.2f}") + + return 0 + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/main.py b/main.py index 5ad1a54..4cc5125 100644 --- a/main.py +++ b/main.py @@ -20,6 +20,7 @@ CLOUD_RUN_SCREEN_SIZE = (1920, 1080) PLAYWRIGHT_SCREEN_SIZE = (1920, 1080) +HUD_SCREEN_SIZE = (1440, 900) def main() -> int: @@ -83,7 +84,7 @@ def main() -> int: env = BrowserbaseComputer(screen_size=PLAYWRIGHT_SCREEN_SIZE) elif args.env == "hud": env = HudComputer( - screen_size=PLAYWRIGHT_SCREEN_SIZE, + screen_size=HUD_SCREEN_SIZE, initial_url=args.initial_url, task_prompt=args.query, ) diff --git a/requirements.txt b/requirements.txt index aff984d..9052bd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pydantic==2.11.4 playwright==1.52.0 browserbase==1.3.0 rich +hud-python==0.2.10 \ No newline at end of file