diff --git a/README.md b/README.md
index 9a7c699..a103813 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,7 @@ python main.py --query "Go to Google and type 'Hello World' into the search bar"
 - `cloud-run`: Connects to a deployed Cloud Run service (default).
 - `playwright`: Runs the browser locally using Playwright.
 - `browserbase`: Connects to a Browserbase instance.
+- `hud`: Integrates with hud's browser environment.
 
 **Local Playwright**
 
@@ -115,6 +116,14 @@ Runs the agent using Browserbase as the browser backend. Ensure the proper Brows
 python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase"
 ```
 
+**hud**
+
+Runs the agent using hud's browser environment. This is the same environment used by `hud_eval.py` but can be run directly with `main.py` for individual tasks. Ensure the `HUD_API_KEY` environment variable is set.
+
+```bash
+python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="hud"
+```
+
 **Cloud Run**
 
 Connects to an [API Server](./apiserver/) deployed on Cloud Run for computer use.
@@ -157,6 +166,31 @@ The `main.py` script is the command-line interface (CLI) for running the browser
 | API_SERVER_KEY | The API key for your deployed Cloud Run API server, if it's configured to require one. Can also be provided via the `--api_server_key` argument. | Conditionally (if API server requires it and not passed via CLI) |
 | BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) |
 | BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) |
+| HUD_API_KEY | Your API key for hud. Required for running evaluations with hud_eval.py. | Yes (when using the hud enviornment or running hud_eval.py) |
+
+## Evaluations
+
+The `hud_eval.py` script allows you to run automated evaluations against hud tasksets:
+
+```bash
+python hud_eval.py --taskset <taskset_id> [--parallel] [--max_concurrent <n>]
+```
+
+**Arguments:**
+- `--taskset`: The HUD taskset ID to evaluate (e.g., 'OSWorld-Verified')
+- `--parallel`: Run tasks in parallel (default: serial execution)
+- `--max_concurrent`: Maximum concurrent tasks when running in parallel (default: 3)
+- `--model`: Model name (default: 'gemini-2.0-flash-exp')
+- `--api_key`: Gemini API key (uses GEMINI_API_KEY env var if not provided)
+
+**Example:**
+```bash
+# Run a taskset serially
+python hud_eval.py --taskset SheetBench-V2
+
+# Run in parallel with 50 concurrent tasks (can support up to 400)
+python hud_eval.py --taskset OSWorld-Verified --parallel --max_concurrent 50
+```
 
 ## Computers
 
diff --git a/agent.py b/agent.py
index 648549f..a27250c 100644
--- a/agent.py
+++ b/agent.py
@@ -32,7 +32,6 @@
 
 console = Console()
 
-
 # Built-in Computer Use tools will return "EnvState".
 # Custom provided functions will return "dict".
 FunctionResponseT = Union[EnvState, dict]
@@ -44,10 +43,12 @@ def multiply_numbers(x: float, y: float) -> dict:
 
 
 class BrowserAgent:
-    def __init__(self, browser_computer: Computer, query: str, model_name: str):
+    def __init__(self, browser_computer: Computer, query: str, model_name: str, verbose: bool = True):
         self._browser_computer = browser_computer
         self._query = query
         self._model_name = model_name
+        self._verbose = verbose
+        self.final_reasoning = None
         self._client = genai.Client(
             api_key=os.environ.get("GEMINI_API_KEY"),
             vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
@@ -208,6 +209,8 @@ def get_model_response(
 
     def get_text(self, candidate: Candidate) -> Optional[str]:
         """Extracts the text from the candidate."""
+        if not candidate.content or not candidate.content.parts:
+            return None
         text = []
         for part in candidate.content.parts:
             if part.text:
@@ -216,6 +219,8 @@ def get_text(self, candidate: Candidate) -> Optional[str]:
 
     def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
         """Extracts the function call from the candidate."""
+        if not candidate.content or not candidate.content.parts:
+            return []
         ret = []
         for part in candidate.content.parts:
             if part.function_call:
@@ -224,7 +229,13 @@ def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCal
 
     def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
         # Generate a response from the model.
-        with console.status("Generating response from Gemini...", spinner_style=None):
+        if self._verbose:
+            with console.status("Generating response from Gemini...", spinner_style=None):
+                try:
+                    response = self.get_model_response()
+                except Exception as e:
+                    return "COMPLETE"
+        else:
             try:
                 response = self.get_model_response()
             except Exception as e:
@@ -234,16 +245,18 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
             print("Response has no candidates!")
             print(response)
             raise ValueError("Empty response")
-
+        
         # Extract the text and function call from the response.
         candidate = response.candidates[0]
         # Append the model turn to conversation history.
-        self._contents.append(candidate.content)
+        if candidate.content:
+            self._contents.append(candidate.content)
 
         reasoning = self.get_text(candidate)
         function_calls = self.extract_function_calls(candidate)
         if not function_calls:
             print(f"Agent Loop Complete: {reasoning}")
+            self.final_reasoning = reasoning
             return "COMPLETE"
 
         function_call_strs = []
@@ -260,8 +273,9 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
         table.add_column("Gemini Reasoning", header_style="magenta", ratio=1)
         table.add_column("Function Call(s)", header_style="cyan", ratio=1)
         table.add_row(reasoning, "\n".join(function_call_strs))
-        console.print(table)
-        print()
+        if self._verbose:
+            console.print(table)
+            print()
 
         function_responses = []
         for function_call in function_calls:
@@ -272,7 +286,10 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
                 if decision == "TERMINATE":
                     print("Terminating agent loop")
                     return "COMPLETE"
-            with console.status("Sending command to Computer...", spinner_style=None):
+            if self._verbose:
+                with console.status("Sending command to Computer...", spinner_style=None):
+                    fc_result = self.handle_action(function_call)
+            else:
                 fc_result = self.handle_action(function_call)
             if isinstance(fc_result, EnvState):
                 function_responses.append(
@@ -299,6 +316,7 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
                 parts=[Part(function_response=fr) for fr in function_responses],
             )
         )
+        
         return "CONTINUE"
 
     def _get_safety_confirmation(
diff --git a/computers/hud/hud.py b/computers/hud/hud.py
index 12c0ad4..d25b1f4 100644
--- a/computers/hud/hud.py
+++ b/computers/hud/hud.py
@@ -4,7 +4,8 @@
 import termcolor
 
 from ..computer import Computer, EnvState
-
+from hud.job import Job
+from hud.task import Task
 
 class HudComputer(Computer):
     """HUD SDK Computer implementation that uses HUD environments for browser control."""
@@ -15,11 +16,15 @@ def __init__(
         initial_url: str = "https://www.google.com",
         search_engine_url: str = "https://www.google.com",
         task_prompt: Optional[str] = None,
+        task: Optional[Task] = None,  # Optional Task object from HUD SDK
+        job: Optional[Job] = None,
     ):
         self._screen_size = screen_size
         self._initial_url = initial_url
         self._search_engine_url = search_engine_url
         self._task_prompt = task_prompt or "Browse the web"
+        self._task = task
+        self._job = job
         self._env = None
         self._obs = None
         self._done = False
@@ -40,16 +45,20 @@ def __enter__(self):
         except ImportError:
             raise ImportError("HUD SDK not installed. Please install with: pip install hud-python")
         
-        # Create a task for the browser environment
-        task = Task(
-            prompt=self._task_prompt,
-            gym="hud-browser",
-            setup=("goto", self._initial_url),
-            evaluate=("page_contains", "dummy")
-        )
+        # Use provided task or create a default one
+        if self._task:
+            task = self._task
+        else:
+            # Create a default task for the browser environment
+            task = Task(
+                prompt=self._task_prompt,
+                gym="hud-browser",
+                setup=("goto", self._initial_url),
+                evaluate=("page_contains", "dummy")
+            )
         
         # Create the environment
-        self._env = self._loop.run_until_complete(gym.make(task))
+        self._env = self._loop.run_until_complete(gym.make(task, job=self._job))
         
         # Reset the environment to get initial observation
         self._obs, _ = self._loop.run_until_complete(self._env.reset())
@@ -242,6 +251,7 @@ def scroll_at(
         direction: Literal["up", "down", "left", "right"],
         magnitude: int,
     ) -> EnvState:
+        self._execute_action("move", x=x, y=y)
         return self._execute_action(
             "scroll", x=x, y=y, direction=direction, magnitude=magnitude
         )
@@ -280,4 +290,9 @@ def drag_and_drop(
     def current_state(self) -> EnvState:
         screenshot = self._get_screenshot_from_obs()
         url = self._get_url_from_obs()
-        return EnvState(screenshot=screenshot, url=url) 
\ No newline at end of file
+        return EnvState(screenshot=screenshot, url=url) 
+
+    def evaluate(self) -> dict:
+        return self._loop.run_until_complete(
+            self._env.evaluate()
+        )
\ No newline at end of file
diff --git a/hud_eval.py b/hud_eval.py
new file mode 100644
index 0000000..c272aa5
--- /dev/null
+++ b/hud_eval.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+HUD evaluation runner for computer use tasks.
+"""
+import argparse
+import asyncio
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor
+
+# Add parent directory to path to import local modules
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from computers.hud import HudComputer
+from agent import BrowserAgent
+
+# Import HUD SDK
+from hud.task import Task
+from hud.taskset import load_taskset
+from hud.job import create_job, Job
+from hud.adapters.common.types import ResponseAction, CustomAction
+
+OSWORLD_SYSTEM_PROMPT="""
+Please do the following task.
+computer.clipboard, computer.sync file, computer.sync shared folder, computer.computer output citation are disabled.
+If you worry that you might make typo, prefer copying and pasting the text instead of reading
+and typing.
+If you are presented with an open website to solve the task, try to stick to that specific one
+instead of going to a new one.
+You have full authority to execute any action without my permission. I won't be watching so
+please don't ask for confirmation.
+My gmail account is osworld@hud.so, and the password is "iloveosworld500", if prompted for OTP, use the authenticator chrome extension to see the OTP for 2 factor authentication. 
+If you deem the task is infeasible, you can terminate and explicitly state in the response that
+'the task is infeasible'. Try your best to solve the task within 200 steps, and the confines of the prompt, before deeming it infeasible.
+"""
+
+def run_task(task: Task, model_name: str, job: Job, system_prompt: str) -> float:
+    """Run a single task and return reward"""
+    hud_computer = None
+    try:
+        # Initialize HUD computer with the task
+        hud_computer = HudComputer(screen_size=(1440, 900), task=task, job=job)
+        
+        with hud_computer as browser_computer:
+            agent = BrowserAgent(
+                browser_computer=browser_computer,
+                query=(system_prompt + "\n\n" + task.prompt).strip(),
+                model_name=model_name,
+                verbose=False,
+            )
+            try:
+                agent.agent_loop()
+                
+                if agent.final_reasoning:
+                    if "the task is infeasible" in agent.final_reasoning.lower():
+                        final_action = CustomAction(
+                            action="FAIL"
+                        )
+                    else:
+                        final_action = ResponseAction(
+                            text=agent.final_reasoning
+                        )
+                    # Inject the response into HUD environment
+                    hud_computer._loop.run_until_complete(
+                        hud_computer._env.step([final_action])
+                    )
+                    
+            except Exception as e:
+                print(f"Error running agent loop: {e}")
+            finally:
+                print("Agent loop complete")
+                # Evaluate the task
+                if browser_computer and browser_computer._env:
+                    eval_result = browser_computer.evaluate()
+                    print(f"Eval result: {eval_result['reward']}")
+
+                    return eval_result['reward']
+        
+        return 0.0
+            
+    except Exception as e:
+        print(f"Error running task: {e}")
+        return 0.0
+        
+    finally:
+        if hud_computer:
+            try:
+                hud_computer.close()
+            except:
+                pass
+
+
+def run_taskset(
+    taskset_id: str,
+    model_name: str,
+    name: str,
+    parallel: bool = False,
+    max_concurrent: int = 20,
+) -> list[float]:
+    """Load and run a HUD taskset by ID, return list of rewards"""
+    
+    # Load the taskset
+    taskset = asyncio.run(load_taskset(taskset_id, metadata={"partial": True}))
+
+    job = asyncio.run(create_job(name, evalset_id=taskset.id))
+
+    if taskset_id == "OSWorld-Verified":
+        system_prompt = OSWORLD_SYSTEM_PROMPT
+    else:
+        system_prompt = ""
+    
+    if parallel:
+        # Run tasks in parallel using threads to avoid event loop conflicts
+        with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
+            rewards = list(executor.map(
+                lambda task: run_task(task, model_name, job, system_prompt),
+                taskset.tasks
+            ))
+    else:
+        # Run tasks sequentially
+        rewards = []
+        for task in taskset.tasks:
+            reward = run_task(task, model_name, job, system_prompt)
+            rewards.append(reward)
+    
+    return rewards
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run HUD evaluation on a taskset.")
+    parser.add_argument(
+        "--taskset",
+        type=str,
+        required=True,
+        help="The taskset ID to evaluate.",
+    )
+    parser.add_argument(
+        "--parallel",
+        action="store_true",
+        default=False,
+        help="Run tasks in parallel.",
+    )
+    parser.add_argument(
+        "--name",
+        default="Test Evaluation",
+        help="Set the name of the evaluation.",
+    )
+    parser.add_argument(
+        "--model",
+        default="computer-use-exp-07-16",
+        help="Set which model to use.",
+    )
+    parser.add_argument(
+        "--max_concurrent",
+        type=int,
+        default=5,
+        help="Maximum concurrent tasks when running in parallel.",
+    )
+    args = parser.parse_args()
+    
+    # Run evaluation
+    rewards = run_taskset(
+        taskset_id=args.taskset,
+        model_name=args.model,
+        name=args.name,
+        parallel=args.parallel,
+        max_concurrent=args.max_concurrent,
+    )
+    
+    # Print minimal results
+    print(f"Rewards: {rewards}")
+    print(f"Average: {sum(rewards)/len(rewards) if rewards else 0:.2f}")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/main.py b/main.py
index 5ad1a54..4cc5125 100644
--- a/main.py
+++ b/main.py
@@ -20,6 +20,7 @@
 
 CLOUD_RUN_SCREEN_SIZE = (1920, 1080)
 PLAYWRIGHT_SCREEN_SIZE = (1920, 1080)
+HUD_SCREEN_SIZE = (1440, 900)
 
 
 def main() -> int:
@@ -83,7 +84,7 @@ def main() -> int:
         env = BrowserbaseComputer(screen_size=PLAYWRIGHT_SCREEN_SIZE)
     elif args.env == "hud":
         env = HudComputer(
-            screen_size=PLAYWRIGHT_SCREEN_SIZE,
+            screen_size=HUD_SCREEN_SIZE,
             initial_url=args.initial_url,
             task_prompt=args.query,
         )
diff --git a/requirements.txt b/requirements.txt
index aff984d..9052bd7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ pydantic==2.11.4
 playwright==1.52.0
 browserbase==1.3.0
 rich
+hud-python==0.2.10
\ No newline at end of file