google-gemini · emilypitler · Sep 30, 2025 · Sep 29, 2025 · Sep 30, 2025
diff --git a/README.md b/README.md
@@ -32,8 +32,9 @@ playwright install chrome
 ```
 
 ### 2. Configuration
+You can get started using either the Gemini Developer API or Vertex AI.
 
-#### Set Gemini API Key (for Gemini Developer API only)
+#### A. If using the Gemini Developer API:
 
 You need a Gemini API key to use the agent:
 
@@ -53,7 +54,7 @@ source .venv/bin/activate
 
 Replace `YOUR_GEMINI_API_KEY` with your actual key.
 
-#### Setup Vertex AI Client (for Vertex AI API only)
+#### B. If using the Vertex AI Client:
 
 You need to explicitly use Vertex AI, then provide project and location to use the agent:
 
@@ -84,11 +85,13 @@ The primary way to use the tool is via the `main.py` script.
 **General Command Structure:**
 
 ```bash
-python main.py --query "Go to Google and type 'Hello World' into the search bar" --env <environment> [options]
+python main.py --query "Go to Google and type 'Hello World' into the search bar"
 ```
 
 **Available Environments:**
 
+You can specify a particular environment with the ```--env <environment>``` flag.  Available options:
+
 - `playwright`: Runs the browser locally using Playwright.
 - `browserbase`: Connects to a Browserbase instance.
 - `hud`: Integrates with hud's browser environment.
@@ -167,4 +170,4 @@ python hud_eval.py --taskset SheetBench-V2
 
 # Run in parallel with 50 concurrent tasks (can support up to 400)
 python hud_eval.py --taskset OSWorld-Verified --parallel --max_concurrent 50
-```
+```
diff --git a/computers/hud/hud.py b/computers/hud/hud.py
@@ -143,6 +143,8 @@ def _create_cla_action(self, action_type: str, **kwargs) -> Dict[str, Any]:
                 enter_after=kwargs.get("enter_after", False)
             )
         elif action_type == "scroll":
+            x = kwargs.get("x", 0)
+            y = kwargs.get("y", 0)
             # Map direction to scroll amounts
             direction = kwargs.get("direction", "down")
             dx, dy = 0, 0
@@ -157,7 +159,7 @@ def _create_cla_action(self, action_type: str, **kwargs) -> Dict[str, Any]:
                 dx = -magnitude
 
             action = ScrollAction(
-                scroll=Point(x=dx, y=dy)
+                scroll=Point(x=x+dx, y=y+dy)
             )
             if "x" in kwargs and "y" in kwargs:
                 action.point = Point(x=kwargs["x"], y=kwargs["y"])
@@ -255,7 +257,14 @@ def type_text_at(
     def scroll_document(
         self, direction: Literal["up", "down", "left", "right"]
     ) -> EnvState:
-        return self._execute_action("scroll", direction=direction)
+        if direction == "down":
+            return self.key_combination(["PageDown"])
+        elif direction == "up":
+            return self.key_combination(["PageUp"])
+        elif direction in ("left", "right"):
+            return self._horizontal_document_scroll(direction)
+        else:
+            raise ValueError("Unsupported direction: ", direction)
 
     def scroll_at(
         self,