Skip to content

Commit dbd9a8a

Browse files
authored
Merge pull request #29 from lorenss-m/hud-integration
init hud computer
2 parents 400d26d + 0cc9646 commit dbd9a8a

File tree

4 files changed

+299
-2
lines changed

4 files changed

+299
-2
lines changed

computers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
from .browserbase.browserbase import BrowserbaseComputer
33
from .playwright.playwright import PlaywrightComputer
44
from .cloud_run.cloud_run import CloudRunComputer
5+
from .hud.hud import HudComputer
56

67
__all__ = [
78
"Computer",
89
"EnvState",
910
"BrowserbaseComputer",
1011
"PlaywrightComputer",
1112
"CloudRunComputer",
13+
"HudComputer",
1214
]

computers/hud/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .hud import HudComputer
2+
3+
__all__ = ["HudComputer"]

computers/hud/hud.py

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
import asyncio
2+
import io
3+
import base64
4+
from typing import Literal, Optional, Any, Dict
5+
import termcolor
6+
from PIL import Image
7+
import numpy as np
8+
9+
from ..computer import Computer, EnvState
10+
11+
12+
class HudComputer(Computer):
13+
"""HUD SDK Computer implementation that uses HUD environments for browser control."""
14+
15+
def __init__(
16+
self,
17+
screen_size: tuple[int, int],
18+
initial_url: str = "https://www.google.com",
19+
search_engine_url: str = "https://www.google.com",
20+
task_prompt: Optional[str] = None,
21+
):
22+
self._screen_size = screen_size
23+
self._initial_url = initial_url
24+
self._search_engine_url = search_engine_url
25+
self._task_prompt = task_prompt or "Browse the web"
26+
self._env = None
27+
self._obs = None
28+
self._done = False
29+
self._loop = None
30+
self._current_url = None
31+
32+
def __enter__(self):
33+
print("Creating HUD session...")
34+
35+
# Create and run the async setup in a new event loop
36+
self._loop = asyncio.new_event_loop()
37+
asyncio.set_event_loop(self._loop)
38+
39+
# Import HUD SDK here to avoid circular imports
40+
try:
41+
from hud import gym
42+
from hud.task import Task
43+
except ImportError:
44+
raise ImportError("HUD SDK not installed. Please install with: pip install hud-python")
45+
46+
# Create a task for the browser environment
47+
task = Task(
48+
prompt=self._task_prompt,
49+
gym="hud-browser",
50+
setup=("goto", self._initial_url),
51+
evaluate=("page_contains", "dummy")
52+
)
53+
54+
# Create the environment
55+
self._env = self._loop.run_until_complete(gym.make(task))
56+
57+
# Reset the environment to get initial observation
58+
self._obs, _ = self._loop.run_until_complete(self._env.reset())
59+
60+
termcolor.cprint(
61+
f"HUD browser session started.",
62+
color="green",
63+
attrs=["bold"],
64+
)
65+
return self
66+
67+
def __exit__(self, exc_type, exc_val, exc_tb):
68+
if self._env:
69+
self._loop.run_until_complete(self._env.close())
70+
if self._loop:
71+
self._loop.close()
72+
73+
def _get_screenshot_from_obs(self) -> bytes:
74+
"""Extract screenshot from HUD observation."""
75+
if self._obs is None:
76+
return b""
77+
78+
if hasattr(self._obs, 'screenshot'):
79+
screenshot_b64 = self._obs.screenshot
80+
screenshot_bytes = base64.b64decode(screenshot_b64)
81+
return screenshot_bytes
82+
83+
# HUD SDK returns observations with a 'screenshot' key containing base64 encoded image
84+
if isinstance(self._obs, dict) and 'screenshot' in self._obs:
85+
screenshot_b64 = self._obs['screenshot']
86+
# Decode base64 to bytes
87+
screenshot_bytes = base64.b64decode(screenshot_b64)
88+
return screenshot_bytes
89+
90+
return b""
91+
92+
def _get_url_from_obs(self) -> str:
93+
"""Extract URL from HUD observation."""
94+
if self._current_url is None:
95+
return self._initial_url
96+
return self._current_url
97+
98+
def _create_cla_action(self, action_type: str, **kwargs) -> Dict[str, Any]:
99+
"""Create a CLA action in the HUD SDK format."""
100+
from hud.adapters.common.types import (
101+
ClickAction,
102+
DragAction,
103+
MoveAction,
104+
Point,
105+
PressAction,
106+
ScrollAction,
107+
TypeAction,
108+
WaitAction,
109+
CustomAction,
110+
)
111+
# Map our action types to HUD SDK CLA action types
112+
if action_type == "click":
113+
return ClickAction(
114+
point=Point(x=kwargs.get("x"), y=kwargs.get("y")),
115+
button=kwargs.get("button", "left")
116+
)
117+
elif action_type == "move":
118+
return MoveAction(
119+
point=Point(x=kwargs.get("x"), y=kwargs.get("y"))
120+
)
121+
elif action_type == "type":
122+
return TypeAction(
123+
text=kwargs.get("text", ""),
124+
enter_after=kwargs.get("enter_after", False)
125+
)
126+
elif action_type == "scroll":
127+
# Map direction to scroll amounts
128+
direction = kwargs.get("direction", "down")
129+
dx, dy = 0, 0
130+
magnitude = kwargs.get("magnitude", 100)
131+
if direction == "down":
132+
dy = magnitude
133+
elif direction == "up":
134+
dy = -magnitude
135+
elif direction == "right":
136+
dx = magnitude
137+
elif direction == "left":
138+
dx = -magnitude
139+
140+
action = ScrollAction(
141+
scroll=Point(x=dx, y=dy)
142+
)
143+
if "x" in kwargs and "y" in kwargs:
144+
action.point = Point(x=kwargs["x"], y=kwargs["y"])
145+
return action
146+
elif action_type == "press":
147+
return PressAction(
148+
keys=kwargs.get("keys", [])
149+
)
150+
elif action_type == "wait":
151+
# Convert seconds to milliseconds
152+
return WaitAction(
153+
time=kwargs.get("seconds", 5) * 1000
154+
)
155+
elif action_type == "drag":
156+
return DragAction(
157+
path=[
158+
Point(x=kwargs.get("start_x"), y=kwargs.get("start_y")),
159+
Point(x=kwargs.get("end_x"), y=kwargs.get("end_y"))
160+
]
161+
)
162+
elif action_type == "navigate":
163+
# Use a CustomAction for navigation
164+
return CustomAction(
165+
action="navigate",
166+
args={"url": kwargs.get("url", "")}
167+
)
168+
elif action_type == "go_back":
169+
return CustomAction(
170+
action="go_back"
171+
)
172+
elif action_type == "go_forward":
173+
return CustomAction(
174+
action="go_forward"
175+
)
176+
else:
177+
# Fallback to CustomAction for any unrecognized action
178+
return CustomAction(
179+
action=action_type,
180+
args=kwargs
181+
)
182+
183+
def _execute_action(self, action_type: str, **kwargs) -> EnvState:
184+
"""Execute an action in the HUD environment."""
185+
if self._done:
186+
return self.current_state()
187+
188+
# Create CLA action for HUD SDK
189+
action = self._create_cla_action(action_type, **kwargs)
190+
191+
# Execute action in HUD environment
192+
# HUD SDK expects a list of actions
193+
self._obs, reward, self._done, info = self._loop.run_until_complete(
194+
self._env.step([action])
195+
)
196+
197+
if "current_url" in info:
198+
self._current_url = info["current_url"]
199+
200+
return self.current_state()
201+
202+
def screen_size(self) -> tuple[int, int]:
203+
return self._screen_size
204+
205+
def open_web_browser(self) -> EnvState:
206+
return self.current_state()
207+
208+
def click_at(self, x: int, y: int) -> EnvState:
209+
return self._execute_action("click", x=x, y=y)
210+
211+
def hover_at(self, x: int, y: int) -> EnvState:
212+
return self._execute_action("move", x=x, y=y)
213+
214+
def type_text_at(
215+
self,
216+
x: int,
217+
y: int,
218+
text: str,
219+
press_enter: bool,
220+
clear_before_typing: bool,
221+
) -> EnvState:
222+
# First click at the position
223+
self._execute_action("click", x=x, y=y)
224+
225+
# Clear existing text if requested
226+
if clear_before_typing:
227+
# Select all and delete
228+
self._execute_action("press", keys=["ctrl", "a"])
229+
self._execute_action("press", keys=["delete"])
230+
231+
# Type the text with optional enter
232+
self._execute_action("type", text=text, enter_after=press_enter)
233+
234+
return self.current_state()
235+
236+
def scroll_document(
237+
self, direction: Literal["up", "down", "left", "right"]
238+
) -> EnvState:
239+
return self._execute_action("scroll", direction=direction)
240+
241+
def scroll_at(
242+
self,
243+
x: int,
244+
y: int,
245+
direction: Literal["up", "down", "left", "right"],
246+
magnitude: int,
247+
) -> EnvState:
248+
return self._execute_action(
249+
"scroll", x=x, y=y, direction=direction, magnitude=magnitude
250+
)
251+
252+
def wait_5_seconds(self) -> EnvState:
253+
return self._execute_action("wait", seconds=5)
254+
255+
def go_back(self) -> EnvState:
256+
return self._execute_action("go_back")
257+
258+
def go_forward(self) -> EnvState:
259+
return self._execute_action("go_forward")
260+
261+
def search(self) -> EnvState:
262+
return self.navigate(self._search_engine_url)
263+
264+
def navigate(self, url: str) -> EnvState:
265+
return self._execute_action("navigate", url=url)
266+
267+
def key_combination(self, keys: list[str]) -> EnvState:
268+
# Map key names to HUD SDK format (lowercase)
269+
mapped_keys = [key.lower() for key in keys]
270+
return self._execute_action("press", keys=mapped_keys)
271+
272+
def drag_and_drop(
273+
self, x: int, y: int, destination_x: int, destination_y: int
274+
) -> EnvState:
275+
return self._execute_action(
276+
"drag",
277+
start_x=x,
278+
start_y=y,
279+
end_x=destination_x,
280+
end_y=destination_y
281+
)
282+
283+
def current_state(self) -> EnvState:
284+
screenshot = self._get_screenshot_from_obs()
285+
url = self._get_url_from_obs()
286+
return EnvState(screenshot=screenshot, url=url)

main.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import os
1616

1717
from agent import BrowserAgent
18-
from computers import CloudRunComputer, BrowserbaseComputer, PlaywrightComputer
18+
from computers import CloudRunComputer, BrowserbaseComputer, PlaywrightComputer, HudComputer
1919

2020

2121
CLOUD_RUN_SCREEN_SIZE = (1920, 1080)
@@ -43,7 +43,7 @@ def main() -> int:
4343
parser.add_argument(
4444
"--env",
4545
type=str,
46-
choices=("cloud-run", "playwright", "browserbase"),
46+
choices=("cloud-run", "playwright", "browserbase", "hud"),
4747
default="cloud-run",
4848
help="The computer use environment to use.",
4949
)
@@ -81,6 +81,12 @@ def main() -> int:
8181
)
8282
elif args.env == "browserbase":
8383
env = BrowserbaseComputer(screen_size=PLAYWRIGHT_SCREEN_SIZE)
84+
elif args.env == "hud":
85+
env = HudComputer(
86+
screen_size=PLAYWRIGHT_SCREEN_SIZE,
87+
initial_url=args.initial_url,
88+
task_prompt=args.query,
89+
)
8490
else:
8591
raise ValueError("Unknown environment: ", args.env)
8692

0 commit comments

Comments
 (0)