1+ import asyncio
2+ import io
3+ import base64
4+ from typing import Literal , Optional , Any , Dict
5+ import termcolor
6+ from PIL import Image
7+ import numpy as np
8+
9+ from ..computer import Computer , EnvState
10+
11+
12+ class HudComputer (Computer ):
13+ """HUD SDK Computer implementation that uses HUD environments for browser control."""
14+
15+ def __init__ (
16+ self ,
17+ screen_size : tuple [int , int ],
18+ initial_url : str = "https://www.google.com" ,
19+ search_engine_url : str = "https://www.google.com" ,
20+ task_prompt : Optional [str ] = None ,
21+ ):
22+ self ._screen_size = screen_size
23+ self ._initial_url = initial_url
24+ self ._search_engine_url = search_engine_url
25+ self ._task_prompt = task_prompt or "Browse the web"
26+ self ._env = None
27+ self ._obs = None
28+ self ._done = False
29+ self ._loop = None
30+ self ._current_url = None
31+
32+ def __enter__ (self ):
33+ print ("Creating HUD session..." )
34+
35+ # Create and run the async setup in a new event loop
36+ self ._loop = asyncio .new_event_loop ()
37+ asyncio .set_event_loop (self ._loop )
38+
39+ # Import HUD SDK here to avoid circular imports
40+ try :
41+ from hud import gym
42+ from hud .task import Task
43+ except ImportError :
44+ raise ImportError ("HUD SDK not installed. Please install with: pip install hud-python" )
45+
46+ # Create a task for the browser environment
47+ task = Task (
48+ prompt = self ._task_prompt ,
49+ gym = "hud-browser" ,
50+ setup = ("goto" , self ._initial_url ),
51+ evaluate = ("page_contains" , "dummy" )
52+ )
53+
54+ # Create the environment
55+ self ._env = self ._loop .run_until_complete (gym .make (task ))
56+
57+ # Reset the environment to get initial observation
58+ self ._obs , _ = self ._loop .run_until_complete (self ._env .reset ())
59+
60+ termcolor .cprint (
61+ f"HUD browser session started." ,
62+ color = "green" ,
63+ attrs = ["bold" ],
64+ )
65+ return self
66+
67+ def __exit__ (self , exc_type , exc_val , exc_tb ):
68+ if self ._env :
69+ self ._loop .run_until_complete (self ._env .close ())
70+ if self ._loop :
71+ self ._loop .close ()
72+
73+ def _get_screenshot_from_obs (self ) -> bytes :
74+ """Extract screenshot from HUD observation."""
75+ if self ._obs is None :
76+ return b""
77+
78+ if hasattr (self ._obs , 'screenshot' ):
79+ screenshot_b64 = self ._obs .screenshot
80+ screenshot_bytes = base64 .b64decode (screenshot_b64 )
81+ return screenshot_bytes
82+
83+ # HUD SDK returns observations with a 'screenshot' key containing base64 encoded image
84+ if isinstance (self ._obs , dict ) and 'screenshot' in self ._obs :
85+ screenshot_b64 = self ._obs ['screenshot' ]
86+ # Decode base64 to bytes
87+ screenshot_bytes = base64 .b64decode (screenshot_b64 )
88+ return screenshot_bytes
89+
90+ return b""
91+
92+ def _get_url_from_obs (self ) -> str :
93+ """Extract URL from HUD observation."""
94+ if self ._current_url is None :
95+ return self ._initial_url
96+ return self ._current_url
97+
98+ def _create_cla_action (self , action_type : str , ** kwargs ) -> Dict [str , Any ]:
99+ """Create a CLA action in the HUD SDK format."""
100+ from hud .adapters .common .types import (
101+ ClickAction ,
102+ DragAction ,
103+ MoveAction ,
104+ Point ,
105+ PressAction ,
106+ ScrollAction ,
107+ TypeAction ,
108+ WaitAction ,
109+ CustomAction ,
110+ )
111+ # Map our action types to HUD SDK CLA action types
112+ if action_type == "click" :
113+ return ClickAction (
114+ point = Point (x = kwargs .get ("x" ), y = kwargs .get ("y" )),
115+ button = kwargs .get ("button" , "left" )
116+ )
117+ elif action_type == "move" :
118+ return MoveAction (
119+ point = Point (x = kwargs .get ("x" ), y = kwargs .get ("y" ))
120+ )
121+ elif action_type == "type" :
122+ return TypeAction (
123+ text = kwargs .get ("text" , "" ),
124+ enter_after = kwargs .get ("enter_after" , False )
125+ )
126+ elif action_type == "scroll" :
127+ # Map direction to scroll amounts
128+ direction = kwargs .get ("direction" , "down" )
129+ dx , dy = 0 , 0
130+ magnitude = kwargs .get ("magnitude" , 100 )
131+ if direction == "down" :
132+ dy = magnitude
133+ elif direction == "up" :
134+ dy = - magnitude
135+ elif direction == "right" :
136+ dx = magnitude
137+ elif direction == "left" :
138+ dx = - magnitude
139+
140+ action = ScrollAction (
141+ scroll = Point (x = dx , y = dy )
142+ )
143+ if "x" in kwargs and "y" in kwargs :
144+ action .point = Point (x = kwargs ["x" ], y = kwargs ["y" ])
145+ return action
146+ elif action_type == "press" :
147+ return PressAction (
148+ keys = kwargs .get ("keys" , [])
149+ )
150+ elif action_type == "wait" :
151+ # Convert seconds to milliseconds
152+ return WaitAction (
153+ time = kwargs .get ("seconds" , 5 ) * 1000
154+ )
155+ elif action_type == "drag" :
156+ return DragAction (
157+ path = [
158+ Point (x = kwargs .get ("start_x" ), y = kwargs .get ("start_y" )),
159+ Point (x = kwargs .get ("end_x" ), y = kwargs .get ("end_y" ))
160+ ]
161+ )
162+ elif action_type == "navigate" :
163+ # Use a CustomAction for navigation
164+ return CustomAction (
165+ action = "navigate" ,
166+ args = {"url" : kwargs .get ("url" , "" )}
167+ )
168+ elif action_type == "go_back" :
169+ return CustomAction (
170+ action = "go_back"
171+ )
172+ elif action_type == "go_forward" :
173+ return CustomAction (
174+ action = "go_forward"
175+ )
176+ else :
177+ # Fallback to CustomAction for any unrecognized action
178+ return CustomAction (
179+ action = action_type ,
180+ args = kwargs
181+ )
182+
183+ def _execute_action (self , action_type : str , ** kwargs ) -> EnvState :
184+ """Execute an action in the HUD environment."""
185+ if self ._done :
186+ return self .current_state ()
187+
188+ # Create CLA action for HUD SDK
189+ action = self ._create_cla_action (action_type , ** kwargs )
190+
191+ # Execute action in HUD environment
192+ # HUD SDK expects a list of actions
193+ self ._obs , reward , self ._done , info = self ._loop .run_until_complete (
194+ self ._env .step ([action ])
195+ )
196+
197+ if "current_url" in info :
198+ self ._current_url = info ["current_url" ]
199+
200+ return self .current_state ()
201+
202+ def screen_size (self ) -> tuple [int , int ]:
203+ return self ._screen_size
204+
205+ def open_web_browser (self ) -> EnvState :
206+ return self .current_state ()
207+
208+ def click_at (self , x : int , y : int ) -> EnvState :
209+ return self ._execute_action ("click" , x = x , y = y )
210+
211+ def hover_at (self , x : int , y : int ) -> EnvState :
212+ return self ._execute_action ("move" , x = x , y = y )
213+
214+ def type_text_at (
215+ self ,
216+ x : int ,
217+ y : int ,
218+ text : str ,
219+ press_enter : bool ,
220+ clear_before_typing : bool ,
221+ ) -> EnvState :
222+ # First click at the position
223+ self ._execute_action ("click" , x = x , y = y )
224+
225+ # Clear existing text if requested
226+ if clear_before_typing :
227+ # Select all and delete
228+ self ._execute_action ("press" , keys = ["ctrl" , "a" ])
229+ self ._execute_action ("press" , keys = ["delete" ])
230+
231+ # Type the text with optional enter
232+ self ._execute_action ("type" , text = text , enter_after = press_enter )
233+
234+ return self .current_state ()
235+
236+ def scroll_document (
237+ self , direction : Literal ["up" , "down" , "left" , "right" ]
238+ ) -> EnvState :
239+ return self ._execute_action ("scroll" , direction = direction )
240+
241+ def scroll_at (
242+ self ,
243+ x : int ,
244+ y : int ,
245+ direction : Literal ["up" , "down" , "left" , "right" ],
246+ magnitude : int ,
247+ ) -> EnvState :
248+ return self ._execute_action (
249+ "scroll" , x = x , y = y , direction = direction , magnitude = magnitude
250+ )
251+
252+ def wait_5_seconds (self ) -> EnvState :
253+ return self ._execute_action ("wait" , seconds = 5 )
254+
255+ def go_back (self ) -> EnvState :
256+ return self ._execute_action ("go_back" )
257+
258+ def go_forward (self ) -> EnvState :
259+ return self ._execute_action ("go_forward" )
260+
261+ def search (self ) -> EnvState :
262+ return self .navigate (self ._search_engine_url )
263+
264+ def navigate (self , url : str ) -> EnvState :
265+ return self ._execute_action ("navigate" , url = url )
266+
267+ def key_combination (self , keys : list [str ]) -> EnvState :
268+ # Map key names to HUD SDK format (lowercase)
269+ mapped_keys = [key .lower () for key in keys ]
270+ return self ._execute_action ("press" , keys = mapped_keys )
271+
272+ def drag_and_drop (
273+ self , x : int , y : int , destination_x : int , destination_y : int
274+ ) -> EnvState :
275+ return self ._execute_action (
276+ "drag" ,
277+ start_x = x ,
278+ start_y = y ,
279+ end_x = destination_x ,
280+ end_y = destination_y
281+ )
282+
283+ def current_state (self ) -> EnvState :
284+ screenshot = self ._get_screenshot_from_obs ()
285+ url = self ._get_url_from_obs ()
286+ return EnvState (screenshot = screenshot , url = url )
0 commit comments