Skip to content

Commit e23bc49

Browse files
committed
Implement the v2 actions.
1 parent 7370699 commit e23bc49

File tree

3 files changed

+215
-21
lines changed

3 files changed

+215
-21
lines changed

agent.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ def __init__(
3939
browser_computer: Computer,
4040
query: str,
4141
model_name: Literal[
42-
"computer-use-exp"
43-
] = "computer-use-exp",
42+
"computer-use-exp-6-11"
43+
] = "computer-use-exp-6-11",
4444
):
4545
self._browser_computer = browser_computer
4646
self._query = query
@@ -50,6 +50,10 @@ def __init__(
5050
vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
5151
project=os.environ.get("VERTEXAI_PROJECT"),
5252
location=os.environ.get("VERTEXAI_LOCATION"),
53+
http_options=types.HttpOptions(
54+
api_version="v1alpha",
55+
base_url="https://generativelanguage.googleapis.com",
56+
)
5357
)
5458
self._contents: list[Content] = [
5559
Content(
@@ -94,13 +98,32 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
9498
elif action.name == "type_text_at":
9599
x = self.normalize_x(action.args["x"])
96100
y = self.normalize_y(action.args["y"])
101+
press_enter = action.args.get("press_enter", True)
102+
clear_before_typing = action.args.get("clear_before_typing", True)
97103
return self._browser_computer.type_text_at(
98104
x=x,
99105
y=y,
100106
text=action.args["text"],
107+
press_enter=press_enter,
108+
clear_before_typing=clear_before_typing,
101109
)
102110
elif action.name == "scroll_document":
103111
return self._browser_computer.scroll_document(action.args["direction"])
112+
elif action.name == "scroll_at":
113+
x = self.normalize_x(action.args["x"])
114+
y = self.normalize_y(action.args["y"])
115+
magnitude = action.args.get("magnitude", 200)
116+
direction = action.args["direction"]
117+
118+
if direction in ("up", "down"):
119+
magnitude = self.normalize_y(magnitude)
120+
elif direction in ("left", "right"):
121+
magnitude = self.normalize_x(magnitude)
122+
else:
123+
raise ValueError("Unknown direction: ", direction)
124+
return self._browser_computer.scroll_at(
125+
x=x, y=y, direction=direction, magnitude=magnitude
126+
)
104127
elif action.name == "wait_5_seconds":
105128
return self._browser_computer.wait_5_seconds()
106129
elif action.name == "go_back":
@@ -115,6 +138,17 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
115138
return self._browser_computer.key_combination(
116139
action.args["keys"].split("+")
117140
)
141+
elif action.name == "drag_and_drop":
142+
x = self.normalize_x(action.args["x"])
143+
y = self.normalize_y(action.args["y"])
144+
destination_x = self.normalize_x(action.args["destination_x"])
145+
destination_y = self.normalize_y(action.args["destination_y"])
146+
return self._browser_computer.drag_and_drop(
147+
x=x,
148+
y=y,
149+
destination_x=destination_x,
150+
destination_y=destination_y,
151+
)
118152
else:
119153
raise ValueError(f"Unsupported function: {action}")
120154

computers/computer.py

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
import abc
1515
import pydantic
16+
from typing import Literal
1617

1718

1819
class EnvState(pydantic.BaseModel):
@@ -33,31 +34,54 @@ def open_web_browser(self) -> EnvState:
3334
"""Opens the web browser."""
3435

3536
@abc.abstractmethod
36-
def click_at(self, y: int, x: int) -> EnvState:
37-
"""Clicks at a specific y (0-999), x (0-999) coordinate on the webpage.
37+
def click_at(self, x: int, y: int) -> EnvState:
38+
"""Clicks at a specific x, y coordinate on the webpage.
3839
39-
The 'x' and 'y' values are scaled to the height and width of the screen.
40+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
4041
"""
4142

4243
@abc.abstractmethod
43-
def hover_at(self, y: int, x: int) -> EnvState:
44-
"""Hovers at a specific y (0-999), x (0-999) coordinate on the webpage.
44+
def hover_at(self, x: int, y: int) -> EnvState:
45+
"""Hovers at a specific x, y coordinate on the webpage.
4546
4647
May be used to explore sub-menus that appear on hover.
47-
The 'x' and 'y' values are scaled to the height and width of the screen.
48+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
4849
"""
4950

5051
@abc.abstractmethod
51-
def type_text_at(self, y: int, x: int, text: str) -> EnvState:
52-
"""Types text at a specific y (0-999), x (0-999) coordinate.
53-
54-
The system automatically presses ENTER after typing.
55-
The 'x' and 'y' values are scaled to the height and width of the screen.
52+
def type_text_at(
53+
self,
54+
x: int,
55+
y: int,
56+
text: str,
57+
press_enter: bool,
58+
clear_before_typing: bool,
59+
) -> EnvState:
60+
"""Types text at a specific x, y coordinate.
61+
62+
The system automatically presses ENTER after typing. To disable this, set `press_enter` to False.
63+
The system automatically clears any existing content before typing the specified `text`. To disable this, set `clear_before_typing` to False.
64+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
5665
"""
5766

5867
@abc.abstractmethod
59-
def scroll_document(self, direction: str) -> EnvState:
60-
"""Scrolls the entire webpage "up" or "down" based on direction."""
68+
def scroll_document(
69+
self, direction: Literal["up", "down", "left", "right"]
70+
) -> EnvState:
71+
"""Scrolls the entire webpage "up", "down", "left" or "right" based on direction."""
72+
73+
@abc.abstractmethod
74+
def scroll_at(
75+
self,
76+
x: int,
77+
y: int,
78+
direction: Literal["up", "down", "left", "right"],
79+
magnitude: int,
80+
) -> EnvState:
81+
"""Scrolls up, down, right, or left at a x, y coordinate by magnitude.
82+
83+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
84+
"""
6185

6286
@abc.abstractmethod
6387
def wait_5_seconds(self) -> EnvState:
@@ -88,6 +112,14 @@ def navigate(self, url: str) -> EnvState:
88112
def key_combination(self, keys: list[str]) -> EnvState:
89113
"""Presses keyboard keys and combinations, such as "control+c" or "enter"."""
90114

115+
@abc.abstractmethod
116+
def drag_and_drop(
117+
self, x: int, y: int, destination_x: int, destination_y: int
118+
) -> EnvState:
119+
"""Drag and drop an element from a x, y coordinate to a destination destination_y, destination_x coordinate.
120+
The 'x', 'y', 'destination_y' and 'destination_x' values are absolute values, scaled to the height and width of the screen.
121+
"""
122+
91123
@abc.abstractmethod
92124
def current_state(self) -> EnvState:
93125
"""Returns the current state of the current webpage."""

computers/playwright/playwright.py

Lines changed: 134 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,54 @@
1818
EnvState,
1919
)
2020
from playwright.sync_api import sync_playwright
21+
from typing import Literal
22+
23+
# Define a mapping from the user-friendly key names to Playwright's expected key names.
24+
# Playwright is generally good with case-insensitivity for these, but it's best to be canonical.
25+
# See: https://playwright.dev/docs/api/class-keyboard#keyboard-press
26+
# Keys like 'a', 'b', '1', '$' are passed directly.
27+
PLAYWRIGHT_KEY_MAP = {
28+
"backspace": "Backspace",
29+
"tab": "Tab",
30+
"return": "Enter", # Playwright uses 'Enter'
31+
"enter": "Enter",
32+
"shift": "Shift",
33+
"control": "Control", # Or 'ControlOrMeta' for cross-platform Ctrl/Cmd
34+
"alt": "Alt",
35+
"escape": "Escape",
36+
"space": "Space", # Can also just be " "
37+
"pageup": "PageUp",
38+
"pagedown": "PageDown",
39+
"end": "End",
40+
"home": "Home",
41+
"left": "ArrowLeft",
42+
"up": "ArrowUp",
43+
"right": "ArrowRight",
44+
"down": "ArrowDown",
45+
"insert": "Insert",
46+
"delete": "Delete",
47+
"semicolon": ";", # For actual character ';'
48+
"equals": "=", # For actual character '='
49+
"multiply": "Multiply", # NumpadMultiply
50+
"add": "Add", # NumpadAdd
51+
"separator": "Separator", # Numpad specific
52+
"subtract": "Subtract", # NumpadSubtract, or just '-' for character
53+
"decimal": "Decimal", # NumpadDecimal, or just '.' for character
54+
"divide": "Divide", # NumpadDivide, or just '/' for character
55+
"f1": "F1",
56+
"f2": "F2",
57+
"f3": "F3",
58+
"f4": "F4",
59+
"f5": "F5",
60+
"f6": "F6",
61+
"f7": "F7",
62+
"f8": "F8",
63+
"f9": "F9",
64+
"f10": "F10",
65+
"f11": "F11",
66+
"f12": "F12",
67+
"command": "Meta", # 'Meta' is Command on macOS, Windows key on Windows
68+
}
2169

2270

2371
class PlaywrightComputer(Computer):
@@ -91,24 +139,86 @@ def hover_at(self, x: int, y: int):
91139
self._page.wait_for_load_state()
92140
return self.current_state()
93141

94-
def type_text_at(self, x: int, y: int, text: str) -> EnvState:
142+
def type_text_at(
143+
self,
144+
x: int,
145+
y: int,
146+
text: str,
147+
press_enter: bool = True,
148+
clear_before_typing: bool = True,
149+
) -> EnvState:
95150
self.highlight_mouse(x, y)
96151
self._page.mouse.click(x, y)
97152
self._page.wait_for_load_state()
153+
154+
if clear_before_typing:
155+
self.key_combination(["Control", "A"])
156+
self.key_combination(["Delete"])
157+
98158
self._page.keyboard.type(text)
99159
self._page.wait_for_load_state()
100-
self.key_combination(["Enter"])
160+
161+
if press_enter:
162+
self.key_combination(["Enter"])
101163
self._page.wait_for_load_state()
102164
return self.current_state()
103165

104-
def scroll_document(self, direction: str) -> EnvState:
105-
if direction.lower() == "down":
166+
def _horizontal_document_scroll(
167+
self, direction: Literal["left", "right"]
168+
) -> EnvState:
169+
# Scroll by 50% of the viewport size.
170+
horizontal_scroll_amount = self.screen_size()[0] // 2
171+
if direction == "left":
172+
sign = "-"
173+
else:
174+
sign = ""
175+
scroll_argument = f"{sign}{horizontal_scroll_amount}"
176+
# Scroll using JS.
177+
self._page.evaluate(f"window.scrollBy({scroll_argument}, 0); ")
178+
self._page.wait_for_load_state()
179+
return self.current_state()
180+
181+
def scroll_document(
182+
self, direction: Literal["up", "down", "left", "right"]
183+
) -> EnvState:
184+
if direction == "down":
106185
return self.key_combination(["PageDown"])
107-
elif direction.lower() == "up":
186+
elif direction == "up":
108187
return self.key_combination(["PageUp"])
188+
elif direction in ("left", "right"):
189+
return self._horizontal_document_scroll(direction)
190+
else:
191+
raise ValueError("Unsupported direction: ", direction)
192+
193+
def scroll_at(
194+
self,
195+
x: int,
196+
y: int,
197+
direction: Literal["up", "down", "left", "right"],
198+
magnitude: int,
199+
) -> EnvState:
200+
self.highlight_mouse(x, y)
201+
202+
self._page.mouse.move(x, y)
203+
self._page.wait_for_load_state()
204+
205+
dx = 0
206+
dy = 0
207+
if direction == "up":
208+
dy = -magnitude
209+
elif direction == "down":
210+
dy = magnitude
211+
elif direction == "left":
212+
dx = -magnitude
213+
elif direction == "right":
214+
dx = magnitude
109215
else:
110216
raise ValueError("Unsupported direction: ", direction)
111217

218+
self._page.mouse.wheel(dx, dy)
219+
self._page.wait_for_load_state()
220+
return self.current_state()
221+
112222
def wait_5_seconds(self) -> EnvState:
113223
time.sleep(5)
114224
return self.current_state()
@@ -132,6 +242,9 @@ def navigate(self, url: str) -> EnvState:
132242
return self.current_state()
133243

134244
def key_combination(self, keys: list[str]) -> EnvState:
245+
# Normalize all keys to the Playwright compatible version.
246+
keys = [PLAYWRIGHT_KEY_MAP.get(k.lower(), k) for k in keys]
247+
135248
for key in keys[:-1]:
136249
self._page.keyboard.down(key)
137250

@@ -142,6 +255,21 @@ def key_combination(self, keys: list[str]) -> EnvState:
142255

143256
return self.current_state()
144257

258+
def drag_and_drop(
259+
self, x: int, y: int, destination_x: int, destination_y: int
260+
) -> EnvState:
261+
self.highlight_mouse(x, y)
262+
self._page.mouse.move(x, y)
263+
self._page.wait_for_load_state()
264+
self._page.mouse.down()
265+
self._page.wait_for_load_state()
266+
267+
self.highlight_mouse(destination_x, destination_y)
268+
self._page.mouse.move(destination_x, destination_y)
269+
self._page.wait_for_load_state()
270+
self._page.mouse.up()
271+
return self.current_state()
272+
145273
def current_state(self) -> EnvState:
146274
self._page.wait_for_load_state()
147275
# Even if Playwright reports the page as loaded, it may not be so.
@@ -167,7 +295,7 @@ def highlight_mouse(self, x: int, y: int):
167295
div.style.borderRadius = '50%';
168296
div.style.width = '20px';
169297
div.style.height = '20px';
170-
div.style.position = 'absolute';
298+
div.style.position = 'fixed';
171299
div.style.zIndex = '9999';
172300
document.body.appendChild(div);
173301

0 commit comments

Comments
 (0)