Skip to content

Commit f1d5822

Browse files
authored
Merge pull request #22 from google/ericpts/v2
Implement the v2 actions.
2 parents 7370699 + 318ac59 commit f1d5822

File tree

11 files changed

+558
-106
lines changed

11 files changed

+558
-106
lines changed

agent.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ def __init__(
3939
browser_computer: Computer,
4040
query: str,
4141
model_name: Literal[
42-
"computer-use-exp"
43-
] = "computer-use-exp",
42+
"computer-use-exp-6-11"
43+
] = "computer-use-exp-6-11",
4444
):
4545
self._browser_computer = browser_computer
4646
self._query = query
@@ -50,6 +50,10 @@ def __init__(
5050
vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
5151
project=os.environ.get("VERTEXAI_PROJECT"),
5252
location=os.environ.get("VERTEXAI_LOCATION"),
53+
http_options=types.HttpOptions(
54+
api_version="v1alpha",
55+
base_url="https://generativelanguage.googleapis.com",
56+
)
5357
)
5458
self._contents: list[Content] = [
5559
Content(
@@ -94,13 +98,32 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
9498
elif action.name == "type_text_at":
9599
x = self.normalize_x(action.args["x"])
96100
y = self.normalize_y(action.args["y"])
101+
press_enter = action.args.get("press_enter", True)
102+
clear_before_typing = action.args.get("clear_before_typing", True)
97103
return self._browser_computer.type_text_at(
98104
x=x,
99105
y=y,
100106
text=action.args["text"],
107+
press_enter=press_enter,
108+
clear_before_typing=clear_before_typing,
101109
)
102110
elif action.name == "scroll_document":
103111
return self._browser_computer.scroll_document(action.args["direction"])
112+
elif action.name == "scroll_at":
113+
x = self.normalize_x(action.args["x"])
114+
y = self.normalize_y(action.args["y"])
115+
magnitude = action.args.get("magnitude", 200)
116+
direction = action.args["direction"]
117+
118+
if direction in ("up", "down"):
119+
magnitude = self.normalize_y(magnitude)
120+
elif direction in ("left", "right"):
121+
magnitude = self.normalize_x(magnitude)
122+
else:
123+
raise ValueError("Unknown direction: ", direction)
124+
return self._browser_computer.scroll_at(
125+
x=x, y=y, direction=direction, magnitude=magnitude
126+
)
104127
elif action.name == "wait_5_seconds":
105128
return self._browser_computer.wait_5_seconds()
106129
elif action.name == "go_back":
@@ -115,6 +138,17 @@ def handle_action(self, action: types.FunctionCall) -> EnvState:
115138
return self._browser_computer.key_combination(
116139
action.args["keys"].split("+")
117140
)
141+
elif action.name == "drag_and_drop":
142+
x = self.normalize_x(action.args["x"])
143+
y = self.normalize_y(action.args["y"])
144+
destination_x = self.normalize_x(action.args["destination_x"])
145+
destination_y = self.normalize_y(action.args["destination_y"])
146+
return self._browser_computer.drag_and_drop(
147+
x=x,
148+
y=y,
149+
destination_x=destination_x,
150+
destination_y=destination_y,
151+
)
118152
else:
119153
raise ValueError(f"Unsupported function: {action}")
120154

apiserver/commands.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,39 @@ class NavigateArgs(BaseModel):
2424

2525

2626
class ClickAtArgs(BaseModel):
27-
y: int
2827
x: int
28+
y: int
2929

3030

3131
class HoverAtArgs(BaseModel):
32-
y: int
3332
x: int
33+
y: int
3434

3535

3636
class TypeTextAtArgs(BaseModel):
37-
y: int
3837
x: int
38+
y: int
3939
text: str
40+
press_enter: bool
41+
clear_before_typing: bool
4042

4143

4244
class ScrollDocumentArgs(BaseModel):
43-
direction: Literal["up"] | Literal["down"]
45+
direction: Literal["up", "down", "left", "right"]
46+
47+
48+
class ScrollAtArgs(BaseModel):
49+
x: int
50+
y: int
51+
direction: Literal["up", "down", "left", "right"]
52+
magnitude: int
53+
54+
55+
class DragAndDropArgs(BaseModel):
56+
x: int
57+
y: int
58+
destination_x: int
59+
destination_y: int
4460

4561

4662
class KeyCombinationArgs(BaseModel):
@@ -72,6 +88,16 @@ class ScrollDocument(BaseModel):
7288
args: ScrollDocumentArgs
7389

7490

91+
class ScrollAt(BaseModel):
92+
name: Literal["scroll_at"]
93+
args: ScrollAtArgs
94+
95+
96+
class DragAndDrop(BaseModel):
97+
name: Literal["drag_and_drop"]
98+
args: DragAndDropArgs
99+
100+
75101
class Wait5Seconds(BaseModel):
76102
name: Literal["wait_5_seconds"]
77103
args: Optional[EmptyJson] = Field(None)
@@ -120,6 +146,8 @@ class Shutdown(BaseModel):
120146
HoverAt,
121147
TypeTextAt,
122148
ScrollDocument,
149+
ScrollAt,
150+
DragAndDrop,
123151
GoBack,
124152
GoForward,
125153
Search,

apiserver/test_app.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,12 @@ def test_get_command_timeout_standard_command():
114114

115115
def test_get_command_timeout_type_text_at_command():
116116
text = "hello"
117-
command = TypeTextAt(name="type_text_at", args=TypeTextAtArgs(y=0, x=0, text=text))
117+
command = TypeTextAt(
118+
name="type_text_at",
119+
args=TypeTextAtArgs(
120+
y=0, x=0, text=text, press_enter=False, clear_before_typing=True
121+
),
122+
)
118123
base_timeout = 10
119124
key_delay = 0.1
120125
expected_timeout = base_timeout + (len(text) * key_delay)
@@ -124,7 +129,12 @@ def test_get_command_timeout_type_text_at_command():
124129

125130
def test_get_command_timeout_type_text_at_command_zero_delay():
126131
text = "world"
127-
command = TypeTextAt(name="type_text_at", args=TypeTextAtArgs(y=0, x=0, text=text))
132+
command = TypeTextAt(
133+
name="type_text_at",
134+
args=TypeTextAtArgs(
135+
y=0, x=0, text=text, press_enter=False, clear_before_typing=True
136+
),
137+
)
128138
base_timeout = 20
129139
key_delay = 0.0
130140
expected_timeout = base_timeout

apiserver/test_commands.py

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
HoverAt,
2020
TypeTextAt,
2121
ScrollDocument,
22+
ScrollAt,
23+
DragAndDrop,
2224
GoBack,
2325
GoForward,
2426
Search,
@@ -38,21 +40,27 @@ def test_navigate(self):
3840
self.assertEqual(command.model_dump(), function_call)
3941

4042
def test_click_at(self):
41-
function_call = {"name": "click_at", "args": {"y": 1, "x": 2}}
43+
function_call = {"name": "click_at", "args": {"x": 2, "y": 1}}
4244
command = CommandModel.model_validate(function_call)
4345
self.assertIsInstance(command.root, ClickAt)
4446
self.assertEqual(command.model_dump(), function_call)
4547

4648
def test_hover_at(self):
47-
function_call = {"name": "hover_at", "args": {"y": 1, "x": 2}}
49+
function_call = {"name": "hover_at", "args": {"x": 2, "y": 1}}
4850
command = CommandModel.model_validate(function_call)
4951
self.assertIsInstance(command.root, HoverAt)
5052
self.assertEqual(command.model_dump(), function_call)
5153

5254
def test_type_text_at(self):
5355
function_call = {
5456
"name": "type_text_at",
55-
"args": {"y": 1, "x": 2, "text": "one"},
57+
"args": {
58+
"x": 2,
59+
"y": 1,
60+
"text": "one",
61+
"press_enter": True,
62+
"clear_before_typing": False,
63+
},
5664
}
5765
command = CommandModel.model_validate(
5866
function_call,
@@ -62,7 +70,13 @@ def test_type_text_at(self):
6270
command.model_dump(),
6371
{
6472
"name": "type_text_at",
65-
"args": {"y": 1, "x": 2, "text": "one"},
73+
"args": {
74+
"x": 2,
75+
"y": 1,
76+
"text": "one",
77+
"press_enter": True,
78+
"clear_before_typing": False,
79+
},
6680
},
6781
)
6882

@@ -77,6 +91,58 @@ def test_scroll_document(self):
7791
self.assertIsInstance(command.root, ScrollDocument)
7892
self.assertEqual(command.model_dump(), function_call)
7993

94+
function_call = {
95+
"name": "scroll_document",
96+
"args": {"direction": "down"},
97+
}
98+
command = CommandModel.model_validate(
99+
function_call,
100+
)
101+
self.assertIsInstance(command.root, ScrollDocument)
102+
self.assertEqual(command.model_dump(), function_call)
103+
104+
function_call = {
105+
"name": "scroll_document",
106+
"args": {"direction": "left"},
107+
}
108+
command = CommandModel.model_validate(
109+
function_call,
110+
)
111+
self.assertIsInstance(command.root, ScrollDocument)
112+
self.assertEqual(command.model_dump(), function_call)
113+
114+
function_call = {
115+
"name": "scroll_document",
116+
"args": {"direction": "right"},
117+
}
118+
command = CommandModel.model_validate(
119+
function_call,
120+
)
121+
self.assertIsInstance(command.root, ScrollDocument)
122+
self.assertEqual(command.model_dump(), function_call)
123+
124+
def test_scroll_at(self):
125+
function_call = {
126+
"name": "scroll_at",
127+
"args": {"x": 1, "y": 2, "direction": "up", "magnitude": 10},
128+
}
129+
command = CommandModel.model_validate(
130+
function_call,
131+
)
132+
self.assertIsInstance(command.root, ScrollAt)
133+
self.assertEqual(command.model_dump(), function_call)
134+
135+
def test_drag_and_drop(self):
136+
function_call = {
137+
"name": "drag_and_drop",
138+
"args": {"x": 1, "y": 2, "destination_x": 3, "destination_y": 4},
139+
}
140+
command = CommandModel.model_validate(
141+
function_call,
142+
)
143+
self.assertIsInstance(command.root, DragAndDrop)
144+
self.assertEqual(command.model_dump(), function_call)
145+
80146
def test_go_back(self):
81147
function_call = {"name": "go_back", "args": {}}
82148
command = CommandModel.model_validate(function_call)

computers/cloud_run/cloud_run.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import base64
1515
import termcolor
1616
import time
17-
from typing import Any, Optional
17+
from typing import Any, Optional, Literal
1818
import requests
1919
from urllib.parse import urljoin
2020
from ..computer import (
@@ -110,25 +110,46 @@ def _run_command(
110110
def open_web_browser(self) -> EnvState:
111111
return self._run_command("open_web_browser")
112112

113-
def click_at(self, y, x):
113+
def click_at(self, x: int, y: int) -> EnvState:
114114
return self._run_command("click_at", args={"x": x, "y": y})
115115

116-
def hover_at(self, y, x):
116+
def hover_at(self, x: int, y: int) -> EnvState:
117117
return self._run_command("hover_at", args={"x": x, "y": y})
118118

119-
def type_text_at(self, x: int, y: int, text: str) -> EnvState:
119+
def type_text_at(
120+
self,
121+
x: int,
122+
y: int,
123+
text: str,
124+
press_enter: bool,
125+
clear_before_typing: bool,
126+
) -> EnvState:
120127
return self._run_command(
121128
"type_text_at",
122129
args={
123130
"x": x,
124131
"y": y,
125132
"text": text,
133+
"press_enter": press_enter,
134+
"clear_before_typing": clear_before_typing,
126135
},
127136
)
128137

129138
def scroll_document(self, direction: str) -> EnvState:
130139
return self._run_command("scroll_document", args={"direction": direction})
131140

141+
def scroll_at(
142+
self,
143+
x: int,
144+
y: int,
145+
direction: Literal["up", "down", "left", "right"],
146+
magnitude: int,
147+
) -> EnvState:
148+
return self._run_command(
149+
"scroll_at",
150+
args={"x": x, "y": y, "direction": direction, "magnitude": magnitude},
151+
)
152+
132153
def wait_5_seconds(self) -> EnvState:
133154
return self._run_command("wait_5_seconds")
134155

@@ -147,6 +168,19 @@ def navigate(self, url: str) -> EnvState:
147168
def key_combination(self, keys: list[str]) -> EnvState:
148169
return self._run_command("key_combination", args={"keys": "+".join(keys)})
149170

171+
def drag_and_drop(
172+
self, x: int, y: int, destination_x: int, destination_y: int
173+
) -> EnvState:
174+
return self._run_command(
175+
"drag_and_drop",
176+
args={
177+
"x": x,
178+
"y": y,
179+
"destination_x": destination_x,
180+
"destination_y": destination_y,
181+
},
182+
)
183+
150184
def current_state(self) -> EnvState:
151185
return self._run_command("screenshot")
152186

0 commit comments

Comments
 (0)