3030
3131from computers import EnvState , Computer
3232
33+ MAX_RECENT_TURN_WITH_SCREENSHOTS = 3
34+ PREDEFINED_COMPUTER_USE_FUNCTIONS = [
35+ "open_web_browser" ,
36+ "click_at" ,
37+ "hover_at" ,
38+ "type_text_at" ,
39+ "scroll_document" ,
40+ "scroll_at" ,
41+ "wait_5_seconds" ,
42+ "go_back" ,
43+ "go_forward" ,
44+ "search" ,
45+ "navigate" ,
46+ "key_combination" ,
47+ "drag_and_drop" ,
48+ ]
49+
50+
3351console = Console ()
3452
3553# Built-in Computer Use tools will return "EnvState".
@@ -60,12 +78,6 @@ def __init__(
6078 vertexai = os .environ .get ("USE_VERTEXAI" , "0" ).lower () in ["true" , "1" ],
6179 project = os .environ .get ("VERTEXAI_PROJECT" ),
6280 location = os .environ .get ("VERTEXAI_LOCATION" ),
63- http_options = types .HttpOptions (
64- api_version = "v1alpha" ,
65- base_url = os .environ .get (
66- "GEMINI_API_SERVER" , "https://generativelanguage.googleapis.com"
67- ),
68- ),
6981 )
7082 self ._contents : list [Content ] = [
7183 Content (
@@ -101,7 +113,6 @@ def __init__(
101113 ),
102114 types .Tool (function_declarations = custom_functions ),
103115 ],
104- thinking_config = types .ThinkingConfig (include_thoughts = True ),
105116 )
106117
107118 def handle_action (self , action : types .FunctionCall ) -> FunctionResponseT :
@@ -321,9 +332,9 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
321332 "url" : fc_result .url ,
322333 ** extra_fr_fields ,
323334 },
324- data = [
325- types .Part (
326- inline_data = types .Blob (
335+ parts = [
336+ types .FunctionResponsePart (
337+ inline_data = types .FunctionResponseBlob (
327338 mime_type = "image/png" , data = fc_result .screenshot
328339 )
329340 )
@@ -342,6 +353,35 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
342353 )
343354 )
344355
356+ # only keep screenshots in the few most recent turns, remove the screenshot images from the old turns.
357+ turn_with_screenshots_found = 0
358+ for content in reversed (self ._contents ):
359+ if content .role == "user" and content .parts :
360+ # check if content has screenshot of the predefined computer use functions.
361+ has_screenshot = False
362+ for part in content .parts :
363+ if (
364+ part .function_response
365+ and part .function_response .parts
366+ and part .function_response .name
367+ in PREDEFINED_COMPUTER_USE_FUNCTIONS
368+ ):
369+ has_screenshot = True
370+ break
371+
372+ if has_screenshot :
373+ turn_with_screenshots_found += 1
374+ # remove the screenshot image if the number of screenshots exceed the limit.
375+ if turn_with_screenshots_found > MAX_RECENT_TURN_WITH_SCREENSHOTS :
376+ for part in content .parts :
377+ if (
378+ part .function_response
379+ and part .function_response .parts
380+ and part .function_response .name
381+ in PREDEFINED_COMPUTER_USE_FUNCTIONS
382+ ):
383+ part .function_response .parts = None
384+
345385 return "CONTINUE"
346386
347387 def _get_safety_confirmation (
0 commit comments