|
1 | 1 | /** |
2 | | - * Tools for interacting with web pages: click, type, hover, drag, fill forms, etc. |
| 2 | + * Tools for interacting with web pages: click, type, hover, drag, etc. |
3 | 3 | */ |
4 | 4 | import { z } from "zod"; |
5 | 5 | import { zodToJsonSchema } from "zod-to-json-schema"; |
6 | 6 | import { callExtension, waitForBridgeConnection, hasExtensionConnection } from "../utils/bridge.js"; |
7 | 7 | import { captureAriaSnapshot } from "../utils/aria-snapshot.js"; |
8 | | -import type { Tool } from "./tool.js"; |
9 | | - |
10 | | -/** |
11 | | - * Click an element on the page by CSS selector |
12 | | - */ |
13 | | -const ClickSchema = z.object({ |
14 | | - selector: z.string().describe("CSS selector for the element to click (e.g., '#submit-btn', '.nav-link', 'button[type=\"submit\"]')"), |
15 | | - waitForNavigation: z.boolean().optional().describe("Whether to wait for navigation after clicking (default: true)"), |
| 8 | +import type { Tool, ToolResult } from "./tool.js"; |
| 9 | + |
| 10 | +const ElementSchema = z.object({ |
| 11 | + element: z.string().describe("Human-readable element description from the browser snapshot"), |
| 12 | + ref: z.string().describe("Exact target element reference from the browser snapshot"), |
| 13 | + selector: z |
| 14 | + .string() |
| 15 | + .optional() |
| 16 | + .describe("Optional CSS selector fallback (legacy). Use ref from browser_snapshot whenever possible."), |
16 | 17 | }); |
17 | 18 |
|
18 | | -export const click: Tool = { |
| 19 | +const ClickSchema = ElementSchema; |
| 20 | + |
| 21 | +export const browserClick: Tool = { |
19 | 22 | schema: { |
20 | | - name: "click", |
21 | | - description: "Click an element on the currently active tab using a CSS selector. First use navigate_browser to load a page, then use this tool to interact with elements. Returns snapshot of the page after clicking.", |
| 23 | + name: "browser_click", |
| 24 | + description: "Perform click on a web page", |
22 | 25 | inputSchema: zodToJsonSchema(ClickSchema) as any, |
23 | 26 | }, |
24 | 27 | handle: async (params) => { |
25 | 28 | if (!hasExtensionConnection()) { |
26 | 29 | await waitForBridgeConnection(4000); |
27 | 30 | } |
28 | | - const data = await callExtension("click_element", params); |
29 | 31 |
|
30 | | - // Return snapshot after clicking |
31 | | - return captureAriaSnapshot(data.data.finalUrl, `Clicked "${params.selector}"`); |
| 32 | + await callExtension("click_element", params); |
| 33 | + const snapshot = await captureAriaSnapshot(); |
| 34 | + return withActionText(`Clicked "${params.element}"`, snapshot); |
32 | 35 | }, |
33 | 36 | }; |
34 | 37 |
|
35 | | -/** |
36 | | - * Type text into an element |
37 | | - */ |
38 | | -const TypeSchema = z.object({ |
39 | | - selector: z.string().describe("CSS selector for the input element"), |
| 38 | +const TypeSchema = ElementSchema.extend({ |
40 | 39 | text: z.string().describe("Text to type into the element"), |
41 | | - clear: z.boolean().optional().describe("Whether to clear existing text before typing (default: true)"), |
42 | | - pressEnter: z.boolean().optional().describe("Whether to press Enter after typing (useful for submitting forms or sending messages, default: false)"), |
| 40 | + submit: z.boolean().optional().describe("Whether to submit entered text (press Enter after)"), |
43 | 41 | }); |
44 | 42 |
|
45 | | -export const type: Tool = { |
| 43 | +export const browserType: Tool = { |
46 | 44 | schema: { |
47 | | - name: "type", |
48 | | - description: "Type text into a form field or input element on the currently active tab. Supports regular inputs, textareas, and contenteditable elements (like Slack, Discord). First use navigate_browser to load a page, then use this tool to interact with elements. Set pressEnter=true to submit forms or send messages after typing.", |
| 45 | + name: "browser_type", |
| 46 | + description: "Type text into editable element", |
49 | 47 | inputSchema: zodToJsonSchema(TypeSchema) as any, |
50 | 48 | }, |
51 | 49 | handle: async (params) => { |
52 | 50 | if (!hasExtensionConnection()) { |
53 | 51 | await waitForBridgeConnection(4000); |
54 | 52 | } |
55 | | - const data = await callExtension("type_text", params); |
56 | 53 |
|
57 | | - const action = params.pressEnter ? `Typed "${params.text}" and pressed Enter` : `Typed "${params.text}"`; |
58 | | - return captureAriaSnapshot(data.data.url, `${action} into "${params.selector}"`); |
| 54 | + await callExtension("type_text", { ...params, pressEnter: params.submit === true }); |
| 55 | + const action = params.submit ? `Typed "${params.text}" and pressed Enter` : `Typed "${params.text}"`; |
| 56 | + const snapshot = await captureAriaSnapshot(); |
| 57 | + return withActionText(`${action} into "${params.element}"`, snapshot); |
59 | 58 | }, |
60 | 59 | }; |
61 | 60 |
|
62 | | -/** |
63 | | - * Hover over an element |
64 | | - */ |
65 | | -const HoverSchema = z.object({ |
66 | | - selector: z.string().describe("CSS selector for the element to hover over"), |
67 | | -}); |
| 61 | +const HoverSchema = ElementSchema; |
68 | 62 |
|
69 | | -export const hover: Tool = { |
| 63 | +export const browserHover: Tool = { |
70 | 64 | schema: { |
71 | | - name: "hover", |
72 | | - description: "Hover the mouse over an element on the currently active tab to trigger hover effects, tooltips, or dropdowns. First use navigate_browser to load a page, then use this tool to interact with elements.", |
| 65 | + name: "browser_hover", |
| 66 | + description: "Hover over element on page", |
73 | 67 | inputSchema: zodToJsonSchema(HoverSchema) as any, |
74 | 68 | }, |
75 | 69 | handle: async (params) => { |
76 | 70 | if (!hasExtensionConnection()) { |
77 | 71 | await waitForBridgeConnection(4000); |
78 | 72 | } |
79 | | - const data = await callExtension("hover_element", params); |
80 | 73 |
|
81 | | - return captureAriaSnapshot(data.data.url, `Hovered over "${params.selector}"`); |
| 74 | + await callExtension("hover_element", params); |
| 75 | + const snapshot = await captureAriaSnapshot(); |
| 76 | + return withActionText(`Hovered over "${params.element}"`, snapshot); |
82 | 77 | }, |
83 | 78 | }; |
84 | 79 |
|
85 | | -/** |
86 | | - * Select an option from a dropdown |
87 | | - */ |
88 | | -const SelectOptionSchema = z.object({ |
89 | | - selector: z.string().describe("CSS selector for the select element"), |
90 | | - value: z.string().describe("The option value or visible text to select"), |
| 80 | +const SelectOptionSchema = ElementSchema.extend({ |
| 81 | + values: z.array(z.string()).min(1).describe("Array of values to select in the dropdown"), |
91 | 82 | }); |
92 | 83 |
|
93 | | -export const selectOption: Tool = { |
| 84 | +export const browserSelectOption: Tool = { |
94 | 85 | schema: { |
95 | | - name: "select_option", |
96 | | - description: "Select an option from a dropdown/select element on the currently active tab by value or visible text. First use navigate_browser to load a page, then use this tool to interact with elements.", |
| 86 | + name: "browser_select_option", |
| 87 | + description: "Select an option in a dropdown", |
97 | 88 | inputSchema: zodToJsonSchema(SelectOptionSchema) as any, |
98 | 89 | }, |
99 | 90 | handle: async (params) => { |
100 | 91 | if (!hasExtensionConnection()) { |
101 | 92 | await waitForBridgeConnection(4000); |
102 | 93 | } |
103 | | - const data = await callExtension("select_option", params); |
104 | 94 |
|
105 | | - return captureAriaSnapshot(data.data.url, `Selected option "${params.value}" in "${params.selector}"`); |
| 95 | + await callExtension("select_option", params); |
| 96 | + const snapshot = await captureAriaSnapshot(); |
| 97 | + return withActionText(`Selected option in "${params.element}"`, snapshot); |
106 | 98 | }, |
107 | 99 | }; |
108 | 100 |
|
109 | | -/** |
110 | | - * Fill multiple form fields at once |
111 | | - */ |
112 | 101 | const FillFormFieldSchema = z.object({ |
113 | | - selector: z.string().describe("CSS selector for the form field"), |
| 102 | + selector: z |
| 103 | + .string() |
| 104 | + .optional() |
| 105 | + .describe("CSS selector for the form field (legacy fallback, prefer ref)"), |
| 106 | + ref: z.string().optional().describe("Element reference from browser_snapshot"), |
114 | 107 | value: z.string().describe("Value to set (use 'true'/'false' for checkboxes)"), |
115 | 108 | }); |
116 | 109 |
|
117 | 110 | const FillFormSchema = z.object({ |
118 | 111 | fields: z.array(FillFormFieldSchema).min(1).describe("Array of fields to fill"), |
119 | 112 | }); |
120 | 113 |
|
121 | | -export const fillForm: Tool = { |
| 114 | +export const browserFillForm: Tool = { |
122 | 115 | schema: { |
123 | | - name: "fill_form", |
124 | | - description: "Fill multiple form fields at once on the currently active tab. First use navigate_browser to load a page, then use this tool to interact with elements. Supports text inputs, selects, checkboxes, and radio buttons.", |
| 116 | + name: "browser_fill_form", |
| 117 | + description: "Fill multiple form fields (inputs, selects, checkboxes, radios) by selector/value.", |
125 | 118 | inputSchema: zodToJsonSchema(FillFormSchema) as any, |
126 | 119 | }, |
127 | 120 | handle: async (params) => { |
128 | 121 | if (!hasExtensionConnection()) { |
129 | 122 | await waitForBridgeConnection(4000); |
130 | 123 | } |
131 | | - const data = await callExtension("fill_form", params); |
132 | 124 |
|
133 | | - const fieldCount = data.data.successfulFields || 0; |
134 | | - return captureAriaSnapshot(data.data.url, `Filled ${fieldCount} form fields`); |
| 125 | + const data = await callExtension("browser_fill_form", params); |
| 126 | + const fieldCount = data?.data?.successfulFields || params.fields.length; |
| 127 | + const snapshot = await captureAriaSnapshot(); |
| 128 | + return withActionText(`Filled ${fieldCount} form fields`, snapshot); |
135 | 129 | }, |
136 | 130 | }; |
137 | 131 |
|
138 | | -/** |
139 | | - * Execute custom JavaScript on the page |
140 | | - */ |
141 | | -const ExecuteScriptSchema = z.object({ |
142 | | - script: z.string().describe("The JavaScript code to execute. Should be a function body that returns a value."), |
143 | | - args: z.array(z.union([ |
144 | | - z.string(), |
145 | | - z.number(), |
146 | | - z.boolean(), |
147 | | - z.null(), |
148 | | - z.record(z.unknown()), |
149 | | - ])).optional().describe("Optional array of arguments to pass to the script (supports strings, numbers, booleans, null, and objects)"), |
| 132 | +const PressKeySchema = z.object({ |
| 133 | + key: z.string().describe("Name of the key to press or character to generate (e.g., 'Enter', 'ArrowLeft', 'a')"), |
150 | 134 | }); |
151 | 135 |
|
152 | | -export const executeScript: Tool = { |
| 136 | +export const browserPressKey: Tool = { |
153 | 137 | schema: { |
154 | | - name: "execute_script", |
155 | | - description: "Execute custom JavaScript code on the currently active tab and return the result. First use navigate_browser to load a page, then use this tool to execute scripts. Use with caution.", |
156 | | - inputSchema: zodToJsonSchema(ExecuteScriptSchema) as any, |
| 138 | + name: "browser_press_key", |
| 139 | + description: "Press a key on the keyboard", |
| 140 | + inputSchema: zodToJsonSchema(PressKeySchema) as any, |
157 | 141 | }, |
158 | 142 | handle: async (params) => { |
159 | 143 | if (!hasExtensionConnection()) { |
160 | 144 | await waitForBridgeConnection(4000); |
161 | 145 | } |
162 | | - const data = await callExtension("execute_script", params); |
163 | | - |
164 | | - return { |
165 | | - content: [ |
166 | | - { |
167 | | - type: "text", |
168 | | - text: `Script executed successfully on ${data.data.url}. Result:\n\`\`\`json\n${JSON.stringify(data.data.result, null, 2)}\n\`\`\``, |
169 | | - }, |
170 | | - ], |
171 | | - _meta: { urls: [data.data.url] }, |
172 | | - }; |
| 146 | + |
| 147 | + await callExtension("press_key", params); |
| 148 | + const snapshot = await captureAriaSnapshot(); |
| 149 | + return withActionText(`Pressed key ${params.key}`, snapshot); |
| 150 | + }, |
| 151 | +}; |
| 152 | + |
| 153 | +const DragSchema = z.object({ |
| 154 | + startElement: z.string().describe("Human-readable source element description"), |
| 155 | + startRef: z.string().describe("Source element reference from browser_snapshot"), |
| 156 | + startSelector: z.string().optional().describe("Optional CSS selector fallback for the source element"), |
| 157 | + endElement: z.string().describe("Human-readable target element description"), |
| 158 | + endRef: z.string().describe("Target element reference from browser_snapshot"), |
| 159 | + endSelector: z.string().optional().describe("Optional CSS selector fallback for the target element"), |
| 160 | +}); |
| 161 | + |
| 162 | +export const browserDrag: Tool = { |
| 163 | + schema: { |
| 164 | + name: "browser_drag", |
| 165 | + description: "Perform drag and drop between two elements", |
| 166 | + inputSchema: zodToJsonSchema(DragSchema) as any, |
| 167 | + }, |
| 168 | + handle: async (params) => { |
| 169 | + if (!hasExtensionConnection()) { |
| 170 | + await waitForBridgeConnection(4000); |
| 171 | + } |
| 172 | + |
| 173 | + await callExtension("drag_element", params); |
| 174 | + const snapshot = await captureAriaSnapshot(); |
| 175 | + return withActionText(`Dragged "${params.startElement}" to "${params.endElement}"`, snapshot); |
173 | 176 | }, |
174 | 177 | }; |
| 178 | + |
| 179 | +function withActionText(action: string, snapshot: ToolResult): ToolResult { |
| 180 | + const existing = Array.isArray(snapshot.content) ? snapshot.content : []; |
| 181 | + return { |
| 182 | + ...snapshot, |
| 183 | + content: [ |
| 184 | + { |
| 185 | + type: "text", |
| 186 | + text: action, |
| 187 | + }, |
| 188 | + ...existing, |
| 189 | + ], |
| 190 | + }; |
| 191 | +} |
0 commit comments