Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/ai/Azure.AI.Agents/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "net",
"TagPrefix": "net/ai/Azure.AI.Agents",
"Tag": "net/ai/Azure.AI.Agents_24e62d0814"
"Tag": "net/ai/Azure.AI.Agents_b452a3b368"
}
272 changes: 272 additions & 0 deletions sdk/ai/Azure.AI.Agents/samples/Sample10_ComputerUse.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
# Sample for use of an agent with Computer Use tool in Azure.AI.Agents.

To enable your Agent to Computer Use tool, you need to use `ComputerTool` while creating `PromptAgentDefinition`.
1. First, we need to create an Agent client and read the environment variables, which will be used in the next steps.

```C# Snippet:Sample_CreateAgentClient_ComputerUse
var projectEndpoint = System.Environment.GetEnvironmentVariable("PROJECT_ENDPOINT");
var modelDeploymentName = System.Environment.GetEnvironmentVariable("COMPUTER_USE_DEPLOYMENT_NAME");
AgentsClient client = new(endpoint: new Uri(projectEndpoint), tokenProvider: new DefaultAzureCredential());
OpenAIClient openAIClient = client.GetOpenAIClient();
```

2. To use the tool, we need to read image files using `ReadImageFile` method.

Synchronous sample:
```C# Snippet:Sample_ReadImageFile_ComputerUse
private static BinaryData ReadImageFile(string name, [CallerFilePath] string pth = "")
{
var dirName = Path.GetDirectoryName(pth) ?? "";
return new BinaryData(File.ReadAllBytes(Path.Combine(dirName, name)));
}
```

3. In this example we will read in three toy schreenshots and place them into dictionary.

```C# Snippet:Sample_ReadImageFilesToDictionaries_ComputerUse
Dictionary<string, BinaryData> screenshots = new() {
{ "browser_search", ReadImageFile("Assets/cua_browser_search.png")},
{ "search_typed", ReadImageFile("Assets/cua_search_typed.png")},
{ "search_results", ReadImageFile("Assets/cua_search_results.png")},
};
```

4. Create a `PromptAgentDefinition` with `ComputerTool`.

Synchronous sample:
```C# Snippet:Sample_CreateAgent_ComputerUse_Sync
PromptAgentDefinition agentDefinition = new(model: modelDeploymentName)
{
Instructions = "You are a computer automation assistant.\n\n" +
"Be direct and efficient. When you reach the search results page, read and describe the actual search result titles and descriptions you can see.",
Tools = {
ResponseTool.CreateComputerTool(
environment: new ComputerToolEnvironment("windows"),
displayWidth: 1026,
displayHeight: 769
),
}
};
AgentVersion agentVersion = client.CreateAgentVersion(
agentName: "myAgent",
definition: agentDefinition,
options: null);
```

Asynchronous sample:
```C# Snippet:Sample_CreateAgent_ComputerUse_Async
PromptAgentDefinition agentDefinition = new(model: modelDeploymentName)
{
Instructions = "You are a computer automation assistant.\n\n" +
"Be direct and efficient. When you reach the search results page, read and describe the actual search result titles and descriptions you can see.",
Tools = {
ResponseTool.CreateComputerTool(
environment: new ComputerToolEnvironment("windows"),
displayWidth: 1026,
displayHeight: 769
),
}
};
AgentVersion agentVersion = await client.CreateAgentVersionAsync(
agentName: "myAgent",
definition: agentDefinition,
options: null);
```

4. Create a helper method to parse the ComputerTool outputs and to respond to Agents queries with new screenshots. Please note that throughout this sample we set the media type for image. Agents support `image/jpeg`, `image/png`, `image/gif` and `image/webp` media types.

```C# Snippet:Sample_ProcessComputerUseCall_ComputerUse
private static string ProcessComputerUseCall(ComputerCallResponseItem item, string oldScreenshot)
{
string currentScreenshot = "browser_search";
switch (item.Action.Kind)
{
case ComputerCallActionKind.Type:
Console.WriteLine($" Typing text \"{item.Action.TypeText}\" - Simulating keyboard input");
currentScreenshot = "search_typed";
break;
case ComputerCallActionKind.KeyPress:
HashSet<string> codes = [.. item.Action.KeyPressKeyCodes];
if (codes.Contains("Return") || codes.Contains("ENTER"))
{
// If we have typed the value to the search field, go to search results.
if (string.Equals(oldScreenshot, "search_typed"))
{
Console.WriteLine(" -> Detected ENTER key press, when search field was populated, displaying results.");
currentScreenshot = "search_results";
}
else
{
Console.WriteLine(" -> Detected ENTER key press, on results or unpopulated search, do nothing.");
currentScreenshot = oldScreenshot;
}
}
else
{
Console.WriteLine($" Key press: {item.Action.KeyPressKeyCodes.Aggregate("", (agg, next) => agg + "+" + next)} - Simulating key combination");
}
break;
case ComputerCallActionKind.Click:
Console.WriteLine($" Click at ({item.Action.ClickCoordinates.Value.X}, {item.Action.ClickCoordinates.Value.Y}) - Simulating click on UI element");
if (string.Equals(oldScreenshot, "search_typed"))
{
Console.WriteLine(" -> Assuming click on Search button when search field was populated, displaying results.");
currentScreenshot = "search_results";
}
else
{
Console.WriteLine(" -> Assuming click on Search on results or when search was not populated, do nothing.");
currentScreenshot = oldScreenshot;
}
break;
case ComputerCallActionKind.Drag:
string pathStr = item.Action.DragPath.ToArray().Select(p => $"{p.X}, {p.Y}").Aggregate("", (agg, next) => $"{agg} -> {next}");
Console.WriteLine($" Drag path: {pathStr} - Simulating drag operation");
break;
case ComputerCallActionKind.Scroll:
Console.WriteLine($" Scroll at ({item.Action.ScrollCoordinates.Value.X}, {item.Action.ScrollCoordinates.Value.Y}) - Simulating scroll action");
break;
case ComputerCallActionKind.Screenshot:
Console.WriteLine(" Taking screenshot - Capturing current screen state");
break;
default:
break;
}
Console.WriteLine($" -> Action processed: {item.Action.Kind}");

return currentScreenshot;
}
```

5. For brevity create the methods to wait for response to be returned.

Synchronous sample:
```C# Snippet:Sample_WaitForResponse_ComputerUse_Sync
public static OpenAIResponse CreateAndWaitForResponse(OpenAIResponseClient responseClient, IEnumerable<ResponseItem> items, ResponseCreationOptions options)
{
OpenAIResponse response = responseClient.CreateResponse(
inputItems: items,
options: options);
while (response.Status != ResponseStatus.Incomplete && response.Status != ResponseStatus.Failed && response.Status != ResponseStatus.Completed)
{
Thread.Sleep(TimeSpan.FromMilliseconds(500));
response = responseClient.GetResponse(responseId: response.Id);
}
Assert.That(response.Status, Is.EqualTo(ResponseStatus.Completed));
return response;
}
```

Asynchronous sample:
```C# Snippet:Sample_WaitForResponse_ComputerUse_Async
public static async Task<OpenAIResponse> CreateAndWaitForResponseAsync(OpenAIResponseClient responseClient, IEnumerable<ResponseItem> items, ResponseCreationOptions options)
{
OpenAIResponse response = await responseClient.CreateResponseAsync(
inputItems: items,
options: options);
while (response.Status != ResponseStatus.Incomplete && response.Status != ResponseStatus.Failed && response.Status != ResponseStatus.Completed)
{
await Task.Delay(TimeSpan.FromMilliseconds(500));
response = await responseClient.GetResponseAsync(responseId: response.Id);
}
Assert.That(response.Status, Is.EqualTo(ResponseStatus.Completed));
return response;
}
```

6. Create an `OpenAIResponse` using `ResponseItem`, containing two `ResponseContentPart`: one with the image and another with the text. In the loop we will request Agent while it is continuing to browse web. Finally, print the tool output message.

Synchronous sample:
```C# Snippet:Sample_CreateResponse_ComputerUse_Sync
OpenAIResponseClient responseClient = openAIClient.GetOpenAIResponseClient(modelDeploymentName);
ResponseCreationOptions responseOptions = new();
responseOptions.SetAgentReference(new AgentReference(name: agentVersion.Name));
responseOptions.TruncationMode = ResponseTruncationMode.Auto;
string currentScreenshot = "browser_search";
ResponseItem request = ResponseItem.CreateUserMessageItem(
[
ResponseContentPart.CreateInputTextPart("I need you to help me search for 'OpenAI news'. Please type 'OpenAI news' and submit the search. Once you see search results, the task is complete."),
ResponseContentPart.CreateInputImagePart(imageBytes: screenshots["browser_search"], imageBytesMediaType: "image/png", imageDetailLevel: ResponseImageDetailLevel.High)
]
);
List<ResponseItem> inputItems = [request];
bool computerUseCalled = false;
int limitIteration = 10;
OpenAIResponse response;
do
{
response = CreateAndWaitForResponse(
responseClient,
inputItems,
responseOptions);
computerUseCalled = false;
inputItems.Clear();
responseOptions.PreviousResponseId = response.Id;
foreach (ResponseItem responseItem in response.OutputItems)
{
inputItems.Add(responseItem);
if (responseItem is ComputerCallResponseItem computerCall)
{
currentScreenshot = ProcessComputerUseCall(computerCall, currentScreenshot);
inputItems.Add(ResponseItem.CreateComputerCallOutputItem(callId: computerCall.CallId, output: ComputerCallOutput.CreateScreenshotOutput(screenshotImageBytes: screenshots[currentScreenshot], screenshotImageBytesMediaType: "image/png")));
computerUseCalled = true;
}
}
limitIteration--;
} while (computerUseCalled && limitIteration > 0);
Console.WriteLine(response.GetOutputText());
```

Asynchronous sample:
```C# Snippet:Sample_CreateResponse_ComputerUse_Async
OpenAIResponseClient responseClient = openAIClient.GetOpenAIResponseClient(modelDeploymentName);
ResponseCreationOptions responseOptions = new();
responseOptions.SetAgentReference(new AgentReference(name: agentVersion.Name));
responseOptions.TruncationMode = ResponseTruncationMode.Auto;
ResponseItem request = ResponseItem.CreateUserMessageItem(
[
ResponseContentPart.CreateInputTextPart("I need you to help me search for 'OpenAI news'. Please type 'OpenAI news' and submit the search. Once you see search results, the task is complete."),
ResponseContentPart.CreateInputImagePart(imageBytes: screenshots["browser_search"], imageBytesMediaType: "image/png", imageDetailLevel: ResponseImageDetailLevel.High)
]
);
List<ResponseItem> inputItems = [request];
bool computerUseCalled = false;
string currentScreenshot = "browser_search";
int limitIteration = 10;
OpenAIResponse response;
do
{
response = await CreateAndWaitForResponseAsync(
responseClient,
inputItems,
responseOptions
);
computerUseCalled = false;
responseOptions.PreviousResponseId = response.Id;
inputItems.Clear();
foreach (ResponseItem responseItem in response.OutputItems)
{
inputItems.Add(responseItem);
if (responseItem is ComputerCallResponseItem computerCall)
{
currentScreenshot = ProcessComputerUseCall(computerCall, currentScreenshot);
inputItems.Add(ResponseItem.CreateComputerCallOutputItem(callId: computerCall.CallId, output: ComputerCallOutput.CreateScreenshotOutput(screenshotImageBytes: screenshots[currentScreenshot], screenshotImageBytesMediaType: "image/png")));
computerUseCalled = true;
}
}
limitIteration--;
} while (computerUseCalled && limitIteration > 0);
Console.WriteLine(response.GetOutputText());
```

7. Clean up resources by deleting Agent and uploaded files.

Synchronous sample:
```C# Snippet:Sample_Cleanup_ComputerUse_Sync
client.DeleteAgentVersion(agentName: agentVersion.Name, agentVersion: agentVersion.Version);
```

Asynchronous sample:
```C# Snippet:Sample_Cleanup_ComputerUse_Async
await client.DeleteAgentVersionAsync(agentName: agentVersion.Name, agentVersion: agentVersion.Version);
```
4 changes: 2 additions & 2 deletions sdk/ai/Azure.AI.Agents/samples/Sample8_FileSearch.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ OpenAIResponseClient responseClient = openAIClient.GetOpenAIResponseClient(model
ResponseCreationOptions responseOptions = new();
responseOptions.SetAgentReference(new AgentReference(name: agentVersion.Name));

ResponseItem request = ResponseItem.CreateUserMessageItem("The word 'apple' uses the code 442345, while the word 'banana' uses the code 673457.");
ResponseItem request = ResponseItem.CreateUserMessageItem("Can you give me the documented codes for 'banana' and 'orange'?");
OpenAIResponse response = responseClient.CreateResponse(
[request],
responseOptions);
Expand All @@ -105,7 +105,7 @@ OpenAIResponseClient responseClient = openAIClient.GetOpenAIResponseClient(model
ResponseCreationOptions responseOptions = new();
responseOptions.SetAgentReference(new AgentReference(name: agentVersion.Name));

ResponseItem request = ResponseItem.CreateUserMessageItem("The word 'apple' uses the code 442345, while the word 'banana' uses the code 673457.");
ResponseItem request = ResponseItem.CreateUserMessageItem("Can you give me the documented codes for 'banana' and 'orange'?");
OpenAIResponse response = await responseClient.CreateResponseAsync(
[request],
responseOptions);
Expand Down
3 changes: 3 additions & 0 deletions sdk/ai/Azure.AI.Agents/tests/AIAgentsTestEnvironment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ public class AIAgentsTestEnvironment : TestEnvironment
public string AGENT_NAME => GetRecordedVariable("AZURE_AI_FOUNDRY_AGENT_NAME");
public string MODELDEPLOYMENTNAME => GetRecordedVariable("MODEL_DEPLOYMENT_NAME");

public string COMPUTER_USE_DEPLOYMENT_NAME => GetRecordedVariable("COMPUTER_USE_DEPLOYMENT_NAME");

public override Dictionary<string, string> ParseEnvironmentFile() => new()
{
{ "OPEN-API-KEY", Environment.GetEnvironmentVariable("OPENAI_API_KEY") ?? "api-key" }
Expand All @@ -35,5 +37,6 @@ public override Task WaitForEnvironmentAsync()
public string CONTAINER_APP_RESOURCE_ID => GetRecordedVariable("CONTAINER_APP_RESOURCE_ID");
public string INGRESS_SUBDOMAIN_SUFFIX => GetRecordedVariable("INGRESS_SUBDOMAIN_SUFFIX");
public string OPENAI_FILE_ID => GetRecordedVariable("OPENAI_FILE_ID");
public string COMPUTER_SCREENSHOTS => GetRecordedVariable("COMPUTER_SCREENSHOTS");
}
}
17 changes: 14 additions & 3 deletions sdk/ai/Azure.AI.Agents/tests/AgentsTestBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
Expand All @@ -14,13 +15,13 @@
using Azure.AI.Projects;
using Azure.Identity;
using Microsoft.ClientModel.TestFramework;
using Microsoft.Extensions.Primitives;
using NUnit.Framework;
using OpenAI;
using OpenAI.Responses;
using OpenAI.VectorStores;

namespace Azure.AI.Agents.Tests;
#pragma warning disable OPENAICUA001

public class AgentsTestBase : RecordedTestBase<AIAgentsTestEnvironment>
{
Expand Down Expand Up @@ -51,6 +52,7 @@ public enum ToolType
{
{ToolType.None, "Hello, tell me a joke."},
{ToolType.FunctionCall, "What is the nickname for Seattle, WA?" },
{ToolType.ComputerUse, "I need you to help me search for 'OpenAI news'. Please type 'OpenAI news' and submit the search. Once you see search results, the task is complete." },
{ToolType.BingGrounding, "How does wikipedia explain Euler's Identity?" },
{ToolType.OpenAPI, "What's the weather in Seattle?"},
{ToolType.DeepResearch, "Research the current state of studies on orca intelligence and orca language, " +
Expand All @@ -77,6 +79,8 @@ public enum ToolType
{ToolType.None, "You are a prompt agent."},
{ToolType.BingGrounding, "You are helpful agent."},
{ToolType.FunctionCall, "You are helpful agent. Use the provided functions to help answer questions."},
{ToolType.ComputerUse, "You are a computer automation assistant.\n\n" +
"Be direct and efficient. When you reach the search results page, read and describe the actual search result titles and descriptions you can see." },
{ToolType.OpenAPI, "You are helpful agent."},
{ToolType.DeepResearch, "You are a helpful agent that assists in researching scientific topics."},
{ToolType.AzureAISearch, "You are a helpful agent that can search for information using Azure AI Search."},
Expand Down Expand Up @@ -258,6 +262,12 @@ protected void IgnoreSampleMayBe()
}
}

protected static string GetTestFile(string fileName, [CallerFilePath] string pth = "")
{
var dirName = Path.GetDirectoryName(pth) ?? "";
return Path.Combine(new string[] { dirName, "TestData", fileName });
}

#region ToolHelper
private async Task<VectorStore> GetVectorStore(OpenAIClient openAIClient)
{
Expand All @@ -283,7 +293,7 @@ private async Task<VectorStore> GetVectorStore(OpenAIClient openAIClient)
/// </summary>
/// <param name="toolType"></param>
/// <returns></returns>
protected async Task<AgentDefinition> GetAgentToolDefinition(ToolType toolType, OpenAIClient oaiClient)
protected async Task<AgentDefinition> GetAgentToolDefinition(ToolType toolType, OpenAIClient oaiClient, string model=default)
{
ResponseTool tool = toolType switch
{
Expand Down Expand Up @@ -326,9 +336,10 @@ protected async Task<AgentDefinition> GetAgentToolDefinition(ToolType toolType,
),
strictModeEnabled: false
),
ToolType.ComputerUse => ResponseTool.CreateComputerTool(environment: new ComputerToolEnvironment("windows"), displayWidth: 1026, displayHeight: 769),
_ => throw new InvalidOperationException($"Unknown tool type {toolType}")
};
return new PromptAgentDefinition(TestEnvironment.MODELDEPLOYMENTNAME)
return new PromptAgentDefinition(model ?? TestEnvironment.MODELDEPLOYMENTNAME)
{
Instructions = ToolInstructions[toolType],
Tools = { tool },
Expand Down
Loading
Loading