diff --git a/assets/evaluators/builtin/tool_output_utilization/asset.yaml b/assets/evaluators/builtin/tool_output_utilization/asset.yaml new file mode 100644 index 0000000000..845a698b96 --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/asset.yaml @@ -0,0 +1,4 @@ +type: evaluator +spec: spec.yaml +categories: +- Evaluator \ No newline at end of file diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py new file mode 100644 index 0000000000..fd0ad71aa9 --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -0,0 +1,557 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import os +import math +import logging +from enum import Enum +from typing import Dict, Union, List + +from typing_extensions import overload, override + +from azure.ai.evaluation._exceptions import ( + EvaluationException, + ErrorBlame, + ErrorCategory, + ErrorTarget, + ErrorMessage, +) +from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._common.utils import _extract_text_from_content +from azure.ai.evaluation._common._experimental import experimental + +logger = logging.getLogger(__name__) + + +# ``` updated _exceptions.py +# Extend ErrorTarget enum if needed +def _create_extended_error_target(ErrorTarget): + """Create an extended ErrorTarget enum that includes TOOL_OUTPUT_UTILIZATION_EVALUATOR.""" + existing_members = {member.name: member.value for member in ErrorTarget} + existing_members["TOOL_OUTPUT_UTILIZATION_EVALUATOR"] = "ToolOutputUtilizationEvaluator" + + ErrorTarget = Enum("ExtendedErrorTarget", existing_members) + return ErrorTarget + + +ErrorTarget = _create_extended_error_target(ErrorTarget) +# ``` + + +# ``` updated utils.py +def _filter_to_used_tools(tool_definitions, msgs_lists, logger=None): + """Filter the tool definitions to only include those that were actually used in the messages lists.""" + try: + used_tool_names = set() + any_tools_used = False + for msgs in msgs_lists: + for msg in msgs: + if msg.get("role") == "assistant" and "content" in msg: + for content in msg.get("content", []): + if content.get("type") == "tool_call": + any_tools_used = True + if "tool_call" in content and "function" in content["tool_call"]: + used_tool_names.add(content["tool_call"]["function"]) + elif "name" in content: + used_tool_names.add(content["name"]) + + filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names] + if any_tools_used and not filtered_tools: + if logger: + logger.warning("No tool definitions matched the tools used in the messages. Returning original list.") + filtered_tools = tool_definitions + + return filtered_tools + except Exception as e: + if logger: + logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}") + return tool_definitions + + +def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False): + """Parse conversation history from a list of messages into structured format. + + :param query: List of message dictionaries containing the conversation history + :type query: List[dict] + :param include_system_messages: Whether to include system messages in the output + :type include_system_messages: bool + :param include_tool_messages: Whether to include tool-related messages in agent responses + :type include_tool_messages: bool + :return: Dict containing parsed user_queries, agent_responses, and optionally system_message + :rtype: Dict[str, Union[List[List[str]], str]] + :raises EvaluationException: If conversation history is malformed (mismatched user/agent turns + """ + all_user_queries, all_agent_responses = [], [] + cur_user_query, cur_agent_response = [], [] + system_message = None + + for msg in query: + role = msg.get("role") + if not role: + continue + if include_system_messages and role == "system": + system_message = msg.get("content", "") + + elif role == "user" and "content" in msg: + if cur_agent_response: + formatted_agent_response = _get_agent_response( + cur_agent_response, include_tool_messages=include_tool_messages + ) + all_agent_responses.append([formatted_agent_response]) + cur_agent_response = [] + text_in_msg = _extract_text_from_content(msg["content"]) + if text_in_msg: + cur_user_query.append(text_in_msg) + + elif role in ("assistant", "tool"): + if cur_user_query: + all_user_queries.append(cur_user_query) + cur_user_query = [] + cur_agent_response.append(msg) + + if cur_user_query: + all_user_queries.append(cur_user_query) + if cur_agent_response: + formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages) + all_agent_responses.append([formatted_agent_response]) + + if len(all_user_queries) != len(all_agent_responses) + 1: + raise EvaluationException( + message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY, + internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY, + target=ErrorTarget.CONVERSATION_HISTORY_PARSING, + category=ErrorCategory.INVALID_VALUE, + blame=ErrorBlame.USER_ERROR, + ) + + result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses} + if include_system_messages and system_message: + result["system_message"] = system_message + return result + + +def _pretty_format_conversation_history(conversation_history): + """Format the conversation history for better readability.""" + formatted_history = "" + if conversation_history.get("system_message"): + formatted_history += "SYSTEM_PROMPT:\n" + formatted_history += " " + conversation_history["system_message"] + "\n\n" + for i, (user_query, agent_response) in enumerate( + zip( + conversation_history["user_queries"], + conversation_history["agent_responses"] + [None], + ) + ): + formatted_history += f"User turn {i+1}:\n" + for msg in user_query: + formatted_history += " " + "\n ".join(msg) + formatted_history += "\n\n" + if agent_response: + formatted_history += f"Agent turn {i+1}:\n" + for msg in agent_response: + formatted_history += " " + "\n ".join(msg) + formatted_history += "\n\n" + return formatted_history + + +def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False): + """Reformats the conversation history to a more compact representation.""" + try: + conversation_history = _get_conversation_history( + query, + include_system_messages=include_system_messages, + include_tool_messages=include_tool_messages, + ) + return _pretty_format_conversation_history(conversation_history) + except Exception as e: + # If the conversation history cannot be parsed for whatever reason, the original query is returned + # This is a fallback to ensure that the evaluation can still proceed. + # However the accuracy of the evaluation will be affected. + # From our tests the negative impact on IntentResolution is: + # Higher intra model variance (0.142 vs 0.046) + # Higher inter model variance (0.345 vs 0.607) + # Lower percentage of mode in Likert scale (73.4% vs 75.4%) + # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3) + if logger: + logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}") + print(e) + return query + + +def _get_agent_response(agent_response_msgs, include_tool_messages=False): + """Extract formatted agent response including text, and optionally tool calls/results.""" + agent_response_text = [] + tool_results = {} + + # First pass: collect tool results + if include_tool_messages: + for msg in agent_response_msgs: + if msg.get("role") == "tool" and "tool_call_id" in msg: + for content in msg.get("content", []): + if content.get("type") == "tool_result": + result = content.get("tool_result") + tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}" + + # Second pass: parse assistant messages and tool calls + for msg in agent_response_msgs: + if "role" in msg and msg.get("role") == "assistant" and "content" in msg: + text = _extract_text_from_content(msg["content"]) + if text: + agent_response_text.extend(text) + if include_tool_messages: + for content in msg.get("content", []): + # Todo: Verify if this is the correct way to handle tool calls + if content.get("type") == "tool_call": + if "tool_call" in content and "function" in content.get("tool_call", {}): + tc = content.get("tool_call", {}) + func_name = tc.get("function", {}).get("name", "") + args = tc.get("function", {}).get("arguments", {}) + tool_call_id = tc.get("id") + else: + tool_call_id = content.get("tool_call_id") + func_name = content.get("name", "") + args = content.get("arguments", {}) + args_str = ", ".join(f'{k}="{v}"' for k, v in args.items()) + call_line = f"[TOOL_CALL] {func_name}({args_str})" + agent_response_text.append(call_line) + if tool_call_id in tool_results: + agent_response_text.append(tool_results[tool_call_id]) + + return agent_response_text + + +def reformat_agent_response(response, logger=None, include_tool_messages=False): + """Reformat agent response to a standardized string format. + + :param response: The agent response to reformat, can be None, empty list, or list of messages + :type response: Union[None, List[dict], str] + :param logger: Optional logger for warning messages + :type logger: Optional[logging.Logger] + :param include_tool_messages: Whether to include tool call and result information + :type include_tool_messages: bool + :return: Formatted agent response as a string, or original response if parsing fails + :rtype: str + """ + try: + if response is None or response == []: + return "" + agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages) + if agent_response == []: + # If no message could be extracted, fallback to the original response in that case + if logger: + logger.warning( + "Empty agent response extracted, likely due to input schema change. " + f"Falling back to using the original response: {response}" + ) + return response + return "\n".join(agent_response) + except Exception as e: + # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), + # the original response is returned + # This is a fallback to ensure that the evaluation can still proceed. + # See comments on reformat_conversation_history for more details. + if logger: + logger.warning(f"Agent response could not be parsed, falling back to original response. Error: {e}") + return response + + +def reformat_tool_definitions(tool_definitions, logger=None): + """Reformat tool definitions into a human-readable string format. + + :param tool_definitions: List of tool definition dictionaries containing name, description, and parameters + :type tool_definitions: List[dict] + :param logger: Optional logger for warning messages + :type logger: Optional[logging.Logger] + :return: Formatted tool definitions as a string, or original definitions if parsing fails + :rtype: str + """ + try: + output_lines = ["TOOL_DEFINITIONS:"] + for tool in tool_definitions: + name = tool.get("name", "unnamed_tool") + desc = tool.get("description", "").strip() + params = tool.get("parameters", {}).get("properties", {}) + param_names = ", ".join(params.keys()) if params else "no parameters" + output_lines.append(f"- {name}: {desc} (inputs: {param_names})") + return "\n".join(output_lines) + except Exception as e: + # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned + # This is a fallback to ensure that the evaluation can still proceed. + # See comments on reformat_conversation_history for more details. + if logger: + logger.warning( + "Tool definitions could not be parsed, falling back to original definitions" + f": {tool_definitions}. Error: {e}" + ) + return tool_definitions + + +# ``` + + +@experimental +class ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): + """Evaluate how effectively an AI agent uses tool outputs. + + This evaluator checks whether the agent correctly incorporates information from tools into its responses. + + Scoring is based on two levels: + 1. Pass - effectively utilizes tool outputs and accurately incorporates the information into its response. + 2. Fail - fails to properly utilize tool outputs or incorrectly incorporates the information into its response. + + The evaluation includes the score, a brief explanation, and a final pass/fail result. + + + :param model_config: Configuration for the Azure OpenAI model. + :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, + ~azure.ai.evaluation.OpenAIModelConfiguration] + + .. admonition:: Example: + .. literalinclude:: ../samples/evaluation_samples_evaluate.py + :start-after: [START tool_output_utilization_evaluator] + :end-before: [END tool_output_utilization_evaluator] + :language: python + :dedent: 8 + :caption: Initialize and call a ToolOutputUtilizationEvaluator with a query and response. + + .. admonition:: Example using Azure AI Project URL: + + .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py + :start-after: [START tool_output_utilization_evaluator] + :end-before: [END tool_output_utilization_evaluator] + :language: python + :dedent: 8 + :caption: Initialize and call ToolOutputUtilizationEvaluator + using Azure AI Project URL in the following format + https://{resource_name}.services.ai.azure.com/api/projects/{project_name} + """ + + _PROMPTY_FILE = "tool_output_utilization.prompty" + _RESULT_KEY = "tool_output_utilization" + _OPTIONAL_PARAMS = ["tool_definitions"] + + _DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE = 3 + + id = "azureai://built-in/evaluators/tool_output_utilization" + """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + + @override + def __init__( + self, + model_config, + *, + threshold=_DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE, + credential=None, + **kwargs, + ): + """Initialize the Tool Output Utilization Evaluator.""" + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) + self.threshold = threshold + super().__init__( + model_config=model_config, + prompty_file=prompty_path, + result_key=self._RESULT_KEY, + credential=credential, + **kwargs, + ) + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]], + tool_definitions: Union[dict, List[dict]], + ) -> Dict[str, Union[str, float]]: + """Evaluate tool output utilization for a given query, response, and optional tool defintions. + + The query and response can be either a string or a list of messages. + Example with string inputs and no tools: + evaluator = ToolOutputUtilizationEvaluator(model_config) + query = "What is the weather today?" + response = "The weather is sunny." + + result = evaluator(query=query, response=response) + + Example with list of messages: + evaluator = ToolOutputUtilizationEvaluator(model_config) + query = [ + { + "role": "system", + "content": "You are a helpful customer service assistant.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Hi, can you check the status of my last order?", + } + ], + }, + ] + + response = [ + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Sure! Let me look that up for you."} + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call": { + "id": "tool_1", + "type": "function", + "function": { + "name": "get_order_status", + "arguments": {"order_id": "123"}, + }, + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "tool_1", + "content": [ + { + "type": "tool_result", + "tool_result": '{"order_id": "123", "status": "shipped"}', + } + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Your order 123 has been shipped and is on its way!", + } + ], + }, + ] + + tool_definitions = [ + { + "name": "get_order_status", + "description": "Retrieve the status of an order by its ID.", + "parameters": { + "type": "object", + "properties": { + "order_id": { + "type": "string", + "description": "The order ID to check.", + } + }, + }, + } + ] + + + result = evaluator(query=query, response=response, tool_definitions=tool_definitions) + + :keyword query: The query being evaluated, either a string or a list of messages. + :paramtype query: Union[str, List[dict]] + :keyword response: The response being evaluated, either a string or a list of messages + (full agent response potentially including tool calls) + :paramtype response: Union[str, List[dict]] + :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. + :paramtype tool_definitions: Union[dict, List[dict]] + :return: A dictionary with the tool output utilization evaluation results. + :rtype: Dict[str, Union[str, float]] + """ + + @override + def __call__( # pylint: disable=docstring-missing-param + self, + *args, + **kwargs, + ): + """Invoke the instance using the overloaded __call__ signature. + + For detailed parameter types and return value documentation, see the overloaded __call__ definition. + """ + return super().__call__(*args, **kwargs) + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] + """Do Tool Output Utilization evaluation. + + :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow + method + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + # we override the _do_eval method as we want the output to be a dictionary, + # which is a different schema than _base_prompty_eval.py + if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input): + raise EvaluationException( + message=( + "Query, response, and tool_definitions are required inputs to " + "the Tool Output Utilization evaluator." + ), + internal_message=( + "Query, response, and tool_definitions are required inputs " + "to the Tool Output Utilization evaluator." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.MISSING_FIELD, + target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, + ) + + tool_definitions = eval_input["tool_definitions"] + filtered_tool_definitions = _filter_to_used_tools( + tool_definitions=tool_definitions, + msgs_lists=[eval_input["query"], eval_input["response"]], + logger=logger, + ) + eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger) + + eval_input["query"] = reformat_conversation_history( + eval_input["query"], + logger, + include_system_messages=True, + include_tool_messages=True, + ) + eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) + + llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) + if isinstance(llm_output, dict): + output_label = llm_output.get("label", None) + if output_label is None: + if logger: + logger.warning("LLM output does not contain 'label' key, returning NaN for the score.") + output_label = "fail" + + output_label = output_label.lower() + if output_label not in ["pass", "fail"]: + if logger: + logger.warning( + ( + f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), " + "returning NaN for the score." + ) + ) + + score = 1.0 if output_label == "pass" else 0.0 + score_result = output_label + reason = llm_output.get("reason", "") + + faulty_details = llm_output.get("faulty_details", []) + if faulty_details: + reason += " Issues found: " + "; ".join(faulty_details) + + return { + f"{self._result_key}": score, + f"{self._result_key}_result": score_result, + f"{self._result_key}_reason": reason, + } + if logger: + logger.warning("LLM output is not a dictionary, returning NaN for the score.") + return {self._result_key: math.nan} diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty b/assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty new file mode 100644 index 0000000000..5fd406e48d --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/tool_output_utilization.prompty @@ -0,0 +1,221 @@ +--- +name: Tool Output Utilization Evaluator +description: Binary evaluator that judges whether an agent correctly understands and *uses* the outputs returned by tools it invoked (APIs, search/retrieval, DB queries, etc.). This evaluator focuses ONLY on incorrect, missing, or fabricated uses of tool outputs — whether they are used in the final response to the user or reused as inputs to subsequent tool calls. It does NOT judge tool selection, correctness of new inputs, or general reasoning quality. +model: + api: chat + parameters: + temperature: 0.0 + max_tokens: 1500 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: json_object + +inputs: + query: + type: string + response: + type: string + tool_definitions: + type: string +--- +system: +You are **Tool Output Utilization Judge**, an expert evaluator whose only task is to decide whether the AGENT correctly interpreted and *used* TOOL OUTPUTS whenproducing the RESPONSE. + +Key constraints: + +- **Focus exclusively** on uses of tool outputs. A "use" means any appearance or + incorporation of a prior tool output (from `query`) within the agent's `response` + — either as part of the textual content to the user or as a parameter inside a new tool call. +- Do **not** judge whether the agent chose the right tool, made the right new call, + or used the correct input format. Those are evaluated separately. +- Treat `query` as the authoritative source of all prior conversation. +- Treat `response` as the agent's latest message, which may: + 1. State facts that come from tool outputs. + 2. Contain tool calls that reference or reuse prior tool outputs. +- Use `tool_definitions` for contextual understanding of tool structures (fields, types, units, etc.). +- Conservative rule: if any tool-derived information appears incorrectly used in RESPONSE, omitted when relevant, or fabricated, mark it as a fault. + +INPUT +===== + +CONVERSATION_HISTORY: {{query}} +AGENT_RESPONSE: {{response}} +TOOL_DEFINITIONS: {{tool_definitions}} + +> `CONVERSATION_HISTORY` includes all prior turns and any tool results. +> `AGENT_RESPONSE` is the model's latest message. +> `TOOL_DEFINITIONS` describe the tool schemas used. + +user: +ROLE +==== + +You are Tool Output Utilization Judge. Evaluate whether the RESPONSE correctly: + +- Reflects the factual content of prior tool outputs from `query`, and +- Reuses any of those tool outputs correctly when incorporating them into new tool calls or the textual response. + +TASK +==== + +Produce exactly one JSON object (and nothing else) with these keys in **this exact order**: + +1. `faulty_details`: array of strings — list only the faults found (empty array if none). + Each entry can follow one of these formats: + - "claim -> MISMATCH (expected X, saw Y) mapped to tool_name.field_path" + - "claim -> FABRICATED (no supporting tool field)" + - "use -> FABRICATED (referenced value not found in prior tool outputs)" + - "use -> MISMATCH (expected X, used Y) mapped to tool_name.field_path" + + +2. `reason`: short 1–2 sentence summary of why PASS or FAIL. +3. `label`: string `"pass"` or `"fail"`. + +> Output must be valid JSON, all lowercase keys, no extra text or markdown. + +EVALUATION STEPS +================ + +1. Identify all **instances** in the RESPONSE where tool outputs are *used*: + - Either referenced in text (factual claims to the user), or + - Reused as parameters in new tool calls. +2. For each instance: + - Cross-check against the corresponding tool outputs in `query`. + - If the usage faithfully matches the tool output (exact or paraphrased) → OK. + - If the agent uses wrong values, wrong entities, incorrect transformations, or fabricates data → record as fault. +3. Populate the JSON object: + - `faulty_details`: all detected issues (empty if none). + - `reason`: concise rationale. + - `label`: `"pass"` or `"fail"`. + +SCORING RULES +============= + +- **PASS:** No faulty uses of tool outputs found (empty `faulty_details`) in the RESPONSE. +- **FAIL:** Any misuse, fabrication, omission, or misinterpretation of a tool output, + including when a prior tool output is reused incorrectly in a new tool call in the RESPONSE. + +IMPLEMENTATION NOTES +==================== + +- Do NOT evaluate: + - The correctness of *which tool* was used. + - Whether new tool inputs are valid by themselves. + - Task success or completeness. +- Your judgment concerns *only* whether previously returned tool outputs are + correctly understood and reused where they appear. +- If multiple faulty uses exist, list all in `faulty_details`. +- When uncertain whether a value use is correct, treat it as a fault and explain why. +- If tool outputs are missing but the response claims to use them, that counts as a fabricated use. +- If a tool fails, that is outside your scope; unless the response misuses or misreports the failed output. + +> [TOOL CALLS] and [TOOL RESULTS] are internal, user does not see them. + +EXAMPLES (few-shot — using the new JSON schema and key order) + +### Example 1 - PASS +QUERY: +User turn 1: + Can you transfer $500 from my checking to my savings account? + +Agent turn 1: + [TOOL_CALL] get_account_balances(user_id="USER456") + [TOOL_RESULT] {'accounts': [{'account_id': 'CHK001', 'type': 'checking', 'balance': 1250.75}, {'account_id': 'SAV001', 'type': 'savings', 'balance': 3400.20}]} + You have $1,250.75 in checking and $3,400.20 in savings. You have enough for the transfer. + +User turn 2: + Great, please go ahead. + +RESPONSE: +[TOOL_CALL] transfer_funds(from_account="CHK001", to_account="SAV001", amount=500) +[TOOL_RESULT] {'transaction_id':'TXN789123','status':'completed','from_account':'CHK001','to_account':'SAV001','amount':500,'new_balances':{'CHK001':750.75,'SAV001':3900.20}} +Transfer completed successfully. Checking now has $750.75 and savings $3,900.20. + +EXPECTED JSON: +{ + "faulty_details": [], + "reason": "All tool-derived claims and uses in the response match the prior tool outputs correctly.", + "label": "pass", +} + + +### Example 2 - FAIL (unit misinterpretation) +QUERY: +User turn 1: + What's the current temperature in Rome? + +RESPONSE: +[TOOL_CALL] weather_api(city="Rome") +[TOOL_RESULT] {"city":"Rome","temp":28,"condition":"Sunny"} +It’s currently 28°F and sunny in Rome. + +TOOL_DEFINITIONS: +- weather_api: Gets current weather for a city and returns temperature in Celsius (inputs: city) (outputs: city, temp, condition) + +EXPECTED JSON: +{ + "faulty_details": [ + "claim -> MISMATCH (expected 28°C, saw 28°F) mapped to weather_api.temp" + ], + "reason": "Agent incorrectly reported the temperature in Fahrenheit instead of Celsius as provided by the tool output.", + "label": "fail", +} + + +### Example 3 - FAIL (fabricated inventory claim) +QUERY: +User turn 1: + Can you check if item B123 is available? + +RESPONSE: +[TOOL_CALL] inventory_api(item_id="B123") +[TOOL_RESULT] {"item_id":"B123","qty":0,"eta":"2025-10-07"} +The item is in stock till the 7th of October. + +EXPECTED JSON: +{ + "faulty_details": [ + "claim -> FABRICATED (no supporting tool field; inventory_api.qty is 0, eta is future date)" + ], + "reason": "Agent fabricated a claim that the item is in stock, whereas the tool output indicates a quantity of 0 and misunderstood availability eta", + "label": "fail", +} + + +### Example 4 - FAIL (misuse of prior tool output in new tool call) +QUERY: +User turn 1: + Show my latest account transactions. + +Agent turn 1: + [TOOL_CALL] get_account_list(user_id="U123") + [TOOL_RESULT] {"accounts":[{"id":"CHK100","type":"checking"},{"id":"SAV200","type":"savings"}]} + You have two accounts: CHK100 and SAV200. Which one do you want transactions for? + +User turn 2: + Please get the transaction history for my checking account. + +RESPONSE: +[TOOL_CALL] get_transactions(account_id="SAV200") +[TOOL_RESULT] ... +The latest transactions for your checking account are: ... + +EXPECTED JSON: +{ + "faulty_details": [ + "use -> MISMATCH (expected CHK100, used SAV200) mapped to get_transactions.account_id" + ], + "reason": "Agent incorrectly used the savings account ID instead of the requested checking account ID, leading to a mismatch in the transactions reported.", + "label": "fail", +} + +-- +END OF EXAMPLES + +FINAL NOTES: + +- Output must be exactly one JSON object and must follow the key order: `faulty_details`, `reason`, `label`. + +# Output diff --git a/assets/evaluators/builtin/tool_output_utilization/spec.yaml b/assets/evaluators/builtin/tool_output_utilization/spec.yaml new file mode 100644 index 0000000000..ee1b119342 --- /dev/null +++ b/assets/evaluators/builtin/tool_output_utilization/spec.yaml @@ -0,0 +1,49 @@ +type: "evaluator" +name: "builtin.tool_output_utilization" +version: 1 +displayName: "Tool-Output-Utilization-Evaluator" +description: "| | |\n| -- | -- |\n| Score range | Binary [0-1]: 1 indicates correct tool output usage, 0 indicates faulty usage. |\n| What is this metric? | Tool Output Utilization evaluates whether an agent correctly understands and uses the outputs returned by tools it invoked (APIs, search/retrieval, DB queries, etc.). |\n| How does it work? | It focuses exclusively on detecting incorrect, missing, or fabricated uses of tool outputs in the agent's response, checking if tool-derived information is accurately reflected or reused. |\n| When to use it? | Use this metric for agent-based systems that rely on tools, ensuring agents properly interpret and utilize tool results without fabrication or misrepresentation. |\n" +evaluatorType: "builtin" +evaluatorSubType: "code" +categories: ["agents"] +tags: + provider: "Microsoft" +initParameterSchema: + type: "object" + properties: + deployment_name: + type: "string" + threshold: + type: "number" + credential: + type: "object" + required: ["deployment_name"] +dataMappingSchema: + type: "object" + properties: + query: + anyOf: + - type: "string" + - type: "array" + items: + type: "object" + response: + anyOf: + - type: "string" + - type: "array" + items: + type: "object" + tool_definitions: + anyOf: + - type: "object" + - type: "array" + items: + type: "object" + required: ["query", "response", "tool_definitions"] +outputSchema: + tool_output_utilization: + type: "ordinal" + desirable_direction: "increase" + min_value: 0 + max_value: 1 +path: ./evaluator \ No newline at end of file