Skip to content
4 changes: 4 additions & 0 deletions assets/evaluators/builtin/task_completion/asset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
type: evaluator
spec: spec.yaml
categories:
- Evaluator
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import os
import logging
from typing import Dict, Union, List, Optional

from typing_extensions import overload, override

from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from azure.ai.evaluation._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
from azure.ai.evaluation._common._experimental import experimental

logger = logging.getLogger(__name__)


@experimental
class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task.
This evaluator assesses task completion based on:
- Final outcome and deliverable of the task
- Completeness of task requirements

This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.

Scoring is binary:
- TRUE: Task fully completed with usable deliverable that meets all user requirements
- FALSE: Task incomplete, partially completed, or deliverable does not meet requirements

The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.


:param model_config: Configuration for the Azure OpenAI model.
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
~azure.ai.evaluation.OpenAIModelConfiguration]

.. admonition:: Example:
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START task_completion_evaluator]
:end-before: [END task_completion_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a TaskCompletionEvaluator with a query and response.

.. admonition:: Example using Azure AI Project URL:

.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
:start-after: [START task_completion_evaluator]
:end-before: [END task_completion_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}

"""

_PROMPTY_FILE = "task_completion.prompty"
_RESULT_KEY = "task_completion"
_OPTIONAL_PARAMS = ["tool_definitions"]

id = "azureai://built-in/evaluators/task_completion"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
def __init__(self, model_config, *, credential=None, **kwargs):
"""Initialize the TaskCompletionEvaluator.

:param model_config: Configuration for the Azure OpenAI model.
:type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
:keyword credential: Credential for authentication.
:type credential: Optional[TokenCredential]
:keyword kwargs: Additional keyword arguments.
"""
current_dir = os.path.dirname(__file__)
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
super().__init__(
model_config=model_config,
prompty_file=prompty_path,
result_key=self._RESULT_KEY,
credential=credential,
**kwargs,
)

@overload
def __call__(
self,
*,
query: Union[str, List[dict]],
response: Union[str, List[dict]],
tool_definitions: Optional[Union[dict, List[dict]]] = None,
) -> Dict[str, Union[str, bool]]:
"""Evaluate task completion for a given query, response, and optionally tool definitions.
The query and response can be either a string or a list of messages.


Example with string inputs and no tools:
evaluator = TaskCompletionEvaluator(model_config)
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."

result = evaluator(query=query, response=response)

Example with list of messages:
evaluator = TaskCompletionEvaluator(model_config)
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]

result = evaluator(query=query, response=response, tool_definitions=tool_definitions)

:keyword query: The query being evaluated, either a string or a list of messages.
:paramtype query: Union[str, List[dict]]
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
:paramtype response: Union[str, List[dict]]
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
:return: A dictionary with the task completion evaluation results.
:rtype: Dict[str, Union[str, bool]]
"""

@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""
Invoke the instance using the overloaded __call__ signature.

For detailed parameter types and return value documentation, see the overloaded __call__ definition.
"""
return super().__call__(*args, **kwargs)

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
"""Do Task Completion evaluation.
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
# we override the _do_eval method as we want the output to be a dictionary,
# which is a different schema than _base_prompty_eval.py
if "query" not in eval_input and "response" not in eval_input:
raise EvaluationException(
message="Both query and response must be provided as input to the Task Completion evaluator.",
internal_message="Both query and response must be provided as input to the Task Completion evaluator.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
)
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)

llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
if isinstance(llm_output, dict):
success = llm_output.get("success", False)
if isinstance(success, str):
success = success.upper() == "TRUE"

success_result = "pass" if success else "fail"
reason = llm_output.get("explanation", "")
return {
f"{self._result_key}": success,
f"{self._result_key}_result": success_result,
f"{self._result_key}_reason": reason,
f"{self._result_key}_details": llm_output.get("details", ""),
}
if logger:
logger.warning("LLM output is not a dictionary, returning False for the success.")
return {self._result_key: False}
Loading
Loading