Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 29 additions & 8 deletions loom/engines/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,21 @@ async def evaluate_record(self, record: Record) -> Record:
try:
# Get reference text if configured
reference = None
if self.config.reference_field and self.config.reference_field in record.data:
if (
self.config.reference_field
and self.config.reference_field in record.data
):
reference = str(record.data[self.config.reference_field])

# Prepare evaluator list for Arbiter
evaluator_types = [eval_config.name for eval_config in self.config.evaluators]
evaluator_types = [
eval_config.name for eval_config in self.config.evaluators
]

# Call Arbiter evaluate function
logger.debug(f"Calling Arbiter for record {record.id} with evaluators: {evaluator_types}")
logger.debug(
f"Calling Arbiter for record {record.id} with evaluators: {evaluator_types}"
)
result = await asyncio.wait_for(
arbiter_evaluate(
output=record.transformed_data,
Expand Down Expand Up @@ -99,7 +106,9 @@ async def evaluate_record(self, record: Record) -> Record:
except asyncio.TimeoutError:
record.status = RecordStatus.ERROR
record.error = f"Evaluation exceeded timeout ({self.config.timeout}s)"
logger.error(f"Evaluation timeout for record {record.id}: {self.config.timeout}s")
logger.error(
f"Evaluation timeout for record {record.id}: {self.config.timeout}s"
)
raise EvaluateError(record.error)
except Exception as e:
record.status = RecordStatus.ERROR
Expand Down Expand Up @@ -160,8 +169,12 @@ def _check_any_pass(self, record: Record) -> bool:

Mathematical definition: ∃ evaluator: score ≥ threshold
"""
# Localize lookup for performance
evaluation_scores = record.evaluation_scores
get_score = evaluation_scores.get

for eval_config in self.config.evaluators:
score = record.evaluation_scores.get(eval_config.name)
score = get_score(eval_config.name)
if score is not None and score >= eval_config.threshold:
return True
return False
Expand Down Expand Up @@ -225,13 +238,21 @@ async def evaluate_with_semaphore(record: Record) -> Record:
evaluated_records = []
for result in results:
if isinstance(result, Exception):
logger.error(f"Batch evaluation failed: {type(result).__name__}: {result}")
logger.error(
f"Batch evaluation failed: {type(result).__name__}: {result}"
)
raise EvaluateError(f"Batch evaluation failed: {result}")
evaluated_records.append(result)

passed_count = sum(1 for r in evaluated_records if r.quality_gate_passed)
failed_count = sum(1 for r in evaluated_records if not r.quality_gate_passed and r.status != RecordStatus.ERROR)
error_count = sum(1 for r in evaluated_records if r.status == RecordStatus.ERROR)
failed_count = sum(
1
for r in evaluated_records
if not r.quality_gate_passed and r.status != RecordStatus.ERROR
)
error_count = sum(
1 for r in evaluated_records if r.status == RecordStatus.ERROR
)
logger.info(
f"Batch evaluation complete: {passed_count} passed, "
f"{failed_count} failed quality gate, {error_count} errors out of {len(records)} total"
Expand Down