feat(examples): add basic evaluation example

stefanoamorelli · stefanoamorelli · commit 978bf5f54616 · 2025-11-16T20:18:12.000+02:00
diff --git a/examples/evaluation/basic/README.md b/examples/evaluation/basic/README.md
@@ -0,0 +1,61 @@
+# Basic Evaluation Example
+
+This example demonstrates the core functionality of the ADK evaluation framework with a simple math assistant agent.
+
+## Features Demonstrated
+
+- Creating a simple agent for evaluation
+- Setting up evaluation storage (in-memory)
+- Registering evaluators
+- Creating an eval set with test cases
+- Configuring evaluation criteria
+- Running evaluations and viewing results
+
+## Evaluators Used
+
+1. **RESPONSE_MATCH_SCORE** - Algorithmic comparison using ROUGE-1
+2. **SEMANTIC_RESPONSE_MATCH** - LLM-as-Judge semantic validation
+
+## Running the Example
+
+1. Set your API key:
+```bash
+export GOOGLE_API_KEY=your_api_key_here
+```
+
+2. Run the example:
+```bash
+go run main.go
+```
+
+## What to Expect
+
+The example:
+1. Creates a math assistant agent
+2. Sets up two evaluation cases (addition and multiplication)
+3. Runs both evaluators on each case
+4. Displays detailed results including scores and pass/fail status
+
+## Sample Output
+
+```
+Running evaluation...
+===================
+
+Evaluation Complete!
+===================
+Overall Status: PASSED
+Overall Score: 0.85
+
+Case 1: addition-simple
+  Status: PASSED
+  response_match: 0.82 (PASSED)
+  semantic_match: 0.90 (PASSED)
+
+Case 2: multiplication-simple
+  Status: PASSED
+  response_match: 0.78 (PASSED)
+  semantic_match: 0.88 (PASSED)
+
+Evaluation results saved to storage.
+```
diff --git a/examples/evaluation/basic/main.go b/examples/evaluation/basic/main.go
@@ -0,0 +1,160 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"os"
+	"time"
+
+	"google.golang.org/adk/agent/llmagent"
+	"google.golang.org/adk/evaluation"
+	"google.golang.org/adk/evaluation/evaluators"
+	"google.golang.org/adk/evaluation/storage"
+	"google.golang.org/adk/model/gemini"
+	"google.golang.org/adk/runner"
+	"google.golang.org/adk/session"
+	"google.golang.org/genai"
+)
+
+func main() {
+	ctx := context.Background()
+
+	model, err := gemini.NewModel(ctx, "gemini-2.0-flash-exp", &genai.ClientConfig{
+		APIKey: os.Getenv("GOOGLE_API_KEY"),
+	})
+	if err != nil {
+		log.Fatalf("Failed to create model: %v", err)
+	}
+
+	agent, err := llmagent.New(llmagent.Config{
+		Name:        "math_assistant",
+		Model:       model,
+		Description: "A helpful math assistant that answers basic math questions.",
+		Instruction: "You are a math tutor. Answer math questions clearly and concisely.",
+	})
+	if err != nil {
+		log.Fatalf("Failed to create agent: %v", err)
+	}
+
+	sessionService := session.InMemoryService()
+
+	agentRunner, err := runner.New(runner.Config{
+		AppName:        "math-eval-app",
+		Agent:          agent,
+		SessionService: sessionService,
+	})
+	if err != nil {
+		log.Fatalf("Failed to create agent runner: %v", err)
+	}
+
+	if err := evaluation.RegisterDefaultEvaluators(map[evaluation.MetricType]evaluation.EvaluatorFactory{
+		evaluation.MetricResponseMatch:         evaluators.NewResponseMatchEvaluator,
+		evaluation.MetricSemanticResponseMatch: evaluators.NewSemanticResponseMatchEvaluator,
+	}); err != nil {
+		log.Fatalf("Failed to register evaluators: %v", err)
+	}
+
+	judgeLLM, err := gemini.NewModel(ctx, "gemini-2.0-flash-exp", &genai.ClientConfig{
+		APIKey: os.Getenv("GOOGLE_API_KEY"),
+	})
+	if err != nil {
+		log.Fatalf("Failed to create judge LLM: %v", err)
+	}
+
+	evalStorage := storage.NewMemoryStorage()
+
+	evalRunner := evaluation.NewRunner(evaluation.RunnerConfig{
+		AgentRunner:    agentRunner,
+		Storage:        evalStorage,
+		SessionService: sessionService,
+		AppName:        "math-eval-app",
+		RateLimitDelay: 6 * time.Second,
+	})
+
+	evalSet := &evaluation.EvalSet{
+		ID:   "basic-math-eval",
+		Name: "Basic Math Evaluation",
+		EvalCases: []evaluation.EvalCase{
+			{
+				ID: "addition-simple",
+				Conversation: []evaluation.ConversationTurn{
+					{Role: "user", Content: "What is 2 + 2?"},
+				},
+				ExpectedResponse: "2 + 2 = 4",
+			},
+			{
+				ID: "multiplication-simple",
+				Conversation: []evaluation.ConversationTurn{
+					{Role: "user", Content: "What is 5 times 3?"},
+				},
+				ExpectedResponse: "5 times 3 = 15",
+			},
+		},
+	}
+
+	err = evalStorage.SaveEvalSet(ctx, "math-eval-app", evalSet)
+	if err != nil {
+		log.Fatalf("Failed to save eval set: %v", err)
+	}
+
+	config := &evaluation.EvalConfig{
+		JudgeLLM:   judgeLLM,
+		JudgeModel: "gemini-2.0-flash-exp",
+		Criteria: []evaluation.Criterion{
+			&evaluation.Threshold{
+				MinScore:   0.5,
+				MetricType: evaluation.MetricResponseMatch,
+			},
+			&evaluation.LLMAsJudgeCriterion{
+				Threshold: &evaluation.Threshold{
+					MinScore:   0.8,
+					MetricType: evaluation.MetricSemanticResponseMatch,
+				},
+				MetricType: evaluation.MetricSemanticResponseMatch,
+				JudgeModel: "gemini-2.0-flash-exp",
+			},
+		},
+	}
+
+	fmt.Println("Running evaluation...")
+	fmt.Println("===================")
+
+	result, err := evalRunner.RunEvalSet(ctx, evalSet, config)
+	if err != nil {
+		log.Fatalf("Evaluation failed: %v", err)
+	}
+
+	fmt.Printf("\nEvaluation Complete!\n")
+	fmt.Printf("===================\n")
+	fmt.Printf("Overall Status: %s\n", result.Status)
+	fmt.Printf("Overall Score: %.2f\n\n", result.OverallScore)
+
+	for i, caseResult := range result.EvalCaseResults {
+		fmt.Printf("Case %d: %s\n", i+1, caseResult.EvalID)
+		fmt.Printf("  Status: %s\n", caseResult.FinalEvalStatus)
+		for metricName, metric := range caseResult.OverallMetricResults {
+			fmt.Printf("  %s: %.2f (%s)\n", metricName, metric.Score, metric.Status)
+			if metric.ErrorMessage != "" {
+				fmt.Printf("    Error: %s\n", metric.ErrorMessage)
+			}
+		}
+		fmt.Println()
+	}
+
+	fmt.Println("Evaluation results saved to storage.")
+}