Skip to content

Commit 978bf5f

Browse files
feat(examples): add basic evaluation example
1 parent a28312c commit 978bf5f

File tree

2 files changed

+221
-0
lines changed

2 files changed

+221
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Basic Evaluation Example
2+
3+
This example demonstrates the core functionality of the ADK evaluation framework with a simple math assistant agent.
4+
5+
## Features Demonstrated
6+
7+
- Creating a simple agent for evaluation
8+
- Setting up evaluation storage (in-memory)
9+
- Registering evaluators
10+
- Creating an eval set with test cases
11+
- Configuring evaluation criteria
12+
- Running evaluations and viewing results
13+
14+
## Evaluators Used
15+
16+
1. **RESPONSE_MATCH_SCORE** - Algorithmic comparison using ROUGE-1
17+
2. **SEMANTIC_RESPONSE_MATCH** - LLM-as-Judge semantic validation
18+
19+
## Running the Example
20+
21+
1. Set your API key:
22+
```bash
23+
export GOOGLE_API_KEY=your_api_key_here
24+
```
25+
26+
2. Run the example:
27+
```bash
28+
go run main.go
29+
```
30+
31+
## What to Expect
32+
33+
The example:
34+
1. Creates a math assistant agent
35+
2. Sets up two evaluation cases (addition and multiplication)
36+
3. Runs both evaluators on each case
37+
4. Displays detailed results including scores and pass/fail status
38+
39+
## Sample Output
40+
41+
```
42+
Running evaluation...
43+
===================
44+
45+
Evaluation Complete!
46+
===================
47+
Overall Status: PASSED
48+
Overall Score: 0.85
49+
50+
Case 1: addition-simple
51+
Status: PASSED
52+
response_match: 0.82 (PASSED)
53+
semantic_match: 0.90 (PASSED)
54+
55+
Case 2: multiplication-simple
56+
Status: PASSED
57+
response_match: 0.78 (PASSED)
58+
semantic_match: 0.88 (PASSED)
59+
60+
Evaluation results saved to storage.
61+
```

examples/evaluation/basic/main.go

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"log"
21+
"os"
22+
"time"
23+
24+
"google.golang.org/adk/agent/llmagent"
25+
"google.golang.org/adk/evaluation"
26+
"google.golang.org/adk/evaluation/evaluators"
27+
"google.golang.org/adk/evaluation/storage"
28+
"google.golang.org/adk/model/gemini"
29+
"google.golang.org/adk/runner"
30+
"google.golang.org/adk/session"
31+
"google.golang.org/genai"
32+
)
33+
34+
func main() {
35+
ctx := context.Background()
36+
37+
model, err := gemini.NewModel(ctx, "gemini-2.0-flash-exp", &genai.ClientConfig{
38+
APIKey: os.Getenv("GOOGLE_API_KEY"),
39+
})
40+
if err != nil {
41+
log.Fatalf("Failed to create model: %v", err)
42+
}
43+
44+
agent, err := llmagent.New(llmagent.Config{
45+
Name: "math_assistant",
46+
Model: model,
47+
Description: "A helpful math assistant that answers basic math questions.",
48+
Instruction: "You are a math tutor. Answer math questions clearly and concisely.",
49+
})
50+
if err != nil {
51+
log.Fatalf("Failed to create agent: %v", err)
52+
}
53+
54+
sessionService := session.InMemoryService()
55+
56+
agentRunner, err := runner.New(runner.Config{
57+
AppName: "math-eval-app",
58+
Agent: agent,
59+
SessionService: sessionService,
60+
})
61+
if err != nil {
62+
log.Fatalf("Failed to create agent runner: %v", err)
63+
}
64+
65+
if err := evaluation.RegisterDefaultEvaluators(map[evaluation.MetricType]evaluation.EvaluatorFactory{
66+
evaluation.MetricResponseMatch: evaluators.NewResponseMatchEvaluator,
67+
evaluation.MetricSemanticResponseMatch: evaluators.NewSemanticResponseMatchEvaluator,
68+
}); err != nil {
69+
log.Fatalf("Failed to register evaluators: %v", err)
70+
}
71+
72+
judgeLLM, err := gemini.NewModel(ctx, "gemini-2.0-flash-exp", &genai.ClientConfig{
73+
APIKey: os.Getenv("GOOGLE_API_KEY"),
74+
})
75+
if err != nil {
76+
log.Fatalf("Failed to create judge LLM: %v", err)
77+
}
78+
79+
evalStorage := storage.NewMemoryStorage()
80+
81+
evalRunner := evaluation.NewRunner(evaluation.RunnerConfig{
82+
AgentRunner: agentRunner,
83+
Storage: evalStorage,
84+
SessionService: sessionService,
85+
AppName: "math-eval-app",
86+
RateLimitDelay: 6 * time.Second,
87+
})
88+
89+
evalSet := &evaluation.EvalSet{
90+
ID: "basic-math-eval",
91+
Name: "Basic Math Evaluation",
92+
EvalCases: []evaluation.EvalCase{
93+
{
94+
ID: "addition-simple",
95+
Conversation: []evaluation.ConversationTurn{
96+
{Role: "user", Content: "What is 2 + 2?"},
97+
},
98+
ExpectedResponse: "2 + 2 = 4",
99+
},
100+
{
101+
ID: "multiplication-simple",
102+
Conversation: []evaluation.ConversationTurn{
103+
{Role: "user", Content: "What is 5 times 3?"},
104+
},
105+
ExpectedResponse: "5 times 3 = 15",
106+
},
107+
},
108+
}
109+
110+
err = evalStorage.SaveEvalSet(ctx, "math-eval-app", evalSet)
111+
if err != nil {
112+
log.Fatalf("Failed to save eval set: %v", err)
113+
}
114+
115+
config := &evaluation.EvalConfig{
116+
JudgeLLM: judgeLLM,
117+
JudgeModel: "gemini-2.0-flash-exp",
118+
Criteria: []evaluation.Criterion{
119+
&evaluation.Threshold{
120+
MinScore: 0.5,
121+
MetricType: evaluation.MetricResponseMatch,
122+
},
123+
&evaluation.LLMAsJudgeCriterion{
124+
Threshold: &evaluation.Threshold{
125+
MinScore: 0.8,
126+
MetricType: evaluation.MetricSemanticResponseMatch,
127+
},
128+
MetricType: evaluation.MetricSemanticResponseMatch,
129+
JudgeModel: "gemini-2.0-flash-exp",
130+
},
131+
},
132+
}
133+
134+
fmt.Println("Running evaluation...")
135+
fmt.Println("===================")
136+
137+
result, err := evalRunner.RunEvalSet(ctx, evalSet, config)
138+
if err != nil {
139+
log.Fatalf("Evaluation failed: %v", err)
140+
}
141+
142+
fmt.Printf("\nEvaluation Complete!\n")
143+
fmt.Printf("===================\n")
144+
fmt.Printf("Overall Status: %s\n", result.Status)
145+
fmt.Printf("Overall Score: %.2f\n\n", result.OverallScore)
146+
147+
for i, caseResult := range result.EvalCaseResults {
148+
fmt.Printf("Case %d: %s\n", i+1, caseResult.EvalID)
149+
fmt.Printf(" Status: %s\n", caseResult.FinalEvalStatus)
150+
for metricName, metric := range caseResult.OverallMetricResults {
151+
fmt.Printf(" %s: %.2f (%s)\n", metricName, metric.Score, metric.Status)
152+
if metric.ErrorMessage != "" {
153+
fmt.Printf(" Error: %s\n", metric.ErrorMessage)
154+
}
155+
}
156+
fmt.Println()
157+
}
158+
159+
fmt.Println("Evaluation results saved to storage.")
160+
}

0 commit comments

Comments
 (0)