Skip to content

Commit 5b6cd8b

Browse files
committed
Re-implement evaluation framework
1 parent f9d4050 commit 5b6cd8b

File tree

120 files changed

+9922
-536
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

120 files changed

+9922
-536
lines changed

docs/modules/ROOT/pages/testing.adoc

Lines changed: 731 additions & 85 deletions
Large diffs are not rendered by default.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
target/
2+
*.iml
3+
.idea/
4+
.vscode/
5+
.DS_Store
6+
*.log
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Chatbot Evaluation Testing Sample
2+
3+
This sample demonstrates comprehensive evaluation testing for AI-powered chatbots using the Quarkus LangChain4j evaluation framework.
4+
5+
## Overview
6+
7+
The sample includes a simple customer support chatbot and showcases various evaluation approaches:
8+
9+
- **Imperative Testing** - Traditional programmatic evaluation using `Scorer`
10+
- **Fluent Builder API** - Readable, chainable evaluation definitions
11+
- **Declarative Testing** - Annotation-driven evaluations with `@EvaluationTest`
12+
- **AI as Judge** - Using an LLM to evaluate response quality
13+
- **Semantic Similarity** - Embedding-based similarity comparison
14+
- **Test suite-Level Reporting** - Aggregated reports across multiple tests
15+

samples/chatbot-evaluation/pom.xml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
7+
<parent>
8+
<groupId>io.quarkiverse.langchain4j</groupId>
9+
<artifactId>quarkus-langchain4j-samples-parent</artifactId>
10+
<version>999-SNAPSHOT</version>
11+
<relativePath>../pom.xml</relativePath>
12+
</parent>
13+
14+
<artifactId>quarkus-langchain4j-sample-chatbot-evaluation</artifactId>
15+
<name>Quarkus LangChain4j - Samples - Chatbot Evaluation Testing</name>
16+
<description>Sample demonstrating comprehensive evaluation testing with semantic similarity and AI judge strategies</description>
17+
18+
<dependencies>
19+
<dependency>
20+
<groupId>io.quarkus</groupId>
21+
<artifactId>quarkus-rest</artifactId>
22+
</dependency>
23+
<dependency>
24+
<groupId>io.quarkiverse.langchain4j</groupId>
25+
<artifactId>quarkus-langchain4j-openai</artifactId>
26+
<version>${project.version}</version>
27+
</dependency>
28+
29+
<!-- Evaluation Testing Framework -->
30+
<dependency>
31+
<groupId>io.quarkiverse.langchain4j</groupId>
32+
<artifactId>quarkus-langchain4j-testing-evaluation-junit5</artifactId>
33+
<version>${project.version}</version>
34+
<scope>test</scope>
35+
</dependency>
36+
<dependency>
37+
<groupId>dev.langchain4j</groupId>
38+
<artifactId>langchain4j-embeddings-bge-small-en-v15</artifactId>
39+
<scope>test</scope>
40+
</dependency>
41+
<dependency>
42+
<groupId>io.quarkiverse.langchain4j</groupId>
43+
<artifactId>quarkus-langchain4j-testing-evaluation-semantic-similarity</artifactId>
44+
<version>${project.version}</version>
45+
<scope>test</scope>
46+
</dependency>
47+
<dependency>
48+
<groupId>io.quarkiverse.langchain4j</groupId>
49+
<artifactId>quarkus-langchain4j-testing-evaluation-ai-judge</artifactId>
50+
<version>${project.version}</version>
51+
<scope>test</scope>
52+
</dependency>
53+
54+
<!-- Test dependencies -->
55+
<dependency>
56+
<groupId>io.quarkus</groupId>
57+
<artifactId>quarkus-junit5</artifactId>
58+
<scope>test</scope>
59+
</dependency>
60+
<dependency>
61+
<groupId>io.rest-assured</groupId>
62+
<artifactId>rest-assured</artifactId>
63+
<scope>test</scope>
64+
</dependency>
65+
</dependencies>
66+
67+
<build>
68+
<plugins>
69+
<plugin>
70+
<groupId>io.quarkus</groupId>
71+
<artifactId>quarkus-maven-plugin</artifactId>
72+
</plugin>
73+
</plugins>
74+
</build>
75+
</project>
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package io.quarkiverse.langchain4j.sample.chatbot;
2+
3+
import jakarta.enterprise.context.ApplicationScoped;
4+
5+
import dev.langchain4j.service.SystemMessage;
6+
import dev.langchain4j.service.UserMessage;
7+
import io.quarkiverse.langchain4j.RegisterAiService;
8+
9+
/**
10+
* A simple customer support chatbot that answers questions about
11+
* a fictional e-commerce company.
12+
*/
13+
@RegisterAiService
14+
@ApplicationScoped
15+
public interface CustomerSupportBot {
16+
17+
@SystemMessage("""
18+
You are a helpful customer support assistant for QuarkusShop, an online e-commerce store.
19+
20+
Company Information:
21+
- Business hours: Monday-Friday, 9 AM - 5 PM EST
22+
- Shipping: Free shipping on orders over $50
23+
- Returns: 30-day return policy
24+
- Contact: [email protected] or 1-800-QUARKUS
25+
26+
Provide helpful, concise, and friendly responses to customer questions.
27+
""")
28+
String chat(@UserMessage String message);
29+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# OpenAI Configuration
2+
quarkus.langchain4j.openai.api-key=${OPENAI_API_KEY}
3+
quarkus.langchain4j.openai.chat-model.model-name=gpt-4o-mini
4+
quarkus.langchain4j.openai.chat-model.temperature=0.0
5+
quarkus.langchain4j.openai.timeout=60s
6+
7+
# Logging
8+
quarkus.log.category."io.quarkiverse.langchain4j".level=DEBUG
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
package io.quarkiverse.langchain4j.sample.chatbot;
2+
3+
import static io.quarkiverse.langchain4j.testing.evaluation.EvaluationAssertions.assertThat;
4+
5+
import jakarta.inject.Inject;
6+
7+
import org.junit.jupiter.api.Test;
8+
9+
import dev.langchain4j.model.chat.ChatModel;
10+
import io.quarkiverse.langchain4j.evaluation.junit5.Evaluate;
11+
import io.quarkiverse.langchain4j.evaluation.junit5.SampleLocation;
12+
import io.quarkiverse.langchain4j.testing.evaluation.EvaluationReport;
13+
import io.quarkiverse.langchain4j.testing.evaluation.Samples;
14+
import io.quarkiverse.langchain4j.testing.evaluation.Scorer;
15+
import io.quarkiverse.langchain4j.testing.evaluation.judge.AiJudgeStrategy;
16+
import io.quarkiverse.langchain4j.testing.evaluation.similarity.SemanticSimilarityStrategy;
17+
import io.quarkus.test.junit.QuarkusTest;
18+
19+
/**
20+
* Demonstrates using AI as a judge to evaluate chatbot responses.
21+
* This approach uses an LLM to judge whether responses are acceptable,
22+
* which is useful for more nuanced evaluation beyond simple similarity.
23+
*/
24+
@QuarkusTest
25+
@Evaluate
26+
public class AIJudgeEvaluationTest {
27+
28+
@Inject
29+
CustomerSupportBot bot;
30+
31+
@Inject
32+
ChatModel model;
33+
34+
@Test
35+
void evaluateWithAIJudge(
36+
Scorer scorer,
37+
@SampleLocation("src/test/resources/customer-support-samples.yaml") Samples<String> samples) {
38+
39+
// Use AI judge strategy for more sophisticated evaluation
40+
EvaluationReport<String> report = scorer.evaluate(
41+
samples,
42+
params -> bot.chat(params.get(0)),
43+
new AiJudgeStrategy(model));
44+
45+
assertThat(report)
46+
.hasScoreGreaterThanOrEqualTo(50.0)
47+
.hasAtLeastPassedEvaluations(3);
48+
49+
// Print detailed results
50+
report.evaluations().forEach(eval -> {
51+
System.out.printf("%s: %s (score: %.2f)%n",
52+
eval.sample().name(),
53+
eval.passed() ? "PASS" : "FAIL",
54+
eval.score() * 100);
55+
if (eval.explanation() != null) {
56+
System.out.printf(" Explanation: %s%n", eval.explanation());
57+
}
58+
});
59+
}
60+
61+
@Test
62+
void compareStrategies(
63+
Scorer scorer,
64+
@SampleLocation("src/test/resources/smoke-tests.yaml") Samples<String> samples) {
65+
66+
// Evaluate with semantic similarity
67+
EvaluationReport<String> semanticReport = scorer.evaluate(
68+
samples,
69+
params -> bot.chat(params.get(0)),
70+
new SemanticSimilarityStrategy(0.85));
71+
72+
// Evaluate with AI judge
73+
EvaluationReport<String> aiJudgeReport = scorer.evaluate(
74+
samples,
75+
params -> bot.chat(params.get(0)),
76+
new AiJudgeStrategy(model));
77+
78+
System.out.printf("Semantic Similarity Score: %.2f%%%n", semanticReport.score());
79+
System.out.printf("AI Judge Score: %.2f%%%n", aiJudgeReport.score());
80+
81+
// Both should have reasonable scores
82+
assertThat(semanticReport).hasScoreGreaterThan(60.0);
83+
assertThat(aiJudgeReport).hasScoreGreaterThan(60.0);
84+
}
85+
86+
@Test
87+
void evaluateWithBothStrategies(
88+
Scorer scorer,
89+
@SampleLocation("src/test/resources/customer-support-samples.yaml") Samples<String> samples) {
90+
91+
// Apply both strategies - sample must pass both to be considered successful
92+
EvaluationReport<String> report = scorer.evaluate(
93+
samples,
94+
params -> bot.chat(params.get(0)),
95+
new SemanticSimilarityStrategy(0.80),
96+
new AiJudgeStrategy(model));
97+
98+
// Since each sample is evaluated by both strategies,
99+
// we'll have 2x the number of evaluations
100+
assertThat(report)
101+
.hasAtLeastPassedEvaluations(8); // At least half should pass
102+
}
103+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package io.quarkiverse.langchain4j.sample.chatbot;
2+
3+
import static io.quarkiverse.langchain4j.testing.evaluation.EvaluationAssertions.assertThat;
4+
5+
import java.util.function.Function;
6+
7+
import jakarta.inject.Inject;
8+
9+
import org.junit.jupiter.api.DisplayNameGeneration;
10+
11+
import dev.langchain4j.model.chat.ChatModel;
12+
import io.quarkiverse.langchain4j.evaluation.junit5.Evaluate;
13+
import io.quarkiverse.langchain4j.evaluation.junit5.EvaluationDisplayNameGenerator;
14+
import io.quarkiverse.langchain4j.evaluation.junit5.EvaluationFunction;
15+
import io.quarkiverse.langchain4j.evaluation.junit5.EvaluationTest;
16+
import io.quarkiverse.langchain4j.evaluation.junit5.SampleLocation;
17+
import io.quarkiverse.langchain4j.evaluation.junit5.StrategyTest;
18+
import io.quarkiverse.langchain4j.testing.evaluation.EvaluationStrategy;
19+
import io.quarkiverse.langchain4j.testing.evaluation.Parameters;
20+
import io.quarkiverse.langchain4j.testing.evaluation.Samples;
21+
import io.quarkiverse.langchain4j.testing.evaluation.Scorer;
22+
import io.quarkiverse.langchain4j.testing.evaluation.judge.AiJudgeStrategy;
23+
import io.quarkiverse.langchain4j.testing.evaluation.similarity.SemanticSimilarityStrategy;
24+
import io.quarkus.test.junit.QuarkusTest;
25+
26+
/**
27+
* Demonstrates declarative evaluation testing using annotations.
28+
* This approach is more concise and reduces boilerplate code.
29+
*/
30+
@QuarkusTest
31+
@Evaluate
32+
@DisplayNameGeneration(EvaluationDisplayNameGenerator.class)
33+
public class DeclarativeEvaluationTest {
34+
35+
@Inject
36+
CustomerSupportBot bot;
37+
38+
@Inject // Reuse the chat model for AI judging
39+
ChatModel judgeModel;
40+
41+
/**
42+
* Define a reusable evaluation function.
43+
* This function will be referenced by name in the test annotations.
44+
*/
45+
@EvaluationFunction("chatbot")
46+
public Function<Parameters, String> chatbotFunction() {
47+
return params -> bot.chat(params.get(0));
48+
}
49+
50+
/**
51+
* Declarative test using @EvaluationTest.
52+
* The framework automatically loads samples, evaluates them,
53+
* and asserts the minimum score.
54+
*/
55+
@EvaluationTest(samples = "smoke-tests.yaml", strategy = SemanticSimilarityStrategy.class, function = "chatbot", minScore = 70.0)
56+
void smokeTestsWithSemanticSimilarity() {
57+
// Test body can be empty - evaluation happens automatically
58+
// The test will fail if score is below 70%
59+
}
60+
61+
/**
62+
* Test using multiple strategies with @StrategyTest.
63+
* The test runs once for each strategy.
64+
*/
65+
@StrategyTest(strategies = {
66+
SemanticSimilarityStrategy.class,
67+
AiJudgeStrategy.class,
68+
})
69+
void customerSupportWithMultipleStrategies(
70+
@SampleLocation("src/test/resources/smoke-tests.yaml") Samples<String> samples,
71+
EvaluationStrategy<String> strategy,
72+
Scorer scorer) {
73+
// This test method will execute twice:
74+
// 1. Once with SemanticSimilarityStrategy
75+
// 2. Once with AiJudgeStrategy
76+
// Each execution appears as a separate test in the results
77+
78+
var report = scorer.evaluate(
79+
samples,
80+
params -> bot.chat(params.get(0)),
81+
strategy);
82+
83+
System.out.printf("Strategy %s - Score: %.2f%%%n",
84+
strategy.getClass().getSimpleName(),
85+
report.score());
86+
87+
assertThat(report).hasScoreGreaterThan(60.0);
88+
}
89+
90+
/**
91+
* Another @EvaluationTest with different configuration.
92+
*/
93+
@EvaluationTest(samples = "customer-support-samples.yaml", strategy = AiJudgeStrategy.class, function = "chatbot", minScore = 85.0)
94+
void criticalCustomerSupportEvaluation() {
95+
// Higher threshold for critical evaluations
96+
}
97+
}

0 commit comments

Comments
 (0)