Evals

Add public API contract tests #3417

Workflow file for this run

	name: Evals

	on:
	pull_request:
	types:
	- opened
	- synchronize
	- labeled
	- unlabeled
	paths-ignore:
	- "packages/docs/**"

	env:
	EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-haiku-4-5"
	EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent"
	EVAL_MAX_CONCURRENCY: 25
	EVAL_TRIAL_COUNT: 3

	concurrency:
	group: ${{ github.ref }}
	cancel-in-progress: true

	jobs:
	determine-changes:
	runs-on: ubuntu-latest
	outputs:
	core: ${{ steps.filter.outputs.core }}
	evals: ${{ steps.filter.outputs.evals }}
	docs-only: ${{ steps.filter.outputs.docs-only }}
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- uses: dorny/paths-filter@v3
	id: filter
	with:
	filters: \|
	core:
	- '.github/workflows/ci.yml'
	- 'packages/core/**'
	- 'package.json'
	- 'pnpm-lock.yaml'
	- 'turbo.json'
	evals:
	- 'packages/evals/**'
	- 'package.json'
	- 'pnpm-lock.yaml'
	docs-only:
	- '*/.md'
	- 'examples/**'
	- '!packages/*/.md'

	determine-evals:
	needs: [determine-changes]
	runs-on: ubuntu-latest
	outputs:
	skip-all-evals: ${{ steps.check-labels.outputs.skip-all-evals }}
	run-regression: ${{ steps.check-labels.outputs.run-regression }}
	run-combination: ${{ steps.check-labels.outputs.run-combination }}
	run-extract: ${{ steps.check-labels.outputs.run-extract }}
	run-act: ${{ steps.check-labels.outputs.run-act }}
	run-observe: ${{ steps.check-labels.outputs.run-observe }}
	run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
	run-agent: ${{ steps.check-labels.outputs.run-agent }}
	steps:
	- id: check-labels
	run: \|
	# Check if skip-evals label is present
	if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-evals') }}" == "true" ]]; then
	echo "skip-evals label found - skipping all evals"
	echo "skip-all-evals=true" >> $GITHUB_OUTPUT
	echo "run-regression=false" >> $GITHUB_OUTPUT
	echo "run-combination=false" >> $GITHUB_OUTPUT
	echo "run-extract=false" >> $GITHUB_OUTPUT
	echo "run-act=false" >> $GITHUB_OUTPUT
	echo "run-observe=false" >> $GITHUB_OUTPUT
	echo "run-targeted-extract=false" >> $GITHUB_OUTPUT
	echo "run-agent=false" >> $GITHUB_OUTPUT
	exit 0
	fi

	# Skip evals if only docs/examples changed (and not on main)
	if [[ "${{ needs.determine-changes.outputs.docs-only }}" == "true" && "${{ needs.determine-changes.outputs.core }}" == "false" && "${{ needs.determine-changes.outputs.evals }}" == "false" && "${{ github.ref }}" != "refs/heads/main" ]]; then
	echo "Only docs/examples changed - skipping evals"
	echo "skip-all-evals=true" >> $GITHUB_OUTPUT
	echo "run-regression=false" >> $GITHUB_OUTPUT
	echo "run-combination=false" >> $GITHUB_OUTPUT
	echo "run-extract=false" >> $GITHUB_OUTPUT
	echo "run-act=false" >> $GITHUB_OUTPUT
	echo "run-observe=false" >> $GITHUB_OUTPUT
	echo "run-targeted-extract=false" >> $GITHUB_OUTPUT
	echo "run-agent=false" >> $GITHUB_OUTPUT
	exit 0
	fi

	# Default to running all tests on main branch
	if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
	echo "Running all tests for main branch"
	echo "skip-all-evals=false" >> $GITHUB_OUTPUT
	echo "run-regression=true" >> $GITHUB_OUTPUT
	echo "run-combination=true" >> $GITHUB_OUTPUT
	echo "run-extract=true" >> $GITHUB_OUTPUT
	echo "run-act=true" >> $GITHUB_OUTPUT
	echo "run-observe=true" >> $GITHUB_OUTPUT
	echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
	echo "run-agent=true" >> $GITHUB_OUTPUT
	exit 0
	fi

	# Check for skip-regression-evals label
	if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-regression-evals') }}" == "true" ]]; then
	echo "skip-regression-evals label found - regression evals will be skipped"
	echo "run-regression=false" >> $GITHUB_OUTPUT
	else
	echo "Regression evals will run by default"
	echo "run-regression=true" >> $GITHUB_OUTPUT
	fi

	# Check for specific labels
	echo "skip-all-evals=false" >> $GITHUB_OUTPUT
	echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT
	echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT
	echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
	echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
	echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
	echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT

	run-lint:
	needs: [determine-changes]
	if: needs.determine-changes.outputs.core == 'true' \|\| needs.determine-changes.outputs.evals == 'true'
	runs-on: ubuntu-latest
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Setup pnpm
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	run: pnpm install --frozen-lockfile

	- name: Run Lint
	run: pnpm run lint

	run-build:
	needs: [determine-changes]
	if: needs.determine-changes.outputs.core == 'true' \|\| needs.determine-changes.outputs.evals == 'true'
	runs-on: ubuntu-latest
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Setup pnpm
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	run: pnpm install --frozen-lockfile

	- name: Run Build
	run: pnpm run build

	- name: Upload build artifacts
	uses: actions/upload-artifact@v4
	with:
	name: build-artifacts
	path: \|
	packages/*/dist/
	packages/*/lib/
	retention-days: 1

	- name: Run Vitest
	run: pnpm --filter @browserbasehq/stagehand run test:vitest

	run-e2e-local-tests:
	needs: [run-lint, run-build]
	runs-on: ubuntu-latest
	timeout-minutes: 50
	env:
	HEADLESS: true
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Setup pnpm
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	run: pnpm install --frozen-lockfile

	- name: Build Stagehand
	run: pnpm run build

	- name: Run local E2E Tests (Deterministic Playwright)
	run: pnpm run e2e:local

	run-e2e-bb-tests:
	needs: [run-lint, run-build]
	runs-on: ubuntu-latest
	timeout-minutes: 50
	if: >
	github.event_name == 'push' \|\|
	(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository)
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Setup pnpm
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	run: pnpm install --frozen-lockfile

	- name: Build Stagehand
	run: pnpm run build

	- name: Run E2E Tests (browserbase)
	run: pnpm run e2e:bb

	run-regression-evals:
	needs:
	[run-e2e-bb-tests, run-e2e-local-tests, run-build, determine-evals]
	if: needs.determine-evals.outputs.skip-all-evals != 'true' && needs.determine-evals.outputs.run-regression == 'true'
	runs-on: ubuntu-latest
	timeout-minutes: 9
	outputs:
	regression_score: ${{ steps.set-regression-score.outputs.regression_score }}
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Setup pnpm
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	- name: Run Regression Evals
	run: pnpm run evals category regression trials=2 concurrency=20 env=BROWSERBASE

	- name: Log Regression Evals Performance
	run: \|
	experimentName=$(jq -r '.experimentName' eval-summary.json)
	echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
	if [ -f eval-summary.json ]; then
	regression_score=$(jq '.categories.regression' eval-summary.json)
	echo "Regression category score: $regression_score%"
	if (( $(echo "$regression_score < 90" \| bc -l) )); then
	echo "Regression category score is below 90%. Failing CI."
	exit 1
	fi
	else
	echo "Eval summary not found for regression category. Failing CI."
	exit 1
	fi

	run-combination-evals:
	needs: [run-regression-evals, run-build, determine-evals]
	runs-on: ubuntu-latest
	timeout-minutes: 40
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Check for 'combination' label
	id: label-check
	run: \|
	if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then
	echo "has_label=false" >> $GITHUB_OUTPUT
	echo "No label for COMBINATION. Exiting with success."
	else
	echo "has_label=true" >> $GITHUB_OUTPUT
	fi

	- name: Setup pnpm
	if: needs.determine-evals.outputs.run-combination == 'true'
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	if: needs.determine-evals.outputs.run-combination == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	if: needs.determine-evals.outputs.run-combination == 'true'
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	if: needs.determine-evals.outputs.run-combination == 'true'
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	- name: Run Combination Evals
	if: needs.determine-evals.outputs.run-combination == 'true'
	run: pnpm run evals category combination

	- name: Log Combination Evals Performance
	if: needs.determine-evals.outputs.run-combination == 'true'
	run: \|
	experimentName=$(jq -r '.experimentName' eval-summary.json)
	echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
	if [ -f eval-summary.json ]; then
	combination_score=$(jq '.categories.combination' eval-summary.json)
	echo "Combination category score: $combination_score%"
	exit 0
	else
	echo "Eval summary not found for combination category. Failing CI."
	exit 1
	fi

	run-act-evals:
	needs: [run-regression-evals, run-build, determine-evals]
	runs-on: ubuntu-latest
	timeout-minutes: 25
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Check for 'act' label
	id: label-check
	run: \|
	if [ "${{ needs.determine-evals.outputs.run-act }}" != "true" ]; then
	echo "has_label=false" >> $GITHUB_OUTPUT
	echo "No label for ACT. Exiting with success."
	else
	echo "has_label=true" >> $GITHUB_OUTPUT
	fi

	- name: Setup pnpm
	if: needs.determine-evals.outputs.run-act == 'true'
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	if: needs.determine-evals.outputs.run-act == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	if: needs.determine-evals.outputs.run-act == 'true'
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	if: needs.determine-evals.outputs.run-act == 'true'
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	- name: Run Act Evals
	if: needs.determine-evals.outputs.run-act == 'true'
	run: pnpm run evals category act

	- name: Log Act Evals Performance
	if: needs.determine-evals.outputs.run-act == 'true'
	run: \|
	experimentName=$(jq -r '.experimentName' eval-summary.json)
	echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
	if [ -f eval-summary.json ]; then
	act_score=$(jq '.categories.act' eval-summary.json)
	echo "Act category score: $act_score%"
	if (( $(echo "$act_score < 80" \| bc -l) )); then
	echo "Act category score is below 80%. Failing CI."
	exit 1
	fi
	else
	echo "Eval summary not found for act category. Failing CI."
	exit 1
	fi

	run-extract-evals:
	needs: [run-regression-evals, run-build, determine-evals]
	runs-on: ubuntu-latest
	timeout-minutes: 50
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Check for 'extract' label
	id: label-check
	run: \|
	if [ "${{ needs.determine-evals.outputs.run-extract }}" != "true" ]; then
	echo "has_label=false" >> $GITHUB_OUTPUT
	echo "No label for EXTRACT. Exiting with success."
	else
	echo "has_label=true" >> $GITHUB_OUTPUT
	fi

	- name: Setup pnpm
	if: needs.determine-evals.outputs.run-extract == 'true'
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	if: needs.determine-evals.outputs.run-extract == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	if: needs.determine-evals.outputs.run-extract == 'true'
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	if: needs.determine-evals.outputs.run-extract == 'true'
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	# 1. Run extract category with domExtract
	- name: Run Extract Evals (domExtract)
	if: needs.determine-evals.outputs.run-extract == 'true'
	run: pnpm run evals category extract -- --extract-method=domExtract

	- name: Save Extract Dom Results
	if: needs.determine-evals.outputs.run-extract == 'true'
	run: mv eval-summary.json eval-summary-extract-dom.json

	# 2. Log and Compare Extract Evals Performance
	- name: Log and Compare Extract Evals Performance
	if: needs.determine-evals.outputs.run-extract == 'true'
	run: \|
	experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json)
	dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json)
	echo "DomExtract Extract category score: $dom_score%"
	echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"

	# If domExtract <80% fail CI
	if (( $(echo "$dom_score < 80" \| bc -l) )); then
	echo "DomExtract extract category score is below 80%. Failing CI."
	exit 1
	fi

	run-observe-evals:
	needs: [run-regression-evals, run-build, determine-evals]
	runs-on: ubuntu-latest
	timeout-minutes: 60
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Check for 'observe' label
	id: label-check
	run: \|
	if [ "${{ needs.determine-evals.outputs.run-observe }}" != "true" ]; then
	echo "has_label=false" >> $GITHUB_OUTPUT
	echo "No label for OBSERVE. Exiting with success."
	else
	echo "has_label=true" >> $GITHUB_OUTPUT
	fi

	- name: Setup pnpm
	if: needs.determine-evals.outputs.run-observe == 'true'
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	if: needs.determine-evals.outputs.run-observe == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	if: needs.determine-evals.outputs.run-observe == 'true'
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	if: needs.determine-evals.outputs.run-observe == 'true'
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	- name: Run Observe Evals
	if: needs.determine-evals.outputs.run-observe == 'true'
	run: pnpm run evals category observe

	- name: Log Observe Evals Performance
	if: needs.determine-evals.outputs.run-observe == 'true'
	run: \|
	experimentName=$(jq -r '.experimentName' eval-summary.json)
	echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
	if [ -f eval-summary.json ]; then
	observe_score=$(jq '.categories.observe' eval-summary.json)
	echo "Observe category score: $observe_score%"
	if (( $(echo "$observe_score < 80" \| bc -l) )); then
	echo "Observe category score is below 80%. Failing CI."
	exit 1
	fi
	else
	echo "Eval summary not found for observe category. Failing CI."
	exit 1
	fi

	run-targeted-extract-evals:
	needs: [run-regression-evals, run-build, determine-evals]
	runs-on: ubuntu-latest
	timeout-minutes: 60
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Check for 'targeted-extract' label
	id: label-check
	run: \|
	if [ "${{ needs.determine-evals.outputs.run-targeted-extract }}" != "true" ]; then
	echo "has_label=false" >> $GITHUB_OUTPUT
	echo "No label for TARGETED-EXTRACT. Exiting with success."
	else
	echo "has_label=true" >> $GITHUB_OUTPUT
	fi

	- name: Setup pnpm
	if: needs.determine-evals.outputs.run-targeted-extract == 'true'
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	if: needs.determine-evals.outputs.run-targeted-extract == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	if: needs.determine-evals.outputs.run-targeted-extract == 'true'
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	if: needs.determine-evals.outputs.run-targeted-extract == 'true'
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	- name: Run targeted extract Evals
	if: needs.determine-evals.outputs.run-targeted-extract == 'true'
	run: pnpm run evals category targeted_extract

	- name: Log targeted extract Evals Performance
	if: needs.determine-evals.outputs.run-targeted-extract == 'true'
	run: \|
	experimentName=$(jq -r '.experimentName' eval-summary.json)
	echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
	if [ -f eval-summary.json ]; then
	targeted_extract_score=$(jq '.categories.targeted_extract' eval-summary.json)
	echo "Targeted extract category score: $targeted_extract_score%"
	if (( $(echo "$targeted_extract_score < 80" \| bc -l) )); then
	echo "Targeted extract score is below 80%. Failing CI."
	exit 1
	fi
	else
	echo "Eval summary not found for targeted_extract category. Failing CI."
	exit 1
	fi

	run-agent-evals:
	needs: [run-regression-evals, run-build, determine-evals]
	runs-on: ubuntu-latest
	timeout-minutes: 90 # Agent evals can be long-running
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
	BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
	BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
	HEADLESS: true
	EVAL_ENV: browserbase
	# Use agent models for agent evals in CI
	EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest"
	EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals
	EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals
	steps:
	- name: Check out repository code
	uses: actions/checkout@v4

	- name: Check for 'agent' label
	id: label-check
	run: \|
	if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then
	echo "has_label=false" >> $GITHUB_OUTPUT
	echo "No label for AGENT. Exiting with success."
	else
	echo "has_label=true" >> $GITHUB_OUTPUT
	fi

	- name: Setup pnpm
	if: needs.determine-evals.outputs.run-agent == 'true'
	uses: pnpm/action-setup@v4

	- name: Set up Node.js
	if: needs.determine-evals.outputs.run-agent == 'true'
	uses: actions/setup-node@v4
	with:
	node-version: "20"
	cache: "pnpm"

	- name: Install dependencies
	if: needs.determine-evals.outputs.run-agent == 'true'
	run: pnpm install --frozen-lockfile

	- name: Download build artifacts
	if: needs.determine-evals.outputs.run-agent == 'true'
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts

	- name: Run Agent Evals
	if: needs.determine-evals.outputs.run-agent == 'true'
	run: pnpm run evals category agent

	- name: Log Agent Evals Performance
	if: needs.determine-evals.outputs.run-agent == 'true'
	run: \|
	experimentName=$(jq -r '.experimentName' eval-summary.json)
	echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
	if [ -f eval-summary.json ]; then
	agent_score=$(jq '.categories.agent' eval-summary.json)
	echo "Agent category score: $agent_score%"
	# Lower threshold for agent evals since they're complex
	if (( $(echo "$agent_score < 50" \| bc -l) )); then
	echo "Agent category score is below 50%. Failing CI."
	exit 1
	fi
	else
	echo "Eval summary not found for agent category. Failing CI."
	exit 1
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Add public API contract tests #3417

Workflow file

Add public API contract tests #3417

Uh oh!

Jobs

Run details

Workflow file for this run