Add public API contract tests #3417
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Evals | |
| on: | |
| pull_request: | |
| types: | |
| - opened | |
| - synchronize | |
| - labeled | |
| - unlabeled | |
| paths-ignore: | |
| - "packages/docs/**" | |
| env: | |
| EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-haiku-4-5" | |
| EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent" | |
| EVAL_MAX_CONCURRENCY: 25 | |
| EVAL_TRIAL_COUNT: 3 | |
| concurrency: | |
| group: ${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| determine-changes: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| core: ${{ steps.filter.outputs.core }} | |
| evals: ${{ steps.filter.outputs.evals }} | |
| docs-only: ${{ steps.filter.outputs.docs-only }} | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - uses: dorny/paths-filter@v3 | |
| id: filter | |
| with: | |
| filters: | | |
| core: | |
| - '.github/workflows/ci.yml' | |
| - 'packages/core/**' | |
| - 'package.json' | |
| - 'pnpm-lock.yaml' | |
| - 'turbo.json' | |
| evals: | |
| - 'packages/evals/**' | |
| - 'package.json' | |
| - 'pnpm-lock.yaml' | |
| docs-only: | |
| - '**/*.md' | |
| - 'examples/**' | |
| - '!packages/**/*.md' | |
| determine-evals: | |
| needs: [determine-changes] | |
| runs-on: ubuntu-latest | |
| outputs: | |
| skip-all-evals: ${{ steps.check-labels.outputs.skip-all-evals }} | |
| run-regression: ${{ steps.check-labels.outputs.run-regression }} | |
| run-combination: ${{ steps.check-labels.outputs.run-combination }} | |
| run-extract: ${{ steps.check-labels.outputs.run-extract }} | |
| run-act: ${{ steps.check-labels.outputs.run-act }} | |
| run-observe: ${{ steps.check-labels.outputs.run-observe }} | |
| run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }} | |
| run-agent: ${{ steps.check-labels.outputs.run-agent }} | |
| steps: | |
| - id: check-labels | |
| run: | | |
| # Check if skip-evals label is present | |
| if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-evals') }}" == "true" ]]; then | |
| echo "skip-evals label found - skipping all evals" | |
| echo "skip-all-evals=true" >> $GITHUB_OUTPUT | |
| echo "run-regression=false" >> $GITHUB_OUTPUT | |
| echo "run-combination=false" >> $GITHUB_OUTPUT | |
| echo "run-extract=false" >> $GITHUB_OUTPUT | |
| echo "run-act=false" >> $GITHUB_OUTPUT | |
| echo "run-observe=false" >> $GITHUB_OUTPUT | |
| echo "run-targeted-extract=false" >> $GITHUB_OUTPUT | |
| echo "run-agent=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Skip evals if only docs/examples changed (and not on main) | |
| if [[ "${{ needs.determine-changes.outputs.docs-only }}" == "true" && "${{ needs.determine-changes.outputs.core }}" == "false" && "${{ needs.determine-changes.outputs.evals }}" == "false" && "${{ github.ref }}" != "refs/heads/main" ]]; then | |
| echo "Only docs/examples changed - skipping evals" | |
| echo "skip-all-evals=true" >> $GITHUB_OUTPUT | |
| echo "run-regression=false" >> $GITHUB_OUTPUT | |
| echo "run-combination=false" >> $GITHUB_OUTPUT | |
| echo "run-extract=false" >> $GITHUB_OUTPUT | |
| echo "run-act=false" >> $GITHUB_OUTPUT | |
| echo "run-observe=false" >> $GITHUB_OUTPUT | |
| echo "run-targeted-extract=false" >> $GITHUB_OUTPUT | |
| echo "run-agent=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Default to running all tests on main branch | |
| if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then | |
| echo "Running all tests for main branch" | |
| echo "skip-all-evals=false" >> $GITHUB_OUTPUT | |
| echo "run-regression=true" >> $GITHUB_OUTPUT | |
| echo "run-combination=true" >> $GITHUB_OUTPUT | |
| echo "run-extract=true" >> $GITHUB_OUTPUT | |
| echo "run-act=true" >> $GITHUB_OUTPUT | |
| echo "run-observe=true" >> $GITHUB_OUTPUT | |
| echo "run-targeted-extract=true" >> $GITHUB_OUTPUT | |
| echo "run-agent=true" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Check for skip-regression-evals label | |
| if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-regression-evals') }}" == "true" ]]; then | |
| echo "skip-regression-evals label found - regression evals will be skipped" | |
| echo "run-regression=false" >> $GITHUB_OUTPUT | |
| else | |
| echo "Regression evals will run by default" | |
| echo "run-regression=true" >> $GITHUB_OUTPUT | |
| fi | |
| # Check for specific labels | |
| echo "skip-all-evals=false" >> $GITHUB_OUTPUT | |
| echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT | |
| echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT | |
| echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT | |
| echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT | |
| echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT | |
| echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT | |
| run-lint: | |
| needs: [determine-changes] | |
| if: needs.determine-changes.outputs.core == 'true' || needs.determine-changes.outputs.evals == 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Setup pnpm | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Run Lint | |
| run: pnpm run lint | |
| run-build: | |
| needs: [determine-changes] | |
| if: needs.determine-changes.outputs.core == 'true' || needs.determine-changes.outputs.evals == 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Setup pnpm | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Run Build | |
| run: pnpm run build | |
| - name: Upload build artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| path: | | |
| packages/*/dist/ | |
| packages/*/lib/ | |
| retention-days: 1 | |
| - name: Run Vitest | |
| run: pnpm --filter @browserbasehq/stagehand run test:vitest | |
| run-e2e-local-tests: | |
| needs: [run-lint, run-build] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 50 | |
| env: | |
| HEADLESS: true | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Setup pnpm | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Build Stagehand | |
| run: pnpm run build | |
| - name: Run local E2E Tests (Deterministic Playwright) | |
| run: pnpm run e2e:local | |
| run-e2e-bb-tests: | |
| needs: [run-lint, run-build] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 50 | |
| if: > | |
| github.event_name == 'push' || | |
| (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Setup pnpm | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Build Stagehand | |
| run: pnpm run build | |
| - name: Run E2E Tests (browserbase) | |
| run: pnpm run e2e:bb | |
| run-regression-evals: | |
| needs: | |
| [run-e2e-bb-tests, run-e2e-local-tests, run-build, determine-evals] | |
| if: needs.determine-evals.outputs.skip-all-evals != 'true' && needs.determine-evals.outputs.run-regression == 'true' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 9 | |
| outputs: | |
| regression_score: ${{ steps.set-regression-score.outputs.regression_score }} | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Setup pnpm | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run Regression Evals | |
| run: pnpm run evals category regression trials=2 concurrency=20 env=BROWSERBASE | |
| - name: Log Regression Evals Performance | |
| run: | | |
| experimentName=$(jq -r '.experimentName' eval-summary.json) | |
| echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
| if [ -f eval-summary.json ]; then | |
| regression_score=$(jq '.categories.regression' eval-summary.json) | |
| echo "Regression category score: $regression_score%" | |
| if (( $(echo "$regression_score < 90" | bc -l) )); then | |
| echo "Regression category score is below 90%. Failing CI." | |
| exit 1 | |
| fi | |
| else | |
| echo "Eval summary not found for regression category. Failing CI." | |
| exit 1 | |
| fi | |
| run-combination-evals: | |
| needs: [run-regression-evals, run-build, determine-evals] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 40 | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Check for 'combination' label | |
| id: label-check | |
| run: | | |
| if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then | |
| echo "has_label=false" >> $GITHUB_OUTPUT | |
| echo "No label for COMBINATION. Exiting with success." | |
| else | |
| echo "has_label=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Setup pnpm | |
| if: needs.determine-evals.outputs.run-combination == 'true' | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| if: needs.determine-evals.outputs.run-combination == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| if: needs.determine-evals.outputs.run-combination == 'true' | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| if: needs.determine-evals.outputs.run-combination == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run Combination Evals | |
| if: needs.determine-evals.outputs.run-combination == 'true' | |
| run: pnpm run evals category combination | |
| - name: Log Combination Evals Performance | |
| if: needs.determine-evals.outputs.run-combination == 'true' | |
| run: | | |
| experimentName=$(jq -r '.experimentName' eval-summary.json) | |
| echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
| if [ -f eval-summary.json ]; then | |
| combination_score=$(jq '.categories.combination' eval-summary.json) | |
| echo "Combination category score: $combination_score%" | |
| exit 0 | |
| else | |
| echo "Eval summary not found for combination category. Failing CI." | |
| exit 1 | |
| fi | |
| run-act-evals: | |
| needs: [run-regression-evals, run-build, determine-evals] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 25 | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Check for 'act' label | |
| id: label-check | |
| run: | | |
| if [ "${{ needs.determine-evals.outputs.run-act }}" != "true" ]; then | |
| echo "has_label=false" >> $GITHUB_OUTPUT | |
| echo "No label for ACT. Exiting with success." | |
| else | |
| echo "has_label=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Setup pnpm | |
| if: needs.determine-evals.outputs.run-act == 'true' | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| if: needs.determine-evals.outputs.run-act == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| if: needs.determine-evals.outputs.run-act == 'true' | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| if: needs.determine-evals.outputs.run-act == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run Act Evals | |
| if: needs.determine-evals.outputs.run-act == 'true' | |
| run: pnpm run evals category act | |
| - name: Log Act Evals Performance | |
| if: needs.determine-evals.outputs.run-act == 'true' | |
| run: | | |
| experimentName=$(jq -r '.experimentName' eval-summary.json) | |
| echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
| if [ -f eval-summary.json ]; then | |
| act_score=$(jq '.categories.act' eval-summary.json) | |
| echo "Act category score: $act_score%" | |
| if (( $(echo "$act_score < 80" | bc -l) )); then | |
| echo "Act category score is below 80%. Failing CI." | |
| exit 1 | |
| fi | |
| else | |
| echo "Eval summary not found for act category. Failing CI." | |
| exit 1 | |
| fi | |
| run-extract-evals: | |
| needs: [run-regression-evals, run-build, determine-evals] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 50 | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Check for 'extract' label | |
| id: label-check | |
| run: | | |
| if [ "${{ needs.determine-evals.outputs.run-extract }}" != "true" ]; then | |
| echo "has_label=false" >> $GITHUB_OUTPUT | |
| echo "No label for EXTRACT. Exiting with success." | |
| else | |
| echo "has_label=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Setup pnpm | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| # 1. Run extract category with domExtract | |
| - name: Run Extract Evals (domExtract) | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| run: pnpm run evals category extract -- --extract-method=domExtract | |
| - name: Save Extract Dom Results | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| run: mv eval-summary.json eval-summary-extract-dom.json | |
| # 2. Log and Compare Extract Evals Performance | |
| - name: Log and Compare Extract Evals Performance | |
| if: needs.determine-evals.outputs.run-extract == 'true' | |
| run: | | |
| experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) | |
| dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) | |
| echo "DomExtract Extract category score: $dom_score%" | |
| echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
| # If domExtract <80% fail CI | |
| if (( $(echo "$dom_score < 80" | bc -l) )); then | |
| echo "DomExtract extract category score is below 80%. Failing CI." | |
| exit 1 | |
| fi | |
| run-observe-evals: | |
| needs: [run-regression-evals, run-build, determine-evals] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Check for 'observe' label | |
| id: label-check | |
| run: | | |
| if [ "${{ needs.determine-evals.outputs.run-observe }}" != "true" ]; then | |
| echo "has_label=false" >> $GITHUB_OUTPUT | |
| echo "No label for OBSERVE. Exiting with success." | |
| else | |
| echo "has_label=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Setup pnpm | |
| if: needs.determine-evals.outputs.run-observe == 'true' | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| if: needs.determine-evals.outputs.run-observe == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| if: needs.determine-evals.outputs.run-observe == 'true' | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| if: needs.determine-evals.outputs.run-observe == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run Observe Evals | |
| if: needs.determine-evals.outputs.run-observe == 'true' | |
| run: pnpm run evals category observe | |
| - name: Log Observe Evals Performance | |
| if: needs.determine-evals.outputs.run-observe == 'true' | |
| run: | | |
| experimentName=$(jq -r '.experimentName' eval-summary.json) | |
| echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
| if [ -f eval-summary.json ]; then | |
| observe_score=$(jq '.categories.observe' eval-summary.json) | |
| echo "Observe category score: $observe_score%" | |
| if (( $(echo "$observe_score < 80" | bc -l) )); then | |
| echo "Observe category score is below 80%. Failing CI." | |
| exit 1 | |
| fi | |
| else | |
| echo "Eval summary not found for observe category. Failing CI." | |
| exit 1 | |
| fi | |
| run-targeted-extract-evals: | |
| needs: [run-regression-evals, run-build, determine-evals] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Check for 'targeted-extract' label | |
| id: label-check | |
| run: | | |
| if [ "${{ needs.determine-evals.outputs.run-targeted-extract }}" != "true" ]; then | |
| echo "has_label=false" >> $GITHUB_OUTPUT | |
| echo "No label for TARGETED-EXTRACT. Exiting with success." | |
| else | |
| echo "has_label=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Setup pnpm | |
| if: needs.determine-evals.outputs.run-targeted-extract == 'true' | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| if: needs.determine-evals.outputs.run-targeted-extract == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| if: needs.determine-evals.outputs.run-targeted-extract == 'true' | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| if: needs.determine-evals.outputs.run-targeted-extract == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run targeted extract Evals | |
| if: needs.determine-evals.outputs.run-targeted-extract == 'true' | |
| run: pnpm run evals category targeted_extract | |
| - name: Log targeted extract Evals Performance | |
| if: needs.determine-evals.outputs.run-targeted-extract == 'true' | |
| run: | | |
| experimentName=$(jq -r '.experimentName' eval-summary.json) | |
| echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
| if [ -f eval-summary.json ]; then | |
| targeted_extract_score=$(jq '.categories.targeted_extract' eval-summary.json) | |
| echo "Targeted extract category score: $targeted_extract_score%" | |
| if (( $(echo "$targeted_extract_score < 80" | bc -l) )); then | |
| echo "Targeted extract score is below 80%. Failing CI." | |
| exit 1 | |
| fi | |
| else | |
| echo "Eval summary not found for targeted_extract category. Failing CI." | |
| exit 1 | |
| fi | |
| run-agent-evals: | |
| needs: [run-regression-evals, run-build, determine-evals] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 # Agent evals can be long-running | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
| BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
| BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
| HEADLESS: true | |
| EVAL_ENV: browserbase | |
| # Use agent models for agent evals in CI | |
| EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest" | |
| EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals | |
| EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals | |
| steps: | |
| - name: Check out repository code | |
| uses: actions/checkout@v4 | |
| - name: Check for 'agent' label | |
| id: label-check | |
| run: | | |
| if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then | |
| echo "has_label=false" >> $GITHUB_OUTPUT | |
| echo "No label for AGENT. Exiting with success." | |
| else | |
| echo "has_label=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Setup pnpm | |
| if: needs.determine-evals.outputs.run-agent == 'true' | |
| uses: pnpm/action-setup@v4 | |
| - name: Set up Node.js | |
| if: needs.determine-evals.outputs.run-agent == 'true' | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "pnpm" | |
| - name: Install dependencies | |
| if: needs.determine-evals.outputs.run-agent == 'true' | |
| run: pnpm install --frozen-lockfile | |
| - name: Download build artifacts | |
| if: needs.determine-evals.outputs.run-agent == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run Agent Evals | |
| if: needs.determine-evals.outputs.run-agent == 'true' | |
| run: pnpm run evals category agent | |
| - name: Log Agent Evals Performance | |
| if: needs.determine-evals.outputs.run-agent == 'true' | |
| run: | | |
| experimentName=$(jq -r '.experimentName' eval-summary.json) | |
| echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
| if [ -f eval-summary.json ]; then | |
| agent_score=$(jq '.categories.agent' eval-summary.json) | |
| echo "Agent category score: $agent_score%" | |
| # Lower threshold for agent evals since they're complex | |
| if (( $(echo "$agent_score < 50" | bc -l) )); then | |
| echo "Agent category score is below 50%. Failing CI." | |
| exit 1 | |
| fi | |
| else | |
| echo "Eval summary not found for agent category. Failing CI." | |
| exit 1 | |
| fi |