Skip to content

Reduce eval spend #337

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 3 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 1 addition & 48 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@ on:
types:
- opened
- synchronize
schedule:
- cron: "0 */6 * * *" # every 6 hours

env:
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
EXPERIMENTAL_EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest,o1-mini,o1-preview"
EVAL_CATEGORIES: "observe,act,combination,extract,experimental,text_extract"
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract"

concurrency:
group: ${{ github.ref }}
Expand Down Expand Up @@ -337,47 +334,3 @@ jobs:
echo "Eval summary not found for combination category. Failing CI."
exit 1
fi

run-experimental-evals:
runs-on: ubuntu-latest
timeout-minutes: 120
needs: [run-text-extract-evals]
if: github.ref == 'refs/heads/main'
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase

steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
run: npm install --no-frozen-lockfile

- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Experimental Evals
run: npm run evals category experimental

- name: Log Experimental Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
experimental_score=$(jq '.categories.experimental' eval-summary.json)
echo "Experimental category score: $experimental_score%"
else
echo "Eval summary not found for experimental category. Failing CI."
exit 1
fi
16 changes: 3 additions & 13 deletions evals/taskConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import fs from "fs";
import path from "path";
import { AvailableModel, AvailableModelSchema } from "../types/model";
import { filterByCategory, filterByEvalName } from "./args";
import { filterByEvalName } from "./args";

// The configuration file `evals.config.json` contains a list of tasks and their associated categories.
const configPath = path.join(__dirname, "evals.config.json");
Expand Down Expand Up @@ -51,26 +51,16 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
? process.env.EVAL_MODELS.split(",")
: ["gpt-4o", "claude-3-5-sonnet-latest"];

const EXPERIMENTAL_EVAL_MODELS = process.env.EXPERIMENTAL_EVAL_MODELS
? process.env.EXPERIMENTAL_EVAL_MODELS.split(",")
: ["o1-mini", "o1-preview"];

/**
* getModelList:
* Returns a list of models to be used for the given category.
* If category is "experimental", it merges DEFAULT_EVAL_MODELS and EXPERIMENTAL_EVAL_MODELS.
* Otherwise, returns DEFAULT_EVAL_MODELS.
*/
const getModelList = (category: string | null): string[] => {
if (category === "experimental") {
// Remove duplicates by creating a Set and converting back to array.
return Array.from(
new Set([...DEFAULT_EVAL_MODELS, ...EXPERIMENTAL_EVAL_MODELS]),
);
}
const getModelList = (): string[] => {
return DEFAULT_EVAL_MODELS;
};
const MODELS: AvailableModel[] = getModelList(filterByCategory).map((model) => {
const MODELS: AvailableModel[] = getModelList().map((model) => {
if (!AvailableModelSchema.safeParse(model).success) {
throw new Error(`Model ${model} is not a supported model`);
}
Expand Down
Loading