browserbase · seanmcguire12 · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,13 +8,10 @@ on:
     types:
       - opened
       - synchronize
-  schedule:
-    - cron: "0 */6 * * *" # every 6 hours
 
 env:
   EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
-  EXPERIMENTAL_EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest,o1-mini,o1-preview"
-  EVAL_CATEGORIES: "observe,act,combination,extract,experimental,text_extract"
+  EVAL_CATEGORIES: "observe,act,combination,extract,text_extract"
 
 concurrency:
   group: ${{ github.ref }}
@@ -337,47 +334,3 @@ jobs:
             echo "Eval summary not found for combination category. Failing CI."
             exit 1
           fi
-
-  run-experimental-evals:
-    runs-on: ubuntu-latest
-    timeout-minutes: 120
-    needs: [run-text-extract-evals]
-    if: github.ref == 'refs/heads/main'
-    env:
-      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
-      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
-      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
-      HEADLESS: true
-      EVAL_ENV: browserbase
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - name: Install dependencies
-        run: npm install --no-frozen-lockfile
-
-      - name: Install Playwright browsers
-        run: npm exec playwright install --with-deps
-
-      - name: Run Experimental Evals
-        run: npm run evals category experimental
-
-      - name: Log Experimental Evals Performance
-        run: |
-          experimentName=$(jq -r '.experimentName' eval-summary.json)
-          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
-          if [ -f eval-summary.json ]; then
-            experimental_score=$(jq '.categories.experimental' eval-summary.json)
-            echo "Experimental category score: $experimental_score%"
-          else
-            echo "Eval summary not found for experimental category. Failing CI."
-            exit 1
-          fi
diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts
@@ -13,7 +13,7 @@
 import fs from "fs";
 import path from "path";
 import { AvailableModel, AvailableModelSchema } from "../types/model";
-import { filterByCategory, filterByEvalName } from "./args";
+import { filterByEvalName } from "./args";
 
 // The configuration file `evals.config.json` contains a list of tasks and their associated categories.
 const configPath = path.join(__dirname, "evals.config.json");
@@ -51,26 +51,16 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
   ? process.env.EVAL_MODELS.split(",")
   : ["gpt-4o", "claude-3-5-sonnet-latest"];
 
-const EXPERIMENTAL_EVAL_MODELS = process.env.EXPERIMENTAL_EVAL_MODELS
-  ? process.env.EXPERIMENTAL_EVAL_MODELS.split(",")
-  : ["o1-mini", "o1-preview"];
-
 /**
  * getModelList:
  * Returns a list of models to be used for the given category.
  * If category is "experimental", it merges DEFAULT_EVAL_MODELS and EXPERIMENTAL_EVAL_MODELS.
  * Otherwise, returns DEFAULT_EVAL_MODELS.
  */
-const getModelList = (category: string | null): string[] => {
-  if (category === "experimental") {
-    // Remove duplicates by creating a Set and converting back to array.
-    return Array.from(
-      new Set([...DEFAULT_EVAL_MODELS, ...EXPERIMENTAL_EVAL_MODELS]),
-    );
-  }
+const getModelList = (): string[] => {
   return DEFAULT_EVAL_MODELS;
 };
-const MODELS: AvailableModel[] = getModelList(filterByCategory).map((model) => {
+const MODELS: AvailableModel[] = getModelList().map((model) => {
   if (!AvailableModelSchema.safeParse(model).success) {
     throw new Error(`Model ${model} is not a supported model`);
   }