diff --git a/README.md b/README.md index af13bc9..8fc6ab9 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,15 @@ Claude will use the MCP server to: 1. First get an overview of the requested API 2. Then retrieve specific operation details as needed + + +## Running evals + +The evals package loads an mcp client that then runs the index.ts file, so there is no need to rebuild between tests. You can load environment variables by prefixing the npx command. Full documentation can be found [here](https://www.mcpevals.io/docs). + +```bash +OPENAI_API_KEY=your-key npx mcp-eval evals.ts index.js +``` ## Requirements - Node.js >= 16.17.0 diff --git a/evals.ts b/evals.ts new file mode 100644 index 0000000..72898da --- /dev/null +++ b/evals.ts @@ -0,0 +1,32 @@ +//evals.ts + +import { EvalConfig } from 'mcp-evals'; +import { openai } from "@ai-sdk/openai"; +import { grade, EvalFunction } from "mcp-evals"; + +const getApiOverviewEval: EvalFunction = { + name: "getApiOverviewEval", + description: "Evaluates the getApiOverview tool's ability to provide an overview of an OpenAPI specification", + run: async () => { + const result = await grade(openai("gpt-4"), "Given the ID 'petstore', retrieve an overview of the OpenAPI specification."); + return JSON.parse(result); + } +}; + +const getApiOperationEval: EvalFunction = { + name: "getApiOperation Tool Evaluation", + description: "Evaluates the getApiOperation tool", + run: async () => { + const result = await grade(openai("gpt-4"), "Retrieve the operation details for id 'petstore' and operationIdOrRoute 'addPet' from the OpenAPI specification."); + return JSON.parse(result); + } +}; + +const config: EvalConfig = { + model: openai("gpt-4"), + evals: [getApiOverviewEval, getApiOperationEval] +}; + +export default config; + +export const evals = [getApiOverviewEval, getApiOperationEval]; \ No newline at end of file diff --git a/package.json b/package.json index d13f4de..cbe9c7c 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "@modelcontextprotocol/sdk": "^0.6.0", "chalk": "^5.3.0", "dotenv": "^16.4.5", + "mcp-evals": "^1.0.18", "undici": "^5.28.4" }, "engines": {