generate_dataset_with_all_agents.py

import concurrent.futures
import json
import os
import random

from dotenv import load_dotenv
from tqdm import tqdm

from agents import BaseDebateAgent
from agents import BoNDebateAgent
from agents import DebateAgent
from agents import run_debate
from models import DebateSample
from models import MMLUProDebateSample

random.seed(314)

load_dotenv()

BLIND_TEMPLATE = """Question:

{question}

A) {first_option}
B) {second_option}

Please answer the question. Say "A" or "B" and NOTHING ELSE."""

TEMPLATE = """Question:

{question}

A) {first_option}
B) {second_option}

Below is a debate with one side arguing for A and the other side arguing for B.

{debate_so_far}

Read the debate carefully and then answer the question. Say "A" or "B" and NOTHING ELSE."""

openai_api_key = os.getenv("OPENAI_API_KEY")
assert openai_api_key is not None, "The OPENAI_API_KEY environment variable is not set."

with open("binarised_mmlu_pro.json") as f:
    samples = [MMLUProDebateSample.model_validate(sample) for sample in json.load(f)]

SAMPLES = 500
samples = samples[:SAMPLES]

pbar = tqdm(total=len(samples))


def agent_total_cost(agent: BaseDebateAgent):
    if isinstance(agent, DebateAgent):
        return agent.llm.total_cost()
    elif isinstance(agent, BoNDebateAgent):
        return agent.llm.total_cost() + agent.reward_model.llm.total_cost()
    else:
        return 0.0


def create_debate_sample(
    mmlu_pro_sample: MMLUProDebateSample,
    agents: list[BaseDebateAgent],
    number_of_turns: int = 6,
):
    agent_a = random.choice(agents)
    agent_b = random.choice(agents)
    reverse_labels = random.choice([True, False])
    if reverse_labels:
        first_option = mmlu_pro_sample.incorrect_answer
        second_option = mmlu_pro_sample.correct_answer
        label = "B"
    else:
        first_option = mmlu_pro_sample.correct_answer
        second_option = mmlu_pro_sample.incorrect_answer
        label = "A"
    blind_judge_prompt = BLIND_TEMPLATE.format(
        question=mmlu_pro_sample.question,
        first_option=first_option,
        second_option=second_option,
    )
    full_debate = run_debate(
        question=mmlu_pro_sample.question,
        position=first_option,
        opposing_position=second_option,
        agent=agent_a,
        opponent_agent=agent_b,
        number_of_turns=number_of_turns,
    )
    speakers = [
        "Person advocating for A: ",
        "Person advocating for B: ",
    ] * number_of_turns
    debate_so_far = "".join(
        f"\n\n{speaker}\n{turn}" for speaker, turn in zip(speakers, full_debate)
    )
    judge_prompt = TEMPLATE.format(
        question=mmlu_pro_sample.question,
        first_option=first_option,
        second_option=second_option,
        debate_so_far=debate_so_far,
    )
    pbar.update(1)

    total_cost = sum([agent_total_cost(agent) for agent in agents])
    print(f"Total cost: {total_cost}")

    return DebateSample(
        question=mmlu_pro_sample.question,
        first_answer=first_option,
        second_answer=second_option,
        label=label,
        blind_judge_prompt=blind_judge_prompt,
        judge_prompt=judge_prompt,
    )


AGENTS = [
    DebateAgent.from_model(
        model="gpt-3.5-turbo-0125",
        api_key=openai_api_key,
        temperature=0.0,
    ),
    BoNDebateAgent.from_model(
        model="gpt-3.5-turbo-0125",
        api_key=openai_api_key,
        temperature=0.8,
        best_of=4,
    ),
    DebateAgent.from_model(
        model="gpt-4o-mini-2024-07-18",
        api_key=openai_api_key,
        temperature=0.0,
    ),
    BoNDebateAgent.from_model(
        model="gpt-4o-mini-2024-07-18",
        api_key=openai_api_key,
        temperature=0.8,
        best_of=4,
    ),
    DebateAgent.from_model(
        model="gpt-4o-2024-05-13",
        api_key=openai_api_key,
        temperature=0.0,
    ),
    BoNDebateAgent.from_model(
        model="gpt-4o-2024-05-13",
        api_key=openai_api_key,
        temperature=0.8,
        best_of=4,
    ),
]

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    debate_samples = list(
        executor.map(lambda sample: create_debate_sample(sample, AGENTS), samples)
    )

pbar.close()

with open("debate_sample_six_agents.json", "w") as f:
    json.dump([sample.model_dump() for sample in debate_samples], f)