Skip to content

Commit bc2969d

Browse files
epsilondylanyufeng zhao
and
yufeng zhao
authored
[Feature] Add support for BBEH dataset (#1925)
* bbeh * bbeh * fix_smallbugs_bbeh * removeprint * results --------- Co-authored-by: yufeng zhao <zhaoyufeng@pjlab.org.cn>
1 parent 59e49ae commit bc2969d

File tree

7 files changed

+296
-0
lines changed

7 files changed

+296
-0
lines changed

dataset-index.yml

+5
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,11 @@
234234
category: Reasoning
235235
paper: https://arxiv.org/pdf/2210.09261
236236
configpath: opencompass/configs/datasets/bbh
237+
- bbeh:
238+
name: BIG-Bench Extra Hard
239+
category: Reasoning
240+
paper: https://arxiv.org/abs/2502.19187
241+
configpath: opencompass/configs/datasets/bbeh
237242
- BoolQ:
238243
name: SuperGLUE / BoolQ
239244
category: Knowledge
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# BB#H
2+
3+
```bash
4+
python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
5+
python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
6+
```
7+
8+
## Models
9+
10+
| model | score |
11+
|:-----------------------------------------:|------:|
12+
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 10.93 |
13+
14+
### Details
15+
16+
| model | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
17+
|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
18+
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 14.00 | 33.33 | 13.50 | 1.00 | 28.00 | 11.00 | 10.00 | 18.50 |
19+
20+
| model | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
21+
|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
22+
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 0.00 | 42.50 | 3.50 | 2.00 | 0.00 | 0.00 | 1.00 | 17.00 |
23+
24+
| model | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
25+
|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
26+
| Meta-Llama-3-8B-Instruct-LMDeploy-API | 4.00 | 5.00 | 2.00 | 3.00 | 7.50 | 2.00 | 3.50 |
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import os
2+
from opencompass.openicl.icl_prompt_template import PromptTemplate
3+
from opencompass.openicl.icl_retriever import ZeroRetriever
4+
from opencompass.openicl.icl_inferencer import GenInferencer
5+
from opencompass.openicl.icl_evaluator import AccEvaluator
6+
from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
7+
8+
bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
9+
10+
11+
bbeh_multiple_choice_sets = [
12+
'bbeh_boolean_expressions',
13+
'bbeh_disambiguation_qa',
14+
'bbeh_geometric_shapes',
15+
'bbeh_hyperbaton',
16+
'bbeh_movie_recommendation',
17+
'bbeh_nycc',
18+
'bbeh_shuffled_objects',
19+
]
20+
21+
bbeh_free_form_sets = [
22+
'bbeh_boardgame_qa',
23+
'bbeh_buggy_tables',
24+
'bbeh_causal_understanding',
25+
'bbeh_dyck_languages',
26+
'bbeh_linguini',
27+
'bbeh_multistep_arithmetic',
28+
'bbeh_object_counting',
29+
'bbeh_object_properties',
30+
'bbeh_sarc_triples',
31+
'bbeh_spatial_reasoning',
32+
'bbeh_sportqa',
33+
'bbeh_temporal_sequence',
34+
'bbeh_time_arithmetic',
35+
'bbeh_web_of_lies',
36+
'bbeh_word_sorting',
37+
'bbeh_zebra_puzzles',
38+
]
39+
40+
bbeh_datasets = []
41+
for _name in bbeh_multiple_choice_sets:
42+
bbeh_infer_cfg = dict(
43+
prompt_template=dict(
44+
type=PromptTemplate,
45+
template=dict(round=[
46+
dict(
47+
role='HUMAN',
48+
prompt=
49+
f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
50+
)
51+
])),
52+
retriever=dict(type=ZeroRetriever),
53+
inferencer=dict(type=GenInferencer, max_out_len=8192))
54+
bbeh_eval_cfg = dict(
55+
evaluator=dict(type=BBEHEvaluator_mcq),
56+
pred_role='BOT',
57+
pred_postprocessor=dict(type=bbeh_mcq_postprocess),
58+
dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
59+
60+
bbeh_datasets.append(
61+
dict(
62+
type=BBEHDataset,
63+
path='opencompass/bbeh',
64+
name=_name,
65+
abbr=_name,
66+
reader_cfg=bbeh_reader_cfg,
67+
infer_cfg=bbeh_infer_cfg.copy(),
68+
eval_cfg=bbeh_eval_cfg.copy()))
69+
70+
for _name in bbeh_free_form_sets:
71+
bbeh_infer_cfg = dict(
72+
prompt_template=dict(
73+
type=PromptTemplate,
74+
template=dict(round=[
75+
dict(
76+
role='HUMAN',
77+
prompt=
78+
f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
79+
)
80+
])),
81+
retriever=dict(type=ZeroRetriever),
82+
inferencer=dict(type=GenInferencer, max_out_len=8192))
83+
bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
84+
85+
bbeh_datasets.append(
86+
dict(
87+
type=BBEHDataset,
88+
path='opencompass/bbeh',
89+
name=_name,
90+
abbr=_name,
91+
reader_cfg=bbeh_reader_cfg,
92+
infer_cfg=bbeh_infer_cfg.copy(),
93+
eval_cfg=bbeh_eval_cfg.copy()))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
bbeh_summary_groups = []
2+
3+
# bbeh
4+
_bbeh = [
5+
'bbeh_boolean_expressions', 'bbeh_disambiguation_qa', 'bbeh_geometric_shapes', 'bbeh_hyperbaton',
6+
'bbeh_movie_recommendation', 'bbeh_nycc', 'bbeh_shuffled_objects', 'bbeh_boardgame_qa',
7+
'bbeh_buggy_tables', 'bbeh_causal_understanding', 'bbeh_dyck_languages', 'bbeh_linguini',
8+
'bbeh_multistep_arithmetic', 'bbeh_object_counting', 'bbeh_object_properties', 'bbeh_sarc_triples',
9+
'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
10+
'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
11+
]
12+
bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})

opencompass/datasets/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .arc_prize_public_evaluation import * # noqa: F401, F403
1010
from .ax import * # noqa: F401, F403
1111
from .babilong import * # noqa: F401, F403
12+
from .bbeh import * # noqa: F401, F403
1213
from .bbh import * # noqa: F401, F403
1314
from .bigcodebench import * # noqa: F401, F403
1415
from .boolq import * # noqa: F401, F403

opencompass/datasets/bbeh.py

+149
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import json
2+
import os.path as osp
3+
import re
4+
from os import environ
5+
6+
from datasets import Dataset
7+
8+
from opencompass.openicl.icl_evaluator import BaseEvaluator
9+
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
10+
TEXT_POSTPROCESSORS)
11+
from opencompass.utils import get_data_path
12+
13+
from .base import BaseDataset
14+
15+
16+
@LOAD_DATASET.register_module()
17+
class BBEHDataset(BaseDataset):
18+
19+
@staticmethod
20+
def load(path: str, name: str):
21+
path = get_data_path(path)
22+
if environ.get('DATASET_SOURCE') == 'ModelScope':
23+
from modelscope import MsDataset
24+
dataset = MsDataset.load(path, subset_name=name, split='test')
25+
else:
26+
with open(osp.join(path, f'{name}/task.json'), 'r') as f:
27+
data = json.load(f)['examples']
28+
dataset = Dataset.from_list(data)
29+
return dataset
30+
31+
32+
@TEXT_POSTPROCESSORS.register_module('bbeh_freeform')
33+
def bbeh_freeform_postprocess(text: str) -> str:
34+
# Extract answer using specified prefixes
35+
prefixes = [
36+
'The answer is: ', 'The answer is ', 'The final answer is: ',
37+
'The final answer is '
38+
]
39+
answer = text
40+
for prefix in prefixes:
41+
if prefix in text:
42+
answer = text.split(prefix)[-1]
43+
break
44+
45+
# Remove formatting markup
46+
if '\\boxed' in answer:
47+
answer = re.sub(r'\\boxed{(.*?)}', r'\1', answer) # latex box
48+
if '\\text' in answer:
49+
answer = re.sub(r'\\text(?:tt)?{(.*?)}', r'\1', answer) # text/texttt
50+
if '**' in answer:
51+
answer = re.sub(r'\*\*(.*?)\*\*', r'\1', answer) # bold
52+
53+
# Take first line and clean
54+
if '\n' in answer:
55+
answer = answer.split('\n')[0].strip()
56+
57+
return answer.strip().lower()
58+
59+
60+
@TEXT_POSTPROCESSORS.register_module('bbeh_mcq')
61+
def bbeh_mcq_postprocess(text: str) -> str:
62+
# Extract answer using specified prefixes
63+
prefixes = [
64+
'The answer is: ', 'The answer is ', 'The final answer is: ',
65+
'The final answer is '
66+
]
67+
answer = text
68+
for prefix in prefixes:
69+
if prefix in text:
70+
answer = text.split(prefix)[-1]
71+
break
72+
73+
# Remove parentheses if present
74+
answer = answer.strip('()')
75+
76+
# Take first line and clean
77+
if '\n' in answer:
78+
answer = answer.split('\n')[0].strip()
79+
80+
return answer.strip().lower()
81+
82+
83+
@ICL_EVALUATORS.register_module()
84+
class BBEHEvaluator(BaseEvaluator):
85+
86+
def score(self, predictions, references):
87+
if len(predictions) != len(references):
88+
return {
89+
'error': 'predictions and references have different length'
90+
}
91+
92+
processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
93+
# References are already in correct format
94+
processed_refs = [r.lower() for r in references]
95+
96+
details = []
97+
correct_count = 0
98+
99+
for pred, ref in zip(processed_preds, processed_refs):
100+
correct = False
101+
102+
# Rule 1: Exact match
103+
if pred == ref:
104+
correct = True
105+
# Rule 2: Match after removing quotes/brackets
106+
elif pred == ref.strip("'\"()[]"):
107+
correct = True
108+
# Rule 4: Comma - separated answers
109+
elif ',' in ref:
110+
norm_pred = re.sub(r'\s*,\s*', ',', pred)
111+
norm_ref = re.sub(r'\s*,\s*', ',', ref)
112+
if norm_pred == norm_ref:
113+
correct = True
114+
115+
details.append({'pred': pred, 'answer': ref, 'correct': correct})
116+
correct_count += int(correct)
117+
118+
score = (correct_count / len(predictions)) * 100
119+
return {'score': score, 'details': details}
120+
121+
122+
@ICL_EVALUATORS.register_module()
123+
class BBEHEvaluator_mcq(BaseEvaluator):
124+
125+
def score(self, predictions, references):
126+
if len(predictions) != len(references):
127+
return {
128+
'error': 'predictions and references have different length'
129+
}
130+
131+
processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
132+
# References are already in correct format
133+
processed_refs = [r.lower().strip('()') for r in references]
134+
135+
details = []
136+
correct_count = 0
137+
138+
for pred, ref in zip(processed_preds, processed_refs):
139+
correct = False
140+
141+
# Rule 1: Exact match
142+
if pred == ref:
143+
correct = True
144+
145+
details.append({'pred': pred, 'answer': ref, 'correct': correct})
146+
correct_count += int(correct)
147+
148+
score = (correct_count / len(predictions)) * 100
149+
return {'score': score, 'details': details}

opencompass/utils/datasets_info.py

+10
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@
3333
"hf_id": "opencompass/bbh",
3434
"local": "./data/BBH/data",
3535
},
36+
# bbeh
37+
"opencompass/bbeh": {
38+
"ms_id": "",
39+
"hf_id": "",
40+
"local": "./data/bbeh/",
41+
},
3642
# C-Eval
3743
"opencompass/ceval-exam": {
3844
"ms_id": "opencompass/ceval-exam",
@@ -691,6 +697,10 @@
691697
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
692698
"md5": "9107597d137e7362eaf7d218ddef7a6d",
693699
},
700+
"/bbeh": {
701+
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip",
702+
"md5": "43a3c2d73aee731ac68ac790bc9a358e",
703+
},
694704
"subjective/judgerbench": {
695705
"url":
696706
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",

0 commit comments

Comments
 (0)