-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathgpt4v_eval.py
306 lines (242 loc) · 9.82 KB
/
gpt4v_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import base64
import requests
from PIL import Image
from io import BytesIO
import argparse
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from tqdm import tqdm
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from torchvision.utils import save_image
from pope_loader import POPEDataSet
from minigpt4.common.dist_utils import get_rank
from minigpt4.models import load_preprocess
from minigpt4.common.config import Config
from minigpt4.common.dist_utils import get_rank
from minigpt4.common.registry import registry
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
# from PIL import Image
from torchvision.utils import save_image
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn
import json
MODEL_EVAL_CONFIG_PATH = {
"minigpt4": "eval_configs/minigpt4_eval.yaml",
"instructblip": "eval_configs/instructblip_eval.yaml",
"lrv_instruct": "eval_configs/lrv_instruct_eval.yaml",
"shikra": "eval_configs/shikra_eval.yaml",
"llava-1.5": "eval_configs/llava-1.5_eval.yaml",
}
INSTRUCTION_TEMPLATE = {
"minigpt4": "###Human: <Img><ImageHere></Img> <question> ###Assistant:",
"instructblip": "<ImageHere><question>",
"lrv_instruct": "###Human: <Img><ImageHere></Img> <question> ###Assistant:",
"shikra": "USER: <im_start><ImageHere><im_end> <question> ASSISTANT:",
"llava-1.5": "USER: <ImageHere> <question> ASSISTANT:"
}
GPT_JUDGE_PROMPT = '''
You are required to score the performance of two AI assistants in describing a given image. You should pay extra attention to the hallucination, which refers to the part of descriptions that are inconsistent with the image content, such as claiming the existence of something not present in the image or describing incorrectly in terms of the counts, positions, or colors of objects in the image. Please rate the responses of the assistants on a scale of 1 to 10, where a higher score indicates better performance, according to the following criteria:
1: Accuracy: whether the response is accurate with respect to the image content. Responses with fewer hallucinationsshould be given higher scores.
2: Detailedness: whether the response is rich in necessary details. Note that hallucinated descriptions should not countas necessary details.
Please output the scores for each criterion, containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. Following the scores, please provide an explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.
[Assistant 1]
{}
[End of Assistant 1]
[Assistant 2]
{}
[End of Assistant 2]
Output format:
Accuracy: <Scores of the two answers>
Reason:
Detailedness: <Scores of the two answers>
Reason:
'''
# OpenAI API Key
API_KEY = "YOUR_API_KEY"
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def call_api(prompt, image_path):
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Getting the base64 string
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 300
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(response.json().keys())
return response.json()
def get_gpt4v_answer(prompt, image_path):
while 1:
try:
res = call_api(prompt, image_path)
if "choices" in res.keys():
return res["choices"][0]["message"]["content"]
else:
assert False
except Exception as e:
print("retry")
# pass
# return call_api(prompt, image_path)
parser = argparse.ArgumentParser(description="POPE-Adv evaluation on LVLMs.")
parser.add_argument("--model", type=str, help="model")
parser.add_argument("--gpu-id", type=int, help="specify the gpu to load the model.")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
parser.add_argument("--data_path", type=str, default="COCO_2014/val2014/", help="data path")
parser.add_argument("--batch_size", type=int, help="batch size")
parser.add_argument("--num_workers", type=int, default=2, help="num workers")
parser.add_argument("--scale_factor", type=float, default=50)
parser.add_argument("--threshold", type=int, default=15)
parser.add_argument("--num_attn_candidates", type=int, default=5)
parser.add_argument("--penalty_weights", type=float, default=1.0)
args = parser.parse_known_args()[0]
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
args.cfg_path = MODEL_EVAL_CONFIG_PATH[args.model]
cfg = Config(args)
setup_seeds(cfg)
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
# ========================================
# Model Initialization
# ========================================
print('Initializing Model')
model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to(device)
model.eval()
processor_cfg = cfg.get_config().preprocess
processor_cfg.vis_processor.eval.do_normalize = False
vis_processors, txt_processors = load_preprocess(processor_cfg)
print(vis_processors["eval"].transform)
print("Done!")
mean = (0.48145466, 0.4578275, 0.40821073)
std = (0.26862954, 0.26130258, 0.27577711)
norm = transforms.Normalize(mean, std)
img_files = os.listdir(args.data_path)
random.shuffle(img_files)
base_path = "log/gpt4v-eval"
if not os.path.exists(base_path + f"/{args.model}"):
os.mkdir(base_path + f"/{args.model}")
gpt_answer_records = {}
assistant_answer_records = {}
avg_hal_score_1 = 0
avg_hal_score_2 = 0
avg_det_score_1 = 0
avg_det_score_2 = 0
num_count = 0
for idx in range(50):
img = img_files[idx]
image_path = args.data_path + img
raw_image = Image.open(image_path)
raw_image = raw_image.convert("RGB")
image = vis_processors["eval"](raw_image).unsqueeze(0)
image = image.to(device)
qu = "Please describe this image in detail."
template = INSTRUCTION_TEMPLATE[args.model]
qu = template.replace("<question>", qu)
assistant_answer_records[str(img)] = {}
with torch.inference_mode():
with torch.no_grad():
out = model.generate(
{"image": norm(image), "prompt":qu},
use_nucleus_sampling=False,
num_beams=5,
max_new_tokens=512,
)
model_response_1 = out[0]
assistant_answer_records[str(img)]["assistant_1"] = model_response_1
print("Beam-5 output:")
print(model_response_1)
with torch.inference_mode():
with torch.no_grad():
out = model.generate(
{"image": norm(image), "prompt":qu},
use_nucleus_sampling=False,
num_beams=5,
max_new_tokens=512,
output_attentions=True,
opera_decoding=True,
scale_factor=args.scale_factor,
threshold=args.threshold,
num_attn_candidates=args.num_attn_candidates,
penalty_weights=args.penalty_weights,
)
model_response_2 = out[0]
assistant_answer_records[str(img)]["assistant_2"] = model_response_2
print("OPERA output:")
print(model_response_2)
# gpt-4v eval
prompt = GPT_JUDGE_PROMPT.format(model_response_1, model_response_2)
gpt_answer = get_gpt4v_answer(prompt, image_path)
print(gpt_answer)
gpt_answer_records[str(img)] = gpt_answer
print(gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" "))
print(len(gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" ")))
try:
hal_score_1, hal_score_2 = gpt_answer.split("Accuracy: ")[-1].split("\n")[0].split(" ")
det_score_1, det_score_2 = gpt_answer.split("Detailedness: ")[-1].split("\n")[0].split(" ")
except:
continue
avg_hal_score_1 += int(hal_score_1)
avg_hal_score_2 += int(hal_score_2)
avg_det_score_1 += int(det_score_1)
avg_det_score_2 += int(det_score_2)
num_count += 1
print("=========================================")
# dump metric file
with open(os.path.join(base_path + f"/{args.model}", 'answers.json'), "w") as f:
json.dump(assistant_answer_records, f)
# dump metric file
with open(os.path.join(base_path + f"/{args.model}", 'records.json'), "w") as f:
json.dump(gpt_answer_records, f)
avg_score = float(avg_hal_score_1) / num_count
avg_score = float(avg_hal_score_2) / num_count
avg_score = float(avg_det_score_1) / num_count
avg_score = float(avg_det_score_2) / num_count
print(f"The avg hal score for Assistant 1 and Assistent 2: {avg_hal_score_1}; {avg_hal_score_2}")
print(f"The avg det score for Assistant 1 and Assistent 2: {avg_det_score_1}; {avg_det_score_2}")