19
19
]
20
20
21
21
22
- def load_phi3v (question , image_urls : List [str ]):
22
+ def load_qwenvl_chat (question : str , image_urls : List [str ]):
23
+ model_name = "Qwen/Qwen-VL-Chat"
24
+ llm = LLM (
25
+ model = model_name ,
26
+ trust_remote_code = True ,
27
+ max_num_seqs = 5 ,
28
+ limit_mm_per_prompt = {"image" : len (image_urls )},
29
+ )
30
+ placeholders = "" .join (f"Picture { i } : <img></img>\n "
31
+ for i , _ in enumerate (image_urls , start = 1 ))
32
+
33
+ # This model does not have a chat_template attribute on its tokenizer,
34
+ # so we need to explicitly pass it. We use ChatML since it's used in the
35
+ # generation utils of the model:
36
+ # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
37
+ tokenizer = AutoTokenizer .from_pretrained (model_name ,
38
+ trust_remote_code = True )
39
+
40
+ # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
41
+ chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n ' + message['content'] + '<|im_end|>' + '\n '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n ' }}{% endif %}" # noqa: E501
42
+
43
+ messages = [{'role' : 'user' , 'content' : f"{ placeholders } \n { question } " }]
44
+ prompt = tokenizer .apply_chat_template (messages ,
45
+ tokenize = False ,
46
+ add_generation_prompt = True ,
47
+ chat_template = chat_template )
48
+
49
+ stop_tokens = ["<|endoftext|>" , "<|im_start|>" , "<|im_end|>" ]
50
+ stop_token_ids = [tokenizer .convert_tokens_to_ids (i ) for i in stop_tokens ]
51
+ return llm , prompt , stop_token_ids , None , chat_template
52
+
53
+
54
+ def load_phi3v (question : str , image_urls : List [str ]):
23
55
llm = LLM (
24
56
model = "microsoft/Phi-3.5-vision-instruct" ,
25
57
trust_remote_code = True ,
@@ -30,10 +62,10 @@ def load_phi3v(question, image_urls: List[str]):
30
62
for i , _ in enumerate (image_urls , start = 1 ))
31
63
prompt = f"<|user|>\n { placeholders } \n { question } <|end|>\n <|assistant|>\n "
32
64
stop_token_ids = None
33
- return llm , prompt , stop_token_ids , None
65
+ return llm , prompt , stop_token_ids , None , None
34
66
35
67
36
- def load_internvl (question , image_urls : List [str ]):
68
+ def load_internvl (question : str , image_urls : List [str ]):
37
69
model_name = "OpenGVLab/InternVL2-2B"
38
70
39
71
llm = LLM (
@@ -61,7 +93,7 @@ def load_internvl(question, image_urls: List[str]):
61
93
stop_tokens = ["<|endoftext|>" , "<|im_start|>" , "<|im_end|>" , "<|end|>" ]
62
94
stop_token_ids = [tokenizer .convert_tokens_to_ids (i ) for i in stop_tokens ]
63
95
64
- return llm , prompt , stop_token_ids , None
96
+ return llm , prompt , stop_token_ids , None , None
65
97
66
98
67
99
def load_qwen2_vl (question , image_urls : List [str ]):
@@ -111,18 +143,19 @@ def load_qwen2_vl(question, image_urls: List[str]):
111
143
else :
112
144
image_data , _ = process_vision_info (messages )
113
145
114
- return llm , prompt , stop_token_ids , image_data
146
+ return llm , prompt , stop_token_ids , image_data , None
115
147
116
148
117
149
model_example_map = {
118
150
"phi3_v" : load_phi3v ,
119
151
"internvl_chat" : load_internvl ,
120
152
"qwen2_vl" : load_qwen2_vl ,
153
+ "qwen_vl_chat" : load_qwenvl_chat ,
121
154
}
122
155
123
156
124
157
def run_generate (model , question : str , image_urls : List [str ]):
125
- llm , prompt , stop_token_ids , image_data = model_example_map [model ](
158
+ llm , prompt , stop_token_ids , image_data , _ = model_example_map [model ](
126
159
question , image_urls )
127
160
if image_data is None :
128
161
image_data = [fetch_image (url ) for url in image_urls ]
@@ -146,29 +179,32 @@ def run_generate(model, question: str, image_urls: List[str]):
146
179
147
180
148
181
def run_chat (model : str , question : str , image_urls : List [str ]):
149
- llm , _ , stop_token_ids , _ = model_example_map [model ](question , image_urls )
182
+ llm , _ , stop_token_ids , _ , chat_template = model_example_map [model ](
183
+ question , image_urls )
150
184
151
185
sampling_params = SamplingParams (temperature = 0.0 ,
152
186
max_tokens = 128 ,
153
187
stop_token_ids = stop_token_ids )
154
-
155
- outputs = llm .chat ([{
156
- "role" :
157
- "user" ,
158
- "content" : [
159
- {
160
- "type" : "text" ,
161
- "text" : question ,
162
- },
163
- * ({
164
- "type" : "image_url" ,
165
- "image_url" : {
166
- "url" : image_url
188
+ outputs = llm .chat (
189
+ [{
190
+ "role" :
191
+ "user" ,
192
+ "content" : [
193
+ {
194
+ "type" : "text" ,
195
+ "text" : question ,
167
196
},
168
- } for image_url in image_urls ),
169
- ],
170
- }],
171
- sampling_params = sampling_params )
197
+ * ({
198
+ "type" : "image_url" ,
199
+ "image_url" : {
200
+ "url" : image_url
201
+ },
202
+ } for image_url in image_urls ),
203
+ ],
204
+ }],
205
+ sampling_params = sampling_params ,
206
+ chat_template = chat_template ,
207
+ )
172
208
173
209
for o in outputs :
174
210
generated_text = o .outputs [0 ].text
0 commit comments