diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 2e32912421d..ed8ddd3c47d 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2311,7 +2311,7 @@ def generate( if input_ids is None: start_tokens = [self.config.text_config.bos_token_id] if getattr(self.config, "image_token_index", None) is not None: - start_tokens += [self.config.image_token_index] * self.config.num_query_tokens + start_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) input_ids = input_ids.repeat(batch_size, 1) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index a63393ab1dd..acce24cc42f 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1593,7 +1593,7 @@ def generate( if input_ids is None: start_tokens = [self.config.text_config.bos_token_id] if getattr(self.config, "image_token_index", None) is not None: - start_tokens += [self.config.image_token_index] * self.config.num_query_tokens + start_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) input_ids = input_ids.repeat(batch_size, 1) diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index e922d1e3f26..e91b05bc015 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -1628,7 +1628,7 @@ def generate( if input_ids is None: start_tokens = [self.config.text_config.bos_token_id] if getattr(self.config, "video_token_index", None) is not None: - start_tokens += [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) input_ids = input_ids.repeat(batch_size, 1) diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 126d81b6d3d..7184955af3a 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -441,7 +441,7 @@ def generate( if input_ids is None: start_tokens = [self.config.text_config.bos_token_id] if getattr(self.config, "video_token_index", None) is not None: - start_tokens += [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) input_ids = input_ids.repeat(batch_size, 1)