[SD3] Fix mis-matched shape when num_images_per_prompt > 1 using without T5 (text_encoder_3=None) (#8558)

Dalanke · yiyixuxu · yiyixuxu · yiyixuxu · commit a0a542702869 · 2024-06-20T11:10:11.000-10:00
* fix shape mismatch when num_images_per_prompt &gt; 1 and text_encoder_3=None

* style

* fix copies

---------

Co-authored-by: YiYi Xu &lt;yixu310@gmail.com&gt;
Co-authored-by: yiyixuxu &lt;yixu310@gmail,com&gt;
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -217,7 +217,11 @@ def _get_t5_prompt_embeds(
 
         if self.text_encoder_3 is None:
             return torch.zeros(
-                (batch_size, self.tokenizer_max_length, self.transformer.config.joint_attention_dim),
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
                 device=device,
                 dtype=dtype,
             )
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
@@ -232,7 +232,11 @@ def _get_t5_prompt_embeds(
 
         if self.text_encoder_3 is None:
             return torch.zeros(
-                (batch_size, self.tokenizer_max_length, self.transformer.config.joint_attention_dim),
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
                 device=device,
                 dtype=dtype,
             )