Skip to content

Commit a1218a6

Browse files
sayakpaulanijain2305
authored andcommitted
Merge branch 'main' into compile_utils
2 parents 75e665b + f20b83a commit a1218a6

39 files changed

+164
-85
lines changed

src/diffusers/models/modeling_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,8 +1421,10 @@ def compile_repeated_blocks(self, *args, **kwargs):
14211421
class name matches will be compiled.
14221422
14231423
2. **`_no_split_modules`** – Fallback. If the preferred attribute is
1424-
missing or empty, we fall back to the legacy Diffusers attribute
1425-
`_no_split_modules`.
1424+
missing or empty, we fall back to the Diffusers attribute
1425+
`_no_split_modules`. The original purpose of this attribute is to
1426+
prevent splitting when device_map is present, but they can also act as a
1427+
good proxy for repeated blocks.
14261428
14271429
Once discovered, each matching sub-module is compiled by calling
14281430
`submodule.compile(*args, **kwargs)`. Any positional or keyword
@@ -1445,8 +1447,8 @@ class name matches will be compiled.
14451447
has_compiled_region = False
14461448
for submod in self.modules():
14471449
if submod.__class__.__name__ in repeated_blocks:
1448-
has_compiled_region = True
14491450
submod.compile(*args, **kwargs)
1451+
has_compiled_region = True
14501452

14511453
if not has_compiled_region:
14521454
raise ValueError(

src/diffusers/models/transformers/transformer_ltx.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,9 @@ class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin
328328

329329
_supports_gradient_checkpointing = True
330330
_skip_layerwise_casting_patterns = ["norm"]
331+
_repeated_blocks = [
332+
"LTXVideoTransformerBlock",
333+
]
331334

332335
@register_to_config
333336
def __init__(

src/diffusers/models/unets/unet_2d_condition.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,9 @@ class conditioning with `class_embed_type` equal to `None`.
167167
_supports_gradient_checkpointing = True
168168
_no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
169169
_skip_layerwise_casting_patterns = ["norm"]
170+
_repeated_blocks = [
171+
"BasicTransformerBlock",
172+
]
170173

171174
@register_to_config
172175
def __init__(

src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
replace_example_docstring,
4242
)
4343
from ...utils.import_utils import is_transformers_version
44-
from ...utils.torch_utils import randn_tensor
44+
from ...utils.torch_utils import empty_device_cache, randn_tensor
4545
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
4646
from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
4747

@@ -267,9 +267,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
267267

268268
if self.device.type != "cpu":
269269
self.to("cpu", silence_dtype_warnings=True)
270-
device_mod = getattr(torch, device.type, None)
271-
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
272-
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
270+
empty_device_cache(device.type)
273271

274272
model_sequence = [
275273
self.text_encoder.text_model,

src/diffusers/pipelines/consisid/consisid_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def prepare_face_models(model_path, device, dtype):
294294
295295
Parameters:
296296
- model_path: Path to the directory containing model files.
297-
- device: The device (e.g., 'cuda', 'cpu') where models will be loaded.
297+
- device: The device (e.g., 'cuda', 'xpu', 'cpu') where models will be loaded.
298298
- dtype: Data type (e.g., torch.float32) for model inference.
299299
300300
Returns:

src/diffusers/pipelines/controlnet/pipeline_controlnet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
scale_lora_layers,
3838
unscale_lora_layers,
3939
)
40-
from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
40+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, is_torch_version, randn_tensor
4141
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
4242
from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
4343
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1339,7 +1339,7 @@ def __call__(
13391339
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
13401340
self.unet.to("cpu")
13411341
self.controlnet.to("cpu")
1342-
torch.cuda.empty_cache()
1342+
empty_device_cache()
13431343

13441344
if not output_type == "latent":
13451345
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[

src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
scale_lora_layers,
3737
unscale_lora_layers,
3838
)
39-
from ...utils.torch_utils import is_compiled_module, randn_tensor
39+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
4040
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
4141
from ..stable_diffusion import StableDiffusionPipelineOutput
4242
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1311,7 +1311,7 @@ def __call__(
13111311
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
13121312
self.unet.to("cpu")
13131313
self.controlnet.to("cpu")
1314-
torch.cuda.empty_cache()
1314+
empty_device_cache()
13151315

13161316
if not output_type == "latent":
13171317
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[

src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
scale_lora_layers,
3939
unscale_lora_layers,
4040
)
41-
from ...utils.torch_utils import is_compiled_module, randn_tensor
41+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
4242
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
4343
from ..stable_diffusion import StableDiffusionPipelineOutput
4444
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -1500,7 +1500,7 @@ def __call__(
15001500
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
15011501
self.unet.to("cpu")
15021502
self.controlnet.to("cpu")
1503-
torch.cuda.empty_cache()
1503+
empty_device_cache()
15041504

15051505
if not output_type == "latent":
15061506
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[

src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
scale_lora_layers,
5252
unscale_lora_layers,
5353
)
54-
from ...utils.torch_utils import is_compiled_module, randn_tensor
54+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
5555
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
5656
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
5757

@@ -1858,7 +1858,7 @@ def denoising_value_valid(dnv):
18581858
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
18591859
self.unet.to("cpu")
18601860
self.controlnet.to("cpu")
1861-
torch.cuda.empty_cache()
1861+
empty_device_cache()
18621862

18631863
if not output_type == "latent":
18641864
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]

src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1465,7 +1465,11 @@ def __call__(
14651465

14661466
# Relevant thread:
14671467
# https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
1468-
if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
1468+
if (
1469+
torch.cuda.is_available()
1470+
and (is_unet_compiled and is_controlnet_compiled)
1471+
and is_torch_higher_equal_2_1
1472+
):
14691473
torch._inductor.cudagraph_mark_step_begin()
14701474
# expand the latents if we are doing classifier free guidance
14711475
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents

src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
scale_lora_layers,
5454
unscale_lora_layers,
5555
)
56-
from ...utils.torch_utils import is_compiled_module, randn_tensor
56+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
5757
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
5858
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
5959

@@ -921,7 +921,7 @@ def prepare_latents(
921921
# Offload text encoder if `enable_model_cpu_offload` was enabled
922922
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
923923
self.text_encoder_2.to("cpu")
924-
torch.cuda.empty_cache()
924+
empty_device_cache()
925925

926926
image = image.to(device=device, dtype=dtype)
927927

@@ -1632,7 +1632,7 @@ def __call__(
16321632
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
16331633
self.unet.to("cpu")
16341634
self.controlnet.to("cpu")
1635-
torch.cuda.empty_cache()
1635+
empty_device_cache()
16361636

16371637
if not output_type == "latent":
16381638
# make sure the VAE is in float32 mode, as it overflows in float16

src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
scale_lora_layers,
5252
unscale_lora_layers,
5353
)
54-
from ...utils.torch_utils import is_compiled_module, randn_tensor
54+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
5555
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
5656
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
5757

@@ -1766,7 +1766,7 @@ def denoising_value_valid(dnv):
17661766
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
17671767
self.unet.to("cpu")
17681768
self.controlnet.to("cpu")
1769-
torch.cuda.empty_cache()
1769+
empty_device_cache()
17701770

17711771
if not output_type == "latent":
17721772
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]

src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
scale_lora_layers,
5454
unscale_lora_layers,
5555
)
56-
from ...utils.torch_utils import is_compiled_module, randn_tensor
56+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, randn_tensor
5757
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
5858
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
5959

@@ -876,7 +876,7 @@ def prepare_latents(
876876
# Offload text encoder if `enable_model_cpu_offload` was enabled
877877
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
878878
self.text_encoder_2.to("cpu")
879-
torch.cuda.empty_cache()
879+
empty_device_cache()
880880

881881
image = image.to(device=device, dtype=dtype)
882882

@@ -1574,7 +1574,7 @@ def __call__(
15741574
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
15751575
self.unet.to("cpu")
15761576
self.controlnet.to("cpu")
1577-
torch.cuda.empty_cache()
1577+
empty_device_cache()
15781578

15791579
if not output_type == "latent":
15801580
# make sure the VAE is in float32 mode, as it overflows in float16

src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
scale_lora_layers,
3737
unscale_lora_layers,
3838
)
39-
from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
39+
from ...utils.torch_utils import empty_device_cache, is_compiled_module, is_torch_version, randn_tensor
4040
from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
4141
from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
4242
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -853,7 +853,7 @@ def __call__(
853853
for i, t in enumerate(timesteps):
854854
# Relevant thread:
855855
# https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
856-
if is_controlnet_compiled and is_torch_higher_equal_2_1:
856+
if torch.cuda.is_available() and is_controlnet_compiled and is_torch_higher_equal_2_1:
857857
torch._inductor.cudagraph_mark_step_begin()
858858
# expand the latents if we are doing classifier free guidance
859859
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -902,7 +902,7 @@ def __call__(
902902
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
903903
self.unet.to("cpu")
904904
self.controlnet.to("cpu")
905-
torch.cuda.empty_cache()
905+
empty_device_cache()
906906

907907
if not output_type == "latent":
908908
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[

src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def __init__(
193193
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
194194
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
195195

196-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
196+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
197197
r"""
198198
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
199199
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
@@ -411,7 +411,7 @@ def __init__(
411411
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
412412
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
413413

414-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
414+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
415415
r"""
416416
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
417417
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -652,7 +652,7 @@ def __init__(
652652
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
653653
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
654654

655-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
655+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
656656
r"""
657657
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
658658
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a

src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def __init__(
179179
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
180180
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
181181

182-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
182+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
183183
r"""
184184
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
185185
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -407,7 +407,7 @@ def __init__(
407407
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
408408
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
409409

410-
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
410+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
411411
r"""
412412
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
413413
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
@@ -417,7 +417,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
417417
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
418418
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
419419

420-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
420+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
421421
r"""
422422
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
423423
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -656,7 +656,7 @@ def __init__(
656656
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
657657
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
658658

659-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
659+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
660660
r"""
661661
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
662662
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a

src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
2626
from ...schedulers import KarrasDiffusionSchedulers
2727
from ...utils import is_torch_xla_available, logging, replace_example_docstring
28-
from ...utils.torch_utils import randn_tensor
28+
from ...utils.torch_utils import empty_device_cache, randn_tensor
2929
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
3030
from .pipeline_output import KolorsPipelineOutput
3131
from .text_encoder import ChatGLMModel
@@ -618,7 +618,7 @@ def prepare_latents(
618618
# Offload text encoder if `enable_model_cpu_offload` was enabled
619619
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
620620
self.text_encoder_2.to("cpu")
621-
torch.cuda.empty_cache()
621+
empty_device_cache()
622622

623623
image = image.to(device=device, dtype=dtype)
624624

src/diffusers/pipelines/musicldm/pipeline_musicldm.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
logging,
3636
replace_example_docstring,
3737
)
38-
from ...utils.torch_utils import randn_tensor
38+
from ...utils.torch_utils import empty_device_cache, get_device, randn_tensor
3939
from ..pipeline_utils import AudioPipelineOutput, DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
4040

4141

@@ -397,20 +397,22 @@ def prepare_latents(self, batch_size, num_channels_latents, height, dtype, devic
397397
def enable_model_cpu_offload(self, gpu_id=0):
398398
r"""
399399
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
400-
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
401-
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
402-
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
400+
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
401+
`forward` method is called, and the model remains in accelerator until the next model runs. Memory savings are
402+
lower than with `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution
403+
of the `unet`.
403404
"""
404405
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
405406
from accelerate import cpu_offload_with_hook
406407
else:
407408
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
408409

409-
device = torch.device(f"cuda:{gpu_id}")
410+
device_type = get_device()
411+
device = torch.device(f"{device_type}:{gpu_id}")
410412

411413
if self.device.type != "cpu":
412414
self.to("cpu", silence_dtype_warnings=True)
413-
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
415+
empty_device_cache() # otherwise we don't see the memory savings (but they probably exist)
414416

415417
model_sequence = [
416418
self.text_encoder.text_model,

0 commit comments

Comments
 (0)