@@ -57,10 +57,11 @@ def __repr__(self) -> str:
57
57
def from_pretrained (cls , pretrained_model_name_or_path : str , is_tlm : bool = False , * args , ** kwargs ):
58
58
if kwargs .get ("attn_implementation" , None ) not in {None , "eager" }:
59
59
logger .warning ('Updating attn_implementation="eager"' )
60
- kwargs . update ({ "attn_implementation" : "eager" })
60
+
61
61
if kwargs .get ("low_cpu_mem_usage" , None ):
62
62
logger .warning ("Updating low_cpu_mem_usage=False" )
63
- kwargs .update ({"low_cpu_mem_usage" : False })
63
+
64
+ kwargs .update ({"attn_implementation" : "eager" , "low_cpu_mem_usage" : False })
64
65
65
66
model = cls ._hf_auto_class .from_pretrained (pretrained_model_name_or_path , * args , ** kwargs )
66
67
return cls (model , is_tlm = is_tlm )
@@ -430,20 +431,16 @@ class QEFFAutoModel(QEFFTransformersBase):
430
431
_pytorch_transforms = [CustomOpsTransform , AwqToMatmulNbitsTransform , GPTQToMatmulNbitsTransform ]
431
432
_onnx_transforms = [FP16ClipTransform , SplitTensorsTransform ]
432
433
433
- def __init__ (self , model : nn .Module , ** kwargs ):
434
- if kwargs .get ("block_size" , None ):
435
- constants .BLOCK_SIZE = kwargs .get ("block_size" )
436
- self ._pytorch_transforms .append (BlockAttentionTransorm )
437
- kwargs .update ({"attn_implementation" : "custom" })
438
- kwargs .pop ("block_size" )
439
-
434
+ def __init__ (self , model : nn .Module , block_size : Optional [int ] = None , ** kwargs ):
435
+ if block_size :
436
+ BlockAttentionTransorm .apply (model , block_size = block_size )
440
437
super ().__init__ (model )
441
438
self .model .config .use_cache = True
442
439
self .num_layers = model .config .num_hidden_layers
443
440
444
441
@classmethod
445
442
@with_replaced_quantizers
446
- def from_pretrained (cls , pretrained_model_name_or_path , * args , ** kwargs ):
443
+ def from_pretrained (cls , pretrained_model_name_or_path , block_size : Optional [ int ] = None , * args , ** kwargs ):
447
444
"""
448
445
This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
449
446
Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
@@ -470,28 +467,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
470
467
# You can now execute the model
471
468
model.generate(inputs)
472
469
"""
473
- if kwargs .get ("block_size" , None ):
474
- constants .BLOCK_SIZE = kwargs .get ("block_size" )
475
- cls ._pytorch_transforms .append (BlockAttentionTransorm )
476
- kwargs .update ({"attn_implementation" : "custom" })
477
- kwargs .pop ("block_size" )
478
-
479
- if kwargs .get ("attn_implementation" , None ) not in {None , "eager" , "custom" }:
470
+ if kwargs .get ("attn_implementation" , None ) not in {None , "eager" }:
480
471
logger .warning ('Updating attn_implementation="eager"' )
481
- kwargs .update ({"attn_implementation" : "eager" })
482
472
483
473
if kwargs .get ("low_cpu_mem_usage" , None ):
484
474
logger .warning ("Updating low_cpu_mem_usage=False" )
485
- kwargs .update ({"low_cpu_mem_usage" : False })
486
475
476
+ kwargs .update ({"attn_implementation" : "eager" , "low_cpu_mem_usage" : False , "add_pooling_layer" : False })
487
477
try :
488
- kwargs .update ({"add_pooling_layer" : False })
489
478
model = cls ._hf_auto_class .from_pretrained (pretrained_model_name_or_path , * args , ** kwargs )
490
479
warnings .warn ("Removing pooling layer from the model if exist" )
491
480
except TypeError :
492
481
kwargs .pop ("add_pooling_layer" , None )
493
482
model = cls ._hf_auto_class .from_pretrained (pretrained_model_name_or_path , * args , ** kwargs )
494
- return cls (model )
483
+ return cls (model , block_size )
495
484
496
485
@property
497
486
def model_hash (self ) -> str :
0 commit comments