diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 07a71109f4895..892ad8a58b8f6 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -278,6 +278,7 @@ def __init__( self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size, + quant_config=quant_config, ) else: self.embed_tokens = PPMissingLayer() @@ -424,8 +425,9 @@ def forward( attn_metadata, intermediate_tensors) return model_output - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: + def compute_logits( + self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) logits /= self.config.logits_scaling