diff --git a/deepspeed/inference/v2/model_implementations/phi3small/model.py b/deepspeed/inference/v2/model_implementations/phi3small/model.py index 9d5e6a599365..1f1e853fc167 100644 --- a/deepspeed/inference/v2/model_implementations/phi3small/model.py +++ b/deepspeed/inference/v2/model_implementations/phi3small/model.py @@ -108,6 +108,11 @@ def positional_embedding_type(self) -> PositionalEmbeddingType: def positional_embedding_config(self) -> Optional[RotateHalfConfig]: return RotateHalfConfig(theta_base=self._config.rope_embedding_base) + @property + def mup_embedding_multiplier(self) -> float: + return 10.0 + + """ Forward implementations """ @@ -127,6 +132,9 @@ def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor: if embed.shape[-1] != self.model_dim: raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}") + if self.mup_embedding_multiplier > 0.0: + embed = embed * self.mup_embedding_multiplier + return embed def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,