patch_unsloth.py

from typing import Optional, List, Union, Tuple

import torch
from peft import PeftModelForCausalLM
from transformers.modeling_outputs import BaseModelOutputWithPast
from unsloth.kernels.rms_layernorm import _rms_layernorm_backward
from unsloth.models.llama import \
    PeftModelForCausalLM_fast_forward, LlamaRotaryEmbedding, LlamaAttention_fast_forward
from unsloth.models.mistral import MistralAttention_fast_forward
from unsloth.kernels import rms_layernorm, fast_rms_layernorm
from xformers.ops.fmha import attn_bias

import modeling_mistral
from modeling_mistral import MistralAttention, MistralSdpaAttention, MistralFlashAttention2, MistralDecoderLayer, \
    MistralModel


@staticmethod
def pre_patch():
    MistralAttention.forward = MistralAttention_fast_forward
    MistralSdpaAttention.forward = MistralAttention_fast_forward
    MistralFlashAttention2.forward = MistralAttention_fast_forward
    MistralDecoderLayer.forward = LlamaDecoderLayer_fast_forward
    MistralModel.forward = LlamaModel_fast_forward
    PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward

    # Solves https://github.com/unslothai/unsloth/issues/168
    # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
    # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
    # https://github.com/huggingface/transformers/pull/27931
    # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
    modeling_mistral.MistralRotaryEmbedding = LlamaRotaryEmbedding
    return


pass


# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
def LlamaDecoderLayer_fast_forward(
        self,
        hidden_states: torch.Tensor,
        causal_mask: Optional[attn_bias.BlockDiagonalCausalMask] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        padding_mask: Optional[torch.LongTensor] = None,
        *args, **kwargs,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
    """
    Args:
        hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
        attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
            (see `past_key_values`).
        past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    """

    residual = hidden_states
    hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
        hidden_states=hidden_states,
        causal_mask=causal_mask,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=output_attentions,
        use_cache=use_cache,
        padding_mask=padding_mask,
    )
    hidden_states = residual + hidden_states

    # Fully Connected
    residual = hidden_states
    hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states)
    hidden_states = self.mlp(hidden_states)
    hidden_states = residual + hidden_states

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (self_attn_weights,)

    if use_cache:
        outputs += (present_key_value,)

    return outputs
    pass


def LlamaModel_fast_forward(
        self,
        input_ids: torch.LongTensor,
        causal_mask: Optional[attn_bias.BlockDiagonalCausalMask] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        *args, **kwargs,
) -> Union[Tuple, BaseModelOutputWithPast]:
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    assert (output_attentions is False)
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # retrieve input_ids and inputs_embeds
    if input_ids is not None and inputs_embeds is not None:
        raise ValueError(
            "Unsloth: You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
    elif input_ids is not None:
        batch_size, seq_length = input_ids.shape
    elif inputs_embeds is not None:
        batch_size, seq_length, _ = inputs_embeds.shape
    else:
        raise ValueError("Unsloth: You have to specify either decoder_input_ids or decoder_inputs_embeds")

    seq_length_with_past = seq_length

    # Fix out of bounds tokenization
    if hasattr(self, "max_seq_length"):
        if seq_length > self.max_seq_length:
            modeling_mistral.logger.warning_once(
                f"Unsloth: Input IDs of length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n" \
                "We shall truncate it ourselves. It's imperative if you correct this issue first."
            )
        if input_ids is not None:
            input_ids = input_ids[:, :self.max_seq_length]
        elif inputs_embeds is not None:
            inputs_embeds = inputs_embeds[:, :self.max_seq_length, :]
        pass
    pass

    past_key_values_length = 0

    if past_key_values is not None:
        past_key_values_length = past_key_values[0][0].shape[2]
        seq_length_with_past = seq_length_with_past + past_key_values_length
    pass

    # We already handle KV cache position_ids ourselves.
    if False:  # (past_key_values_length != 0):
        position_ids = torch.arange(
            past_key_values_length, seq_length + past_key_values_length,
            dtype=torch.int32,
            device="cuda",
        )
        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
    elif position_ids is not None:
        position_ids = position_ids.view(-1, seq_length).to(torch.int32)  # .long()
    else:
        position_ids = None
    pass

    if position_ids is not None:
        if position_ids.shape[0] != batch_size:
            position_ids = position_ids.repeat((batch_size, 1))
    pass

    # Embed positions
    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)

    inputs_embeds = inputs_embeds.to(self.config.torch_dtype)

    # Normalized from Gemma
    IS_GEMMA = self.config.model_type == "gemma"
    train_embed_tokens = self.embed_tokens.weight.requires_grad

    if IS_GEMMA:
        # Match Gemma exactly by casting to bfloat16 / float16
        # inputs_embeds *= math_sqrt(self.config.hidden_size)
        # Ie 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
        # &  2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
        normalizer = torch.tensor(math_sqrt(self.config.hidden_size), dtype=inputs_embeds.dtype)

        if train_embed_tokens:
            # Careful we must not do an inplace op!
            inputs_embeds = inputs_embeds * normalizer
        else:
            inputs_requires_grad = inputs_embeds.requires_grad
            if not inputs_embeds.is_leaf:
                inputs_embeds = inputs_embeds.detach()
                inputs_requires_grad = True
            elif inputs_requires_grad:
                inputs_embeds.requires_grad_(False)
            pass
            inputs_embeds *= normalizer
            # inputs_embeds *= math_sqrt(self.config.hidden_size)
            if inputs_requires_grad: inputs_embeds.requires_grad_(True)
        pass
    pass

    # Fix up attention mask by setting elements to 0
    # Specifically for DPO
    if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
            (not train_embed_tokens):
        # Careful for inference the attention_mask is size (1, kv_seq_len)
        # Whilst the input_embeds is size (1, 1, 4096)
        inputs_requires_grad = inputs_embeds.requires_grad
        if not inputs_embeds.is_leaf:
            inputs_embeds = inputs_embeds.detach()
            inputs_requires_grad = True
        elif inputs_requires_grad:
            inputs_embeds.requires_grad_(False)
        pass
        inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2)
        if inputs_requires_grad: inputs_embeds.requires_grad_(True)
    pass

    # Ignore attention_mask
    if attention_mask is None:
        padding_mask = None
    elif self.training:
        attention_mask = None
        padding_mask = None
    else:
        # if 0 in attention_mask:
        #     padding_mask = attention_mask
        # else:
        padding_mask = None

        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
            attention_mask,
            (batch_size, seq_length),
            inputs_embeds,
            past_key_values_length,
            sliding_window=getattr(self.config, "sliding_window", None),
        )
    pass

    hidden_states = inputs_embeds

    # decoder layers
    all_hidden_states = () if output_hidden_states else None
    all_self_attns = () if output_attentions else None
    next_decoder_cache = () if use_cache else None

    # Gradient checkpointing methods (ie sqrt)
    if hasattr(self, "_gradient_checkpointing_boundaries"):
        boundaries = self._gradient_checkpointing_boundaries
    else:
        boundaries = None
    pass

    for idx, decoder_layer in enumerate(self.layers):
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        past_key_value = past_key_values[idx] if past_key_values is not None else None

        if self.gradient_checkpointing and self.training:

            def create_custom_forward(module):
                def custom_forward(*inputs):
                    # None for past_key_value
                    return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask)

                return custom_forward

            layer_outputs = torch.utils.checkpoint.checkpoint(
                create_custom_forward(decoder_layer),
                hidden_states,
                causal_mask,
                attention_mask,
                position_ids,
                use_reentrant=True,
                preserve_rng_state=False,
            )
        else:
            layer_outputs = decoder_layer(
                hidden_states,
                causal_mask=causal_mask,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
                padding_mask=padding_mask,
            )

        hidden_states = layer_outputs[0]

        if use_cache:
            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

        if output_attentions:
            all_self_attns += (layer_outputs[1],)
    pass

    hidden_states = rms_layernorm.fast_rms_layernorm(self.norm, hidden_states, gemma=IS_GEMMA)

    # add hidden states from the last decoder layer
    if output_hidden_states:
        all_hidden_states += (hidden_states,)

    next_cache = next_decoder_cache if use_cache else None
    if not return_dict:
        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=next_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attns,
    )


pass


@staticmethod
def pre_patch_llama():
    MistralAttention.forward = LlamaAttention_fast_forward
    MistralSdpaAttention.forward = LlamaAttention_fast_forward
    MistralFlashAttention2.forward = LlamaAttention_fast_forward
    MistralDecoderLayer.forward = LlamaDecoderLayer_fast_forward
    MistralModel.forward = LlamaModel_fast_forward
    PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward

    # Solves https://github.com/unslothai/unsloth/issues/168
    # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
    # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
    # https://github.com/huggingface/transformers/pull/27931
    # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
    import transformers.models.llama.modeling_llama
    modeling_mistral.MistralRotaryEmbedding = LlamaRotaryEmbedding
    return


pass


@staticmethod
def backward(ctx, dY):
    shape = dY.shape
    dim = shape[-1]
    dY = dY.reshape(-1, dim)
    X, W, r = ctx.saved_tensors
    n_rows, n_cols = dY.shape
    dW = X

    _rms_layernorm_backward[(n_rows,)](
        dY, dY.stride(0),
        X, X.stride(0),
        W, W.stride(0),
        r, r.stride(0),
        dW, dW.stride(0),
        n_cols, ctx.eps,
        GEMMA=ctx.GEMMA,
        BLOCK_SIZE=ctx.BLOCK_SIZE,
        num_warps=ctx.num_warps,
    )
    dX = dY.view(*shape)
    return dX, None, None, None
pass


def patch():
    from unsloth.models import mistral, llama
    from unsloth import kernels
    mistral.FastMistralModel.pre_patch = pre_patch
    llama.FastLlamaModel.pre_patch = pre_patch_llama
    kernels.rms_layernorm.Fast_RMS_Layernorm.backward = backward