Formatting multihead_attention.py

Summary: Just formatting so review could be easier in the diff above. Reviewed By: myleott Differential Revision: D18799003 fbshipit-source-id: eb9dd385251edd7c60a285d7f576a7dad888db7b
facebookresearch · Jan 22, 2020 · 2535cab · 2535cab
1 parent 6b54873
commit 2535cab
Show file tree

Hide file tree

Showing 3 changed files with 248 additions and 158 deletions.
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
@@ -4,12 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
+
 import torch
-from torch import nn
-from torch.nn import Parameter
 import torch.nn.functional as F
-
 from fairseq import utils
+from torch import nn
+from torch.nn import Parameter
 
 
 class MultiheadAttention(nn.Module):
@@ -18,9 +18,19 @@ class MultiheadAttention(nn.Module):
     See "Attention Is All You Need" for more details.
     """
 
-    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
-                 add_bias_kv=False, add_zero_attn=False, self_attention=False,
-                 encoder_decoder_attention=False):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+    ):
         super().__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -30,14 +40,17 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
         self.scaling = self.head_dim ** -0.5
 
         self.self_attention = self_attention
         self.encoder_decoder_attention = encoder_decoder_attention
 
-        assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
-                                                             'value to be of the same size'
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
 
         self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
         self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
@@ -70,9 +83,9 @@ def reset_parameters(self):
         if self.qkv_same_dim:
             # Empirically observed the convergence to be much better with
             # the scaled initialization
-            nn.init.xavier_uniform_(self.k_proj.weight, gain=1/math.sqrt(2))
-            nn.init.xavier_uniform_(self.v_proj.weight, gain=1/math.sqrt(2))
-            nn.init.xavier_uniform_(self.q_proj.weight, gain=1/math.sqrt(2))
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
         else:
             nn.init.xavier_uniform_(self.k_proj.weight)
             nn.init.xavier_uniform_(self.v_proj.weight)
@@ -88,7 +101,9 @@ def reset_parameters(self):
 
     def forward(
         self,
-        query, key, value,
+        query,
+        key,
+        value,
         key_padding_mask=None,
         incremental_state=None,
         need_weights=True,
@@ -121,23 +136,39 @@ def forward(
         assert embed_dim == self.embed_dim
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
 
-        if self.enable_torch_version and not self.onnx_trace and incremental_state is None and not static_kv:
-            return F.multi_head_attention_forward(query, key, value,
-                                                  self.embed_dim, self.num_heads,
-                                                  torch.empty([0]),
-                                                  torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
-                                                  self.bias_k, self.bias_v,
-                                                  self.add_zero_attn, self.dropout,
-                                                  self.out_proj.weight, self.out_proj.bias,
-                                                  self.training, key_padding_mask, need_weights,
-                                                  attn_mask, use_separate_proj_weight=True,
-                                                  q_proj_weight=self.q_proj.weight,
-                                                  k_proj_weight=self.k_proj.weight,
-                                                  v_proj_weight=self.v_proj.weight)
+        if (
+            self.enable_torch_version
+            and not self.onnx_trace
+            and incremental_state is None
+            and not static_kv
+        ):
+            return F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                torch.empty([0]),
+                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training,
+                key_padding_mask,
+                need_weights,
+                attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+            )
 
         if incremental_state is not None:
             saved_state = self._get_input_buffer(incremental_state)
-            if 'prev_key' in saved_state:
+            if "prev_key" in saved_state:
                 # previous time steps are cached - no need to recompute
                 # key and value if they are static
                 if static_kv:
@@ -171,42 +202,65 @@ def forward(
             k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
             v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
             if attn_mask is not None:
-                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
             if key_padding_mask is not None:
                 key_padding_mask = torch.cat(
-                    [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
-
-        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
         if k is not None:
-            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
         if v is not None:
-            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
 
         if saved_state is not None:
             # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
-            if 'prev_key' in saved_state:
-                prev_key = saved_state['prev_key'].view(bsz * self.num_heads, -1, self.head_dim)
+            if "prev_key" in saved_state:
+                prev_key = saved_state["prev_key"].view(
+                    bsz * self.num_heads, -1, self.head_dim
+                )
                 if static_kv:
                     k = prev_key
                 else:
                     k = torch.cat((prev_key, k), dim=1)
-            if 'prev_value' in saved_state:
-                prev_value = saved_state['prev_value'].view(bsz * self.num_heads, -1, self.head_dim)
+            if "prev_value" in saved_state:
+                prev_value = saved_state["prev_value"].view(
+                    bsz * self.num_heads, -1, self.head_dim
+                )
                 if static_kv:
                     v = prev_value
                 else:
                     v = torch.cat((prev_value, v), dim=1)
             key_padding_mask = self._append_prev_key_padding_mask(
                 key_padding_mask=key_padding_mask,
-                prev_key_padding_mask=saved_state.get('prev_key_padding_mask', None),
+                prev_key_padding_mask=saved_state.get("prev_key_padding_mask", None),
                 batch_size=bsz,
                 src_len=k.size(1),
                 static_kv=static_kv,
             )
 
-            saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim)
-            saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim)
-            saved_state['prev_key_padding_mask'] = key_padding_mask
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
 
             self._set_input_buffer(incremental_state, saved_state)
 
@@ -226,10 +280,19 @@ def forward(
             k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
             v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
             if attn_mask is not None:
-                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
             if key_padding_mask is not None:
                 key_padding_mask = torch.cat(
-                    [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(
+                            key_padding_mask
+                        ),
+                    ],
+                    dim=1,
+                )
 
         attn_weights = torch.bmm(q, k.transpose(1, 2))
         attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
@@ -246,21 +309,26 @@ def forward(
             # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.masked_fill(
-                key_padding_mask.unsqueeze(1).unsqueeze(2),
-                float('-inf'),
+                key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf")
             )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if before_softmax:
             return attn_weights, v
 
-        attn_weights_float = utils.softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace)
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
         attn_weights = attn_weights_float.type_as(attn_weights)
-        attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
+        attn_probs = F.dropout(
+            attn_weights_float.type_as(attn_weights),
+            p=self.dropout,
+            training=self.training,
+        )
 
         attn = torch.bmm(attn_probs, v)
         assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
-        if (self.onnx_trace and attn.size(1) == 1):
+        if self.onnx_trace and attn.size(1) == 1:
             # when ONNX tracing a single decoder step (sequence length == 1)
             # the transpose is a no-op copy before view, thus unnecessary
             attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
@@ -269,7 +337,9 @@ def forward(
         attn = self.out_proj(attn)
 
         if need_weights:
-            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
             if not need_head_weights:
                 # average attention weights over heads
                 attn_weights = attn_weights.mean(dim=0)
@@ -280,22 +350,22 @@ def forward(
 
     @staticmethod
     def _append_prev_key_padding_mask(
-        key_padding_mask,
-        prev_key_padding_mask,
-        batch_size,
-        src_len,
-        static_kv,
+        key_padding_mask, prev_key_padding_mask, batch_size, src_len, static_kv
     ):
         # saved key padding masks have shape (bsz, seq_len)
         if prev_key_padding_mask is not None and static_kv:
             key_padding_mask = prev_key_padding_mask
         elif prev_key_padding_mask is not None and key_padding_mask is not None:
-            key_padding_mask = torch.cat((prev_key_padding_mask, key_padding_mask), dim=1)
+            key_padding_mask = torch.cat(
+                (prev_key_padding_mask, key_padding_mask), dim=1
+            )
         # During incremental decoding, as the padding token enters and
         # leaves the frame, there will be a time when prev or current
         # is None
         elif prev_key_padding_mask is not None:
-            filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1)).bool()
+            filler = torch.zeros(
+                batch_size, src_len - prev_key_padding_mask.size(1)
+            ).bool()
             if prev_key_padding_mask.is_cuda:
                 filler = filler.cuda()
             key_padding_mask = torch.cat((prev_key_padding_mask, filler), dim=1)
@@ -316,45 +386,38 @@ def reorder_incremental_state(self, incremental_state, new_order):
             self._set_input_buffer(incremental_state, input_buffer)
 
     def _get_input_buffer(self, incremental_state):
-        return utils.get_incremental_state(
-            self,
-            incremental_state,
-            'attn_state',
-        ) or {}
+        return utils.get_incremental_state(self, incremental_state, "attn_state") or {}
 
     def _set_input_buffer(self, incremental_state, buffer):
-        utils.set_incremental_state(
-            self,
-            incremental_state,
-            'attn_state',
-            buffer,
-        )
+        utils.set_incremental_state(self, incremental_state, "attn_state", buffer)
 
     def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
         return attn_weights
 
     def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + '.' if name != '' else ''
+        prefix = name + "." if name != "" else ""
         items_to_add = {}
         keys_to_remove = []
         for k in state_dict.keys():
-            if k.endswith(prefix + 'in_proj_weight'):
+            if k.endswith(prefix + "in_proj_weight"):
                 # in_proj_weight used to be q + k + v with same dimensions
                 dim = int(state_dict[k].shape[0] / 3)
-                items_to_add[prefix + 'q_proj.weight'] = state_dict[k][:dim]
-                items_to_add[prefix + 'k_proj.weight'] = state_dict[k][dim:2*dim]
-                items_to_add[prefix + 'v_proj.weight'] = state_dict[k][2*dim:]
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
 
                 keys_to_remove.append(k)
 
-                k_bias = prefix + 'in_proj_bias'
+                k_bias = prefix + "in_proj_bias"
                 if k_bias in state_dict.keys():
                     dim = int(state_dict[k].shape[0] / 3)
-                    items_to_add[prefix + 'q_proj.bias'] = state_dict[k_bias][:dim]
-                    items_to_add[prefix + 'k_proj.bias'] = state_dict[k_bias][dim:2*dim]
-                    items_to_add[prefix + 'v_proj.bias'] = state_dict[k_bias][2*dim:]
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim : 2 * dim
+                    ]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
 
-                    keys_to_remove.append(prefix + 'in_proj_bias')
+                    keys_to_remove.append(prefix + "in_proj_bias")
 
         for k in keys_to_remove:
             del state_dict[k]