Skip to content

Commit

Permalink
Formatting multihead_attention.py
Browse files Browse the repository at this point in the history
Summary: Just formatting so review could be easier in the diff above.

Reviewed By: myleott

Differential Revision: D18799003

fbshipit-source-id: eb9dd385251edd7c60a285d7f576a7dad888db7b
  • Loading branch information
cndn authored and facebook-github-bot committed Jan 22, 2020
1 parent 6b54873 commit 2535cab
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 158 deletions.
217 changes: 140 additions & 77 deletions fairseq/modules/multihead_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
# LICENSE file in the root directory of this source tree.

import math

import torch
from torch import nn
from torch.nn import Parameter
import torch.nn.functional as F

from fairseq import utils
from torch import nn
from torch.nn import Parameter


class MultiheadAttention(nn.Module):
Expand All @@ -18,9 +18,19 @@ class MultiheadAttention(nn.Module):
See "Attention Is All You Need" for more details.
"""

def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
add_bias_kv=False, add_zero_attn=False, self_attention=False,
encoder_decoder_attention=False):
def __init__(
self,
embed_dim,
num_heads,
kdim=None,
vdim=None,
dropout=0.0,
bias=True,
add_bias_kv=False,
add_zero_attn=False,
self_attention=False,
encoder_decoder_attention=False,
):
super().__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
Expand All @@ -30,14 +40,17 @@ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
assert (
self.head_dim * num_heads == self.embed_dim
), "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5

self.self_attention = self_attention
self.encoder_decoder_attention = encoder_decoder_attention

assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
'value to be of the same size'
assert not self.self_attention or self.qkv_same_dim, (
"Self-attention requires query, key and " "value to be of the same size"
)

self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
Expand Down Expand Up @@ -70,9 +83,9 @@ def reset_parameters(self):
if self.qkv_same_dim:
# Empirically observed the convergence to be much better with
# the scaled initialization
nn.init.xavier_uniform_(self.k_proj.weight, gain=1/math.sqrt(2))
nn.init.xavier_uniform_(self.v_proj.weight, gain=1/math.sqrt(2))
nn.init.xavier_uniform_(self.q_proj.weight, gain=1/math.sqrt(2))
nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
else:
nn.init.xavier_uniform_(self.k_proj.weight)
nn.init.xavier_uniform_(self.v_proj.weight)
Expand All @@ -88,7 +101,9 @@ def reset_parameters(self):

def forward(
self,
query, key, value,
query,
key,
value,
key_padding_mask=None,
incremental_state=None,
need_weights=True,
Expand Down Expand Up @@ -121,23 +136,39 @@ def forward(
assert embed_dim == self.embed_dim
assert list(query.size()) == [tgt_len, bsz, embed_dim]

if self.enable_torch_version and not self.onnx_trace and incremental_state is None and not static_kv:
return F.multi_head_attention_forward(query, key, value,
self.embed_dim, self.num_heads,
torch.empty([0]),
torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
self.bias_k, self.bias_v,
self.add_zero_attn, self.dropout,
self.out_proj.weight, self.out_proj.bias,
self.training, key_padding_mask, need_weights,
attn_mask, use_separate_proj_weight=True,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight)
if (
self.enable_torch_version
and not self.onnx_trace
and incremental_state is None
and not static_kv
):
return F.multi_head_attention_forward(
query,
key,
value,
self.embed_dim,
self.num_heads,
torch.empty([0]),
torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
self.bias_k,
self.bias_v,
self.add_zero_attn,
self.dropout,
self.out_proj.weight,
self.out_proj.bias,
self.training,
key_padding_mask,
need_weights,
attn_mask,
use_separate_proj_weight=True,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
)

if incremental_state is not None:
saved_state = self._get_input_buffer(incremental_state)
if 'prev_key' in saved_state:
if "prev_key" in saved_state:
# previous time steps are cached - no need to recompute
# key and value if they are static
if static_kv:
Expand Down Expand Up @@ -171,42 +202,65 @@ def forward(
k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
attn_mask = torch.cat(
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)

q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
[
key_padding_mask,
key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
],
dim=1,
)

q = (
q.contiguous()
.view(tgt_len, bsz * self.num_heads, self.head_dim)
.transpose(0, 1)
)
if k is not None:
k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
k = (
k.contiguous()
.view(-1, bsz * self.num_heads, self.head_dim)
.transpose(0, 1)
)
if v is not None:
v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
v = (
v.contiguous()
.view(-1, bsz * self.num_heads, self.head_dim)
.transpose(0, 1)
)

if saved_state is not None:
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
if 'prev_key' in saved_state:
prev_key = saved_state['prev_key'].view(bsz * self.num_heads, -1, self.head_dim)
if "prev_key" in saved_state:
prev_key = saved_state["prev_key"].view(
bsz * self.num_heads, -1, self.head_dim
)
if static_kv:
k = prev_key
else:
k = torch.cat((prev_key, k), dim=1)
if 'prev_value' in saved_state:
prev_value = saved_state['prev_value'].view(bsz * self.num_heads, -1, self.head_dim)
if "prev_value" in saved_state:
prev_value = saved_state["prev_value"].view(
bsz * self.num_heads, -1, self.head_dim
)
if static_kv:
v = prev_value
else:
v = torch.cat((prev_value, v), dim=1)
key_padding_mask = self._append_prev_key_padding_mask(
key_padding_mask=key_padding_mask,
prev_key_padding_mask=saved_state.get('prev_key_padding_mask', None),
prev_key_padding_mask=saved_state.get("prev_key_padding_mask", None),
batch_size=bsz,
src_len=k.size(1),
static_kv=static_kv,
)

saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim)
saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim)
saved_state['prev_key_padding_mask'] = key_padding_mask
saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
saved_state["prev_key_padding_mask"] = key_padding_mask

self._set_input_buffer(incremental_state, saved_state)

Expand All @@ -226,10 +280,19 @@ def forward(
k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
if attn_mask is not None:
attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
attn_mask = torch.cat(
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
[
key_padding_mask,
torch.zeros(key_padding_mask.size(0), 1).type_as(
key_padding_mask
),
],
dim=1,
)

attn_weights = torch.bmm(q, k.transpose(1, 2))
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
Expand All @@ -246,21 +309,26 @@ def forward(
# don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2),
float('-inf'),
key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf")
)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

if before_softmax:
return attn_weights, v

attn_weights_float = utils.softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace)
attn_weights_float = utils.softmax(
attn_weights, dim=-1, onnx_trace=self.onnx_trace
)
attn_weights = attn_weights_float.type_as(attn_weights)
attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
attn_probs = F.dropout(
attn_weights_float.type_as(attn_weights),
p=self.dropout,
training=self.training,
)

attn = torch.bmm(attn_probs, v)
assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
if (self.onnx_trace and attn.size(1) == 1):
if self.onnx_trace and attn.size(1) == 1:
# when ONNX tracing a single decoder step (sequence length == 1)
# the transpose is a no-op copy before view, thus unnecessary
attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
Expand All @@ -269,7 +337,9 @@ def forward(
attn = self.out_proj(attn)

if need_weights:
attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
attn_weights = attn_weights_float.view(
bsz, self.num_heads, tgt_len, src_len
).transpose(1, 0)
if not need_head_weights:
# average attention weights over heads
attn_weights = attn_weights.mean(dim=0)
Expand All @@ -280,22 +350,22 @@ def forward(

@staticmethod
def _append_prev_key_padding_mask(
key_padding_mask,
prev_key_padding_mask,
batch_size,
src_len,
static_kv,
key_padding_mask, prev_key_padding_mask, batch_size, src_len, static_kv
):
# saved key padding masks have shape (bsz, seq_len)
if prev_key_padding_mask is not None and static_kv:
key_padding_mask = prev_key_padding_mask
elif prev_key_padding_mask is not None and key_padding_mask is not None:
key_padding_mask = torch.cat((prev_key_padding_mask, key_padding_mask), dim=1)
key_padding_mask = torch.cat(
(prev_key_padding_mask, key_padding_mask), dim=1
)
# During incremental decoding, as the padding token enters and
# leaves the frame, there will be a time when prev or current
# is None
elif prev_key_padding_mask is not None:
filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1)).bool()
filler = torch.zeros(
batch_size, src_len - prev_key_padding_mask.size(1)
).bool()
if prev_key_padding_mask.is_cuda:
filler = filler.cuda()
key_padding_mask = torch.cat((prev_key_padding_mask, filler), dim=1)
Expand All @@ -316,45 +386,38 @@ def reorder_incremental_state(self, incremental_state, new_order):
self._set_input_buffer(incremental_state, input_buffer)

def _get_input_buffer(self, incremental_state):
return utils.get_incremental_state(
self,
incremental_state,
'attn_state',
) or {}
return utils.get_incremental_state(self, incremental_state, "attn_state") or {}

def _set_input_buffer(self, incremental_state, buffer):
utils.set_incremental_state(
self,
incremental_state,
'attn_state',
buffer,
)
utils.set_incremental_state(self, incremental_state, "attn_state", buffer)

def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
return attn_weights

def upgrade_state_dict_named(self, state_dict, name):
prefix = name + '.' if name != '' else ''
prefix = name + "." if name != "" else ""
items_to_add = {}
keys_to_remove = []
for k in state_dict.keys():
if k.endswith(prefix + 'in_proj_weight'):
if k.endswith(prefix + "in_proj_weight"):
# in_proj_weight used to be q + k + v with same dimensions
dim = int(state_dict[k].shape[0] / 3)
items_to_add[prefix + 'q_proj.weight'] = state_dict[k][:dim]
items_to_add[prefix + 'k_proj.weight'] = state_dict[k][dim:2*dim]
items_to_add[prefix + 'v_proj.weight'] = state_dict[k][2*dim:]
items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]

keys_to_remove.append(k)

k_bias = prefix + 'in_proj_bias'
k_bias = prefix + "in_proj_bias"
if k_bias in state_dict.keys():
dim = int(state_dict[k].shape[0] / 3)
items_to_add[prefix + 'q_proj.bias'] = state_dict[k_bias][:dim]
items_to_add[prefix + 'k_proj.bias'] = state_dict[k_bias][dim:2*dim]
items_to_add[prefix + 'v_proj.bias'] = state_dict[k_bias][2*dim:]
items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
dim : 2 * dim
]
items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]

keys_to_remove.append(prefix + 'in_proj_bias')
keys_to_remove.append(prefix + "in_proj_bias")

for k in keys_to_remove:
del state_dict[k]
Expand Down
Loading

0 comments on commit 2535cab

Please # to comment.