From 8d61cd684eaa198b15a66184f4c290b568fcf574 Mon Sep 17 00:00:00 2001 From: wlhgtc Date: Fri, 22 Mar 2019 17:05:43 +0800 Subject: [PATCH 1/2] fix the "masked_fill" bug If you use masked_fill according to ex_mask (0 for pad), it will fill not padding position(which value in ex_mask is 1) with 0, this will lead a bad performance. --- fastNLP/modules/encoder/star_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index b28d3d1d..809948bb 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -57,6 +57,7 @@ class StarTransformer(nn.Module): nodes = embs relay = embs.mean(2, keepdim=True) ex_mask = mask[:, None, :, None].expand(B, H, L, 1) + ex_mask = ex_mask.ne(1) # reverse mask for next masked_fill. r_embs = embs.view(B, H, 1, L) for i in range(self.iters): ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2) From 28d3f500c1bf4b1144690cfa747274059ca24b80 Mon Sep 17 00:00:00 2001 From: wlhgtc Date: Sun, 31 Mar 2019 00:00:59 +0800 Subject: [PATCH 2/2] Fix bug in MSA2 (mixed k and v) RT, another bug --- fastNLP/modules/encoder/star_transformer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index 809948bb..1618c8ee 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -7,7 +7,6 @@ import numpy as NP class StarTransformer(nn.Module): """Star-Transformer Encoder part。 paper: https://arxiv.org/abs/1902.09113 - :param hidden_size: int, 输入维度的大小。同时也是输出维度的大小。 :param num_layers: int, star-transformer的层数 :param num_head: int,head的数量。 @@ -46,6 +45,7 @@ class StarTransformer(nn.Module): return f(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) B, L, H = data.size() + mask = (mask == 0) # flip the mask for masked_fill_ smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1) embs = data.permute(0, 2, 1)[:,:,:,None] # B H L 1 @@ -57,7 +57,6 @@ class StarTransformer(nn.Module): nodes = embs relay = embs.mean(2, keepdim=True) ex_mask = mask[:, None, :, None].expand(B, H, L, 1) - ex_mask = ex_mask.ne(1) # reverse mask for next masked_fill. r_embs = embs.view(B, H, 1, L) for i in range(self.iters): ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2) @@ -137,11 +136,10 @@ class MSA2(nn.Module): q = q.view(B, nhead, 1, head_dim) # B, H, 1, 1 -> B, N, 1, h k = k.view(B, nhead, head_dim, L) # B, H, L, 1 -> B, N, h, L - v = k.view(B, nhead, head_dim, L).permute(0, 1, 3, 2) # B, H, L, 1 -> B, N, L, h + v = v.view(B, nhead, head_dim, L).permute(0, 1, 3, 2) # B, H, L, 1 -> B, N, L, h pre_a = torch.matmul(q, k) / NP.sqrt(head_dim) if mask is not None: pre_a = pre_a.masked_fill(mask[:, None, None, :], -float('inf')) alphas = self.drop(F.softmax(pre_a, 3)) # B, N, 1, L att = torch.matmul(alphas, v).view(B, -1, 1, 1) # B, N, 1, h -> B, N*h, 1, 1 return self.WO(att) -