From bffde7857afa157713ed6a73ad8a1646639820aa Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Mon, 14 Dec 2020 16:06:44 +0800 Subject: [PATCH 1/4] fix a bug in early exit of bert --- fastNLP/modules/encoder/bert.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 28c47eb6..f304073d 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -374,20 +374,18 @@ class BertEncoder(nn.Module): self.num_output_layer = max(min(num_output_layer, len(self.layer)), 0) if self.num_output_layer + 1 < len(self.layer): logger.info(f'The transformer encoder will early exit after layer-{self.num_output_layer} ' - f'(start from 0)!') + f'(layer 0 means embedding layer)!') def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): all_encoder_layers = [] for idx, layer_module in enumerate(self.layer): - if idx > self.num_output_layer: + if idx >= self.num_output_layer: break hidden_states = layer_module(hidden_states, attention_mask) if output_all_encoded_layers: all_encoder_layers.append(hidden_states) if not output_all_encoded_layers: all_encoder_layers.append(hidden_states) - if len(all_encoder_layers) == 0: - all_encoder_layers.append(hidden_states) return all_encoder_layers @@ -445,8 +443,8 @@ class BertModel(nn.Module): self.hidden_size = self.config.hidden_size self.model_type = 'bert' neg_num_output_layer = kwargs.get('neg_num_output_layer', -1) - pos_num_output_layer = kwargs.get('pos_num_output_layer', self.config.num_hidden_layers - 1) - self.num_output_layer = max(neg_num_output_layer + self.config.num_hidden_layers, pos_num_output_layer) + pos_num_output_layer = kwargs.get('pos_num_output_layer', self.config.num_hidden_layers) + self.num_output_layer = max(neg_num_output_layer + 1 + self.config.num_hidden_layers, pos_num_output_layer) if hasattr(config, 'sinusoidal_pos_embds'): self.model_type = 'distilbert' elif 'model_type' in kwargs: @@ -535,6 +533,7 @@ class BertModel(nn.Module): encoded_layers = self.encoder(embedding_output, extended_attention_mask, output_all_encoded_layers=output_all_encoded_layers) + encoded_layers.insert(0, embedding_output) sequence_output = encoded_layers[-1] if self.model_type != 'distilbert': pooled_output = self.pooler(sequence_output) @@ -542,8 +541,6 @@ class BertModel(nn.Module): pooled_output = sequence_output[:, 0] if not output_all_encoded_layers: encoded_layers = encoded_layers[-1] - else: - encoded_layers.insert(0, embedding_output) return encoded_layers, pooled_output @classmethod From 84776696cdb3e4e50a99d59ed3f829cf89c8b5b7 Mon Sep 17 00:00:00 2001 From: willqvq Date: Wed, 16 Dec 2020 10:40:00 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E7=BB=99=20=5Fget=5Fdataset=5Furl=20?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86=E4=BD=BF=E7=94=A8=E8=87=AA=E5=AE=9A?= =?UTF-8?q?=E4=B9=89=E6=95=B0=E6=8D=AE=E9=9B=86=E4=B8=8B=E8=BD=BD=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E7=9A=84=E8=83=BD=E5=8A=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/file_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index ff33872b..b05a7a24 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -312,7 +312,8 @@ def _read_extend_url_file(filename, name)->str: return parts[1] return None -def _get_dataset_url(name): + +def _get_dataset_url(name, dataset_dir: dict = None): r""" 给定dataset的名称,返回下载url @@ -323,8 +324,9 @@ def _get_dataset_url(name): url = _read_extend_url_file(FASTNLP_EXTEND_DATASET_URL, name) if url: return url - - filename = DATASET_DIR.get(name, None) + + dataset_dir = DATASET_DIR if dataset_dir is None else dataset_dir + filename = dataset_dir.get(name, None) if filename: url = _get_base_url('dataset') + filename return url From 030e0aa3ee31b1489d3348f42998d4aae14a5f56 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Sat, 26 Dec 2020 09:29:17 +0800 Subject: [PATCH 3/4] update some function about bert and roberta --- fastNLP/embeddings/bert_embedding.py | 42 ++++++++++++++------- fastNLP/embeddings/roberta_embedding.py | 50 ++++++++++++++++++------- fastNLP/modules/encoder/bert.py | 8 ++-- 3 files changed, 69 insertions(+), 31 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 29b17c65..6434cc0d 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -110,11 +110,12 @@ class BertEmbedding(ContextualEmbedding): if '[CLS]' in vocab: self._word_cls_index = vocab['[CLS]'] - min_freq = kwargs.get('min_freq', 1) + min_freq = kwargs.pop('min_freq', 1) self._min_freq = min_freq self.model = _BertWordModel(model_dir_or_name=model_dir_or_name, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, - pooled_cls=pooled_cls, min_freq=min_freq, auto_truncate=auto_truncate) + pooled_cls=pooled_cls, min_freq=min_freq, auto_truncate=auto_truncate, + **kwargs) self.requires_grad = requires_grad self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size @@ -367,32 +368,44 @@ class BertWordPieceEncoder(nn.Module): class _BertWordModel(nn.Module): def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', - include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): + include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2, + **kwargs): super().__init__() if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): - self.layers = list(map(int, layers.split(','))) + if layers.lower() == 'all': + self.layers = None + else: + self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") - assert len(self.layers) > 0, "There is no layer selected!" neg_num_output_layer = -16384 pos_num_output_layer = 0 - for layer in self.layers: - if layer < 0: - neg_num_output_layer = max(layer, neg_num_output_layer) - else: - pos_num_output_layer = max(layer, pos_num_output_layer) + if self.layers is None: + neg_num_output_layer = -1 + else: + for layer in self.layers: + if layer < 0: + neg_num_output_layer = max(layer, neg_num_output_layer) + else: + pos_num_output_layer = max(layer, pos_num_output_layer) self.tokenzier = BertTokenizer.from_pretrained(model_dir_or_name) self.encoder = BertModel.from_pretrained(model_dir_or_name, neg_num_output_layer=neg_num_output_layer, - pos_num_output_layer=pos_num_output_layer) + pos_num_output_layer=pos_num_output_layer, + **kwargs) self._max_position_embeddings = self.encoder.config.max_position_embeddings # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) + if self.layers is None: + self.layers = [idx for idx in range(encoder_layer_number + 1)] + logger.info(f'Bert Model will return {len(self.layers)} layers (layer-0 ' + f'is embedding result): {self.layers}') + assert len(self.layers) > 0, "There is no layer selected!" for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ @@ -417,7 +430,7 @@ class _BertWordModel(nn.Module): word = '[PAD]' elif index == vocab.unknown_idx: word = '[UNK]' - elif vocab.word_count[word]' in vocab: self._word_cls_index = vocab[''] - min_freq = kwargs.get('min_freq', 1) + min_freq = kwargs.pop('min_freq', 1) self._min_freq = min_freq self.model = _RobertaWordModel(model_dir_or_name=model_dir_or_name, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, - pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=min_freq) + pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=min_freq, + **kwargs) self.requires_grad = requires_grad self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size @@ -193,33 +194,45 @@ class RobertaEmbedding(ContextualEmbedding): class _RobertaWordModel(nn.Module): def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', - include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): + include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2, + **kwargs): super().__init__() if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): - self.layers = list(map(int, layers.split(','))) + if layers.lower() == 'all': + self.layers = None + else: + self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") - assert len(self.layers) > 0, "There is no layer selected!" neg_num_output_layer = -16384 pos_num_output_layer = 0 - for layer in self.layers: - if layer < 0: - neg_num_output_layer = max(layer, neg_num_output_layer) - else: - pos_num_output_layer = max(layer, pos_num_output_layer) + if self.layers is None: + neg_num_output_layer = -1 + else: + for layer in self.layers: + if layer < 0: + neg_num_output_layer = max(layer, neg_num_output_layer) + else: + pos_num_output_layer = max(layer, pos_num_output_layer) self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = RobertaModel.from_pretrained(model_dir_or_name, neg_num_output_layer=neg_num_output_layer, - pos_num_output_layer=pos_num_output_layer) + pos_num_output_layer=pos_num_output_layer, + **kwargs) # 由于RobertaEmbedding中设置了padding_idx为1, 且使用了非常神奇的position计算方式,所以-2 self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) + if self.layers is None: + self.layers = [idx for idx in range(encoder_layer_number + 1)] + logger.info(f'RoBERTa Model will return {len(self.layers)} layers (layer-0 ' + f'is embedding result): {self.layers}') + assert len(self.layers) > 0, "There is no layer selected!" for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ @@ -241,7 +254,7 @@ class _RobertaWordModel(nn.Module): word = '' elif index == vocab.unknown_idx: word = '' - elif vocab.word_count[word] self._max_position_embeddings: if self.auto_truncate: word_pieces_lengths = word_pieces_lengths.masked_fill( - word_pieces_lengths + 2 > self._max_position_embeddings, self._max_position_embeddings - 2) + word_pieces_lengths + 2 > self._max_position_embeddings, + self._max_position_embeddings - 2) else: raise RuntimeError( "After split words into word pieces, the lengths of word pieces are longer than the " @@ -290,6 +305,7 @@ class _RobertaWordModel(nn.Module): word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) + # 添加 word_pieces[:, 0].fill_(self._cls_index) batch_indexes = torch.arange(batch_size).to(words) word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index @@ -362,6 +378,12 @@ class _RobertaWordModel(nn.Module): return outputs def save(self, folder): + """ + 给定一个folder保存pytorch_model.bin, config.json, vocab.txt + + :param str folder: + :return: + """ self.tokenizer.save_pretrained(folder) self.encoder.save_pretrained(folder) diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index f304073d..55e79d63 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -184,21 +184,23 @@ class DistilBertEmbeddings(nn.Module): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, input_ids, token_type_ids): + def forward(self, input_ids, token_type_ids, position_ids=None): r""" Parameters ---------- input_ids: torch.tensor(bs, max_seq_length) The token ids to embed. token_type_ids: no used. + position_ids: no used. Outputs ------- embeddings: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) - position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) - position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) From 22839c215a94c28a7452770861a55f8de589729b Mon Sep 17 00:00:00 2001 From: WillQvQ Date: Sun, 27 Dec 2020 15:52:39 +0800 Subject: [PATCH 4/4] update .Jenkinsfile. --- .Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.Jenkinsfile b/.Jenkinsfile index 5da7bb62..da1157e9 100644 --- a/.Jenkinsfile +++ b/.Jenkinsfile @@ -2,7 +2,7 @@ pipeline { agent { docker { image 'ubuntu_tester' - args '-u root:root -v ${HOME}/html/docs:/docs -v ${HOME}/html/_ci:/ci' + args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci' } } environment {