Merge remote-tracking branch 'origin/dataset' into dataset

2024-12-11 10:05:30 +08:00 · 2018-11-09 19:53:08 +08:00 · 2018-11-09 19:53:08 +08:00 · 12e9a93b52
commit 12e9a93b52
parent 79105381f5 1b9daa1985
8 changed files with 518 additions and 1 deletions
--- a/fastNLP/api/api.py
+++ b/fastNLP/api/api.py
@ -0,0 +1,11 @@
+
+
+class API:
+    def __init__(self):
+        pass
+
+    def predict(self):
+        pass
+
+    def load(self):
+        pass
--- a/fastNLP/api/pipeline.py
+++ b/fastNLP/api/pipeline.py
@ -8,7 +8,6 @@ class Pipeline:

    def add_processor(self, processor):
        assert isinstance(processor, Processor), "Must be a Processor, not {}.".format(type(processor))
-        processor_name = type(processor)
        self.pipeline.append(processor)

    def process(self, dataset):
--- a/reproduction/chinese_word_segment/model/init.py
+++ b/reproduction/chinese_word_segment/model/init.py
--- a/reproduction/chinese_word_segment/model/cws_model.py
+++ b/reproduction/chinese_word_segment/model/cws_model.py
@ -0,0 +1,135 @@
+
+from torch import nn
+import torch
+import torch.nn.functional as F
+
+from fastNLP.modules.decoder.MLP import MLP
+from fastNLP.models.base_model import BaseModel
+from reproduction.chinese_word_segment.utils import seq_lens_to_mask
+
+class CWSBiLSTMEncoder(BaseModel):
+    def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
+                 hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1):
+        super().__init__()
+
+        self.input_size = 0
+        self.num_bigram_per_char = num_bigram_per_char
+        self.bidirectional = bidirectional
+        self.num_layers = num_layers
+        self.embed_drop_p = embed_drop_p
+        if self.bidirectional:
+            self.hidden_size = hidden_size//2
+            self.num_directions = 2
+        else:
+            self.hidden_size = hidden_size
+            self.num_directions = 1
+
+        if not bigram_vocab_num is None:
+            assert not bigram_vocab_num is None, "Specify num_bigram_per_char."
+
+        if vocab_num is not None:
+            self.char_embedding = nn.Embedding(num_embeddings=vocab_num, embedding_dim=embed_dim)
+            self.input_size += embed_dim
+
+        if bigram_vocab_num is not None:
+            self.bigram_embedding = nn.Embedding(num_embeddings=bigram_vocab_num, embedding_dim=bigram_embed_dim)
+            self.input_size += self.num_bigram_per_char*bigram_embed_dim
+
+        if self.num_criterion!=None:
+            if bidirectional:
+                self.backward_criterion_embedding = nn.Embedding(num_embeddings=self.num_criterion,
+                                                    embedding_dim=self.hidden_size)
+            self.forward_criterion_embedding = nn.Embedding(num_embeddings=self.num_criterion,
+                                                                embedding_dim=self.hidden_size)
+
+        if not self.embed_drop_p is None:
+            self.embedding_drop = nn.Dropout(p=self.embed_drop_p)
+
+        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, bidirectional=self.bidirectional,
+                    batch_first=True, num_layers=self.num_layers)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for name, param in self.named_parameters():
+            if 'bias_hh' in name:
+                nn.init.constant_(param, 0)
+            elif 'bias_ih' in name:
+                nn.init.constant_(param, 1)
+            else:
+                nn.init.xavier_uniform_(param)
+
+    def init_embedding(self, embedding, embed_name):
+        if embed_name == 'bigram':
+            self.bigram_embedding.weight.data = torch.from_numpy(embedding)
+        elif embed_name == 'char':
+            self.char_embedding.weight.data = torch.from_numpy(embedding)
+
+
+    def forward(self, chars, bigrams=None, seq_lens=None):
+
+        batch_size, max_len = chars.size()
+
+        x_tensor = self.char_embedding(chars)
+
+        if not bigrams is None:
+            bigram_tensor = self.bigram_embedding(bigrams).view(batch_size, max_len, -1)
+            x_tensor = torch.cat([x_tensor, bigram_tensor], dim=2)
+
+        sorted_lens, sorted_indices = torch.sort(seq_lens, descending=True)
+        packed_x = nn.utils.rnn.pack_padded_sequence(x_tensor[sorted_indices], sorted_lens, batch_first=True)
+
+        outputs, _ = self.lstm(packed_x)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+
+        _, desorted_indices = torch.sort(sorted_indices, descending=False)
+        outputs = outputs[desorted_indices]
+
+        return outputs
+
+
+class CWSBiLSTMSegApp(BaseModel):
+    def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
+                 hidden_size=200, bidirectional=True, embed_drop_p=None, num_layers=1, tag_size=2):
+        super(CWSBiLSTMSegApp, self).__init__()
+
+        self.tag_size = tag_size
+
+        self.encoder_model = CWSBiLSTMEncoder(vocab_num, embed_dim, bigram_vocab_num, bigram_embed_dim, num_bigram_per_char,
+                 hidden_size, bidirectional, embed_drop_p, num_layers)
+
+        size_layer = [hidden_size, 100, tag_size]
+        self.decoder_model = MLP(size_layer)
+
+
+    def forward(self, **kwargs):
+        chars = kwargs['chars']
+        if 'bigram' in kwargs:
+            bigrams = kwargs['bigrams']
+        else:
+            bigrams = None
+        seq_lens = kwargs['seq_lens']
+
+        feats = self.encoder_model(chars, bigrams, seq_lens)
+        probs = self.decoder_model(feats)
+
+        pred_dict = {}
+        pred_dict['seq_lens'] = seq_lens
+        pred_dict['pred_prob'] = probs
+
+        return pred_dict
+
+    def loss_fn(self, pred_dict, true_dict):
+        seq_lens = pred_dict['seq_lens']
+        masks = seq_lens_to_mask(seq_lens).float()
+
+        pred_prob = pred_dict['pred_prob']
+        true_y = true_dict['tags']
+
+        # TODO 当前把loss写死了
+        loss = F.cross_entropy(pred_prob.view(-1, self.tag_size),
+                               true_y.view(-1), reduction='none')*masks.view(-1)/torch.sum(masks)
+
+
+        return loss
+
--- a/reproduction/chinese_word_segment/process/init.py
+++ b/reproduction/chinese_word_segment/process/init.py
--- a/reproduction/chinese_word_segment/process/cws_processor.py
+++ b/reproduction/chinese_word_segment/process/cws_processor.py
@ -0,0 +1,283 @@
+
+import re
+
+
+from fastNLP.core.field import SeqLabelField
+from fastNLP.core.vocabulary import Vocabulary
+from fastNLP.core.dataset import DataSet
+
+from fastNLP.api.processor import Processor
+
+
+_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'
+
+class FullSpaceToHalfSpaceProcessor(Processor):
+    def __init__(self, field_name, change_alpha=True, change_digit=True, change_punctuation=True,
+                 change_space=True):
+        super(FullSpaceToHalfSpaceProcessor, self).__init__(field_name, None)
+
+        self.change_alpha = change_alpha
+        self.change_digit = change_digit
+        self.change_punctuation = change_punctuation
+        self.change_space = change_space
+
+        FH_SPACE = [(u"　", u" ")]
+        FH_NUM = [
+            (u"０", u"0"), (u"１", u"1"), (u"２", u"2"), (u"３", u"3"), (u"４", u"4"),
+            (u"５", u"5"), (u"６", u"6"), (u"７", u"7"), (u"８", u"8"), (u"９", u"9")]
+        FH_ALPHA = [
+            (u"ａ", u"a"), (u"ｂ", u"b"), (u"ｃ", u"c"), (u"ｄ", u"d"), (u"ｅ", u"e"),
+            (u"ｆ", u"f"), (u"ｇ", u"g"), (u"ｈ", u"h"), (u"ｉ", u"i"), (u"ｊ", u"j"),
+            (u"ｋ", u"k"), (u"ｌ", u"l"), (u"ｍ", u"m"), (u"ｎ", u"n"), (u"ｏ", u"o"),
+            (u"ｐ", u"p"), (u"ｑ", u"q"), (u"ｒ", u"r"), (u"ｓ", u"s"), (u"ｔ", u"t"),
+            (u"ｕ", u"u"), (u"ｖ", u"v"), (u"ｗ", u"w"), (u"ｘ", u"x"), (u"ｙ", u"y"),
+            (u"ｚ", u"z"),
+            (u"Ａ", u"A"), (u"Ｂ", u"B"), (u"Ｃ", u"C"), (u"Ｄ", u"D"), (u"Ｅ", u"E"),
+            (u"Ｆ", u"F"), (u"Ｇ", u"G"), (u"Ｈ", u"H"), (u"Ｉ", u"I"), (u"Ｊ", u"J"),
+            (u"Ｋ", u"K"), (u"Ｌ", u"L"), (u"Ｍ", u"M"), (u"Ｎ", u"N"), (u"Ｏ", u"O"),
+            (u"Ｐ", u"P"), (u"Ｑ", u"Q"), (u"Ｒ", u"R"), (u"Ｓ", u"S"), (u"Ｔ", u"T"),
+            (u"Ｕ", u"U"), (u"Ｖ", u"V"), (u"Ｗ", u"W"), (u"Ｘ", u"X"), (u"Ｙ", u"Y"),
+            (u"Ｚ", u"Z")]
+        # 谨慎使用标点符号转换, 因为"5．12特大地震"转换后可能就成了"5.12特大地震"
+        FH_PUNCTUATION = [
+            (u'％', u'%'), (u'！', u'!'), (u'＂', u'\"'), (u'＇', u'\''), (u'＃', u'#'),
+            (u'￥', u'$'), (u'＆', u'&'), (u'（', u'('), (u'）', u')'), (u'＊', u'*'),
+            (u'＋', u'+'), (u'，', u','), (u'－', u'-'), (u'．', u'.'), (u'／', u'/'),
+            (u'：', u':'), (u'；', u';'), (u'＜', u'<'), (u'＝', u'='), (u'＞', u'>'),
+            (u'？', u'?'), (u'＠', u'@'), (u'［', u'['), (u'］', u']'), (u'＼', u'\\'),
+            (u'＾', u'^'), (u'＿', u'_'), (u'｀', u'`'), (u'～', u'~'), (u'｛', u'{'),
+            (u'｝', u'}'), (u'｜', u'|')]
+        FHs = []
+        if self.change_alpha:
+            FHs = FH_ALPHA
+        if self.change_digit:
+            FHs += FH_NUM
+        if self.change_punctuation:
+            FHs += FH_PUNCTUATION
+        if self.change_space:
+            FHs += FH_SPACE
+        self.convert_map = {k: v for k, v in FHs}
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+        for ins in dataset:
+            sentence = ins[self.field_name].text
+            new_sentence = [None]*len(sentence)
+            for idx, char in enumerate(sentence):
+                if char in self.convert_map:
+                    char = self.convert_map[char]
+                new_sentence[idx] = char
+            ins[self.field_name].text = ''.join(new_sentence)
+        return dataset
+
+
+class SpeicalSpanProcessor(Processor):
+    # 这个类会将句子中的special span转换为对应的内容。
+    def __init__(self, field_name, new_added_field_name=None):
+        super(SpeicalSpanProcessor, self).__init__(field_name, new_added_field_name)
+
+        self.span_converters = []
+
+
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+        for ins in dataset:
+            sentence = ins[self.field_name].text
+            for span_converter in self.span_converters:
+                sentence = span_converter.find_certain_span_and_replace(sentence)
+            if self.new_added_field_name!=self.field_name:
+                new_text_field = TextField(sentence, is_target=False)
+                ins[self.new_added_field_name] = new_text_field
+            else:
+                ins[self.field_name].text = sentence
+
+        return dataset
+
+    def add_span_converter(self, converter):
+        assert isinstance(converter, SpanConverterBase), "Only SpanConverterBase is allowed, not {}."\
+            .format(type(converter))
+        self.span_converters.append(converter)
+
+
+
+class CWSCharSegProcessor(Processor):
+    def __init__(self, field_name, new_added_field_name):
+        super(CWSCharSegProcessor, self).__init__(field_name, new_added_field_name)
+
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+        for ins in dataset:
+            sentence = ins[self.field_name].text
+            chars = self._split_sent_into_chars(sentence)
+            new_token_field = TokenListFiled(chars, is_target=False)
+            ins[self.new_added_field_name] = new_token_field
+
+        return dataset
+
+    def _split_sent_into_chars(self, sentence):
+        sp_tag_match_iter = re.finditer(_SPECIAL_TAG_PATTERN, sentence)
+        sp_spans = [match_span.span() for match_span in sp_tag_match_iter]
+        sp_span_idx = 0
+        in_span_flag = False
+        chars = []
+        num_spans = len(sp_spans)
+        for idx, char in enumerate(sentence):
+            if sp_span_idx<num_spans and idx == sp_spans[sp_span_idx][0]:
+                in_span_flag = True
+            elif in_span_flag and sp_span_idx<num_spans and idx == sp_spans[sp_span_idx][1] - 1:
+                chars.append(sentence[sp_spans[sp_span_idx]
+                                      [0]:sp_spans[sp_span_idx][1]])
+                in_span_flag = False
+                sp_span_idx += 1
+            elif not in_span_flag:
+                # TODO 需要谨慎考虑如何处理空格的问题
+                if char != ' ':
+                    chars.append(char)
+            else:
+                pass
+        return chars
+
+
+class CWSTagProcessor(Processor):
+    def __init__(self, field_name, new_added_field_name=None):
+        super(CWSTagProcessor, self).__init__(field_name, new_added_field_name)
+
+    def _generate_tag(self, sentence):
+        sp_tag_match_iter = re.finditer(_SPECIAL_TAG_PATTERN, sentence)
+        sp_spans = [match_span.span() for match_span in sp_tag_match_iter]
+        sp_span_idx = 0
+        in_span_flag = False
+        tag_list = []
+        word_len = 0
+        num_spans = len(sp_spans)
+        for idx, char in enumerate(sentence):
+            if sp_span_idx<num_spans and idx == sp_spans[sp_span_idx][0]:
+                in_span_flag = True
+            elif in_span_flag and sp_span_idx<num_spans and idx == sp_spans[sp_span_idx][1] - 1:
+                word_len += 1
+                in_span_flag = False
+                sp_span_idx += 1
+            elif not in_span_flag:
+                if char == ' ':
+                    if word_len!=0:
+                        tag_list.extend(self._tags_from_word_len(word_len))
+                    word_len = 0
+                else:
+                    word_len += 1
+            else:
+                pass
+        if word_len!=0:
+            tag_list.extend(self._tags_from_word_len(word_len))
+
+        return tag_list
+
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+        for ins in dataset:
+            sentence = ins[self.field_name].text
+            tag_list = self._generate_tag(sentence)
+            new_tag_field = SeqLabelField(tag_list)
+            ins[self.new_added_field_name] = new_tag_field
+        return dataset
+
+    def _tags_from_word_len(self, word_len):
+        raise NotImplementedError
+
+
+class CWSSegAppTagProcessor(CWSTagProcessor):
+    def __init__(self, field_name, new_added_field_name=None):
+        super(CWSSegAppTagProcessor, self).__init__(field_name, new_added_field_name)
+
+    def _tags_from_word_len(self, word_len):
+        tag_list = []
+        for _ in range(word_len-1):
+            tag_list.append(0)
+        tag_list.append(1)
+        return tag_list
+
+
+class BigramProcessor(Processor):
+    def __init__(self, field_name, new_added_fielf_name=None):
+
+        super(BigramProcessor, self).__init__(field_name, new_added_fielf_name)
+
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+
+        for ins in dataset:
+            characters = ins[self.field_name].content
+            bigrams = self._generate_bigram(characters)
+            new_token_field = TokenListFiled(bigrams)
+            ins[self.new_added_field_name] = new_token_field
+
+        return dataset
+
+
+    def _generate_bigram(self, characters):
+        pass
+
+
+class Pre2Post2BigramProcessor(BigramProcessor):
+    def __init__(self, field_name, new_added_fielf_name=None):
+
+        super(BigramProcessor, self).__init__(field_name, new_added_fielf_name)
+
+    def _generate_bigram(self, characters):
+        bigrams = []
+        characters = ['<SOS>', '<SOS>'] + characters + ['<EOS>', '<EOS>']
+        for idx in range(2, len(characters)-2):
+            cur_char = characters[idx]
+            pre_pre_char = characters[idx-2]
+            pre_char = characters[idx-1]
+            post_char = characters[idx+1]
+            post_post_char = characters[idx+2]
+            pre_pre_cur_bigram = pre_pre_char + cur_char
+            pre_cur_bigram = pre_char + cur_char
+            cur_post_bigram = cur_char + post_char
+            cur_post_post_bigram = cur_char + post_post_char
+            bigrams.extend([pre_pre_char, pre_char, post_char, post_post_char,
+                            pre_pre_cur_bigram, pre_cur_bigram,
+                            cur_post_bigram, cur_post_post_bigram])
+        return bigrams
+
+
+# 这里需要建立vocabulary了，但是遇到了以下的问题
+# (1) 如果使用Processor的方式的话，但是在这种情况返回的不是dataset。所以建立vocabulary的工作用另外的方式实现，不借用
+#   Processor了
+class IndexProcessor(Processor):
+    def __init__(self, vocab, field_name):
+
+        assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
+
+        super(IndexProcessor, self).__init__(field_name, None)
+        self.vocab = vocab
+
+    def set_vocab(self, vocab):
+        assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
+
+        self.vocab = vocab
+
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+        for ins in dataset:
+            tokens = ins[self.field_name].content
+            index = [self.vocab.to_index(token) for token in tokens]
+            ins[self.field_name]._index = index
+
+        return dataset
+
+
+class VocabProcessor(Processor):
+    def __init__(self, field_name):
+
+        super(VocabProcessor, self).__init__(field_name, None)
+        self.vocab = Vocabulary()
+
+    def process(self, dataset):
+        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
+        for ins in dataset:
+            tokens = ins[self.field_name].content
+            self.vocab.update(tokens)
+
+    def get_vocab(self):
+        self.vocab.build_vocab()
+        return self.vocab
--- a/reproduction/chinese_word_segment/train_context.py
+++ b/reproduction/chinese_word_segment/train_context.py
@ -0,0 +1,3 @@
+
+
+
--- a/reproduction/chinese_word_segment/utils.py
+++ b/reproduction/chinese_word_segment/utils.py
@ -0,0 +1,86 @@
+
+import torch
+
+
+def seq_lens_to_mask(seq_lens):
+    batch_size = seq_lens.size(0)
+    max_len = seq_lens.max()
+
+    indexes = torch.arange(max_len).view(1, -1).repeat(batch_size, 1).to(seq_lens.device)
+    masks = indexes.lt(seq_lens.unsqueeze(1))
+
+    return masks
+
+
+def cut_long_training_sentences(sentences, max_sample_length=200):
+    cutted_sentence = []
+    for sent in sentences:
+        sent_no_space = sent.replace(' ', '')
+        if len(sent_no_space) > max_sample_length:
+            parts = sent.strip().split()
+            new_line = ''
+            length = 0
+            for part in parts:
+                length += len(part)
+                new_line += part + ' '
+                if length > max_sample_length:
+                    new_line = new_line[:-1]
+                    cutted_sentence.append(new_line)
+                    length = 0
+                    new_line = ''
+            if new_line != '':
+                cutted_sentence.append(new_line[:-1])
+        else:
+            cutted_sentence.append(sent)
+    return cutted_sentence
+
+
+from torch import nn
+import torch.nn.functional as F
+
+class FocalLoss(nn.Module):
+    r"""
+        This criterion is a implemenation of Focal Loss, which is proposed in
+        Focal Loss for Dense Object Detection.
+
+            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
+
+        The losses are averaged across observations for each minibatch.
+        Args:
+            alpha(1D Tensor, Variable) : the scalar factor for this criterion
+            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),
+                                   putting more focus on hard, misclassiﬁed examples
+            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
+                                However, if the field size_average is set to False, the losses are
+                                instead summed for each minibatch.
+    """
+
+    def __init__(self, class_num, gamma=2, size_average=True, reduce=False):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.class_num = class_num
+        self.size_average = size_average
+        self.reduce = reduce
+
+    def forward(self, inputs, targets):
+        N = inputs.size(0)
+        C = inputs.size(1)
+        P = F.softmax(inputs, dim=-1)
+
+        class_mask = inputs.data.new(N, C).fill_(0)
+        class_mask.requires_grad = True
+        ids = targets.view(-1, 1)
+        class_mask = class_mask.scatter(1, ids.data, 1.)
+
+        probs = (P * class_mask).sum(1).view(-1, 1)
+
+        log_p = probs.log()
+
+        batch_loss = - (torch.pow((1 - probs), self.gamma)) * log_p
+        if self.reduce:
+            if self.size_average:
+                loss = batch_loss.mean()
+            else:
+                loss = batch_loss.sum()
+            return loss
+        return batch_loss