新增cws converter, io

2024-12-03 20:57:37 +08:00 · 2018-11-09 20:23:05 +08:00 · 2018-11-09 20:23:05 +08:00 · 38aa207ea2
commit 38aa207ea2
parent 1b9daa1985
4 changed files with 409 additions and 0 deletions
--- a/reproduction/chinese_word_segment/io/init.py
+++ b/reproduction/chinese_word_segment/io/init.py
--- a/reproduction/chinese_word_segment/io/cws_reader.py
+++ b/reproduction/chinese_word_segment/io/cws_reader.py
@ -0,0 +1,129 @@
+
+
+from fastNLP.loader.dataset_loader import DataSetLoader
+from fastNLP.core.instance import Instance
+from fastNLP.core.dataset import DataSet
+
+
+def cut_long_sentence(sent, max_sample_length=200):
+    sent_no_space = sent.replace(' ', '')
+    cutted_sentence = []
+    if len(sent_no_space) > max_sample_length:
+        parts = sent.strip().split()
+        new_line = ''
+        length = 0
+        for part in parts:
+            length += len(part)
+            new_line += part + ' '
+            if length > max_sample_length:
+                new_line = new_line[:-1]
+                cutted_sentence.append(new_line)
+                length = 0
+                new_line = ''
+        if new_line != '':
+            cutted_sentence.append(new_line[:-1])
+    else:
+        cutted_sentence.append(sent)
+    return cutted_sentence
+
+class NaiveCWSReader(DataSetLoader):
+    """
+    这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
+        这是 fastNLP , 一个 非常 good 的 包 .
+    或者,即每个part后面还有一个pos tag
+        也/D  在/P  團員/Na  之中/Ng  ，/COMMACATEGORY
+    """
+    def __init__(self, in_word_splitter=None):
+        super().__init__()
+
+        self.in_word_splitter = in_word_splitter
+
+    def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
+        """
+        允许使用的情况有(默认以\t或空格作为seg)
+            这是 fastNLP , 一个 非常 good 的 包 .
+        和
+            也/D  在/P  團員/Na  之中/Ng  ，/COMMACATEGORY
+        如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
+        :param filepath:
+        :param in_word_splitter:
+        :return:
+        """
+        if in_word_splitter == None:
+            in_word_splitter = self.in_word_splitter
+        dataset = DataSet()
+        with open(filepath, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if len(line.replace(' ', ''))==0: # 不能接受空行
+                    continue
+
+                if not in_word_splitter is None:
+                    words = []
+                    for part in line.split():
+                        word = part.split(in_word_splitter)[0]
+                        words.append(word)
+                        line = ' '.join(words)
+                if cut_long_sent:
+                    sents = cut_long_sentence(line)
+                else:
+                    sents = [line]
+                for sent in sents:
+                    instance = Instance(raw_sentence=sent)
+                    dataset.append(instance)
+
+        return dataset
+
+
+class POSCWSReader(DataSetLoader):
+    """
+    支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限.
+        迈 N
+        向 N
+        充 N
+        ...
+        泽 I-PER
+        民 I-PER
+
+        （ N
+        一 N
+        九 N
+        ...
+
+
+    :param filepath:
+    :return:
+    """
+    def __init__(self, in_word_splitter=None):
+        super().__init__()
+        self.in_word_splitter = in_word_splitter
+
+    def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
+        if in_word_splitter is None:
+            in_word_splitter = self.in_word_splitter
+        dataset = DataSet()
+        with open(filepath, 'r') as f:
+            words = []
+            for line in f:
+                line = line.strip()
+                if len(line) == 0: # new line
+                    if len(words)==0: # 不能接受空行
+                        continue
+                    line = ' '.join(words)
+                    if cut_long_sent:
+                        sents = cut_long_sent(line)
+                    else:
+                        sents = [line]
+                    for sent in sents:
+                        instance = Instance(raw_sentence=sent)
+                        dataset.append(instance)
+                    words = []
+                else:
+                    line = line.split()[0]
+                    if in_word_splitter is None:
+                        words.append(line)
+                    else:
+                        words.append(line.split(in_word_splitter)[0])
+        return dataset
+
+
--- a/reproduction/chinese_word_segment/process/span_converter.py
+++ b/reproduction/chinese_word_segment/process/span_converter.py
@ -0,0 +1,185 @@
+
+import re
+
+
+class SpanConverterBase:
+    def __init__(self, replace_tag, pattern):
+        super(SpanConverterBase, self).__init__()
+
+        self.replace_tag = replace_tag
+        self.pattern = pattern
+
+    def find_certain_span_and_replace(self, sentence):
+        replaced_sentence = ''
+        prev_end = 0
+        for match in re.finditer(self.pattern, sentence):
+            start, end = match.span()
+            span = sentence[start:end]
+            replaced_sentence += sentence[prev_end:start] + \
+                self.span_to_special_tag(span)
+            prev_end = end
+        replaced_sentence += sentence[prev_end:]
+
+        return replaced_sentence
+
+    def span_to_special_tag(self, span):
+
+        return self.replace_tag
+
+    def find_certain_span(self, sentence):
+        spans = []
+        for match in re.finditer(self.pattern, sentence):
+            spans.append(match.span())
+        return spans
+
+
+class AlphaSpanConverter(SpanConverterBase):
+    def __init__(self):
+        replace_tag = '<ALPHA>'
+        # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag).
+        pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])'
+
+        super(AlphaSpanConverter, self).__init__(replace_tag, pattern)
+
+
+class DigitSpanConverter(SpanConverterBase):
+    def __init__(self):
+        replace_tag = '<NUM>'
+        pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff  ,%.!<-])'
+
+        super(DigitSpanConverter, self).__init__(replace_tag, pattern)
+
+    def span_to_special_tag(self, span):
+        # return self.special_tag
+        if span[0] == '0' and len(span) > 2:
+            return '<NUM>'
+        decimal_point_count = 0  # one might have more than one decimal pointers
+        for idx, char in enumerate(span):
+            if char == '.' or char == '﹒' or char == '·':
+                decimal_point_count += 1
+        if span[-1] == '.' or span[-1] == '﹒' or span[
+            -1] == '·':  # last digit being decimal point means this is not a number
+            if decimal_point_count == 1:
+                return span
+            else:
+                return '<UNKDGT>'
+        if decimal_point_count == 1:
+            return '<DEC>'
+        elif decimal_point_count > 1:
+            return '<UNKDGT>'
+        else:
+            return '<NUM>'
+
+
+class TimeConverter(SpanConverterBase):
+    def __init__(self):
+        replace_tag = '<TOC>'
+        pattern = '\d+[:：∶][\d:：∶]+(?=[\u4e00-\u9fff  ,%.!<-])'
+
+        super().__init__(replace_tag, pattern)
+
+
+
+class MixNumAlphaConverter(SpanConverterBase):
+    def __init__(self):
+        replace_tag = '<MIX>'
+        pattern = None
+
+        super().__init__(replace_tag, pattern)
+
+    def find_certain_span_and_replace(self, sentence):
+        replaced_sentence = ''
+        start = 0
+        matching_flag = False
+        number_flag = False
+        alpha_flag = False
+        link_flag = False
+        slash_flag = False
+        bracket_flag = False
+        for idx in range(len(sentence)):
+            if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
+                if not matching_flag:
+                    replaced_sentence += sentence[start:idx]
+                    start = idx
+                if re.match('[0-9]', sentence[idx]):
+                    number_flag = True
+                elif re.match('[\'′&\\-]', sentence[idx]):
+                    link_flag = True
+                elif re.match('/', sentence[idx]):
+                    slash_flag = True
+                elif re.match('[\\(\\)]', sentence[idx]):
+                    bracket_flag = True
+                else:
+                    alpha_flag = True
+                matching_flag = True
+            elif re.match('[\\.]', sentence[idx]):
+                pass
+            else:
+                if matching_flag:
+                    if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
+                            or (slash_flag and alpha_flag) or (link_flag and number_flag) \
+                            or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
+                        span = sentence[start:idx]
+                        start = idx
+                        replaced_sentence += self.span_to_special_tag(span)
+                    matching_flag = False
+                    number_flag = False
+                    alpha_flag = False
+                    link_flag = False
+                    slash_flag = False
+                    bracket_flag = False
+
+        replaced_sentence += sentence[start:]
+        return replaced_sentence
+
+    def find_certain_span(self, sentence):
+        spans = []
+        start = 0
+        matching_flag = False
+        number_flag = False
+        alpha_flag = False
+        link_flag = False
+        slash_flag = False
+        bracket_flag = False
+        for idx in range(len(sentence)):
+            if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
+                if not matching_flag:
+                    start = idx
+                if re.match('[0-9]', sentence[idx]):
+                    number_flag = True
+                elif re.match('[\'′&\\-]', sentence[idx]):
+                    link_flag = True
+                elif re.match('/', sentence[idx]):
+                    slash_flag = True
+                elif re.match('[\\(\\)]', sentence[idx]):
+                    bracket_flag = True
+                else:
+                    alpha_flag = True
+                matching_flag = True
+            elif re.match('[\\.]', sentence[idx]):
+                pass
+            else:
+                if matching_flag:
+                    if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
+                            or (slash_flag and alpha_flag) or (link_flag and number_flag) \
+                            or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
+                        spans.append((start, idx))
+                        start = idx
+
+                    matching_flag = False
+                    number_flag = False
+                    alpha_flag = False
+                    link_flag = False
+                    slash_flag = False
+                    bracket_flag = False
+
+        return spans
+
+
+
+class EmailConverter(SpanConverterBase):
+    def __init__(self):
+        replaced_tag = "<EML>"
+        pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff  ,%.!<\\-"$])'
+
+        super(EmailConverter, self).__init__(replaced_tag, pattern)
--- a/reproduction/chinese_word_segment/train_context.py
+++ b/reproduction/chinese_word_segment/train_context.py
@ -1,3 +1,98 @@

+from fastNLP.core.instance import Instance
+from fastNLP.core.dataset import DataSet
+
+
+from fastNLP.api.pipeline import Pipeline
+from reproduction.chinese_word_segment.process.cws_processor import *
+from reproduction.chinese_word_segment.utils import cut_long_training_sentences
+from reproduction.chinese_word_segment.process.span_converter import *
+from reproduction.chinese_word_segment.io import NaiveCWSReader
+
+
+tr_filename = ''
+dev_filename = ''
+
+reader = NaiveCWSReader()
+
+tr_dataset = reader.load(tr_filename, cut=True)
+de_dataset = reader.load(dev_filename)
+


+# TODO 如何组建成为一个Dataset
+def construct_dataset(sentences):
+    dataset = DataSet()
+    for sentence in sentences:
+        instance = Instance()
+        instance['raw_sentence'] = sentence
+        dataset.append(instance)
+
+    return dataset
+
+
+tr_dataset = construct_dataset(tr_sentences)
+dev_dataset = construct_dataset(dev_sentence)
+
+# 1. 准备processor
+fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')
+
+sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence')
+sp_proc.add_span_converter(AlphaSpanConverter())
+sp_proc.add_span_converter(DigitSpanConverter())
+
+char_proc = CWSCharSegProcessor('sentence', 'char_list')
+
+tag_proc = CWSSegAppTagProcessor('sentence', 'tag')
+
+bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list')
+
+char_vocab_proc = VocabProcessor('char_list')
+bigram_vocab_proc = VocabProcessor('bigram_list')
+
+# 2. 使用processor
+fs2hs_proc(tr_dataset)
+
+sp_proc(tr_dataset)
+
+char_proc(tr_dataset)
+tag_proc(tr_dataset)
+bigram_proc(tr_dataset)
+
+char_vocab_proc(tr_dataset)
+bigram_vocab_proc(tr_dataset)
+
+char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list')
+bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list')
+
+char_index_proc(tr_dataset)
+bigram_index_proc(tr_dataset)
+
+# 2.1 处理dev_dataset
+fs2hs_proc(dev_dataset)
+
+sp_proc(dev_dataset)
+
+char_proc(dev_dataset)
+tag_proc(dev_dataset)
+bigram_proc(dev_dataset)
+
+char_index_proc(dev_dataset)
+bigram_index_proc(dev_dataset)
+
+
+# 3. 得到数据集可以用于训练了
+# TODO pretrain的embedding是怎么解决的？
+
+
+
+
+
+# 4. 组装需要存下的内容
+pp = Pipeline()
+pp.add_processor(fs2hs_proc)
+pp.add_processor(sp_proc)
+pp.add_processor(char_proc)
+pp.add_processor(bigram_proc)
+pp.add_processor(char_index_proc)
+pp.add_processor(bigram_index_proc)