From 38aa207ea21a24361ff089984d257010ba8cefe6 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 9 Nov 2018 20:23:05 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Ecws=20converter,=20io?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chinese_word_segment/io/__init__.py | 0 .../chinese_word_segment/io/cws_reader.py | 129 ++++++++++++ .../process/span_converter.py | 185 ++++++++++++++++++ .../chinese_word_segment/train_context.py | 95 +++++++++ 4 files changed, 409 insertions(+) create mode 100644 reproduction/chinese_word_segment/io/__init__.py create mode 100644 reproduction/chinese_word_segment/io/cws_reader.py create mode 100644 reproduction/chinese_word_segment/process/span_converter.py diff --git a/reproduction/chinese_word_segment/io/__init__.py b/reproduction/chinese_word_segment/io/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/reproduction/chinese_word_segment/io/cws_reader.py b/reproduction/chinese_word_segment/io/cws_reader.py new file mode 100644 index 00000000..23c768c6 --- /dev/null +++ b/reproduction/chinese_word_segment/io/cws_reader.py @@ -0,0 +1,129 @@ + + +from fastNLP.loader.dataset_loader import DataSetLoader +from fastNLP.core.instance import Instance +from fastNLP.core.dataset import DataSet + + +def cut_long_sentence(sent, max_sample_length=200): + sent_no_space = sent.replace(' ', '') + cutted_sentence = [] + if len(sent_no_space) > max_sample_length: + parts = sent.strip().split() + new_line = '' + length = 0 + for part in parts: + length += len(part) + new_line += part + ' ' + if length > max_sample_length: + new_line = new_line[:-1] + cutted_sentence.append(new_line) + length = 0 + new_line = '' + if new_line != '': + cutted_sentence.append(new_line[:-1]) + else: + cutted_sentence.append(sent) + return cutted_sentence + +class NaiveCWSReader(DataSetLoader): + """ + 这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了 + 这是 fastNLP , 一个 非常 good 的 包 . + 或者,即每个part后面还有一个pos tag + 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY + """ + def __init__(self, in_word_splitter=None): + super().__init__() + + self.in_word_splitter = in_word_splitter + + def load(self, filepath, in_word_splitter=None, cut_long_sent=False): + """ + 允许使用的情况有(默认以\t或空格作为seg) + 这是 fastNLP , 一个 非常 good 的 包 . + 和 + 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY + 如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] + :param filepath: + :param in_word_splitter: + :return: + """ + if in_word_splitter == None: + in_word_splitter = self.in_word_splitter + dataset = DataSet() + with open(filepath, 'r') as f: + for line in f: + line = line.strip() + if len(line.replace(' ', ''))==0: # 不能接受空行 + continue + + if not in_word_splitter is None: + words = [] + for part in line.split(): + word = part.split(in_word_splitter)[0] + words.append(word) + line = ' '.join(words) + if cut_long_sent: + sents = cut_long_sentence(line) + else: + sents = [line] + for sent in sents: + instance = Instance(raw_sentence=sent) + dataset.append(instance) + + return dataset + + +class POSCWSReader(DataSetLoader): + """ + 支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限. + 迈 N + 向 N + 充 N + ... + 泽 I-PER + 民 I-PER + + ( N + 一 N + 九 N + ... + + + :param filepath: + :return: + """ + def __init__(self, in_word_splitter=None): + super().__init__() + self.in_word_splitter = in_word_splitter + + def load(self, filepath, in_word_splitter=None, cut_long_sent=False): + if in_word_splitter is None: + in_word_splitter = self.in_word_splitter + dataset = DataSet() + with open(filepath, 'r') as f: + words = [] + for line in f: + line = line.strip() + if len(line) == 0: # new line + if len(words)==0: # 不能接受空行 + continue + line = ' '.join(words) + if cut_long_sent: + sents = cut_long_sent(line) + else: + sents = [line] + for sent in sents: + instance = Instance(raw_sentence=sent) + dataset.append(instance) + words = [] + else: + line = line.split()[0] + if in_word_splitter is None: + words.append(line) + else: + words.append(line.split(in_word_splitter)[0]) + return dataset + + diff --git a/reproduction/chinese_word_segment/process/span_converter.py b/reproduction/chinese_word_segment/process/span_converter.py new file mode 100644 index 00000000..23e590c4 --- /dev/null +++ b/reproduction/chinese_word_segment/process/span_converter.py @@ -0,0 +1,185 @@ + +import re + + +class SpanConverterBase: + def __init__(self, replace_tag, pattern): + super(SpanConverterBase, self).__init__() + + self.replace_tag = replace_tag + self.pattern = pattern + + def find_certain_span_and_replace(self, sentence): + replaced_sentence = '' + prev_end = 0 + for match in re.finditer(self.pattern, sentence): + start, end = match.span() + span = sentence[start:end] + replaced_sentence += sentence[prev_end:start] + \ + self.span_to_special_tag(span) + prev_end = end + replaced_sentence += sentence[prev_end:] + + return replaced_sentence + + def span_to_special_tag(self, span): + + return self.replace_tag + + def find_certain_span(self, sentence): + spans = [] + for match in re.finditer(self.pattern, sentence): + spans.append(match.span()) + return spans + + +class AlphaSpanConverter(SpanConverterBase): + def __init__(self): + replace_tag = '' + # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). + pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' + + super(AlphaSpanConverter, self).__init__(replace_tag, pattern) + + +class DigitSpanConverter(SpanConverterBase): + def __init__(self): + replace_tag = '' + pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' + + super(DigitSpanConverter, self).__init__(replace_tag, pattern) + + def span_to_special_tag(self, span): + # return self.special_tag + if span[0] == '0' and len(span) > 2: + return '' + decimal_point_count = 0 # one might have more than one decimal pointers + for idx, char in enumerate(span): + if char == '.' or char == '﹒' or char == '·': + decimal_point_count += 1 + if span[-1] == '.' or span[-1] == '﹒' or span[ + -1] == '·': # last digit being decimal point means this is not a number + if decimal_point_count == 1: + return span + else: + return '' + if decimal_point_count == 1: + return '' + elif decimal_point_count > 1: + return '' + else: + return '' + + +class TimeConverter(SpanConverterBase): + def __init__(self): + replace_tag = '' + pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' + + super().__init__(replace_tag, pattern) + + + +class MixNumAlphaConverter(SpanConverterBase): + def __init__(self): + replace_tag = '' + pattern = None + + super().__init__(replace_tag, pattern) + + def find_certain_span_and_replace(self, sentence): + replaced_sentence = '' + start = 0 + matching_flag = False + number_flag = False + alpha_flag = False + link_flag = False + slash_flag = False + bracket_flag = False + for idx in range(len(sentence)): + if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): + if not matching_flag: + replaced_sentence += sentence[start:idx] + start = idx + if re.match('[0-9]', sentence[idx]): + number_flag = True + elif re.match('[\'′&\\-]', sentence[idx]): + link_flag = True + elif re.match('/', sentence[idx]): + slash_flag = True + elif re.match('[\\(\\)]', sentence[idx]): + bracket_flag = True + else: + alpha_flag = True + matching_flag = True + elif re.match('[\\.]', sentence[idx]): + pass + else: + if matching_flag: + if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ + or (slash_flag and alpha_flag) or (link_flag and number_flag) \ + or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): + span = sentence[start:idx] + start = idx + replaced_sentence += self.span_to_special_tag(span) + matching_flag = False + number_flag = False + alpha_flag = False + link_flag = False + slash_flag = False + bracket_flag = False + + replaced_sentence += sentence[start:] + return replaced_sentence + + def find_certain_span(self, sentence): + spans = [] + start = 0 + matching_flag = False + number_flag = False + alpha_flag = False + link_flag = False + slash_flag = False + bracket_flag = False + for idx in range(len(sentence)): + if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): + if not matching_flag: + start = idx + if re.match('[0-9]', sentence[idx]): + number_flag = True + elif re.match('[\'′&\\-]', sentence[idx]): + link_flag = True + elif re.match('/', sentence[idx]): + slash_flag = True + elif re.match('[\\(\\)]', sentence[idx]): + bracket_flag = True + else: + alpha_flag = True + matching_flag = True + elif re.match('[\\.]', sentence[idx]): + pass + else: + if matching_flag: + if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ + or (slash_flag and alpha_flag) or (link_flag and number_flag) \ + or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): + spans.append((start, idx)) + start = idx + + matching_flag = False + number_flag = False + alpha_flag = False + link_flag = False + slash_flag = False + bracket_flag = False + + return spans + + + +class EmailConverter(SpanConverterBase): + def __init__(self): + replaced_tag = "" + pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' + + super(EmailConverter, self).__init__(replaced_tag, pattern) \ No newline at end of file diff --git a/reproduction/chinese_word_segment/train_context.py b/reproduction/chinese_word_segment/train_context.py index b28b04f6..691a97a6 100644 --- a/reproduction/chinese_word_segment/train_context.py +++ b/reproduction/chinese_word_segment/train_context.py @@ -1,3 +1,98 @@ +from fastNLP.core.instance import Instance +from fastNLP.core.dataset import DataSet + + +from fastNLP.api.pipeline import Pipeline +from reproduction.chinese_word_segment.process.cws_processor import * +from reproduction.chinese_word_segment.utils import cut_long_training_sentences +from reproduction.chinese_word_segment.process.span_converter import * +from reproduction.chinese_word_segment.io import NaiveCWSReader + + +tr_filename = '' +dev_filename = '' + +reader = NaiveCWSReader() + +tr_dataset = reader.load(tr_filename, cut=True) +de_dataset = reader.load(dev_filename) + +# TODO 如何组建成为一个Dataset +def construct_dataset(sentences): + dataset = DataSet() + for sentence in sentences: + instance = Instance() + instance['raw_sentence'] = sentence + dataset.append(instance) + + return dataset + + +tr_dataset = construct_dataset(tr_sentences) +dev_dataset = construct_dataset(dev_sentence) + +# 1. 准备processor +fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence') + +sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence') +sp_proc.add_span_converter(AlphaSpanConverter()) +sp_proc.add_span_converter(DigitSpanConverter()) + +char_proc = CWSCharSegProcessor('sentence', 'char_list') + +tag_proc = CWSSegAppTagProcessor('sentence', 'tag') + +bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list') + +char_vocab_proc = VocabProcessor('char_list') +bigram_vocab_proc = VocabProcessor('bigram_list') + +# 2. 使用processor +fs2hs_proc(tr_dataset) + +sp_proc(tr_dataset) + +char_proc(tr_dataset) +tag_proc(tr_dataset) +bigram_proc(tr_dataset) + +char_vocab_proc(tr_dataset) +bigram_vocab_proc(tr_dataset) + +char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list') +bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list') + +char_index_proc(tr_dataset) +bigram_index_proc(tr_dataset) + +# 2.1 处理dev_dataset +fs2hs_proc(dev_dataset) + +sp_proc(dev_dataset) + +char_proc(dev_dataset) +tag_proc(dev_dataset) +bigram_proc(dev_dataset) + +char_index_proc(dev_dataset) +bigram_index_proc(dev_dataset) + + +# 3. 得到数据集可以用于训练了 +# TODO pretrain的embedding是怎么解决的? + + + + + +# 4. 组装需要存下的内容 +pp = Pipeline() +pp.add_processor(fs2hs_proc) +pp.add_processor(sp_proc) +pp.add_processor(char_proc) +pp.add_processor(bigram_proc) +pp.add_processor(char_index_proc) +pp.add_processor(bigram_index_proc) \ No newline at end of file