新增cws converter, io

This commit is contained in:
yh 2018-11-09 20:23:05 +08:00
parent 1b9daa1985
commit 38aa207ea2
4 changed files with 409 additions and 0 deletions

View File

@ -0,0 +1,129 @@
from fastNLP.loader.dataset_loader import DataSetLoader
from fastNLP.core.instance import Instance
from fastNLP.core.dataset import DataSet
def cut_long_sentence(sent, max_sample_length=200):
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence
class NaiveCWSReader(DataSetLoader):
"""
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
这是 fastNLP , 一个 非常 good .
或者,即每个part后面还有一个pos tag
/D /P 團員/Na 之中/Ng /COMMACATEGORY
"""
def __init__(self, in_word_splitter=None):
super().__init__()
self.in_word_splitter = in_word_splitter
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
"""
允许使用的情况有(默认以\t或空格作为seg)
这是 fastNLP , 一个 非常 good .
/D /P 團員/Na 之中/Ng /COMMACATEGORY
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
:param filepath:
:param in_word_splitter:
:return:
"""
if in_word_splitter == None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if len(line.replace(' ', ''))==0: # 不能接受空行
continue
if not in_word_splitter is None:
words = []
for part in line.split():
word = part.split(in_word_splitter)[0]
words.append(word)
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)
return dataset
class POSCWSReader(DataSetLoader):
"""
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限.
N
N
N
...
I-PER
I-PER
N
N
N
...
:param filepath:
:return:
"""
def __init__(self, in_word_splitter=None):
super().__init__()
self.in_word_splitter = in_word_splitter
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
if in_word_splitter is None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
words = []
for line in f:
line = line.strip()
if len(line) == 0: # new line
if len(words)==0: # 不能接受空行
continue
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sent(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)
words = []
else:
line = line.split()[0]
if in_word_splitter is None:
words.append(line)
else:
words.append(line.split(in_word_splitter)[0])
return dataset

View File

@ -0,0 +1,185 @@
import re
class SpanConverterBase:
def __init__(self, replace_tag, pattern):
super(SpanConverterBase, self).__init__()
self.replace_tag = replace_tag
self.pattern = pattern
def find_certain_span_and_replace(self, sentence):
replaced_sentence = ''
prev_end = 0
for match in re.finditer(self.pattern, sentence):
start, end = match.span()
span = sentence[start:end]
replaced_sentence += sentence[prev_end:start] + \
self.span_to_special_tag(span)
prev_end = end
replaced_sentence += sentence[prev_end:]
return replaced_sentence
def span_to_special_tag(self, span):
return self.replace_tag
def find_certain_span(self, sentence):
spans = []
for match in re.finditer(self.pattern, sentence):
spans.append(match.span())
return spans
class AlphaSpanConverter(SpanConverterBase):
def __init__(self):
replace_tag = '<ALPHA>'
# 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag).
pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])'
super(AlphaSpanConverter, self).__init__(replace_tag, pattern)
class DigitSpanConverter(SpanConverterBase):
def __init__(self):
replace_tag = '<NUM>'
pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])'
super(DigitSpanConverter, self).__init__(replace_tag, pattern)
def span_to_special_tag(self, span):
# return self.special_tag
if span[0] == '0' and len(span) > 2:
return '<NUM>'
decimal_point_count = 0 # one might have more than one decimal pointers
for idx, char in enumerate(span):
if char == '.' or char == '' or char == '·':
decimal_point_count += 1
if span[-1] == '.' or span[-1] == '' or span[
-1] == '·': # last digit being decimal point means this is not a number
if decimal_point_count == 1:
return span
else:
return '<UNKDGT>'
if decimal_point_count == 1:
return '<DEC>'
elif decimal_point_count > 1:
return '<UNKDGT>'
else:
return '<NUM>'
class TimeConverter(SpanConverterBase):
def __init__(self):
replace_tag = '<TOC>'
pattern = '\d+[:][\d:]+(?=[\u4e00-\u9fff ,%.!<-])'
super().__init__(replace_tag, pattern)
class MixNumAlphaConverter(SpanConverterBase):
def __init__(self):
replace_tag = '<MIX>'
pattern = None
super().__init__(replace_tag, pattern)
def find_certain_span_and_replace(self, sentence):
replaced_sentence = ''
start = 0
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False
for idx in range(len(sentence)):
if re.match('[0-9a-zA-Z/\\(\\)\'&\\-]', sentence[idx]):
if not matching_flag:
replaced_sentence += sentence[start:idx]
start = idx
if re.match('[0-9]', sentence[idx]):
number_flag = True
elif re.match('[\'&\\-]', sentence[idx]):
link_flag = True
elif re.match('/', sentence[idx]):
slash_flag = True
elif re.match('[\\(\\)]', sentence[idx]):
bracket_flag = True
else:
alpha_flag = True
matching_flag = True
elif re.match('[\\.]', sentence[idx]):
pass
else:
if matching_flag:
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
span = sentence[start:idx]
start = idx
replaced_sentence += self.span_to_special_tag(span)
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False
replaced_sentence += sentence[start:]
return replaced_sentence
def find_certain_span(self, sentence):
spans = []
start = 0
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False
for idx in range(len(sentence)):
if re.match('[0-9a-zA-Z/\\(\\)\'&\\-]', sentence[idx]):
if not matching_flag:
start = idx
if re.match('[0-9]', sentence[idx]):
number_flag = True
elif re.match('[\'&\\-]', sentence[idx]):
link_flag = True
elif re.match('/', sentence[idx]):
slash_flag = True
elif re.match('[\\(\\)]', sentence[idx]):
bracket_flag = True
else:
alpha_flag = True
matching_flag = True
elif re.match('[\\.]', sentence[idx]):
pass
else:
if matching_flag:
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
spans.append((start, idx))
start = idx
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False
return spans
class EmailConverter(SpanConverterBase):
def __init__(self):
replaced_tag = "<EML>"
pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])'
super(EmailConverter, self).__init__(replaced_tag, pattern)

View File

@ -1,3 +1,98 @@
from fastNLP.core.instance import Instance
from fastNLP.core.dataset import DataSet
from fastNLP.api.pipeline import Pipeline
from reproduction.chinese_word_segment.process.cws_processor import *
from reproduction.chinese_word_segment.utils import cut_long_training_sentences
from reproduction.chinese_word_segment.process.span_converter import *
from reproduction.chinese_word_segment.io import NaiveCWSReader
tr_filename = ''
dev_filename = ''
reader = NaiveCWSReader()
tr_dataset = reader.load(tr_filename, cut=True)
de_dataset = reader.load(dev_filename)
# TODO 如何组建成为一个Dataset
def construct_dataset(sentences):
dataset = DataSet()
for sentence in sentences:
instance = Instance()
instance['raw_sentence'] = sentence
dataset.append(instance)
return dataset
tr_dataset = construct_dataset(tr_sentences)
dev_dataset = construct_dataset(dev_sentence)
# 1. 准备processor
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')
sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence')
sp_proc.add_span_converter(AlphaSpanConverter())
sp_proc.add_span_converter(DigitSpanConverter())
char_proc = CWSCharSegProcessor('sentence', 'char_list')
tag_proc = CWSSegAppTagProcessor('sentence', 'tag')
bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list')
char_vocab_proc = VocabProcessor('char_list')
bigram_vocab_proc = VocabProcessor('bigram_list')
# 2. 使用processor
fs2hs_proc(tr_dataset)
sp_proc(tr_dataset)
char_proc(tr_dataset)
tag_proc(tr_dataset)
bigram_proc(tr_dataset)
char_vocab_proc(tr_dataset)
bigram_vocab_proc(tr_dataset)
char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list')
bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list')
char_index_proc(tr_dataset)
bigram_index_proc(tr_dataset)
# 2.1 处理dev_dataset
fs2hs_proc(dev_dataset)
sp_proc(dev_dataset)
char_proc(dev_dataset)
tag_proc(dev_dataset)
bigram_proc(dev_dataset)
char_index_proc(dev_dataset)
bigram_index_proc(dev_dataset)
# 3. 得到数据集可以用于训练了
# TODO pretrain的embedding是怎么解决的
# 4. 组装需要存下的内容
pp = Pipeline()
pp.add_processor(fs2hs_proc)
pp.add_processor(sp_proc)
pp.add_processor(char_proc)
pp.add_processor(bigram_proc)
pp.add_processor(char_index_proc)
pp.add_processor(bigram_index_proc)