mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-03 20:57:37 +08:00
新增cws converter, io
This commit is contained in:
parent
1b9daa1985
commit
38aa207ea2
0
reproduction/chinese_word_segment/io/__init__.py
Normal file
0
reproduction/chinese_word_segment/io/__init__.py
Normal file
129
reproduction/chinese_word_segment/io/cws_reader.py
Normal file
129
reproduction/chinese_word_segment/io/cws_reader.py
Normal file
@ -0,0 +1,129 @@
|
||||
|
||||
|
||||
from fastNLP.loader.dataset_loader import DataSetLoader
|
||||
from fastNLP.core.instance import Instance
|
||||
from fastNLP.core.dataset import DataSet
|
||||
|
||||
|
||||
def cut_long_sentence(sent, max_sample_length=200):
|
||||
sent_no_space = sent.replace(' ', '')
|
||||
cutted_sentence = []
|
||||
if len(sent_no_space) > max_sample_length:
|
||||
parts = sent.strip().split()
|
||||
new_line = ''
|
||||
length = 0
|
||||
for part in parts:
|
||||
length += len(part)
|
||||
new_line += part + ' '
|
||||
if length > max_sample_length:
|
||||
new_line = new_line[:-1]
|
||||
cutted_sentence.append(new_line)
|
||||
length = 0
|
||||
new_line = ''
|
||||
if new_line != '':
|
||||
cutted_sentence.append(new_line[:-1])
|
||||
else:
|
||||
cutted_sentence.append(sent)
|
||||
return cutted_sentence
|
||||
|
||||
class NaiveCWSReader(DataSetLoader):
|
||||
"""
|
||||
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
|
||||
这是 fastNLP , 一个 非常 good 的 包 .
|
||||
或者,即每个part后面还有一个pos tag
|
||||
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
|
||||
"""
|
||||
def __init__(self, in_word_splitter=None):
|
||||
super().__init__()
|
||||
|
||||
self.in_word_splitter = in_word_splitter
|
||||
|
||||
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
|
||||
"""
|
||||
允许使用的情况有(默认以\t或空格作为seg)
|
||||
这是 fastNLP , 一个 非常 good 的 包 .
|
||||
和
|
||||
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
|
||||
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
|
||||
:param filepath:
|
||||
:param in_word_splitter:
|
||||
:return:
|
||||
"""
|
||||
if in_word_splitter == None:
|
||||
in_word_splitter = self.in_word_splitter
|
||||
dataset = DataSet()
|
||||
with open(filepath, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if len(line.replace(' ', ''))==0: # 不能接受空行
|
||||
continue
|
||||
|
||||
if not in_word_splitter is None:
|
||||
words = []
|
||||
for part in line.split():
|
||||
word = part.split(in_word_splitter)[0]
|
||||
words.append(word)
|
||||
line = ' '.join(words)
|
||||
if cut_long_sent:
|
||||
sents = cut_long_sentence(line)
|
||||
else:
|
||||
sents = [line]
|
||||
for sent in sents:
|
||||
instance = Instance(raw_sentence=sent)
|
||||
dataset.append(instance)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
class POSCWSReader(DataSetLoader):
|
||||
"""
|
||||
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限.
|
||||
迈 N
|
||||
向 N
|
||||
充 N
|
||||
...
|
||||
泽 I-PER
|
||||
民 I-PER
|
||||
|
||||
( N
|
||||
一 N
|
||||
九 N
|
||||
...
|
||||
|
||||
|
||||
:param filepath:
|
||||
:return:
|
||||
"""
|
||||
def __init__(self, in_word_splitter=None):
|
||||
super().__init__()
|
||||
self.in_word_splitter = in_word_splitter
|
||||
|
||||
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
|
||||
if in_word_splitter is None:
|
||||
in_word_splitter = self.in_word_splitter
|
||||
dataset = DataSet()
|
||||
with open(filepath, 'r') as f:
|
||||
words = []
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if len(line) == 0: # new line
|
||||
if len(words)==0: # 不能接受空行
|
||||
continue
|
||||
line = ' '.join(words)
|
||||
if cut_long_sent:
|
||||
sents = cut_long_sent(line)
|
||||
else:
|
||||
sents = [line]
|
||||
for sent in sents:
|
||||
instance = Instance(raw_sentence=sent)
|
||||
dataset.append(instance)
|
||||
words = []
|
||||
else:
|
||||
line = line.split()[0]
|
||||
if in_word_splitter is None:
|
||||
words.append(line)
|
||||
else:
|
||||
words.append(line.split(in_word_splitter)[0])
|
||||
return dataset
|
||||
|
||||
|
185
reproduction/chinese_word_segment/process/span_converter.py
Normal file
185
reproduction/chinese_word_segment/process/span_converter.py
Normal file
@ -0,0 +1,185 @@
|
||||
|
||||
import re
|
||||
|
||||
|
||||
class SpanConverterBase:
|
||||
def __init__(self, replace_tag, pattern):
|
||||
super(SpanConverterBase, self).__init__()
|
||||
|
||||
self.replace_tag = replace_tag
|
||||
self.pattern = pattern
|
||||
|
||||
def find_certain_span_and_replace(self, sentence):
|
||||
replaced_sentence = ''
|
||||
prev_end = 0
|
||||
for match in re.finditer(self.pattern, sentence):
|
||||
start, end = match.span()
|
||||
span = sentence[start:end]
|
||||
replaced_sentence += sentence[prev_end:start] + \
|
||||
self.span_to_special_tag(span)
|
||||
prev_end = end
|
||||
replaced_sentence += sentence[prev_end:]
|
||||
|
||||
return replaced_sentence
|
||||
|
||||
def span_to_special_tag(self, span):
|
||||
|
||||
return self.replace_tag
|
||||
|
||||
def find_certain_span(self, sentence):
|
||||
spans = []
|
||||
for match in re.finditer(self.pattern, sentence):
|
||||
spans.append(match.span())
|
||||
return spans
|
||||
|
||||
|
||||
class AlphaSpanConverter(SpanConverterBase):
|
||||
def __init__(self):
|
||||
replace_tag = '<ALPHA>'
|
||||
# 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag).
|
||||
pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])'
|
||||
|
||||
super(AlphaSpanConverter, self).__init__(replace_tag, pattern)
|
||||
|
||||
|
||||
class DigitSpanConverter(SpanConverterBase):
|
||||
def __init__(self):
|
||||
replace_tag = '<NUM>'
|
||||
pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])'
|
||||
|
||||
super(DigitSpanConverter, self).__init__(replace_tag, pattern)
|
||||
|
||||
def span_to_special_tag(self, span):
|
||||
# return self.special_tag
|
||||
if span[0] == '0' and len(span) > 2:
|
||||
return '<NUM>'
|
||||
decimal_point_count = 0 # one might have more than one decimal pointers
|
||||
for idx, char in enumerate(span):
|
||||
if char == '.' or char == '﹒' or char == '·':
|
||||
decimal_point_count += 1
|
||||
if span[-1] == '.' or span[-1] == '﹒' or span[
|
||||
-1] == '·': # last digit being decimal point means this is not a number
|
||||
if decimal_point_count == 1:
|
||||
return span
|
||||
else:
|
||||
return '<UNKDGT>'
|
||||
if decimal_point_count == 1:
|
||||
return '<DEC>'
|
||||
elif decimal_point_count > 1:
|
||||
return '<UNKDGT>'
|
||||
else:
|
||||
return '<NUM>'
|
||||
|
||||
|
||||
class TimeConverter(SpanConverterBase):
|
||||
def __init__(self):
|
||||
replace_tag = '<TOC>'
|
||||
pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])'
|
||||
|
||||
super().__init__(replace_tag, pattern)
|
||||
|
||||
|
||||
|
||||
class MixNumAlphaConverter(SpanConverterBase):
|
||||
def __init__(self):
|
||||
replace_tag = '<MIX>'
|
||||
pattern = None
|
||||
|
||||
super().__init__(replace_tag, pattern)
|
||||
|
||||
def find_certain_span_and_replace(self, sentence):
|
||||
replaced_sentence = ''
|
||||
start = 0
|
||||
matching_flag = False
|
||||
number_flag = False
|
||||
alpha_flag = False
|
||||
link_flag = False
|
||||
slash_flag = False
|
||||
bracket_flag = False
|
||||
for idx in range(len(sentence)):
|
||||
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
|
||||
if not matching_flag:
|
||||
replaced_sentence += sentence[start:idx]
|
||||
start = idx
|
||||
if re.match('[0-9]', sentence[idx]):
|
||||
number_flag = True
|
||||
elif re.match('[\'′&\\-]', sentence[idx]):
|
||||
link_flag = True
|
||||
elif re.match('/', sentence[idx]):
|
||||
slash_flag = True
|
||||
elif re.match('[\\(\\)]', sentence[idx]):
|
||||
bracket_flag = True
|
||||
else:
|
||||
alpha_flag = True
|
||||
matching_flag = True
|
||||
elif re.match('[\\.]', sentence[idx]):
|
||||
pass
|
||||
else:
|
||||
if matching_flag:
|
||||
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
|
||||
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
|
||||
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
|
||||
span = sentence[start:idx]
|
||||
start = idx
|
||||
replaced_sentence += self.span_to_special_tag(span)
|
||||
matching_flag = False
|
||||
number_flag = False
|
||||
alpha_flag = False
|
||||
link_flag = False
|
||||
slash_flag = False
|
||||
bracket_flag = False
|
||||
|
||||
replaced_sentence += sentence[start:]
|
||||
return replaced_sentence
|
||||
|
||||
def find_certain_span(self, sentence):
|
||||
spans = []
|
||||
start = 0
|
||||
matching_flag = False
|
||||
number_flag = False
|
||||
alpha_flag = False
|
||||
link_flag = False
|
||||
slash_flag = False
|
||||
bracket_flag = False
|
||||
for idx in range(len(sentence)):
|
||||
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
|
||||
if not matching_flag:
|
||||
start = idx
|
||||
if re.match('[0-9]', sentence[idx]):
|
||||
number_flag = True
|
||||
elif re.match('[\'′&\\-]', sentence[idx]):
|
||||
link_flag = True
|
||||
elif re.match('/', sentence[idx]):
|
||||
slash_flag = True
|
||||
elif re.match('[\\(\\)]', sentence[idx]):
|
||||
bracket_flag = True
|
||||
else:
|
||||
alpha_flag = True
|
||||
matching_flag = True
|
||||
elif re.match('[\\.]', sentence[idx]):
|
||||
pass
|
||||
else:
|
||||
if matching_flag:
|
||||
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
|
||||
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
|
||||
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
|
||||
spans.append((start, idx))
|
||||
start = idx
|
||||
|
||||
matching_flag = False
|
||||
number_flag = False
|
||||
alpha_flag = False
|
||||
link_flag = False
|
||||
slash_flag = False
|
||||
bracket_flag = False
|
||||
|
||||
return spans
|
||||
|
||||
|
||||
|
||||
class EmailConverter(SpanConverterBase):
|
||||
def __init__(self):
|
||||
replaced_tag = "<EML>"
|
||||
pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])'
|
||||
|
||||
super(EmailConverter, self).__init__(replaced_tag, pattern)
|
@ -1,3 +1,98 @@
|
||||
|
||||
from fastNLP.core.instance import Instance
|
||||
from fastNLP.core.dataset import DataSet
|
||||
|
||||
|
||||
from fastNLP.api.pipeline import Pipeline
|
||||
from reproduction.chinese_word_segment.process.cws_processor import *
|
||||
from reproduction.chinese_word_segment.utils import cut_long_training_sentences
|
||||
from reproduction.chinese_word_segment.process.span_converter import *
|
||||
from reproduction.chinese_word_segment.io import NaiveCWSReader
|
||||
|
||||
|
||||
tr_filename = ''
|
||||
dev_filename = ''
|
||||
|
||||
reader = NaiveCWSReader()
|
||||
|
||||
tr_dataset = reader.load(tr_filename, cut=True)
|
||||
de_dataset = reader.load(dev_filename)
|
||||
|
||||
|
||||
|
||||
# TODO 如何组建成为一个Dataset
|
||||
def construct_dataset(sentences):
|
||||
dataset = DataSet()
|
||||
for sentence in sentences:
|
||||
instance = Instance()
|
||||
instance['raw_sentence'] = sentence
|
||||
dataset.append(instance)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
tr_dataset = construct_dataset(tr_sentences)
|
||||
dev_dataset = construct_dataset(dev_sentence)
|
||||
|
||||
# 1. 准备processor
|
||||
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')
|
||||
|
||||
sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence')
|
||||
sp_proc.add_span_converter(AlphaSpanConverter())
|
||||
sp_proc.add_span_converter(DigitSpanConverter())
|
||||
|
||||
char_proc = CWSCharSegProcessor('sentence', 'char_list')
|
||||
|
||||
tag_proc = CWSSegAppTagProcessor('sentence', 'tag')
|
||||
|
||||
bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list')
|
||||
|
||||
char_vocab_proc = VocabProcessor('char_list')
|
||||
bigram_vocab_proc = VocabProcessor('bigram_list')
|
||||
|
||||
# 2. 使用processor
|
||||
fs2hs_proc(tr_dataset)
|
||||
|
||||
sp_proc(tr_dataset)
|
||||
|
||||
char_proc(tr_dataset)
|
||||
tag_proc(tr_dataset)
|
||||
bigram_proc(tr_dataset)
|
||||
|
||||
char_vocab_proc(tr_dataset)
|
||||
bigram_vocab_proc(tr_dataset)
|
||||
|
||||
char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list')
|
||||
bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list')
|
||||
|
||||
char_index_proc(tr_dataset)
|
||||
bigram_index_proc(tr_dataset)
|
||||
|
||||
# 2.1 处理dev_dataset
|
||||
fs2hs_proc(dev_dataset)
|
||||
|
||||
sp_proc(dev_dataset)
|
||||
|
||||
char_proc(dev_dataset)
|
||||
tag_proc(dev_dataset)
|
||||
bigram_proc(dev_dataset)
|
||||
|
||||
char_index_proc(dev_dataset)
|
||||
bigram_index_proc(dev_dataset)
|
||||
|
||||
|
||||
# 3. 得到数据集可以用于训练了
|
||||
# TODO pretrain的embedding是怎么解决的?
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 4. 组装需要存下的内容
|
||||
pp = Pipeline()
|
||||
pp.add_processor(fs2hs_proc)
|
||||
pp.add_processor(sp_proc)
|
||||
pp.add_processor(char_proc)
|
||||
pp.add_processor(bigram_proc)
|
||||
pp.add_processor(char_index_proc)
|
||||
pp.add_processor(bigram_index_proc)
|
Loading…
Reference in New Issue
Block a user