mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-04 13:17:51 +08:00
新增cws converter, io
This commit is contained in:
parent
1b9daa1985
commit
38aa207ea2
0
reproduction/chinese_word_segment/io/__init__.py
Normal file
0
reproduction/chinese_word_segment/io/__init__.py
Normal file
129
reproduction/chinese_word_segment/io/cws_reader.py
Normal file
129
reproduction/chinese_word_segment/io/cws_reader.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
|
||||||
|
|
||||||
|
from fastNLP.loader.dataset_loader import DataSetLoader
|
||||||
|
from fastNLP.core.instance import Instance
|
||||||
|
from fastNLP.core.dataset import DataSet
|
||||||
|
|
||||||
|
|
||||||
|
def cut_long_sentence(sent, max_sample_length=200):
|
||||||
|
sent_no_space = sent.replace(' ', '')
|
||||||
|
cutted_sentence = []
|
||||||
|
if len(sent_no_space) > max_sample_length:
|
||||||
|
parts = sent.strip().split()
|
||||||
|
new_line = ''
|
||||||
|
length = 0
|
||||||
|
for part in parts:
|
||||||
|
length += len(part)
|
||||||
|
new_line += part + ' '
|
||||||
|
if length > max_sample_length:
|
||||||
|
new_line = new_line[:-1]
|
||||||
|
cutted_sentence.append(new_line)
|
||||||
|
length = 0
|
||||||
|
new_line = ''
|
||||||
|
if new_line != '':
|
||||||
|
cutted_sentence.append(new_line[:-1])
|
||||||
|
else:
|
||||||
|
cutted_sentence.append(sent)
|
||||||
|
return cutted_sentence
|
||||||
|
|
||||||
|
class NaiveCWSReader(DataSetLoader):
|
||||||
|
"""
|
||||||
|
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
|
||||||
|
这是 fastNLP , 一个 非常 good 的 包 .
|
||||||
|
或者,即每个part后面还有一个pos tag
|
||||||
|
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
|
||||||
|
"""
|
||||||
|
def __init__(self, in_word_splitter=None):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.in_word_splitter = in_word_splitter
|
||||||
|
|
||||||
|
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
|
||||||
|
"""
|
||||||
|
允许使用的情况有(默认以\t或空格作为seg)
|
||||||
|
这是 fastNLP , 一个 非常 good 的 包 .
|
||||||
|
和
|
||||||
|
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
|
||||||
|
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
|
||||||
|
:param filepath:
|
||||||
|
:param in_word_splitter:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if in_word_splitter == None:
|
||||||
|
in_word_splitter = self.in_word_splitter
|
||||||
|
dataset = DataSet()
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if len(line.replace(' ', ''))==0: # 不能接受空行
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not in_word_splitter is None:
|
||||||
|
words = []
|
||||||
|
for part in line.split():
|
||||||
|
word = part.split(in_word_splitter)[0]
|
||||||
|
words.append(word)
|
||||||
|
line = ' '.join(words)
|
||||||
|
if cut_long_sent:
|
||||||
|
sents = cut_long_sentence(line)
|
||||||
|
else:
|
||||||
|
sents = [line]
|
||||||
|
for sent in sents:
|
||||||
|
instance = Instance(raw_sentence=sent)
|
||||||
|
dataset.append(instance)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
class POSCWSReader(DataSetLoader):
|
||||||
|
"""
|
||||||
|
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限.
|
||||||
|
迈 N
|
||||||
|
向 N
|
||||||
|
充 N
|
||||||
|
...
|
||||||
|
泽 I-PER
|
||||||
|
民 I-PER
|
||||||
|
|
||||||
|
( N
|
||||||
|
一 N
|
||||||
|
九 N
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
:param filepath:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
def __init__(self, in_word_splitter=None):
|
||||||
|
super().__init__()
|
||||||
|
self.in_word_splitter = in_word_splitter
|
||||||
|
|
||||||
|
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
|
||||||
|
if in_word_splitter is None:
|
||||||
|
in_word_splitter = self.in_word_splitter
|
||||||
|
dataset = DataSet()
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
words = []
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if len(line) == 0: # new line
|
||||||
|
if len(words)==0: # 不能接受空行
|
||||||
|
continue
|
||||||
|
line = ' '.join(words)
|
||||||
|
if cut_long_sent:
|
||||||
|
sents = cut_long_sent(line)
|
||||||
|
else:
|
||||||
|
sents = [line]
|
||||||
|
for sent in sents:
|
||||||
|
instance = Instance(raw_sentence=sent)
|
||||||
|
dataset.append(instance)
|
||||||
|
words = []
|
||||||
|
else:
|
||||||
|
line = line.split()[0]
|
||||||
|
if in_word_splitter is None:
|
||||||
|
words.append(line)
|
||||||
|
else:
|
||||||
|
words.append(line.split(in_word_splitter)[0])
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
185
reproduction/chinese_word_segment/process/span_converter.py
Normal file
185
reproduction/chinese_word_segment/process/span_converter.py
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class SpanConverterBase:
|
||||||
|
def __init__(self, replace_tag, pattern):
|
||||||
|
super(SpanConverterBase, self).__init__()
|
||||||
|
|
||||||
|
self.replace_tag = replace_tag
|
||||||
|
self.pattern = pattern
|
||||||
|
|
||||||
|
def find_certain_span_and_replace(self, sentence):
|
||||||
|
replaced_sentence = ''
|
||||||
|
prev_end = 0
|
||||||
|
for match in re.finditer(self.pattern, sentence):
|
||||||
|
start, end = match.span()
|
||||||
|
span = sentence[start:end]
|
||||||
|
replaced_sentence += sentence[prev_end:start] + \
|
||||||
|
self.span_to_special_tag(span)
|
||||||
|
prev_end = end
|
||||||
|
replaced_sentence += sentence[prev_end:]
|
||||||
|
|
||||||
|
return replaced_sentence
|
||||||
|
|
||||||
|
def span_to_special_tag(self, span):
|
||||||
|
|
||||||
|
return self.replace_tag
|
||||||
|
|
||||||
|
def find_certain_span(self, sentence):
|
||||||
|
spans = []
|
||||||
|
for match in re.finditer(self.pattern, sentence):
|
||||||
|
spans.append(match.span())
|
||||||
|
return spans
|
||||||
|
|
||||||
|
|
||||||
|
class AlphaSpanConverter(SpanConverterBase):
|
||||||
|
def __init__(self):
|
||||||
|
replace_tag = '<ALPHA>'
|
||||||
|
# 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag).
|
||||||
|
pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])'
|
||||||
|
|
||||||
|
super(AlphaSpanConverter, self).__init__(replace_tag, pattern)
|
||||||
|
|
||||||
|
|
||||||
|
class DigitSpanConverter(SpanConverterBase):
|
||||||
|
def __init__(self):
|
||||||
|
replace_tag = '<NUM>'
|
||||||
|
pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])'
|
||||||
|
|
||||||
|
super(DigitSpanConverter, self).__init__(replace_tag, pattern)
|
||||||
|
|
||||||
|
def span_to_special_tag(self, span):
|
||||||
|
# return self.special_tag
|
||||||
|
if span[0] == '0' and len(span) > 2:
|
||||||
|
return '<NUM>'
|
||||||
|
decimal_point_count = 0 # one might have more than one decimal pointers
|
||||||
|
for idx, char in enumerate(span):
|
||||||
|
if char == '.' or char == '﹒' or char == '·':
|
||||||
|
decimal_point_count += 1
|
||||||
|
if span[-1] == '.' or span[-1] == '﹒' or span[
|
||||||
|
-1] == '·': # last digit being decimal point means this is not a number
|
||||||
|
if decimal_point_count == 1:
|
||||||
|
return span
|
||||||
|
else:
|
||||||
|
return '<UNKDGT>'
|
||||||
|
if decimal_point_count == 1:
|
||||||
|
return '<DEC>'
|
||||||
|
elif decimal_point_count > 1:
|
||||||
|
return '<UNKDGT>'
|
||||||
|
else:
|
||||||
|
return '<NUM>'
|
||||||
|
|
||||||
|
|
||||||
|
class TimeConverter(SpanConverterBase):
|
||||||
|
def __init__(self):
|
||||||
|
replace_tag = '<TOC>'
|
||||||
|
pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])'
|
||||||
|
|
||||||
|
super().__init__(replace_tag, pattern)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MixNumAlphaConverter(SpanConverterBase):
|
||||||
|
def __init__(self):
|
||||||
|
replace_tag = '<MIX>'
|
||||||
|
pattern = None
|
||||||
|
|
||||||
|
super().__init__(replace_tag, pattern)
|
||||||
|
|
||||||
|
def find_certain_span_and_replace(self, sentence):
|
||||||
|
replaced_sentence = ''
|
||||||
|
start = 0
|
||||||
|
matching_flag = False
|
||||||
|
number_flag = False
|
||||||
|
alpha_flag = False
|
||||||
|
link_flag = False
|
||||||
|
slash_flag = False
|
||||||
|
bracket_flag = False
|
||||||
|
for idx in range(len(sentence)):
|
||||||
|
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
|
||||||
|
if not matching_flag:
|
||||||
|
replaced_sentence += sentence[start:idx]
|
||||||
|
start = idx
|
||||||
|
if re.match('[0-9]', sentence[idx]):
|
||||||
|
number_flag = True
|
||||||
|
elif re.match('[\'′&\\-]', sentence[idx]):
|
||||||
|
link_flag = True
|
||||||
|
elif re.match('/', sentence[idx]):
|
||||||
|
slash_flag = True
|
||||||
|
elif re.match('[\\(\\)]', sentence[idx]):
|
||||||
|
bracket_flag = True
|
||||||
|
else:
|
||||||
|
alpha_flag = True
|
||||||
|
matching_flag = True
|
||||||
|
elif re.match('[\\.]', sentence[idx]):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if matching_flag:
|
||||||
|
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
|
||||||
|
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
|
||||||
|
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
|
||||||
|
span = sentence[start:idx]
|
||||||
|
start = idx
|
||||||
|
replaced_sentence += self.span_to_special_tag(span)
|
||||||
|
matching_flag = False
|
||||||
|
number_flag = False
|
||||||
|
alpha_flag = False
|
||||||
|
link_flag = False
|
||||||
|
slash_flag = False
|
||||||
|
bracket_flag = False
|
||||||
|
|
||||||
|
replaced_sentence += sentence[start:]
|
||||||
|
return replaced_sentence
|
||||||
|
|
||||||
|
def find_certain_span(self, sentence):
|
||||||
|
spans = []
|
||||||
|
start = 0
|
||||||
|
matching_flag = False
|
||||||
|
number_flag = False
|
||||||
|
alpha_flag = False
|
||||||
|
link_flag = False
|
||||||
|
slash_flag = False
|
||||||
|
bracket_flag = False
|
||||||
|
for idx in range(len(sentence)):
|
||||||
|
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
|
||||||
|
if not matching_flag:
|
||||||
|
start = idx
|
||||||
|
if re.match('[0-9]', sentence[idx]):
|
||||||
|
number_flag = True
|
||||||
|
elif re.match('[\'′&\\-]', sentence[idx]):
|
||||||
|
link_flag = True
|
||||||
|
elif re.match('/', sentence[idx]):
|
||||||
|
slash_flag = True
|
||||||
|
elif re.match('[\\(\\)]', sentence[idx]):
|
||||||
|
bracket_flag = True
|
||||||
|
else:
|
||||||
|
alpha_flag = True
|
||||||
|
matching_flag = True
|
||||||
|
elif re.match('[\\.]', sentence[idx]):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if matching_flag:
|
||||||
|
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
|
||||||
|
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
|
||||||
|
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
|
||||||
|
spans.append((start, idx))
|
||||||
|
start = idx
|
||||||
|
|
||||||
|
matching_flag = False
|
||||||
|
number_flag = False
|
||||||
|
alpha_flag = False
|
||||||
|
link_flag = False
|
||||||
|
slash_flag = False
|
||||||
|
bracket_flag = False
|
||||||
|
|
||||||
|
return spans
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class EmailConverter(SpanConverterBase):
|
||||||
|
def __init__(self):
|
||||||
|
replaced_tag = "<EML>"
|
||||||
|
pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])'
|
||||||
|
|
||||||
|
super(EmailConverter, self).__init__(replaced_tag, pattern)
|
@ -1,3 +1,98 @@
|
|||||||
|
|
||||||
|
from fastNLP.core.instance import Instance
|
||||||
|
from fastNLP.core.dataset import DataSet
|
||||||
|
|
||||||
|
|
||||||
|
from fastNLP.api.pipeline import Pipeline
|
||||||
|
from reproduction.chinese_word_segment.process.cws_processor import *
|
||||||
|
from reproduction.chinese_word_segment.utils import cut_long_training_sentences
|
||||||
|
from reproduction.chinese_word_segment.process.span_converter import *
|
||||||
|
from reproduction.chinese_word_segment.io import NaiveCWSReader
|
||||||
|
|
||||||
|
|
||||||
|
tr_filename = ''
|
||||||
|
dev_filename = ''
|
||||||
|
|
||||||
|
reader = NaiveCWSReader()
|
||||||
|
|
||||||
|
tr_dataset = reader.load(tr_filename, cut=True)
|
||||||
|
de_dataset = reader.load(dev_filename)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 如何组建成为一个Dataset
|
||||||
|
def construct_dataset(sentences):
|
||||||
|
dataset = DataSet()
|
||||||
|
for sentence in sentences:
|
||||||
|
instance = Instance()
|
||||||
|
instance['raw_sentence'] = sentence
|
||||||
|
dataset.append(instance)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
tr_dataset = construct_dataset(tr_sentences)
|
||||||
|
dev_dataset = construct_dataset(dev_sentence)
|
||||||
|
|
||||||
|
# 1. 准备processor
|
||||||
|
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')
|
||||||
|
|
||||||
|
sp_proc = SpeicalSpanProcessor('raw_sentence', 'sentence')
|
||||||
|
sp_proc.add_span_converter(AlphaSpanConverter())
|
||||||
|
sp_proc.add_span_converter(DigitSpanConverter())
|
||||||
|
|
||||||
|
char_proc = CWSCharSegProcessor('sentence', 'char_list')
|
||||||
|
|
||||||
|
tag_proc = CWSSegAppTagProcessor('sentence', 'tag')
|
||||||
|
|
||||||
|
bigram_proc = Pre2Post2BigramProcessor('char_list', 'bigram_list')
|
||||||
|
|
||||||
|
char_vocab_proc = VocabProcessor('char_list')
|
||||||
|
bigram_vocab_proc = VocabProcessor('bigram_list')
|
||||||
|
|
||||||
|
# 2. 使用processor
|
||||||
|
fs2hs_proc(tr_dataset)
|
||||||
|
|
||||||
|
sp_proc(tr_dataset)
|
||||||
|
|
||||||
|
char_proc(tr_dataset)
|
||||||
|
tag_proc(tr_dataset)
|
||||||
|
bigram_proc(tr_dataset)
|
||||||
|
|
||||||
|
char_vocab_proc(tr_dataset)
|
||||||
|
bigram_vocab_proc(tr_dataset)
|
||||||
|
|
||||||
|
char_index_proc = IndexProcessor(char_vocab_proc.get_vocab(), 'char_list')
|
||||||
|
bigram_index_proc = IndexProcessor(bigram_vocab_proc.get_vocab(), 'bigram_list')
|
||||||
|
|
||||||
|
char_index_proc(tr_dataset)
|
||||||
|
bigram_index_proc(tr_dataset)
|
||||||
|
|
||||||
|
# 2.1 处理dev_dataset
|
||||||
|
fs2hs_proc(dev_dataset)
|
||||||
|
|
||||||
|
sp_proc(dev_dataset)
|
||||||
|
|
||||||
|
char_proc(dev_dataset)
|
||||||
|
tag_proc(dev_dataset)
|
||||||
|
bigram_proc(dev_dataset)
|
||||||
|
|
||||||
|
char_index_proc(dev_dataset)
|
||||||
|
bigram_index_proc(dev_dataset)
|
||||||
|
|
||||||
|
|
||||||
|
# 3. 得到数据集可以用于训练了
|
||||||
|
# TODO pretrain的embedding是怎么解决的?
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 4. 组装需要存下的内容
|
||||||
|
pp = Pipeline()
|
||||||
|
pp.add_processor(fs2hs_proc)
|
||||||
|
pp.add_processor(sp_proc)
|
||||||
|
pp.add_processor(char_proc)
|
||||||
|
pp.add_processor(bigram_proc)
|
||||||
|
pp.add_processor(char_index_proc)
|
||||||
|
pp.add_processor(bigram_index_proc)
|
Loading…
Reference in New Issue
Block a user