mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-03 20:57:37 +08:00
修改processor适配昨天的sao操作
This commit is contained in:
parent
ae0cc9a46b
commit
25a53ac5c9
@ -73,16 +73,16 @@ class FullSpaceToHalfSpaceProcessor(Processor):
|
||||
if char in self.convert_map:
|
||||
char = self.convert_map[char]
|
||||
new_sentence[idx] = char
|
||||
ins[self.field_name].text = ''.join(new_sentence)
|
||||
ins[self.field_name] = ''.join(new_sentence)
|
||||
return dataset
|
||||
|
||||
|
||||
class IndexerProcessor(Processor):
|
||||
def __init__(self, vocab, field_name):
|
||||
def __init__(self, vocab, field_name, new_added_field_name):
|
||||
|
||||
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
|
||||
|
||||
super(IndexerProcessor, self).__init__(field_name, None)
|
||||
super(IndexerProcessor, self).__init__(field_name, new_added_field_name)
|
||||
self.vocab = vocab
|
||||
|
||||
def set_vocab(self, vocab):
|
||||
@ -93,9 +93,9 @@ class IndexerProcessor(Processor):
|
||||
def process(self, dataset):
|
||||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
|
||||
for ins in dataset:
|
||||
tokens = ins[self.field_name].content
|
||||
tokens = ins[self.field_name]
|
||||
index = [self.vocab.to_index(token) for token in tokens]
|
||||
ins[self.field_name]._index = index
|
||||
ins[self.new_added_field_name] = index
|
||||
|
||||
return dataset
|
||||
|
||||
@ -110,7 +110,7 @@ class VocabProcessor(Processor):
|
||||
for dataset in datasets:
|
||||
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
|
||||
for ins in dataset:
|
||||
tokens = ins[self.field_name].content
|
||||
tokens = ins[self.field_name]
|
||||
self.vocab.update(tokens)
|
||||
|
||||
def get_vocab(self):
|
||||
|
@ -5,9 +5,8 @@ import re
|
||||
from fastNLP.core.field import SeqLabelField
|
||||
from fastNLP.core.vocabulary import Vocabulary
|
||||
from fastNLP.core.dataset import DataSet
|
||||
|
||||
from fastNLP.api.processor import Processor
|
||||
from reproduction.chinese_word_segment.process.span_converter import *
|
||||
from reproduction.chinese_word_segment.process.span_converter import SpanConverter
|
||||
|
||||
_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'
|
||||
|
||||
@ -25,11 +24,7 @@ class SpeicalSpanProcessor(Processor):
|
||||
sentence = ins[self.field_name].text
|
||||
for span_converter in self.span_converters:
|
||||
sentence = span_converter.find_certain_span_and_replace(sentence)
|
||||
if self.new_added_field_name!=self.field_name:
|
||||
new_text_field = TextField(sentence, is_target=False)
|
||||
ins[self.new_added_field_name] = new_text_field
|
||||
else:
|
||||
ins[self.field_name].text = sentence
|
||||
ins[self.new_added_field_name] = sentence
|
||||
|
||||
return dataset
|
||||
|
||||
|
@ -1,13 +1,12 @@
|
||||
|
||||
from fastNLP.core.instance import Instance
|
||||
from fastNLP.core.dataset import DataSet
|
||||
|
||||
|
||||
from fastNLP.api.pipeline import Pipeline
|
||||
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor
|
||||
|
||||
from reproduction.chinese_word_segment.process.cws_processor import *
|
||||
from reproduction.chinese_word_segment.utils import cut_long_training_sentences
|
||||
from reproduction.chinese_word_segment.process.span_converter import *
|
||||
from reproduction.chinese_word_segment.io import NaiveCWSReader
|
||||
from reproduction.chinese_word_segment.process.span_converter import AlphaSpanConverter, DigitSpanConverter
|
||||
from reproduction.chinese_word_segment.io.cws_reader import NaiveCWSReader
|
||||
|
||||
|
||||
tr_filename = ''
|
||||
@ -15,9 +14,8 @@ dev_filename = ''
|
||||
|
||||
reader = NaiveCWSReader()
|
||||
|
||||
tr_dataset = reader.load(tr_filename, cut=True)
|
||||
de_dataset = reader.load(dev_filename)
|
||||
|
||||
tr_sentences = reader.load(tr_filename, cut_long_sent=True)
|
||||
dev_sentences = reader.load(dev_filename)
|
||||
|
||||
|
||||
# TODO 如何组建成为一个Dataset
|
||||
@ -32,7 +30,7 @@ def construct_dataset(sentences):
|
||||
|
||||
|
||||
tr_dataset = construct_dataset(tr_sentences)
|
||||
dev_dataset = construct_dataset(dev_sentence)
|
||||
dev_dataset = construct_dataset(dev_sentences)
|
||||
|
||||
# 1. 准备processor
|
||||
fs2hs_proc = FullSpaceToHalfSpaceProcessor('raw_sentence')
|
||||
|
Loading…
Reference in New Issue
Block a user