* fix processor.py

* add code comments
* merge *_saver.py & *_loader.py in io/
* (ancient codes) rename Loss into LossFromTorch
This commit is contained in:
FengZiYjun 2018-12-06 19:28:27 +08:00
parent 306eee9690
commit 27e9453d19
22 changed files with 349 additions and 386 deletions

View File

@ -1,5 +1,3 @@
import torch
import hashlib
import os
import re
@ -7,6 +5,8 @@ import shutil
import sys
import tempfile
import torch
try:
from requests.utils import urlparse
from requests import get as urlopen
@ -132,7 +132,3 @@ if tqdm is None:
sys.stderr.write('\n')
if __name__ == '__main__':
pipeline = load_url('http://10.141.208.102:5000/file/download/infer_context-4e86fd93.pkl', model_dir='.')
print(type(pipeline))

View File

@ -1,14 +1,15 @@
import torch
from collections import defaultdict
import re
from collections import defaultdict
import torch
from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core.vocabulary import Vocabulary
class Processor:
class Processor(object):
def __init__(self, field_name, new_added_field_name):
self.field_name = field_name
if new_added_field_name is None:
@ -17,7 +18,7 @@ class Processor:
self.new_added_field_name = new_added_field_name
def process(self, *args, **kwargs):
pass
raise NotImplementedError
def __call__(self, *args, **kwargs):
return self.process(*args, **kwargs)
@ -132,13 +133,14 @@ class Num2TagProcessor(Processor):
class IndexerProcessor(Processor):
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False):
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True):
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
super(IndexerProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab
self.delete_old_field = delete_old_field
self.is_input = is_input
def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
@ -146,13 +148,14 @@ class IndexerProcessor(Processor):
self.vocab = vocab
def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
for ins in dataset:
tokens = ins[self.field_name]
index = [self.vocab.to_index(token) for token in tokens]
ins[self.new_added_field_name] = index
dataset._set_need_tensor(**{self.new_added_field_name: True})
if self.is_input:
dataset.set_input(self.new_added_field_name)
if self.delete_old_field:
dataset.delete_field(self.field_name)
@ -161,6 +164,9 @@ class IndexerProcessor(Processor):
class VocabProcessor(Processor):
"""Build vocabulary with a field in the data set.
"""
def __init__(self, field_name):
super(VocabProcessor, self).__init__(field_name, None)
self.vocab = Vocabulary()
@ -178,17 +184,20 @@ class VocabProcessor(Processor):
class SeqLenProcessor(Processor):
def __init__(self, field_name, new_added_field_name='seq_lens'):
def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True):
super(SeqLenProcessor, self).__init__(field_name, new_added_field_name)
self.is_input = is_input
def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
for ins in dataset:
length = len(ins[self.field_name])
ins[self.new_added_field_name] = length
dataset._set_need_tensor(**{self.new_added_field_name: True})
if self.is_input:
dataset.set_input(self.new_added_field_name)
return dataset
class ModelProcessor(Processor):
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32):
"""
@ -238,6 +247,7 @@ class ModelProcessor(Processor):
device = torch.device(device)
self.model.to(device)
class Index2WordProcessor(Processor):
def __init__(self, vocab, field_name, new_added_field_name):
super(Index2WordProcessor, self).__init__(field_name, new_added_field_name)
@ -251,6 +261,7 @@ class Index2WordProcessor(Processor):
class SetTensorProcessor(Processor):
# TODO: remove it. It is strange.
def __init__(self, field_dict, default=False):
super(SetTensorProcessor, self).__init__(None, None)
self.field_dict = field_dict
@ -264,6 +275,7 @@ class SetTensorProcessor(Processor):
class SetIsTargetProcessor(Processor):
# TODO; remove it.
def __init__(self, field_dict, default=False):
super(SetIsTargetProcessor, self).__init__(None, None)
self.field_dict = field_dict

View File

@ -2,7 +2,7 @@ from .batch import Batch
from .dataset import DataSet
from .fieldarray import FieldArray
from .instance import Instance
from .losses import Loss
from .losses import LossFromTorch
from .optimizer import Optimizer
from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler
from .tester import Tester

View File

@ -9,32 +9,20 @@ from fastNLP.core.utils import get_func_signature
_READERS = {}
def construct_dataset(sentences):
"""Construct a data set from a list of sentences.
:param sentences: list of list of str
:return dataset: a DataSet object
"""
dataset = DataSet()
for sentence in sentences:
instance = Instance()
instance['raw_sentence'] = sentence
dataset.append(instance)
return dataset
class DataSet(object):
"""DataSet is the collection of examples.
DataSet provides instance-level interface. You can append and access an instance of the DataSet.
However, it stores data in a different way: Field-first, Instance-second.
"""
def __init__(self, data=None):
"""
:param data: a dict or a list. If it is a dict, the key is the name of a field and the value is the field.
All values must be of the same length.
If it is a list, it must be a list of Instance objects.
:param data: a dict or a list.
If `data` is a dict, the key is the name of a FieldArray and the value is the FieldArray. All values
must be of the same length.
If `data` is a list, it must be a list of Instance objects.
"""
self.field_arrays = {}
if data is not None:
@ -60,6 +48,7 @@ class DataSet(object):
def iter_func():
for idx in range(len(self)):
yield self[idx]
return iter_func()
def _inner_iter(self):
@ -69,7 +58,8 @@ class DataSet(object):
self.idx = idx
def __getitem__(self, item):
assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[self.idx])
assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[
self.idx])
assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx)
return self.dataset.field_arrays[item][self.idx]
@ -79,6 +69,7 @@ class DataSet(object):
def inner_iter_func():
for idx in range(len(self)):
yield Iter_ptr(self, idx)
return inner_iter_func()
def __getitem__(self, idx):
@ -217,9 +208,17 @@ class DataSet(object):
raise KeyError("{} is not a valid field name.".format(name))
def get_input_name(self):
"""Get all field names with `is_input` as True.
:return list field_names: a list of str
"""
return [name for name, field in self.field_arrays.items() if field.is_input]
def get_target_name(self):
"""Get all field names with `is_target` as True.
:return list field_names: a list of str
"""
return [name for name, field in self.field_arrays.items() if field.is_target]
@classmethod
@ -243,7 +242,7 @@ class DataSet(object):
:return results: if new_field_name is not passed, returned values of the function over all instances.
"""
results = [func(ins) for ins in self._inner_iter()]
if len(list(filter(lambda x: x is not None, results)))==0: # all None
if len(list(filter(lambda x: x is not None, results))) == 0: # all None
raise ValueError("{} always return None.".format(get_func_signature(func=func)))
extra_param = {}
@ -269,6 +268,12 @@ class DataSet(object):
return results
def drop(self, func):
"""Drop instances if a condition holds.
:param func: a function that takes an Instance object as input, and returns bool.
The instance will be dropped if the function returns True.
"""
results = [ins for ins in self._inner_iter() if not func(ins)]
for name, old_field in self.field_arrays.items():
self.field_arrays[name].content = [ins[name] for ins in results]
@ -338,10 +343,33 @@ class DataSet(object):
return cls(_dict)
def save(self, path):
"""Save the DataSet object as pickle.
:param str path: the path to the pickle
"""
with open(path, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load(path):
"""Load a DataSet object from pickle.
:param str path: the path to the pickle
:return DataSet data_set:
"""
with open(path, 'rb') as f:
return pickle.load(f)
def construct_dataset(sentences):
"""Construct a data set from a list of sentences.
:param sentences: list of list of str
:return dataset: a DataSet object
"""
dataset = DataSet()
for sentence in sentences:
instance = Instance()
instance['raw_sentence'] = sentence
dataset.append(instance)
return dataset

View File

@ -7,14 +7,13 @@ import torch.nn.functional as F
from fastNLP.core.utils import CheckError
from fastNLP.core.utils import CheckRes
from fastNLP.core.utils import _build_args
from fastNLP.core.utils import _check_function_or_method
from fastNLP.core.utils import _check_arg_dict_list
from fastNLP.core.utils import _check_function_or_method
from fastNLP.core.utils import get_func_signature
class LossBase(object):
def __init__(self):
# key: name in target function; value: name in output function
self.param_map = {}
self._checked = False
@ -159,8 +158,18 @@ class LossBase(object):
return loss
class LossFunc(LossBase):
"""A wrapper of user-provided loss function.
"""
def __init__(self, func, key_map=None, **kwargs):
"""
:param func: a callable object, such as a function.
:param dict key_map:
:param kwargs:
"""
super(LossFunc, self).__init__()
_check_function_or_method(func)
if key_map is not None:
@ -254,19 +263,19 @@ def _prepare_losser(losser):
def squash(predict, truth, **kwargs):
'''To reshape tensors in order to fit Loss functions in pytorch
"""To reshape tensors in order to fit loss functions in pytorch
:param predict : Tensor, model output
:param truth : Tensor, truth from dataset
:param **kwargs : extra arguments
:return predict , truth: predict & truth after processing
'''
"""
return predict.view(-1, predict.size()[-1]), truth.view(-1, )
def unpad(predict, truth, **kwargs):
'''To process padded sequence output to get true loss
"""To process padded sequence output to get true loss
Using pack_padded_sequence() method
This method contains squash()
@ -277,7 +286,7 @@ def unpad(predict, truth, **kwargs):
the i-th element is true lengths of i-th sequence
:return predict , truth: predict & truth after processing
'''
"""
if kwargs.get("lens") is None:
return predict, truth
lens = torch.LongTensor(kwargs["lens"])
@ -288,7 +297,7 @@ def unpad(predict, truth, **kwargs):
def unpad_mask(predict, truth, **kwargs):
'''To process padded sequence output to get true loss
"""To process padded sequence output to get true loss
Using mask() method
This method contains squash()
@ -299,7 +308,7 @@ def unpad_mask(predict, truth, **kwargs):
the i-th element is true lengths of i-th sequence
:return predict , truth: predict & truth after processing
'''
"""
if kwargs.get("lens") is None:
return predict, truth
mas = make_mask(kwargs["lens"], truth.size()[1])
@ -307,7 +316,7 @@ def unpad_mask(predict, truth, **kwargs):
def mask(predict, truth, **kwargs):
'''To select specific elements from Tensor
"""To select specific elements from Tensor
This method contains squash()
:param predict : Tensor, [batch_size , max_len , tag_size]
@ -317,7 +326,7 @@ def mask(predict, truth, **kwargs):
the mask Tensor , the position that is 1 will be selected
:return predict , truth: predict & truth after processing
'''
"""
if kwargs.get("mask") is None:
return predict, truth
mask = kwargs["mask"]
@ -332,14 +341,14 @@ def mask(predict, truth, **kwargs):
def make_mask(lens, tar_len):
'''to generate a mask that select [:lens[i]] for i-th element
"""to generate a mask that select [:lens[i]] for i-th element
embezzle from fastNLP.models.sequence_modeling.seq_mask
:param lens : list or LongTensor, [batch_size]
:param tar_len : int
:return mask : ByteTensor
'''
"""
lens = torch.LongTensor(lens)
mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
mask = torch.stack(mask, 1)
@ -376,9 +385,11 @@ loss_function_name = {
}
class Loss(object):
"""a Loss object is a callable object represents loss functions
class LossFromTorch(object):
"""a LossFromTorch object is a callable object represents loss functions
This class only helps you with loss functions from PyTorch.
It has nothing to do with Trainer.
"""
def __init__(self, loss_name, pre_pro=[squash], **kwargs):
@ -408,11 +419,11 @@ class Loss(object):
self.pre_pro = [f if callable(f) else method_dict.get(f) for f in pre_pro]
def add_pre_pro(self, func):
'''add a pre_pro function
"""add a pre_pro function
:param func: a function or str, methods to reform parameters before calculating loss
the strings will be auto translated to pre-defined functions
'''
"""
if not callable(func):
func = method_dict.get(func)
if func is None:
@ -421,12 +432,12 @@ class Loss(object):
@staticmethod
def _get_loss(loss_name, **kwargs):
'''Get loss function from torch
"""Get loss function from torch
:param loss_name: str, the name of loss function
:param **kwargs: kwargs for torch loss function
:return: A callable loss function object
'''
"""
loss_name = loss_name.strip().lower()
loss_name = "".join(loss_name.split("_"))
@ -435,19 +446,19 @@ class Loss(object):
return loss_function_name[loss_name](**kwargs)
def get(self):
'''This method exists just for make some existing codes run error-freely
'''
"""This method exists just for make some existing codes run error-freely
"""
return self
def __call__(self, predict, truth, **kwargs):
'''call a loss function
"""Call a loss function
predict and truth will be processed by pre_pro methods in order of addition
:param predict : Tensor, model output
:param truth : Tensor, truth from dataset
:param **kwargs : extra arguments, pass to pre_pro functions
for example, if used unpad_mask() in pre_pro, there should be a kwarg named lens
'''
"""
for f in self.pre_pro:
if f is None:
continue

View File

@ -308,6 +308,13 @@ def _prepare_metrics(metrics):
return _metrics
"""
Attention: Codes below are not used in current FastNLP.
However, it is useful.
"""
def _conver_numpy(x):
"""convert input data to numpy array

View File

@ -11,6 +11,12 @@ class Optimizer(object):
class SGD(Optimizer):
def __init__(self, model_params=None, lr=0.01, momentum=0):
"""
:param model_params: a generator. E.g. model.parameters() for PyTorch models.
:param float lr: learning rate. Default: 0.01
:param float momentum: momentum. Default: 0
"""
super(SGD, self).__init__(model_params, lr=lr, momentum=momentum)
def construct_from_pytorch(self, model_params):
@ -23,6 +29,12 @@ class SGD(Optimizer):
class Adam(Optimizer):
def __init__(self, model_params=None, lr=0.01, weight_decay=0):
"""
:param model_params: a generator. E.g. model.parameters() for PyTorch models.
:param float lr: learning rate
:param float weight_decay:
"""
super(Adam, self).__init__(model_params, lr=lr, weight_decay=weight_decay)
def construct_from_pytorch(self, model_params):

View File

@ -140,7 +140,6 @@ class Trainer(object):
def train(self):
"""Start Training.
:return:
"""
try:
if torch.cuda.is_available() and self.use_cuda:
@ -216,14 +215,6 @@ class Trainer(object):
pbar.close()
def _print_train(self):
"""
:param data_iterator:
:param model:
:param epoch:
:param start:
:return:
"""
epoch = 1
start = time.time()
while epoch <= self.n_epochs:

View File

@ -29,19 +29,3 @@ class BaseLoader(object):
with open(cache_path, 'wb') as f:
pickle.dump(obj, f)
return obj
class ToyLoader0(BaseLoader):
"""
For CharLM
"""
def __init__(self, data_path):
super(ToyLoader0, self).__init__(data_path)
def load(self):
with open(self.data_path, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()

View File

@ -1,6 +1,152 @@
import configparser
import json
import os
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.base_loader import BaseLoader
class ConfigLoader(BaseLoader):
"""loader for configuration files"""
def __init__(self, data_path=None):
super(ConfigLoader, self).__init__()
if data_path is not None:
self.config = self.parse(super(ConfigLoader, self).load(data_path))
@staticmethod
def parse(string):
raise NotImplementedError
@staticmethod
def load_config(file_path, sections):
"""
:param file_path: the path of config file
:param sections: the dict of {section_name(string): Section instance}
Example:
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
:return: return nothing, but the value of attributes are saved in sessions
"""
assert isinstance(sections, dict)
cfg = configparser.ConfigParser()
if not os.path.exists(file_path):
raise FileNotFoundError("config file {} not found. ".format(file_path))
cfg.read(file_path)
for s in sections:
attr_list = [i for i in sections[s].__dict__.keys() if
not callable(getattr(sections[s], i)) and not i.startswith("__")]
if s not in cfg:
print('section %s not found in config file' % (s))
continue
gen_sec = cfg[s]
for attr in gen_sec.keys():
try:
val = json.loads(gen_sec[attr])
# print(s, attr, val, type(val))
if attr in attr_list:
assert type(val) == type(getattr(sections[s], attr)), \
'type not match, except %s but got %s' % \
(type(getattr(sections[s], attr)), type(val))
"""
if attr in attr_list then check its type and
update its value.
else add a new attr in sections[s]
"""
setattr(sections[s], attr, val)
except Exception as e:
print("cannot load attribute %s in section %s"
% (attr, s))
pass
class ConfigSection(object):
def __init__(self):
pass
def __getitem__(self, key):
"""
:param key: str, the name of the attribute
:return attr: the value of this attribute
if key not in self.__dict__.keys():
return self[key]
else:
raise AttributeError
"""
if key in self.__dict__.keys():
return getattr(self, key)
raise AttributeError("do NOT have attribute %s" % key)
def __setitem__(self, key, value):
"""
:param key: str, the name of the attribute
:param value: the value of this attribute
if key not in self.__dict__.keys():
self[key] will be added
else:
self[key] will be updated
"""
if key in self.__dict__.keys():
if not isinstance(value, type(getattr(self, key))):
raise AttributeError("attr %s except %s but got %s" %
(key, str(type(getattr(self, key))), str(type(value))))
setattr(self, key, value)
def __contains__(self, item):
"""
:param item: The key of item.
:return: True if the key in self.__dict__.keys() else False.
"""
return item in self.__dict__.keys()
def __eq__(self, other):
"""Overwrite the == operator
:param other: Another ConfigSection() object which to be compared.
:return: True if value of each key in each ConfigSection() object are equal to the other, else False.
"""
for k in self.__dict__.keys():
if k not in other.__dict__.keys():
return False
if getattr(self, k) != getattr(self, k):
return False
for k in other.__dict__.keys():
if k not in self.__dict__.keys():
return False
if getattr(self, k) != getattr(self, k):
return False
return True
def __ne__(self, other):
"""Overwrite the != operator
:param other:
:return:
"""
return not self.__eq__(other)
@property
def data(self):
return self.__dict__
if __name__ == "__main__":
config = ConfigLoader('there is no data')
section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()}
"""
General and My can be found in config file, so the attr and
value will be updated
A cannot be found in config file, so nothing will be done
"""
config.load_config("../../test/data_for_tests/config", section)
for s in section:
print(s)
for attr in section[s].__dict__.keys():
print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr)))
class ConfigSaver(object):
@ -125,7 +271,7 @@ class ConfigSaver(object):
# logger = create_logger(__name__, "./config_loader.log")
# logger.warning("section [%s] in config file [%s] has been changed" % (
# section_name, self.file_path
#))
# ))
change_file = True
break
if not change_file:

View File

@ -1,149 +0,0 @@
import configparser
import json
import os
from fastNLP.io.base_loader import BaseLoader
class ConfigLoader(BaseLoader):
"""loader for configuration files"""
def __init__(self, data_path=None):
super(ConfigLoader, self).__init__()
if data_path is not None:
self.config = self.parse(super(ConfigLoader, self).load(data_path))
@staticmethod
def parse(string):
raise NotImplementedError
@staticmethod
def load_config(file_path, sections):
"""
:param file_path: the path of config file
:param sections: the dict of {section_name(string): Section instance}
Example:
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
:return: return nothing, but the value of attributes are saved in sessions
"""
assert isinstance(sections, dict)
cfg = configparser.ConfigParser()
if not os.path.exists(file_path):
raise FileNotFoundError("config file {} not found. ".format(file_path))
cfg.read(file_path)
for s in sections:
attr_list = [i for i in sections[s].__dict__.keys() if
not callable(getattr(sections[s], i)) and not i.startswith("__")]
if s not in cfg:
print('section %s not found in config file' % (s))
continue
gen_sec = cfg[s]
for attr in gen_sec.keys():
try:
val = json.loads(gen_sec[attr])
# print(s, attr, val, type(val))
if attr in attr_list:
assert type(val) == type(getattr(sections[s], attr)), \
'type not match, except %s but got %s' % \
(type(getattr(sections[s], attr)), type(val))
"""
if attr in attr_list then check its type and
update its value.
else add a new attr in sections[s]
"""
setattr(sections[s], attr, val)
except Exception as e:
print("cannot load attribute %s in section %s"
% (attr, s))
pass
class ConfigSection(object):
def __init__(self):
pass
def __getitem__(self, key):
"""
:param key: str, the name of the attribute
:return attr: the value of this attribute
if key not in self.__dict__.keys():
return self[key]
else:
raise AttributeError
"""
if key in self.__dict__.keys():
return getattr(self, key)
raise AttributeError("do NOT have attribute %s" % key)
def __setitem__(self, key, value):
"""
:param key: str, the name of the attribute
:param value: the value of this attribute
if key not in self.__dict__.keys():
self[key] will be added
else:
self[key] will be updated
"""
if key in self.__dict__.keys():
if not isinstance(value, type(getattr(self, key))):
raise AttributeError("attr %s except %s but got %s" %
(key, str(type(getattr(self, key))), str(type(value))))
setattr(self, key, value)
def __contains__(self, item):
"""
:param item: The key of item.
:return: True if the key in self.__dict__.keys() else False.
"""
return item in self.__dict__.keys()
def __eq__(self, other):
"""Overwrite the == operator
:param other: Another ConfigSection() object which to be compared.
:return: True if value of each key in each ConfigSection() object are equal to the other, else False.
"""
for k in self.__dict__.keys():
if k not in other.__dict__.keys():
return False
if getattr(self, k) != getattr(self, k):
return False
for k in other.__dict__.keys():
if k not in self.__dict__.keys():
return False
if getattr(self, k) != getattr(self, k):
return False
return True
def __ne__(self, other):
"""Overwrite the != operator
:param other:
:return:
"""
return not self.__eq__(other)
@property
def data(self):
return self.__dict__
if __name__ == "__main__":
config = ConfigLoader('there is no data')
section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()}
"""
General and My can be found in config file, so the attr and
value will be updated
A cannot be found in config file, so nothing will be done
"""
config.load_config("../../test/data_for_tests/config", section)
for s in section:
print(s)
for attr in section[s].__dict__.keys():
print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr)))

View File

@ -1,4 +1,3 @@
#TODO: need fix for current DataSet
import os
from fastNLP.core.dataset import DataSet
@ -20,8 +19,7 @@ def convert_seq_dataset(data):
"""
dataset = DataSet()
for word_seq in data:
x = TextField(word_seq, is_target=False)
dataset.append(Instance(word_seq=x))
dataset.append(Instance(word_seq=word_seq))
return dataset
@ -40,11 +38,7 @@ def convert_seq2tag_dataset(data):
"""
dataset = DataSet()
for sample in data:
word_seq, label = sample[0], sample[1]
ins = Instance()
ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
.add_field("label", LabelField(label, is_target=True))
dataset.append(ins)
dataset.append(Instance(word_seq=sample[0], label=sample[1]))
return dataset
@ -63,11 +57,7 @@ def convert_seq2seq_dataset(data):
"""
dataset = DataSet()
for sample in data:
word_seq, label_seq = sample[0], sample[1]
ins = Instance()
ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
.add_field("label_seq", TextField(label_seq, is_target=True))
dataset.append(ins)
dataset.append(Instance(word_seq=sample[0], label_seq=sample[1]))
return dataset
@ -273,85 +263,6 @@ class ClassDataSetLoader(DataSetLoader):
return convert_seq2tag_dataset(data)
@DataSet.set_reader('read_conll')
class ConllLoader(DataSetLoader):
"""loader for conll format files"""
def __init__(self):
"""
:param str data_path: the path to the conll data set
"""
super(ConllLoader, self).__init__()
def load(self, data_path):
"""
:return: list lines: all lines in a conll file
"""
with open(data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
data = self.parse(lines)
return self.convert(data)
@staticmethod
def parse(lines):
"""
:param list lines:a list containing all lines in a conll file.
:return: a 3D list
"""
sentences = list()
tokens = list()
for line in lines:
if line[0] == "#":
# skip the comments
continue
if line == "\n":
sentences.append(tokens)
tokens = []
continue
tokens.append(line.split())
return sentences
def convert(self, data):
pass
@DataSet.set_reader('read_lm')
class LMDataSetLoader(DataSetLoader):
"""Language Model Dataset Loader
This loader produces data for language model training in a supervised way.
That means it has X and Y.
"""
def __init__(self):
super(LMDataSetLoader, self).__init__()
def load(self, data_path):
if not os.path.exists(data_path):
raise FileNotFoundError("file {} not found.".format(data_path))
with open(data_path, "r", encoding="utf=8") as f:
text = " ".join(f.readlines())
tokens = text.strip().split()
data = self.sentence_cut(tokens)
return self.convert(data)
def sentence_cut(self, tokens, sentence_length=15):
start_idx = 0
data_set = []
for idx in range(len(tokens) // sentence_length):
x = tokens[start_idx * idx: start_idx * idx + sentence_length]
y = tokens[start_idx * idx + 1: start_idx * idx + sentence_length + 1]
if start_idx * idx + sentence_length + 1 >= len(tokens):
# ad hoc
y.extend(["<unk>"])
data_set.append([x, y])
return data_set
def convert(self, data):
pass
@DataSet.set_reader('read_people_daily')
class PeopleDailyCorpusLoader(DataSetLoader):
"""
@ -403,10 +314,19 @@ class PeopleDailyCorpusLoader(DataSetLoader):
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
# List[List[List[str], List[str]]]
return pos_tag_examples, ner_examples
# ner_examples not used
return self.convert(pos_tag_examples)
def convert(self, data):
pass
data_set = DataSet()
for item in data:
sent_words, sent_pos_tag = item[0], item[1]
data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
data_set.set_target("tags")
data_set.set_input("sent_words")
data_set.set_input("seq_len")
return data_set
class SNLIDataSetLoader(DataSetLoader):
@ -462,17 +382,13 @@ class SNLIDataSetLoader(DataSetLoader):
for example in data:
p, h, l = example
# list, list, str
x1 = TextField(p, is_target=False)
x2 = TextField(h, is_target=False)
x1_len = TextField([1] * len(p), is_target=False)
x2_len = TextField([1] * len(h), is_target=False)
y = LabelField(l, is_target=True)
instance = Instance()
instance.add_field("premise", x1)
instance.add_field("hypothesis", x2)
instance.add_field("premise_len", x1_len)
instance.add_field("hypothesis_len", x2_len)
instance.add_field("truth", y)
instance.add_field("premise", p)
instance.add_field("hypothesis", h)
instance.add_field("truth", l)
data_set.append(instance)
data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len")
data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len")
data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
data_set.set_target("truth")
return data_set

View File

@ -1,5 +1,32 @@
import torch
from fastNLP.io.base_loader import BaseLoader
class ModelLoader(BaseLoader):
"""
Loader for models.
"""
def __init__(self):
super(ModelLoader, self).__init__()
@staticmethod
def load_pytorch(empty_model, model_path):
"""
Load model parameters from .pkl files into the empty PyTorch model.
:param empty_model: a PyTorch model with initialized parameters.
:param model_path: str, the path to the saved model.
"""
empty_model.load_state_dict(torch.load(model_path))
@staticmethod
def load_pytorch_model(model_path):
"""Load the entire model.
"""
return torch.load(model_path)
class ModelSaver(object):
"""Save a model
@ -8,6 +35,7 @@ class ModelSaver(object):
saver.save_pytorch(model)
"""
def __init__(self, save_path):
"""

View File

@ -1,28 +0,0 @@
import torch
from fastNLP.io.base_loader import BaseLoader
class ModelLoader(BaseLoader):
"""
Loader for models.
"""
def __init__(self):
super(ModelLoader, self).__init__()
@staticmethod
def load_pytorch(empty_model, model_path):
"""
Load model parameters from .pkl files into the empty PyTorch model.
:param empty_model: a PyTorch model with initialized parameters.
:param model_path: str, the path to the saved model.
"""
empty_model.load_state_dict(torch.load(model_path))
@staticmethod
def load_pytorch_model(model_path):
"""Load the entire model.
"""
return torch.load(model_path)

View File

@ -5,7 +5,7 @@ sys.path.extend(['/home/yfshao/workdir/dev_fastnlp'])
from fastNLP.api.processor import *
from fastNLP.models.biaffine_parser import BiaffineParser
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.config_io import ConfigSection, ConfigLoader
import _pickle as pickle
import torch

View File

@ -13,11 +13,10 @@ from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.core.field import TextField, SeqLabelField
from fastNLP.core.tester import Tester
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.model_loader import ModelLoader
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.io.model_io import ModelLoader, ModelSaver
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.models.biaffine_parser import BiaffineParser
from fastNLP.io.model_saver import ModelSaver
BOS = '<BOS>'
EOS = '<EOS>'

View File

@ -2,8 +2,8 @@ import torch.nn.functional as F
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.utils import ClassPreprocess as Preprocess
from fastNLP.io.config_loader import ConfigLoader
from fastNLP.io.config_loader import ConfigSection
from fastNLP.io.config_io import ConfigLoader
from fastNLP.io.config_io import ConfigSection
from fastNLP.io.dataset_loader import ClassDataSetLoader as Dataset_loader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.aggregator.self_attention import SelfAttention

View File

@ -3,12 +3,11 @@ import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.utils import load_pickle
from fastNLP.io.model_saver import ModelSaver
from fastNLP.io.model_loader import ModelLoader
from fastNLP.io.model_io import ModelLoader, ModelSaver
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer

View File

@ -12,12 +12,12 @@ with open('requirements.txt', encoding='utf-8') as f:
reqs = f.read()
setup(
name='fastNLP',
name='FastNLP',
version='0.1.1',
description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team',
long_description=readme,
license=license,
author='fudanNLP',
author='FudanNLP',
python_requires='>=3.5',
packages=find_packages(),
install_requires=reqs.strip().split('\n'),

View File

@ -0,0 +1,12 @@
import unittest
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor
from fastNLP.core.dataset import DataSet
class TestProcessor(unittest.TestCase):
def test_FullSpaceToHalfSpaceProcessor(self):
ds = DataSet({"word": ["0, u, u), (u, u2"]})
proc = FullSpaceToHalfSpaceProcessor("word")
ds = proc(ds)
self.assertTrue(ds.field_arrays["word"].content, ["00, u1, u), (u2, u2"])

View File

@ -45,7 +45,7 @@ class TestLoss(unittest.TestCase):
# 验证squash()的正确性
log = math.log
loss_func = loss.Loss("nll")
loss_func = loss.LossFromTorch("nll")
y = tc.Tensor(
[
@ -129,7 +129,7 @@ class TestLoss(unittest.TestCase):
lens = [4, 2, 1]
y = tc.log(y)
loss_func = loss.Loss("nll", pre_pro=["unpad"])
loss_func = loss.LossFromTorch("nll", pre_pro=["unpad"])
los = loss_func(y, gy, lens=lens)
r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1)
@ -169,7 +169,7 @@ class TestLoss(unittest.TestCase):
lens = [2, 4, 2]
loss_func = loss.Loss("nll", pre_pro=["mask"])
loss_func = loss.LossFromTorch("nll", pre_pro=["mask"])
los = loss_func(y, gy, mask=mask)
los2 = loss_func(y, gy, mask=loss.make_mask(lens, gy.size()[-1]))
@ -205,7 +205,7 @@ class TestLoss(unittest.TestCase):
y = tc.log(y)
loss_func = loss.Loss("nll", pre_pro=["unpad_mask"])
loss_func = loss.LossFromTorch("nll", pre_pro=["unpad_mask"])
los = loss_func(y, gy, lens=lens)
r = -log(.1) - log(.3) - log(.5) - log(.5) - log(.3) - log(.7) - log(.1)
@ -235,7 +235,7 @@ class TestLoss(unittest.TestCase):
lens = [4, 2, 1]
y = tc.log(y)
loss_func = loss.Loss("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0]))
loss_func = loss.LossFromTorch("nll", pre_pro=[], weight=tc.Tensor([1, 1, 0]))
loss_func.add_pre_pro("unpad_mask")
los = loss_func(y, gy, lens=lens)

View File

@ -1,8 +1,7 @@
import os
import unittest
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.config_saver import ConfigSaver
from fastNLP.io.config_io import ConfigSection, ConfigLoader, ConfigSaver
class TestConfigSaver(unittest.TestCase):