Merge pull request #8 from fastnlp/master

update
This commit is contained in:
lyhuang18 2018-09-26 19:59:53 +08:00 committed by GitHub
commit 06b8065471
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 713 additions and 931 deletions

View File

@ -5,7 +5,7 @@ from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregation from fastNLP.modules import aggregator
from fastNLP.modules import decoder from fastNLP.modules import decoder
from fastNLP.modules import encoder from fastNLP.modules import encoder
@ -21,7 +21,7 @@ class ClassificationModel(BaseModel):
self.emb = encoder.Embedding(nums=vocab_size, dims=300) self.emb = encoder.Embedding(nums=vocab_size, dims=300)
self.enc = encoder.Conv( self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3) in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregation.MaxPool() self.agg = aggregator.MaxPool()
self.dec = decoder.MLP(size_layer=[100, num_classes]) self.dec = decoder.MLP(size_layer=[100, num_classes])
def forward(self, x): def forward(self, x):

View File

@ -2,10 +2,6 @@ from collections import defaultdict
import torch import torch
from fastNLP.core.dataset import DataSet
from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance
class Batch(object): class Batch(object):
"""Batch is an iterable object which iterates over mini-batches. """Batch is an iterable object which iterates over mini-batches.
@ -16,6 +12,14 @@ class Batch(object):
""" """
def __init__(self, dataset, batch_size, sampler, use_cuda): def __init__(self, dataset, batch_size, sampler, use_cuda):
"""
:param dataset: a DataSet object
:param batch_size: int, the size of the batch
:param sampler: a Sampler object
:param use_cuda: bool, whetjher to use GPU
"""
self.dataset = dataset self.dataset = dataset
self.batch_size = batch_size self.batch_size = batch_size
self.sampler = sampler self.sampler = sampler
@ -81,46 +85,3 @@ class Batch(object):
self.curidx += endidx self.curidx += endidx
return batch_x, batch_y return batch_x, batch_y
if __name__ == "__main__":
"""simple running example
"""
texts = ["i am a cat",
"this is a test of new batch",
"haha"
]
labels = [0, 1, 0]
# prepare vocabulary
vocab = {}
for text in texts:
for tokens in text.split():
if tokens not in vocab:
vocab[tokens] = len(vocab)
print("vocabulary: ", vocab)
# prepare input dataset
data = DataSet()
for text, label in zip(texts, labels):
x = TextField(text.split(), False)
y = LabelField(label, is_target=True)
ins = Instance(text=x, label=y)
data.append(ins)
# use vocabulary to index data
data.index_field("text", vocab)
# define naive sampler for batch class
class SeqSampler:
def __call__(self, dataset):
return list(range(len(dataset)))
# use batch to iterate dataset
data_iterator = Batch(data, 2, SeqSampler(), False)
for epoch in range(1):
for batch_x, batch_y in data_iterator:
print(batch_x)
print(batch_y)
# do stuff

View File

@ -1,10 +1,10 @@
import numpy as np import numpy as np
import torch import torch
from fastNLP.core.action import SequentialSampler
from fastNLP.core.batch import Batch from fastNLP.core.batch import Batch
from fastNLP.core.dataset import create_dataset_from_lists from fastNLP.core.dataset import create_dataset_from_lists
from fastNLP.core.preprocess import load_pickle from fastNLP.core.preprocess import load_pickle
from fastNLP.core.sampler import SequentialSampler
class Predictor(object): class Predictor(object):
@ -27,8 +27,8 @@ class Predictor(object):
self.batch_output = [] self.batch_output = []
self.pickle_path = pickle_path self.pickle_path = pickle_path
self._task = task # one of ("seq_label", "text_classify") self._task = task # one of ("seq_label", "text_classify")
self.index2label = load_pickle(self.pickle_path, "id2class.pkl") self.label_vocab = load_pickle(self.pickle_path, "class2id.pkl")
self.word2index = load_pickle(self.pickle_path, "word2id.pkl") self.word_vocab = load_pickle(self.pickle_path, "word2id.pkl")
def predict(self, network, data): def predict(self, network, data):
"""Perform inference using the trained model. """Perform inference using the trained model.
@ -62,9 +62,13 @@ class Predictor(object):
def data_forward(self, network, x): def data_forward(self, network, x):
"""Forward through network.""" """Forward through network."""
y = network(**x)
if self._task == "seq_label": if self._task == "seq_label":
y = network(x["word_seq"], x["word_seq_origin_len"])
y = network.prediction(y) y = network.prediction(y)
elif self._task == "text_classify":
y = network(x["word_seq"])
else:
raise NotImplementedError("Unknown task type {}.".format(self._task))
return y return y
def prepare_input(self, data): def prepare_input(self, data):
@ -82,7 +86,7 @@ class Predictor(object):
:return data_set: a DataSet instance. :return data_set: a DataSet instance.
""" """
assert isinstance(data, list) assert isinstance(data, list)
return create_dataset_from_lists(data, self.word2index, has_target=False) return create_dataset_from_lists(data, self.word_vocab, has_target=False)
def prepare_output(self, data): def prepare_output(self, data):
"""Transform list of batch outputs into strings.""" """Transform list of batch outputs into strings."""
@ -97,14 +101,14 @@ class Predictor(object):
results = [] results = []
for batch in batch_outputs: for batch in batch_outputs:
for example in np.array(batch): for example in np.array(batch):
results.append([self.index2label[int(x)] for x in example]) results.append([self.label_vocab.to_word(int(x)) for x in example])
return results return results
def _text_classify_prepare_output(self, batch_outputs): def _text_classify_prepare_output(self, batch_outputs):
results = [] results = []
for batch_out in batch_outputs: for batch_out in batch_outputs:
idx = np.argmax(batch_out.detach().numpy(), axis=-1) idx = np.argmax(batch_out.detach().numpy(), axis=-1)
results.extend([self.index2label[i] for i in idx]) results.extend([self.label_vocab.to_word(i) for i in idx])
return results return results

View File

@ -6,16 +6,7 @@ import numpy as np
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
from fastNLP.core.field import TextField, LabelField from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.vocabulary import Vocabulary
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
# the first vocab in dict with the index = 5 # the first vocab in dict with the index = 5
@ -61,31 +52,36 @@ def pickle_exist(pickle_path, pickle_name):
return False return False
class BasePreprocess(object): class Preprocessor(object):
"""Base class of all preprocessors. """Preprocessors are responsible for converting data of strings into data of indices.
Preprocessors are responsible for converting data of strings into data of indices.
During the pre-processing, the following pickle files will be built: During the pre-processing, the following pickle files will be built:
- "word2id.pkl", a mapping from words(tokens) to indices - "word2id.pkl", a Vocabulary object, mapping words to indices.
- "id2word.pkl", a reversed dictionary - "class2id.pkl", a Vocabulary object, mapping labels to indices.
- "label2id.pkl", a dictionary on labels - "data_train.pkl", a DataSet object for training
- "id2label.pkl", a reversed dictionary on labels - "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0.
- "data_test.pkl", a DataSet object for testing, if test_data is not None.
These four pickle files are expected to be saved in the given pickle directory once they are constructed. These four pickle files are expected to be saved in the given pickle directory once they are constructed.
Preprocessors will check if those files are already in the directory and will reuse them in future calls. Preprocessors will check if those files are already in the directory and will reuse them in future calls.
""" """
def __init__(self): def __init__(self, label_is_seq=False):
self.word2index = None """
self.label2index = None
:param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve
several special tokens for sequence processing.
"""
self.data_vocab = Vocabulary()
self.label_vocab = Vocabulary(need_default=label_is_seq)
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.word2index) return len(self.data_vocab)
@property @property
def num_classes(self): def num_classes(self):
return len(self.label2index) return len(self.label_vocab)
def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
"""Main pre-processing pipeline. """Main pre-processing pipeline.
@ -102,20 +98,14 @@ class BasePreprocess(object):
""" """
if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
self.word2index = load_pickle(pickle_path, "word2id.pkl") self.data_vocab = load_pickle(pickle_path, "word2id.pkl")
self.label2index = load_pickle(pickle_path, "class2id.pkl") self.label_vocab = load_pickle(pickle_path, "class2id.pkl")
else: else:
self.word2index, self.label2index = self.build_dict(train_dev_data) self.data_vocab, self.label_vocab = self.build_dict(train_dev_data)
save_pickle(self.word2index, pickle_path, "word2id.pkl") save_pickle(self.data_vocab, pickle_path, "word2id.pkl")
save_pickle(self.label2index, pickle_path, "class2id.pkl") save_pickle(self.label_vocab, pickle_path, "class2id.pkl")
if not pickle_exist(pickle_path, "id2word.pkl"): self.build_reverse_dict()
index2word = self.build_reverse_dict(self.word2index)
save_pickle(index2word, pickle_path, "id2word.pkl")
if not pickle_exist(pickle_path, "id2class.pkl"):
index2label = self.build_reverse_dict(self.label2index)
save_pickle(index2label, pickle_path, "id2class.pkl")
train_set = [] train_set = []
dev_set = [] dev_set = []
@ -125,13 +115,13 @@ class BasePreprocess(object):
split = int(len(train_dev_data) * train_dev_split) split = int(len(train_dev_data) * train_dev_split)
data_dev = train_dev_data[: split] data_dev = train_dev_data[: split]
data_train = train_dev_data[split:] data_train = train_dev_data[split:]
train_set = self.convert_to_dataset(data_train, self.word2index, self.label2index) train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab)
dev_set = self.convert_to_dataset(data_dev, self.word2index, self.label2index) dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab)
save_pickle(dev_set, pickle_path, "data_dev.pkl") save_pickle(dev_set, pickle_path, "data_dev.pkl")
print("{} of the training data is split for validation. ".format(train_dev_split)) print("{} of the training data is split for validation. ".format(train_dev_split))
else: else:
train_set = self.convert_to_dataset(train_dev_data, self.word2index, self.label2index) train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab)
save_pickle(train_set, pickle_path, "data_train.pkl") save_pickle(train_set, pickle_path, "data_train.pkl")
else: else:
train_set = load_pickle(pickle_path, "data_train.pkl") train_set = load_pickle(pickle_path, "data_train.pkl")
@ -143,8 +133,8 @@ class BasePreprocess(object):
# cross validation # cross validation
data_cv = self.cv_split(train_dev_data, n_fold) data_cv = self.cv_split(train_dev_data, n_fold)
for i, (data_train_cv, data_dev_cv) in enumerate(data_cv): for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
data_train_cv = self.convert_to_dataset(data_train_cv, self.word2index, self.label2index) data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab)
data_dev_cv = self.convert_to_dataset(data_dev_cv, self.word2index, self.label2index) data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab)
save_pickle( save_pickle(
data_train_cv, pickle_path, data_train_cv, pickle_path,
"data_train_{}.pkl".format(i)) "data_train_{}.pkl".format(i))
@ -165,7 +155,7 @@ class BasePreprocess(object):
test_set = [] test_set = []
if test_data is not None: if test_data is not None:
if not pickle_exist(pickle_path, "data_test.pkl"): if not pickle_exist(pickle_path, "data_test.pkl"):
test_set = self.convert_to_dataset(test_data, self.word2index, self.label2index) test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab)
save_pickle(test_set, pickle_path, "data_test.pkl") save_pickle(test_set, pickle_path, "data_test.pkl")
# return preprocessed results # return preprocessed results
@ -180,28 +170,15 @@ class BasePreprocess(object):
return tuple(results) return tuple(results)
def build_dict(self, data): def build_dict(self, data):
label2index = DEFAULT_WORD_TO_INDEX.copy()
word2index = DEFAULT_WORD_TO_INDEX.copy()
for example in data: for example in data:
for word in example[0]: word, label = example
if word not in word2index: self.data_vocab.update(word)
word2index[word] = len(word2index) self.label_vocab.update(label)
label = example[1] return self.data_vocab, self.label_vocab
if isinstance(label, str):
# label is a string
if label not in label2index:
label2index[label] = len(label2index)
elif isinstance(label, list):
# label is a list of strings
for single_label in label:
if single_label not in label2index:
label2index[single_label] = len(label2index)
return word2index, label2index
def build_reverse_dict(self):
def build_reverse_dict(self, word_dict): self.data_vocab.build_reverse_vocab()
id2word = {word_dict[w]: w for w in word_dict} self.label_vocab.build_reverse_vocab()
return id2word
def data_split(self, data, train_dev_split): def data_split(self, data, train_dev_split):
"""Split data into train and dev set.""" """Split data into train and dev set."""
@ -289,20 +266,20 @@ class BasePreprocess(object):
return data_set return data_set
class SeqLabelPreprocess(BasePreprocess): class SeqLabelPreprocess(Preprocessor):
def __init__(self): def __init__(self):
print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.")
super(SeqLabelPreprocess, self).__init__() super(SeqLabelPreprocess, self).__init__()
class ClassPreprocess(Preprocessor):
class ClassPreprocess(BasePreprocess):
def __init__(self): def __init__(self):
print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.")
super(ClassPreprocess, self).__init__() super(ClassPreprocess, self).__init__()
if __name__ == "__main__": if __name__ == "__main__":
p = BasePreprocess() p = Preprocessor()
train_dev_data = [[["I", "am", "a", "good", "student", "."], "0"], train_dev_data = [[["I", "am", "a", "good", "student", "."], "0"],
[["You", "are", "pretty", "."], "1"] [["You", "are", "pretty", "."], "1"]
] ]

View File

@ -1,5 +1,3 @@
from collections import Counter
import numpy as np import numpy as np
import torch import torch
@ -17,6 +15,56 @@ def convert_to_torch_tensor(data_list, use_cuda):
return data_list return data_list
class BaseSampler(object):
"""The base class of all samplers.
Sub-classes must implement the __call__ method.
__call__ takes a DataSet object and returns a list of int - the sampling indices.
"""
def __call__(self, *args, **kwargs):
raise NotImplementedError
class SequentialSampler(BaseSampler):
"""Sample data in the original order.
"""
def __call__(self, data_set):
return list(range(len(data_set)))
class RandomSampler(BaseSampler):
"""Sample data in random permutation order.
"""
def __call__(self, data_set):
return list(np.random.permutation(len(data_set)))
def simple_sort_bucketing(lengths):
"""
:param lengths: list of int, the lengths of all examples.
:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
threshold for each bucket (This is usually None.).
:return data: 2-level list
::
[
[index_11, index_12, ...], # bucket 1
[index_21, index_22, ...], # bucket 2
...
]
"""
lengths_mapping = [(idx, length) for idx, length in enumerate(lengths)]
sorted_lengths = sorted(lengths_mapping, key=lambda x: x[1])
# TODO: need to return buckets
return [idx for idx, _ in sorted_lengths]
def k_means_1d(x, k, max_iter=100): def k_means_1d(x, k, max_iter=100):
"""Perform k-means on 1-D data. """Perform k-means on 1-D data.
@ -46,18 +94,10 @@ def k_means_1d(x, k, max_iter=100):
return np.array(centroids), assign return np.array(centroids), assign
def k_means_bucketing(all_inst, buckets): def k_means_bucketing(lengths, buckets):
"""Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. """Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths.
:param all_inst: 3-level list :param lengths: list of int, the length of all samples.
E.g. ::
[
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
...
]
:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
threshold for each bucket (This is usually None.). threshold for each bucket (This is usually None.).
:return data: 2-level list :return data: 2-level list
@ -72,7 +112,6 @@ def k_means_bucketing(all_inst, buckets):
""" """
bucket_data = [[] for _ in buckets] bucket_data = [[] for _ in buckets]
num_buckets = len(buckets) num_buckets = len(buckets)
lengths = np.array([len(inst[0]) for inst in all_inst])
_, assignments = k_means_1d(lengths, num_buckets) _, assignments = k_means_1d(lengths, num_buckets)
for idx, bucket_id in enumerate(assignments): for idx, bucket_id in enumerate(assignments):
@ -81,102 +120,33 @@ def k_means_bucketing(all_inst, buckets):
return bucket_data return bucket_data
class BaseSampler(object): class BucketSampler(BaseSampler):
"""The base class of all samplers.
"""
def __call__(self, *args, **kwargs):
raise NotImplementedError
class SequentialSampler(BaseSampler):
"""Sample data in the original order.
"""
def __call__(self, data_set):
return list(range(len(data_set)))
class RandomSampler(BaseSampler):
"""Sample data in random permutation order.
"""
def __call__(self, data_set):
return list(np.random.permutation(len(data_set)))
class Batchifier(object):
"""Wrap random or sequential sampler to generate a mini-batch.
"""
def __init__(self, sampler, batch_size, drop_last=True):
"""
:param sampler: a Sampler object
:param batch_size: int, the size of the mini-batch
:param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch.
"""
super(Batchifier, self).__init__()
self.sampler = sampler
self.batch_size = batch_size
self.drop_last = drop_last
def __iter__(self):
batch = []
for example in self.sampler:
batch.append(example)
if len(batch) == self.batch_size:
yield batch
batch = []
if 0 < len(batch) < self.batch_size and self.drop_last is False:
yield batch
class BucketBatchifier(Batchifier):
"""Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length.
In sampling, first random choose a bucket. Then sample data from it. In sampling, first random choose a bucket. Then sample data from it.
The number of buckets is decided dynamically by the variance of sentence lengths. The number of buckets is decided dynamically by the variance of sentence lengths.
TODO: merge it into Batch
""" """
def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None): def __call__(self, data_set, batch_size, num_buckets):
return self._process(data_set, batch_size, num_buckets)
def _process(self, data_set, batch_size, num_buckets, use_kmeans=False):
""" """
:param data_set: three-level list, shape [num_samples, 2] :param data_set: a DataSet object
:param batch_size: int :param batch_size: int
:param num_buckets: int, number of buckets for grouping these sequences. :param num_buckets: int, number of buckets for grouping these sequences.
:param drop_last: bool, useless currently. :param use_kmeans: bool, whether to use k-means to create buckets.
:param sampler: Sampler, useless currently.
""" """
super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last)
buckets = ([None] * num_buckets) buckets = ([None] * num_buckets)
self.data = data_set if use_kmeans is True:
self.batch_size = batch_size buckets = k_means_bucketing(data_set, buckets)
self.length_freq = dict(Counter([len(example) for example in data_set])) else:
self.buckets = k_means_bucketing(data_set, buckets) buckets = simple_sort_bucketing(data_set)
index_list = []
def __iter__(self): for _ in range(len(data_set) // batch_size):
"""Make a min-batch of data.""" chosen_bucket = buckets[np.random.randint(0, len(buckets))]
for _ in range(len(self.data) // self.batch_size): np.random.shuffle(chosen_bucket)
bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))] index_list += [idx for idx in chosen_bucket[:batch_size]]
np.random.shuffle(bucket_samples) return index_list
yield [self.data[idx] for idx in bucket_samples[:batch_size]]
if __name__ == "__main__":
import random
data = [[[y] * random.randint(0, 50), [y]] for y in range(500)]
batch_size = 8
iterator = iter(BucketBatchifier(data, batch_size, num_buckets=5))
for d in iterator:
print("\nbatch:")
for dd in d:
print(len(dd[0]), end=" ")

View File

@ -1,32 +1,32 @@
import numpy as np import numpy as np
import torch import torch
from fastNLP.core.action import RandomSampler
from fastNLP.core.batch import Batch from fastNLP.core.batch import Batch
from fastNLP.core.sampler import RandomSampler
from fastNLP.saver.logger import create_logger from fastNLP.saver.logger import create_logger
logger = create_logger(__name__, "./train_test.log") logger = create_logger(__name__, "./train_test.log")
class BaseTester(object): class Tester(object):
"""An collection of model inference and evaluation of performance, used over validation/dev set and test set. """ """An collection of model inference and evaluation of performance, used over validation/dev set and test set. """
def __init__(self, **kwargs): def __init__(self, **kwargs):
""" """
:param kwargs: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]" :param kwargs: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]"
""" """
super(BaseTester, self).__init__() super(Tester, self).__init__()
""" """
"default_args" provides default value for important settings. "default_args" provides default value for important settings.
The initialization arguments "kwargs" with the same key (name) will override the default value. The initialization arguments "kwargs" with the same key (name) will override the default value.
"kwargs" must have the same type as "default_args" on corresponding keys. "kwargs" must have the same type as "default_args" on corresponding keys.
Otherwise, error will raise. Otherwise, error will raise.
""" """
default_args = {"save_output": False, # collect outputs of validation set default_args = {"save_output": True, # collect outputs of validation set
"save_loss": False, # collect losses in validation "save_loss": True, # collect losses in validation
"save_best_dev": False, # save best model during validation "save_best_dev": False, # save best model during validation
"batch_size": 8, "batch_size": 8,
"use_cuda": True, "use_cuda": False,
"pickle_path": "./save/", "pickle_path": "./save/",
"model_name": "dev_best_model.pkl", "model_name": "dev_best_model.pkl",
"print_every_step": 1, "print_every_step": 1,
@ -55,7 +55,7 @@ class BaseTester(object):
logger.error(msg) logger.error(msg)
raise ValueError(msg) raise ValueError(msg)
else: else:
# BaseTester doesn't care about extra arguments # Tester doesn't care about extra arguments
pass pass
print(default_args) print(default_args)
@ -208,7 +208,7 @@ class BaseTester(object):
return self.show_metrics() return self.show_metrics()
class SeqLabelTester(BaseTester): class SeqLabelTester(Tester):
def __init__(self, **test_args): def __init__(self, **test_args):
test_args.update({"task": "seq_label"}) test_args.update({"task": "seq_label"})
print( print(
@ -216,9 +216,9 @@ class SeqLabelTester(BaseTester):
super(SeqLabelTester, self).__init__(**test_args) super(SeqLabelTester, self).__init__(**test_args)
class ClassificationTester(BaseTester): class ClassificationTester(Tester):
def __init__(self, **test_args): def __init__(self, **test_args):
test_args.update({"task": "seq_label"}) test_args.update({"task": "text_classify"})
print( print(
"[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester with argument 'task'='text_classify'.") "[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester with argument 'task'='text_classify'.")
super(ClassificationTester, self).__init__(**test_args) super(ClassificationTester, self).__init__(**test_args)

View File

@ -6,10 +6,10 @@ from datetime import timedelta
import torch import torch
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from fastNLP.core.action import RandomSampler
from fastNLP.core.batch import Batch from fastNLP.core.batch import Batch
from fastNLP.core.loss import Loss from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester from fastNLP.core.tester import SeqLabelTester, ClassificationTester
from fastNLP.saver.logger import create_logger from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver from fastNLP.saver.model_saver import ModelSaver
@ -17,7 +17,7 @@ from fastNLP.saver.model_saver import ModelSaver
logger = create_logger(__name__, "./train_test.log") logger = create_logger(__name__, "./train_test.log")
class BaseTrainer(object): class Trainer(object):
"""Operations of training a model, including data loading, gradient descent, and validation. """Operations of training a model, including data loading, gradient descent, and validation.
""" """
@ -32,7 +32,7 @@ class BaseTrainer(object):
- batch_size: int - batch_size: int
- pickle_path: str, the path to pickle files for pre-processing - pickle_path: str, the path to pickle files for pre-processing
""" """
super(BaseTrainer, self).__init__() super(Trainer, self).__init__()
""" """
"default_args" provides default value for important settings. "default_args" provides default value for important settings.
@ -40,8 +40,8 @@ class BaseTrainer(object):
"kwargs" must have the same type as "default_args" on corresponding keys. "kwargs" must have the same type as "default_args" on corresponding keys.
Otherwise, error will raise. Otherwise, error will raise.
""" """
default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", default_args = {"epochs": 1, "batch_size": 2, "validate": False, "use_cuda": False, "pickle_path": "./save/",
"save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, "save_best_dev": False, "model_name": "default_model_name.pkl", "print_every_step": 1,
"loss": Loss(None), # used to pass type check "loss": Loss(None), # used to pass type check
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0)
} }
@ -69,7 +69,7 @@ class BaseTrainer(object):
logger.error(msg) logger.error(msg)
raise ValueError(msg) raise ValueError(msg)
else: else:
# BaseTrainer doesn't care about extra arguments # Trainer doesn't care about extra arguments
pass pass
print(default_args) print(default_args)
@ -136,6 +136,9 @@ class BaseTrainer(object):
# validation # validation
if self.validate: if self.validate:
if dev_data is None:
raise RuntimeError(
"self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
logger.info("validation started") logger.info("validation started")
validator.test(network, dev_data) validator.test(network, dev_data)
@ -314,7 +317,7 @@ class BaseTrainer(object):
raise NotImplementedError raise NotImplementedError
class SeqLabelTrainer(BaseTrainer): class SeqLabelTrainer(Trainer):
"""Trainer for Sequence Labeling """Trainer for Sequence Labeling
""" """
@ -328,7 +331,7 @@ class SeqLabelTrainer(BaseTrainer):
return SeqLabelTester(**valid_args) return SeqLabelTester(**valid_args)
class ClassificationTrainer(BaseTrainer): class ClassificationTrainer(Trainer):
"""Trainer for text classification.""" """Trainer for text classification."""
def __init__(self, **train_args): def __init__(self, **train_args):

124
fastNLP/core/vocabulary.py Normal file
View File

@ -0,0 +1,124 @@
from copy import deepcopy
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
def isiterable(p_object):
try:
it = iter(p_object)
except TypeError:
return False
return True
class Vocabulary(object):
"""Use for word and index one to one mapping
Example::
vocab = Vocabulary()
word_list = "this is a word list".split()
vocab.update(word_list)
vocab["word"]
vocab.to_word(5)
"""
def __init__(self, need_default=True):
"""
:param bool need_default: set if the Vocabulary has default labels reserved.
"""
if need_default:
self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX)
self.padding_label = DEFAULT_PADDING_LABEL
self.unknown_label = DEFAULT_UNKNOWN_LABEL
else:
self.word2idx = {}
self.padding_label = None
self.unknown_label = None
self.has_default = need_default
self.idx2word = None
def __len__(self):
return len(self.word2idx)
def update(self, word):
"""add word or list of words into Vocabulary
:param word: a list of str or str
"""
if not isinstance(word, str) and isiterable(word):
# it's a nested list
for w in word:
self.update(w)
else:
# it's a word to be added
if word not in self.word2idx:
self.word2idx[word] = len(self)
if self.idx2word is not None:
self.idx2word = None
def __getitem__(self, w):
"""To support usage like::
vocab[w]
"""
if w in self.word2idx:
return self.word2idx[w]
else:
return self.word2idx[DEFAULT_UNKNOWN_LABEL]
def to_index(self, w):
""" like to_index(w) function, turn a word to the index
if w is not in Vocabulary, return the unknown label
:param str w:
"""
return self[w]
def unknown_idx(self):
if self.unknown_label is None:
return None
return self.word2idx[self.unknown_label]
def padding_idx(self):
if self.padding_label is None:
return None
return self.word2idx[self.padding_label]
def build_reverse_vocab(self):
"""build 'index to word' dict based on 'word to index' dict
"""
self.idx2word = {self.word2idx[w] : w for w in self.word2idx}
def to_word(self, idx):
"""given a word's index, return the word itself
:param int idx:
"""
if self.idx2word is None:
self.build_reverse_vocab()
return self.idx2word[idx]
def __getstate__(self):
"""use to prepare data for pickle
"""
state = self.__dict__.copy()
# no need to pickle idx2word as it can be constructed from word2idx
del state['idx2word']
return state
def __setstate__(self, state):
"""use to restore state from pickle
"""
self.__dict__.update(state)
self.idx2word = None

View File

@ -31,7 +31,7 @@ FastNLP_MODEL_COLLECTION = {
"class": "sequence_modeling.AdvSeqLabel", "class": "sequence_modeling.AdvSeqLabel",
"pickle": "cws_basic_model_v_0.pkl", "pickle": "cws_basic_model_v_0.pkl",
"type": "seq_label", "type": "seq_label",
"config_file_name": "config", "config_file_name": "cws.cfg",
"config_section_name": "text_class_model" "config_section_name": "text_class_model"
}, },
"pos_tag_model": { "pos_tag_model": {
@ -39,7 +39,7 @@ FastNLP_MODEL_COLLECTION = {
"class": "sequence_modeling.AdvSeqLabel", "class": "sequence_modeling.AdvSeqLabel",
"pickle": "pos_tag_model_v_0.pkl", "pickle": "pos_tag_model_v_0.pkl",
"type": "seq_label", "type": "seq_label",
"config_file_name": "pos_tag.config", "config_file_name": "pos_tag.cfg",
"config_section_name": "pos_tag_model" "config_section_name": "pos_tag_model"
}, },
"text_classify_model": { "text_classify_model": {
@ -56,21 +56,22 @@ FastNLP_MODEL_COLLECTION = {
class FastNLP(object): class FastNLP(object):
""" """
High-level interface for direct model inference. High-level interface for direct model inference.
Example Usage: Example Usage
::
fastnlp = FastNLP() fastnlp = FastNLP()
fastnlp.load("zh_pos_tag_model") fastnlp.load("zh_pos_tag_model")
text = "这是最好的基于深度学习的中文分词系统。" text = "这是最好的基于深度学习的中文分词系统。"
result = fastnlp.run(text) result = fastnlp.run(text)
print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"] print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"]
""" """
def __init__(self, model_dir="./"): def __init__(self, model_dir="./"):
""" """
:param model_dir: this directory should contain the following files: :param model_dir: this directory should contain the following files:
1. a pre-trained model 1. a trained model
2. a config file 2. a config file, which is a fastNLP's configuration.
3. "id2class.pkl" 3. a Vocab file, which is a pickle object of a Vocab instance.
4. "word2id.pkl"
""" """
self.model_dir = model_dir self.model_dir = model_dir
self.model = None self.model = None
@ -99,10 +100,10 @@ class FastNLP(object):
print("Restore model hyper-parameters {}".format(str(model_args.data))) print("Restore model hyper-parameters {}".format(str(model_args.data)))
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(self.model_dir, "word2id.pkl") word_vocab = load_pickle(self.model_dir, "word2id.pkl")
model_args["vocab_size"] = len(word2index) model_args["vocab_size"] = len(word_vocab)
index2label = load_pickle(self.model_dir, "id2class.pkl") label_vocab = load_pickle(self.model_dir, "class2id.pkl")
model_args["num_classes"] = len(index2label) model_args["num_classes"] = len(label_vocab)
# Construct the model # Construct the model
model = model_class(model_args) model = model_class(model_args)

View File

@ -172,9 +172,8 @@ class ClassDatasetLoader(DatasetLoader):
class ConllLoader(DatasetLoader): class ConllLoader(DatasetLoader):
"""loader for conll format files""" """loader for conll format files"""
def __int__(self, data_name, data_path): def __int__(self, data_path):
""" """
:param str data_name: the name of the conll data set
:param str data_path: the path to the conll data set :param str data_path: the path to the conll data set
""" """
super(ConllLoader, self).__init__(data_path) super(ConllLoader, self).__init__(data_path)
@ -269,8 +268,3 @@ class PeopleDailyCorpusLoader(DatasetLoader):
ner_examples.append([sent_words, sent_ner]) ner_examples.append([sent_words, sent_ner])
return pos_tag_examples, ner_examples return pos_tag_examples, ner_examples
if __name__ == "__main__":
loader = PeopleDailyCorpusLoader("./")
pos, ner = loader.load()
print(pos[:10])
print(ner[:10])

View File

@ -1,11 +1,11 @@
from . import aggregation from . import aggregator
from . import decoder from . import decoder
from . import encoder from . import encoder
from . import interaction from . import interactor
__version__ = '0.0.0' __version__ = '0.0.0'
__all__ = ['encoder', __all__ = ['encoder',
'decoder', 'decoder',
'aggregation', 'aggregator',
'interaction'] 'interactor']

View File

@ -1,8 +1,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F import torch.nn.functional as F
from torch.autograd import Variable
from fastNLP.modules.utils import initial_parameter from fastNLP.modules.utils import initial_parameter

View File

@ -1,19 +1,10 @@
"""
This is borrowed from FudanParser. Not stable. Do not use !!!
"""
import numpy
import numpy as np import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torch.utils.data import torch.utils.data
from torch import optim
from torch.autograd import Function, Variable
from torch.nn import Parameter from torch.nn import Parameter
from .utils import orthogonal
class GroupNorm(nn.Module): class GroupNorm(nn.Module):
def __init__(self, num_features, num_groups=20, eps=1e-5): def __init__(self, num_features, num_groups=20, eps=1e-5):
@ -59,15 +50,6 @@ class LayerNormalization(nn.Module):
return ln_out return ln_out
class OrthEmbedding(nn.Embedding):
def __init__(self, *args, **kwargs):
super(OrthEmbedding, self).__init__(*args, **kwargs)
def reset_parameters(self):
self.weight = orthogonal(self.weight)
nn.init.constant_(self.bias, 0.)
class BiLinear(nn.Module): class BiLinear(nn.Module):
def __init__(self, n_left, n_right, n_out, bias=True): def __init__(self, n_left, n_right, n_out, bias=True):
""" """
@ -241,250 +223,3 @@ class WordDropout(nn.Module):
drop_mask = drop_mask.long() drop_mask = drop_mask.long()
output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx
return output return output
class WlossLayer(torch.nn.Module):
def __init__(self, lam=100, sinkhorn_iter=50):
super(WlossLayer, self).__init__()
# cost = matrix M = distance matrix
# lam = lambda of type float > 0
# sinkhorn_iter > 0
# diagonal cost should be 0
self.lam = lam
self.sinkhorn_iter = sinkhorn_iter
# self.register_buffer("K", torch.exp(-self.cost / self.lam).double())
# self.register_buffer("KM", (self.cost * self.K).double())
def forward(self, pred, target, cost):
return WassersteinLossStab.apply(pred, target,
cost, self.lam, self.sinkhorn_iter)
class WassersteinLossStab(Function):
@staticmethod
def forward(ctx, pred, target, cost, lam=1e-3, sinkhorn_iter=4):
"""pred: Batch * K: K = # mass points
target: Batch * L: L = # mass points"""
# import pdb
# pdb.set_trace()
eps = 1e-8
# pred = pred.gather(dim=1, index=)
na = pred.size(1)
nb = target.size(1)
cost = cost.double()
pred = pred.double()
target = target.double()
cost = cost[:na, :nb].double()
K = torch.exp(-cost / lam).double()
KM = (cost * K).double()
batch_size = pred.size(0)
# pdb.set_trace()
log_a, log_b = torch.log(pred + eps), torch.log(target + eps)
log_u = cost.new(batch_size, na).fill_(-numpy.log(na))
log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb))
# import pdb
# pdb.set_trace()
for i in range(int(sinkhorn_iter)):
log_u_max = torch.max(log_u, dim=1)[0]
u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps)
log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1)
log_v_max = torch.max(log_v, dim=1)[0]
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
tmp = log_u
log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1)
# print(log_u.sum())
if torch.norm(tmp - log_u) / torch.norm(log_u) < eps:
break
log_v_max = torch.max(log_v, dim=1)[0]
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1)
wnorm = torch.exp(log_u + logcostpart1).mean(0).sum() # sum(1) for per item pair loss...
grad_input = log_u * lam
# print("log_u", log_u)
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
grad_input = grad_input / batch_size
ctx.save_for_backward(grad_input)
# print("grad type", type(grad_input))
return pred.new((wnorm,)), grad_input
@staticmethod
def backward(ctx, grad_output, _):
grad_input = ctx.saved_variables
# print(grad)
res = grad_output.clone()
res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data)
res = res.mul_(grad_output[0]).float()
# print("in backward func:\n\n", res)
return res, None, None, None, None, None, None
class Sinkhorn(Function):
def __init__(self):
super(Sinkhorn, self).__init__()
def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop):
a = a.double()
b = b.double()
M = M.double()
nbb = b.size(1)
# init data
na = len(a)
nb = len(b)
cpt = 0
# we assume that no distances are null except those of the diagonal of
# distances
if warmstart is None:
alpha, beta = np.zeros(na), np.zeros(nb)
else:
alpha, beta = warmstart
if nbb:
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
else:
u, v = np.ones(na) / na, np.ones(nb) / nb
def get_K(alpha, beta):
"""log space computation"""
return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg)
def get_Gamma(alpha, beta, u, v):
"""log space gamma computation"""
return np.exp(
-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log(
v.reshape((1, nb))))
# print(np.min(K))
K = get_K(alpha, beta)
transp = K
cpt = 0
err = 1
while 1:
uprev = u
vprev = v
# sinkhorn update
v = b / (np.dot(K.T, u) + 1e-16)
u = a / (np.dot(K, v) + 1e-16)
# remove numerical problems and store them in K
if np.abs(u).max() > tau or np.abs(v).max() > tau:
if nbb:
alpha, beta = alpha + reg * \
np.max(np.log(u), 1), beta + reg * np.max(np.log(v))
else:
alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v)
if nbb:
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
else:
u, v = np.ones(na) / na, np.ones(nb) / nb
K = get_K(alpha, beta)
if cpt % print_period == 0:
# we can speed up the process by checking for the error only all
# the 10th iterations
if nbb:
err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \
np.sum((v - vprev) ** 2) / np.sum((v) ** 2)
else:
transp = get_Gamma(alpha, beta, u, v)
err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2
if log:
log['err'].append(err)
if verbose:
if cpt % (print_period * 20) == 0:
print(
'{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
print('{:5d}|{:8e}|'.format(cpt, err))
if err <= stopThr:
loop = False
if cpt >= numItermax:
loop = False
if np.any(np.isnan(u)) or np.any(np.isnan(v)):
# we have reached the machine precision
# come back to previous solution and quit loop
print('Warning: numerical errors at iteration', cpt)
u = uprev
v = vprev
break
cpt = cpt + 1
# print('err=',err,' cpt=',cpt)
if log:
log['logu'] = alpha / reg + np.log(u)
log['logv'] = beta / reg + np.log(v)
log['alpha'] = alpha + reg * np.log(u)
log['beta'] = beta + reg * np.log(v)
log['warmstart'] = (log['alpha'], log['beta'])
if nbb:
res = np.zeros((nbb))
for i in range(nbb):
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
return res, log
else:
return get_Gamma(alpha, beta, u, v), log
else:
if nbb:
res = np.zeros((nbb))
for i in range(nbb):
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
return res
else:
return get_Gamma(alpha, beta, u, v)
if __name__ == "__main__":
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))) # .cuda()
mylayer = WlossLayer(cost) # .cuda()
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True) # .cuda()
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])) # .cuda()
res, _ = mylayer(inp, ground_true)
# print(inp.requires_grad, res.requires_grad)
# print(res, inp)
mylayer.zero_grad()
res.backward()
print("inp's gradient is good:")
print(inp.grad)
print("convert to gpu:\n", inp.cuda().grad)
print("=============================================="
"\n However, this does not work on pytorch when GPU is enabled")
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda()
mylayer = WlossLayer(cost).cuda()
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda()
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda()
opt = optim.SGD([
{'params': mylayer.parameters()},
], lr=1e-2, momentum=0.9)
res, _ = mylayer(inp, ground_true)
# print(inp.requires_grad, res.requires_grad)
# print(res, inp)
mylayer.zero_grad()
res.backward()
print("input's gradient is None!!!!!!!!!!!!!!!!")
print(inp.grad)

View File

@ -1,9 +1,8 @@
from collections import defaultdict
import numpy as np
import torch import torch
import torch.nn.init as init
import torch.nn as nn import torch.nn as nn
import torch.nn.init as init
def mask_softmax(matrix, mask): def mask_softmax(matrix, mask):
if mask is None: if mask is None:
result = torch.nn.functional.softmax(matrix, dim=-1) result = torch.nn.functional.softmax(matrix, dim=-1)
@ -11,13 +10,28 @@ def mask_softmax(matrix, mask):
raise NotImplementedError raise NotImplementedError
return result return result
def initial_parameter(net ,initial_method =None):
def initial_parameter(net, initial_method=None):
"""A method used to initialize the weights of PyTorch models.
:param net: a PyTorch model
:param initial_method: str, one of the following initializations
- xavier_uniform
- xavier_normal (default)
- kaiming_normal, or msra
- kaiming_uniform
- orthogonal
- sparse
- normal
- uniform
"""
if initial_method == 'xavier_uniform': if initial_method == 'xavier_uniform':
init_method = init.xavier_uniform_ init_method = init.xavier_uniform_
elif initial_method=='xavier_normal': elif initial_method == 'xavier_normal':
init_method = init.xavier_normal_ init_method = init.xavier_normal_
elif initial_method == 'kaiming_normal' or initial_method =='msra': elif initial_method == 'kaiming_normal' or initial_method == 'msra':
init_method = init.kaiming_normal init_method = init.kaiming_normal
elif initial_method == 'kaiming_uniform': elif initial_method == 'kaiming_uniform':
init_method = init.kaiming_normal init_method = init.kaiming_normal
@ -25,263 +39,49 @@ def initial_parameter(net ,initial_method =None):
init_method = init.orthogonal_ init_method = init.orthogonal_
elif initial_method == 'sparse': elif initial_method == 'sparse':
init_method = init.sparse_ init_method = init.sparse_
elif initial_method =='normal': elif initial_method == 'normal':
init_method = init.normal_ init_method = init.normal_
elif initial_method =='uniform': elif initial_method == 'uniform':
initial_method = init.uniform_ initial_method = init.uniform_
else: else:
init_method = init.xavier_normal_ init_method = init.xavier_normal_
def weights_init(m): def weights_init(m):
# classname = m.__class__.__name__ # classname = m.__class__.__name__
if isinstance(m, nn.Conv2d) or isinstance(m,nn.Conv1d) or isinstance(m,nn.Conv3d): # for all the cnn if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv3d): # for all the cnn
if initial_method != None: if initial_method is not None:
init_method(m.weight.data) init_method(m.weight.data)
else: else:
init.xavier_normal_(m.weight.data) init.xavier_normal_(m.weight.data)
init.normal_(m.bias.data) init.normal_(m.bias.data)
elif isinstance(m, nn.LSTM): elif isinstance(m, nn.LSTM):
for w in m.parameters(): for w in m.parameters():
if len(w.data.size())>1: if len(w.data.size()) > 1:
init_method(w.data) # weight init_method(w.data) # weight
else: else:
init.normal_(w.data) # bias init.normal_(w.data) # bias
elif hasattr(m, 'weight') and m.weight.requires_grad: elif hasattr(m, 'weight') and m.weight.requires_grad:
init_method(m.weight.data) init_method(m.weight.data)
else: else:
for w in m.parameters() : for w in m.parameters():
if w.requires_grad: if w.requires_grad:
if len(w.data.size())>1: if len(w.data.size()) > 1:
init_method(w.data) # weight init_method(w.data) # weight
else: else:
init.normal_(w.data) # bias init.normal_(w.data) # bias
# print("init else") # print("init else")
net.apply(weights_init) net.apply(weights_init)
def seq_mask(seq_len, max_len): def seq_mask(seq_len, max_len):
"""Create sequence mask.
:param seq_len: list of int, the lengths of sequences in a batch.
:param max_len: int, the maximum sequence length in a batch.
:return mask: torch.LongTensor, [batch_size, max_len]
"""
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)]
mask = torch.stack(mask, 1) mask = torch.stack(mask, 1)
return mask return mask
"""
Codes from FudanParser. Not tested. Do not use !!!
"""
def expand_gt(gt):
"""expand_gt: Expand ground truth to matrix
Arguments:
gt: tensor of (n, l)
Return:
f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$.
"""
n, l = gt.shape
ret = torch.zeros(n, l, l).long()
for i in range(n):
ret[i][torch.arange(l).long(), gt[i]] = 1
return ret
def greedy_decoding(arc_f):
"""greedy_decoding
Arguments:
arc_f: a tensor in shape of (n, l+1, l+1)
length of the sentence is l and index 0 is <root>
Output:
arc_pred: a tensor in shape of (n, l), indicating the head words
"""
f_arc = arc_f[:, 1:, :] # ignore the root
_, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False)
return arc_pred
def mst_decoding(arc_f):
batch_size = arc_f.shape[0]
length = arc_f.shape[1]
arc_score = arc_f.data.cpu()
pred_collection = []
for i in range(batch_size):
head = mst(arc_score[i].numpy())
pred_collection.append(head[1:].reshape((1, length - 1)))
arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long()
return arc_pred
def outer_product(features):
"""InterProduct: Get inter sequence product of features
Arguments:
features: feature vectors of sequence in the shape of (n, l, h)
Return:
f: product result in (n, l, l, h) shape
"""
n, l, c = features.shape
features = features.contiguous()
x = features.view(n, l, 1, c)
x = x.expand(n, l, l, c)
y = features.view(n, 1, l, c).contiguous()
y = y.expand(n, l, l, c)
return x * y
def outer_concat(features):
"""InterProduct: Get inter sequence concatenation of features
Arguments:
features: feature vectors of sequence in the shape of (n, l, h)
Return:
f: product result in (n, l, l, h) shape
"""
n, l, c = features.shape
x = features.contiguous().view(n, l, 1, c)
x = x.expand(n, l, l, c)
y = features.view(n, 1, l, c)
y = y.expand(n, l, l, c)
return torch.cat((x, y), dim=3)
def mst(scores):
"""
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692 # NOQA
"""
length = scores.shape[0]
min_score = scores.min() - 1
eye = np.eye(length)
scores = scores * (1 - eye) + min_score * eye
heads = np.argmax(scores, axis=1)
heads[0] = 0
tokens = np.arange(1, length)
roots = np.where(heads[tokens] == 0)[0] + 1
if len(roots) < 1:
root_scores = scores[tokens, 0]
head_scores = scores[tokens, heads[tokens]]
new_root = tokens[np.argmax(root_scores / head_scores)]
heads[new_root] = 0
elif len(roots) > 1:
root_scores = scores[roots, 0]
scores[roots, 0] = 0
new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
new_root = roots[np.argmin(
scores[roots, new_heads] / root_scores)]
heads[roots] = new_heads
heads[new_root] = 0
edges = defaultdict(set)
vertices = set((0,))
for dep, head in enumerate(heads[tokens]):
vertices.add(dep + 1)
edges[head].add(dep + 1)
for cycle in _find_cycle(vertices, edges):
dependents = set()
to_visit = set(cycle)
while len(to_visit) > 0:
node = to_visit.pop()
if node not in dependents:
dependents.add(node)
to_visit.update(edges[node])
cycle = np.array(list(cycle))
old_heads = heads[cycle]
old_scores = scores[cycle, old_heads]
non_heads = np.array(list(dependents))
scores[np.repeat(cycle, len(non_heads)),
np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score
new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
new_scores = scores[cycle, new_heads] / old_scores
change = np.argmax(new_scores)
changed_cycle = cycle[change]
old_head = old_heads[change]
new_head = new_heads[change]
heads[changed_cycle] = new_head
edges[new_head].add(changed_cycle)
edges[old_head].remove(changed_cycle)
return heads
def _find_cycle(vertices, edges):
"""
https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA
"""
_index = 0
_stack = []
_indices = {}
_lowlinks = {}
_onstack = defaultdict(lambda: False)
_SCCs = []
def _strongconnect(v):
nonlocal _index
_indices[v] = _index
_lowlinks[v] = _index
_index += 1
_stack.append(v)
_onstack[v] = True
for w in edges[v]:
if w not in _indices:
_strongconnect(w)
_lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
elif _onstack[w]:
_lowlinks[v] = min(_lowlinks[v], _indices[w])
if _lowlinks[v] == _indices[v]:
SCC = set()
while True:
w = _stack.pop()
_onstack[w] = False
SCC.add(w)
if not (w != v):
break
_SCCs.append(SCC)
for v in vertices:
if v not in _indices:
_strongconnect(v)
return [SCC for SCC in _SCCs if len(SCC) > 1]
# https://github.com/alykhantejani/nninit/blob/master/nninit.py
def orthogonal(tensor, gain=1):
"""Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions,
and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with
rows equal to the first dimension and columns equal to the product of as a sparse matrix, where the non-zero elements
will be drawn from a normal distribution with mean=0 and std=`std`.
Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al.
Args:
tensor: a n-dimension torch.Tensor, where n >= 2
gain: optional gain to be applied
Examples:
>>> w = torch.Tensor(3, 5)
>>> nninit.orthogonal(w)
"""
if tensor.ndimension() < 2:
raise ValueError("Only tensors with 2 or more dimensions are supported.")
flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:])))
flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1)
u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False)
if u.shape == flattened.detach().numpy().shape:
tensor.view_as(flattened).copy_(torch.from_numpy(u))
else:
tensor.view_as(flattened).copy_(torch.from_numpy(v))
tensor.mul_(gain)
with torch.no_grad():
return tensor
def generate_step_dropout(masks, hidden_dim, step_dropout, training=False):
# assume batch first
# import pdb
# pdb.set_trace()
batch, length = masks.size()
if not training:
return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch,
length, 1)
masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout)
masked = torch.bernoulli(masked).repeat(1, length, 1)
masked = masked.cuda(masks.device) * masks.view(batch, length, 1)
return masked

View File

@ -2,16 +2,23 @@ import torch
class ModelSaver(object): class ModelSaver(object):
"""Save a models""" """Save a model
Example::
saver = ModelSaver("./save/model_ckpt_100.pkl")
saver.save_pytorch(model)
"""
def __init__(self, save_path): def __init__(self, save_path):
"""
:param save_path: str, the path to the saving directory.
"""
self.save_path = save_path self.save_path = save_path
# TODO: check whether the path exist, if not exist, create it.
def save_pytorch(self, model): def save_pytorch(self, model):
""" """Save a pytorch model into .pkl file.
Save a pytorch model into .pkl file.
:param model: a PyTorch model :param model: a PyTorch model
:return:
""" """
torch.save(model.state_dict(), self.save_path) torch.save(model.state_dict(), self.save_path)

View File

@ -1,23 +1,15 @@
import os
import torch.nn.functional as F import torch.nn.functional as F
from fastNLP.loader.dataset_loader import ClassDatasetLoader as Dataset_loader
from fastNLP.loader.embed_loader import EmbedLoader as EmbedLoader
from fastNLP.loader.config_loader import ConfigSection
from fastNLP.loader.config_loader import ConfigLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.core.preprocess import ClassPreprocess as Preprocess from fastNLP.core.preprocess import ClassPreprocess as Preprocess
from fastNLP.core.trainer import ClassificationTrainer from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.config_loader import ConfigLoader
from fastNLP.loader.config_loader import ConfigSection
from fastNLP.loader.dataset_loader import ClassDatasetLoader as Dataset_loader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.aggregator.self_attention import SelfAttention
from fastNLP.modules.decoder.MLP import MLP
from fastNLP.modules.encoder.embedding import Embedding as Embedding from fastNLP.modules.encoder.embedding import Embedding as Embedding
from fastNLP.modules.encoder.lstm import Lstm from fastNLP.modules.encoder.lstm import Lstm
from fastNLP.modules.aggregation.self_attention import SelfAttention
from fastNLP.modules.decoder.MLP import MLP
train_data_path = 'small_train_data.txt' train_data_path = 'small_train_data.txt'
dev_data_path = 'small_dev_data.txt' dev_data_path = 'small_dev_data.txt'

View File

@ -32,7 +32,7 @@ def infer():
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl") word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index) test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl") index2label = load_pickle(pickle_path, "class2id.pkl")
test_args["num_classes"] = len(index2label) test_args["num_classes"] = len(index2label)
@ -105,7 +105,7 @@ def test():
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl") word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index) test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl") index2label = load_pickle(pickle_path, "class2id.pkl")
test_args["num_classes"] = len(index2label) test_args["num_classes"] = len(index2label)
# load dev data # load dev data

View File

@ -33,7 +33,7 @@ def infer():
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl") word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index) test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl") index2label = load_pickle(pickle_path, "class2id.pkl")
test_args["num_classes"] = len(index2label) test_args["num_classes"] = len(index2label)
# Define the same model # Define the same model
@ -105,7 +105,7 @@ def test():
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl") word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index) test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl") index2label = load_pickle(pickle_path, "class2id.pkl")
test_args["num_classes"] = len(index2label) test_args["num_classes"] = len(index2label)
# load dev data # load dev data

View File

@ -4,6 +4,7 @@ import unittest
from fastNLP.core.predictor import Predictor from fastNLP.core.predictor import Predictor
from fastNLP.core.preprocess import save_pickle from fastNLP.core.preprocess import save_pickle
from fastNLP.models.sequence_modeling import SeqLabeling from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.vocabulary import Vocabulary
class TestPredictor(unittest.TestCase): class TestPredictor(unittest.TestCase):
@ -23,10 +24,14 @@ class TestPredictor(unittest.TestCase):
['a', 'b', 'c', 'd', '$'], ['a', 'b', 'c', 'd', '$'],
['!', 'b', 'c', 'd', 'e'] ['!', 'b', 'c', 'd', 'e']
] ]
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
vocab = Vocabulary()
vocab.word2idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
class_vocab = Vocabulary()
class_vocab.word2idx = {"0":0, "1":1, "2":2, "3":3, "4":4}
os.system("mkdir save") os.system("mkdir save")
save_pickle({0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, "./save/", "id2class.pkl") save_pickle(class_vocab, "./save/", "class2id.pkl")
save_pickle(vocab, "./save/", "word2id.pkl") save_pickle(vocab, "./save/", "word2id.pkl")
model = SeqLabeling(model_args) model = SeqLabeling(model_args)

30
test/core/test_sampler.py Normal file
View File

@ -0,0 +1,30 @@
import torch
from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler
def test_convert_to_torch_tensor():
data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 3, 4, 5, 2]]
ans = convert_to_torch_tensor(data, False)
assert isinstance(ans, torch.Tensor)
assert tuple(ans.shape) == (3, 5)
def test_sequential_sampler():
sampler = SequentialSampler()
data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]
for idx, i in enumerate(sampler(data)):
assert idx == i
def test_random_sampler():
sampler = RandomSampler()
data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]
ans = [data[i] for i in sampler(data)]
assert len(ans) == len(data)
for d in ans:
assert d in data
if __name__ == "__main__":
test_sequential_sampler()

31
test/core/test_vocab.py Normal file
View File

@ -0,0 +1,31 @@
import unittest
from fastNLP.core.vocabulary import Vocabulary, DEFAULT_WORD_TO_INDEX
class TestVocabulary(unittest.TestCase):
def test_vocab(self):
import _pickle as pickle
import os
vocab = Vocabulary()
filename = 'vocab'
vocab.update(filename)
vocab.update([filename, ['a'], [['b']], ['c']])
idx = vocab[filename]
before_pic = (vocab.to_word(idx), vocab[filename])
with open(filename, 'wb') as f:
pickle.dump(vocab, f)
with open(filename, 'rb') as f:
vocab = pickle.load(f)
os.remove(filename)
vocab.build_reverse_vocab()
after_pic = (vocab.to_word(idx), vocab[filename])
TRUE_DICT = {'vocab': 5, 'a': 6, 'b': 7, 'c': 8}
TRUE_DICT.update(DEFAULT_WORD_TO_INDEX)
TRUE_IDXDICT = {0: '<pad>', 1: '<unk>', 2: '<reserved-2>', 3: '<reserved-3>', 4: '<reserved-4>', 5: 'vocab', 6: 'a', 7: 'b', 8: 'c'}
self.assertEqual(before_pic, after_pic)
self.assertDictEqual(TRUE_DICT, vocab.word2idx)
self.assertDictEqual(TRUE_IDXDICT, vocab.idx2word)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,15 @@
1 I _ PRP PRP _ 2 SUB
2 solved _ VBD VBD _ 0 ROOT
3 the _ DT DT _ 4 NMOD
4 problem _ NN NN _ 2 OBJ
5 with _ IN IN _ 2 VMOD
6 statistics _ NNS NNS _ 5 PMOD
7 . _ . . _ 2 P
1 I _ PRP PRP _ 2 SUB
2 solved _ VBD VBD _ 0 ROOT
3 the _ DT DT _ 4 NMOD
4 problem _ NN NN _ 2 OBJ
5 with _ IN IN _ 2 VMOD
6 statistics _ NNS NNS _ 5 PMOD
7 . _ . . _ 2 P

View File

@ -0,0 +1,27 @@
19980101-01-001-001/m 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n ——/w 一九九八年/t 新年/t 讲话/n /w 附/v 图片/n /m 张/q /w
19980101-01-001-002/m 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr
19980101-01-001-003/m /w 一九九七年/t 十二月/t 三十一日/t /w
19980101-01-001-004/m 12月/t 31日/t /w 中共中央/nt 总书记/n 、/w 国家/n 主席/n 江/nr 泽民/nr 发表/v 1998年/t 新年/t 讲话/n 《/w 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n 》/w 。/w /w 新华社/nt 记者/n 兰/nr 红光/nr 摄/Vg /w
19980101-01-001-005/m 同胞/n 们/k 、/w 朋友/n 们/k 、/w 女士/n 们/k 、/w 先生/n 们/k /w
19980101-01-001-006/m 在/p 1998年/t 来临/v 之际/f /w 我/r 十分/m 高兴/a 地/u 通过/p [中央/n 人民/n 广播/vn 电台/n]nt 、/w [中国/ns 国际/n 广播/vn 电台/n]nt 和/c [中央/n 电视台/n]nt /w 向/p 全国/n 各族/r 人民/n /w 向/p [香港/ns 特别/a 行政区/n]ns 同胞/n 、/w 澳门/ns 和/c 台湾/ns 同胞/n 、/w 海外/s 侨胞/n /w 向/p 世界/n 各国/r 的/u 朋友/n 们/k /w 致以/v 诚挚/a 的/u 问候/vn 和/c 良好/a 的/u 祝愿/vn /w
19980101-01-001-007/m 1997年/t /w 是/v 中国/ns 发展/vn 历史/n 上/f 非常/d 重要/a 的/u 很/d 不/d 平凡/a 的/u 一/m 年/q 。/w 中国/ns 人民/n 决心/d 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n /w 继续/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 事业/n 推向/v 前进/v 。/w [中国/ns 政府/n]nt 顺利/ad 恢复/v 对/p 香港/ns 行使/v 主权/n /w 并/c 按照/p “/w 一国两制/j ”/w 、/w “/w 港人治港/l ”/w 、/w 高度/d 自治/v 的/u 方针/n 保持/v 香港/ns 的/u 繁荣/an 稳定/an 。/w [中国/ns 共产党/n]nt 成功/a 地/u 召开/v 了/u 第十五/m 次/q 全国/n 代表大会/n /w 高举/v 邓小平理论/n 伟大/a 旗帜/n /w 总结/v 百年/m 历史/n /w 展望/v 新/a 的/u 世纪/n /w 制定/v 了/u 中国/ns 跨/v 世纪/n 发展/v 的/u 行动/vn 纲领/n 。/w
19980101-01-001-008/m 在/p 这/r 一/m 年/q 中/f /w 中国/ns 的/u 改革/vn 开放/vn 和/c 现代化/vn 建设/vn 继续/v 向前/v 迈进/v 。/w 国民经济/n 保持/v 了/u “/w 高/a 增长/vn 、/w 低/a 通胀/j ”/w 的/u 良好/a 发展/vn 态势/n 。/w 农业/n 生产/vn 再次/d 获得/v 好/a 的/u 收成/n /w 企业/n 改革/vn 继续/v 深化/v /w 人民/n 生活/vn 进一步/d 改善/v 。/w 对外/vn 经济/n 技术/n 合作/vn 与/c 交流/vn 不断/d 扩大/v 。/w 民主/a 法制/n 建设/vn 、/w 精神文明/n 建设/vn 和/c 其他/r 各项/r 事业/n 都/d 有/v 新/a 的/u 进展/vn 。/w 我们/r 十分/m 关注/v 最近/t 一个/m 时期/n 一些/m 国家/n 和/c 地区/n 发生/v 的/u 金融/n 风波/n /w 我们/r 相信/v 通过/p 这些/r 国家/n 和/c 地区/n 的/u 努力/an 以及/c 有关/v 的/u 国际/n 合作/vn /w 情况/n 会/v 逐步/d 得到/v 缓解/vn 。/w 总的来说/c /w 中国/ns 改革/v 和/c 发展/v 的/u 全局/n 继续/v 保持/v 了/u 稳定/an 。/w
19980101-01-001-009/m 在/p 这/r 一/m 年/q 中/f /w 中国/ns 的/u 外交/n 工作/vn 取得/v 了/u 重要/a 成果/n 。/w 通过/p 高层/n 互访/v /w 中国/ns 与/p 美国/ns 、/w 俄罗斯/ns 、/w 法国/ns 、/w 日本/ns 等/u 大国/n 确定/v 了/u 双方/n 关系/n 未来/t 发展/v 的/u 目标/n 和/c 指导/vn 方针/n 。/w 中国/ns 与/p 周边/n 国家/n 和/c 广大/b 发展中国家/l 的/u 友好/a 合作/vn 进一步/d 加强/v 。/w 中国/ns 积极/ad 参与/v [亚/j 太/j 经合/j 组织/n]nt 的/u 活动/vn /w 参加/v 了/u 东盟/ns —/w 中/j 日/j 韩/j 和/c 中国/ns —/w 东盟/ns 首脑/n 非正式/b 会晤/vn 。/w 这些/r 外交/n 活动/vn /w 符合/v 和平/n 与/c 发展/v 的/u 时代/n 主题/n /w 顺应/v 世界/n 走向/v 多极化/v 的/u 趋势/n /w 对于/p 促进/v 国际/n 社会/n 的/u 友好/a 合作/vn 和/c 共同/b 发展/vn 作出/v 了/u 积极/a 的/u 贡献/n 。/w
19980101-01-001-010/m 1998年/t /w 中国/ns 人民/n 将/d 满怀信心/l 地/u 开创/v 新/a 的/u 业绩/n 。/w 尽管/c 我们/r 在/p 经济/n 社会/n 发展/v 中/f 还/d 面临/v 不少/m 困难/an /w 但/c 我们/r 有/v 邓小平理论/n 的/u 指引/vn /w 有/v 改革/v 开放/v 近/a /m 年/q 来/f 取得/v 的/u 伟大/a 成就/n 和/c 积累/v 的/u 丰富/a 经验/n /w 还/d 有/v 其他/r 的/u 各种/r 有利/a 条件/n /w 我们/r 一定/d 能够/v 克服/v 这些/r 困难/an /w 继续/v 稳步前进/l 。/w 只要/c 我们/r 进一步/d 解放思想/i /w 实事求是/i /w 抓住/v 机遇/n /w 开拓进取/l /w 建设/v 有/v 中国/ns 特色/n 社会主义/n 的/u 道路/n 就/c 会/v 越/d 走/v 越/d 宽广/a 。/w
19980101-01-001-011/m 实现/v 祖国/n 的/u 完全/a 统一/vn /w 是/v 海内外/s 全体/n 中国/ns 人/n 的/u 共同/b 心愿/n 。/w 通过/p 中/j 葡/j 双方/n 的/u 合作/vn 和/c 努力/an /w 按照/p “/w 一国两制/j ”/w 方针/n 和/c 澳门/ns 《/w 基本法/n 》/w /w 1999年/t 12月/t 澳门/ns 的/u 回归/vn 一定/d 能够/v 顺利/ad 实现/v 。/w
19980101-01-001-012/m 台湾/ns 是/v 中国/ns 领土/n 不可分割/l 的/u 一/m 部分/n 。/w 完成/v 祖国/n 统一/vn /w 是/v 大势所趋/i /w 民心所向/l 。/w 任何/r 企图/v 制造/v “/w 两/m 个/q 中国/ns ”/w 、/w “/w 一中一台/j ”/w 、/w “/w 台湾/ns 独立/v ”/w 的/u 图谋/n /w 都/d 注定/v 要/v 失败/v 。/w 希望/v 台湾/ns 当局/n 以/p 民族/n 大义/n 为重/v /w 拿/v 出/v 诚意/n /w 采取/v 实际/a 的/u 行动/vn /w 推动/v 两岸/n 经济/n 文化/n 交流/vn 和/c 人员/n 往来/vn /w 促进/v 两岸/n 直接/ad 通邮/v 、/w 通航/v 、/w 通商/v 的/u 早日/d 实现/v /w 并/c 尽早/d 回应/v 我们/r 发出/v 的/u 在/p 一个/m 中国/ns 的/u 原则/n 下/f 两岸/n 进行/v 谈判/vn 的/u 郑重/a 呼吁/vn 。/w
19980101-01-001-013/m 环顾/v 全球/n /w 日益/d 密切/a 的/u 世界/n 经济/n 联系/vn /w 日新月异/i 的/u 科技/n 进步/vn /w 正在/d 为/p 各国/r 经济/n 的/u 发展/vn 提供/v 历史/n 机遇/n 。/w 但是/c /w 世界/n 还/d 不/d 安宁/a 。/w 南北/f 之间/f 的/u 贫富/n 差距/n 继续/v 扩大/v /w 局部/n 冲突/vn 时有发生/l /w 不/d 公正/a 不/d 合理/a 的/u 旧/a 的/u 国际/n 政治/n 经济/n 秩序/n 还/d 没有/v 根本/a 改变/vn /w 发展中国家/l 在/p 激烈/a 的/u 国际/n 经济/n 竞争/vn 中/f 仍/d 处于/v 弱势/n 地位/n /w 人类/n 的/u 生存/vn 与/c 发展/vn 还/d 面临/v 种种/q 威胁/vn 和/c 挑战/vn 。/w 和平/n 与/c 发展/vn 的/u 前景/n 是/v 光明/a 的/u /w /m 世纪/n 将/d 是/v 充满/v 希望/n 的/u 世纪/n 。/w 但/c 前进/v 的/u 道路/n 不/d 会/v 也/d 不/d 可能/v 一帆风顺/i /w 关键/n 是/v 世界/n 各国/r 人民/n 要/v 进一步/d 团结/a 起来/v /w 共同/d 推动/v 早日/d 建立/v 公正/a 合理/a 的/u 国际/n 政治/n 经济/n 新/a 秩序/n 。/w
19980101-01-001-014/m [中国/ns 政府/n]nt 将/d 继续/v 坚持/v 奉行/v 独立自主/i 的/u 和平/n 外交/n 政策/n /w 在/p 和平共处/l 五/m 项/q 原则/n 的/u 基础/n 上/f 努力/ad 发展/v 同/p 世界/n 各国/r 的/u 友好/a 关系/n 。/w 中国/ns 愿意/v 加强/v 同/p 联合国/nt 和/c 其他/r 国际/n 组织/n 的/u 协调/vn /w 促进/v 在/p 扩大/v 经贸/j 科技/n 交流/vn 、/w 保护/v 环境/n 、/w 消除/v 贫困/an 、/w 打击/v 国际/n 犯罪/vn 等/u 方面/n 的/u 国际/n 合作/vn 。/w 中国/ns 永远/d 是/v 维护/v 世界/n 和平/n 与/c 稳定/an 的/u 重要/a 力量/n 。/w 中国/ns 人民/n 愿/v 与/p 世界/n 各国/r 人民/n 一道/d /w 为/p 开创/v 持久/a 和平/n 、/w 共同/d 发展/v 的/u 新/a 世纪/n 而/c 不懈努力/l /w
19980101-01-001-015/m 在/p 这/r 辞旧迎新/l 的/u 美好/a 时刻/n /w 我/r 祝/v 大家/r 新年/t 快乐/a /w 家庭/n 幸福/a /w
19980101-01-001-016/m 谢谢/v /w /w 新华社/nt 北京/ns 12月/t 31日/t 电/n /w
19980101-01-002-001/m 在/p 十五大/j 精神/n 指引/vn 下/f 胜利/vd 前进/v ——/w 元旦/t 献辞/n
19980101-01-002-002/m 我们/r 即将/d 以/p 丰收/vn 的/u 喜悦/an 送/v 走/v 牛年/t /w 以/p 昂扬/a 的/u 斗志/n 迎来/v 虎年/t 。/w 我们/r 伟大/a 祖国/n 在/p 新/a 的/u 一/m 年/q /w 将/d 是/v 充满/v 生机/n 、/w 充满/v 希望/n 的/u 一/m 年/q 。/w
19980101-01-002-003/m 刚刚/d 过去/v 的/u 一/m 年/q /w 大气磅礴/i /w 波澜壮阔/i 。/w 在/p 这/r 一/m 年/q /w 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt /w 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n /w 高举/v 邓小平理论/n 的/u 伟大/a 旗帜/n /w 领导/v 全党/n 和/c 全国/n 各族/r 人民/n 坚定不移/i 地/u 沿着/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 道路/n 阔步/d 前进/v /w 写/v 下/v 了/u 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 辉煌/a 篇章/n 。/w 顺利/a 地/u 恢复/v 对/p 香港/ns 行使/v 主权/n /w 胜利/v 地/u 召开/v 党/n 的/u 第十五/m 次/q 全国/n 代表大会/n ———/w 两/m 件/q 大事/n 办/v 得/u 圆满/a 成功/a 。/w 国民经济/n 稳中求进/l /w 国家/n 经济/n 实力/n 进一步/d 增强/v /w 人民/n 生活/vn 继续/v 改善/v /w 对外/vn 经济/n 技术/n 交流/vn 日益/d 扩大/v 。/w 在/p 国际/n 金融/n 危机/n 的/u 风浪/n 波及/v 许多/m 国家/n 的/u 情况/n 下/f /w 我国/n 保持/v 了/u 金融/n 形势/n 和/c 整个/b 经济/n 形势/n 的/u 稳定/a 发展/vn 。/w 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 取得/v 新/a 的/u 成绩/n /w 各项/r 社会/n 事业/n 全面/ad 进步/v 。/w 外交/n 工作/vn 取得/v 可喜/a 的/u 突破/vn /w 我国/n 的/u 国际/n 地位/n 和/c 国际/n 威望/n 进一步/d 提高/v 。/w 实践/v 使/v 亿万/m 人民/n 对/p 邓小平理论/n 更加/d 信仰/v /w 对/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 更加/d 信赖/v /w 对/p 伟大/a 祖国/n 的/u 光辉/n 前景/n 更加/d 充满/v 信心/n 。/w
19980101-01-002-004/m 1998年/t /w 是/v 全面/ad 贯彻/v 落实/v 党/n 的/u 十五大/j 提出/v 的/u 任务/n 的/u 第一/m 年/q /w 各/r 条/q 战线/n 改革/v 和/c 发展/v 的/u 任务/n 都/d 十分/m 繁重/a /w 有/v 许多/m 深/a 层次/n 的/u 矛盾/an 和/c 问题/n 有待/v 克服/v 和/c 解决/v /w 特别/d 是/v 国有/vn 企业/n 改革/vn 已经/d 进入/v 攻坚/vn 阶段/n 。/w 我们/r 必须/d 进一步/d 深入/ad 学习/v 和/c 掌握/v 党/n 的/u 十五大/j 精神/n /w 统揽全局/l /w 精心/ad 部署/v /w 狠抓/v 落实/v /w 团结/a 一致/a /w 艰苦奋斗/i /w 开拓/v 前进/v /w 为/p 夺取/v 今年/t 改革/v 开放/v 和/c 社会主义/n 现代化/vn 建设/vn 的/u 新/a 胜利/vn 而/c 奋斗/v 。/w
19980101-01-002-005/m 今年/t 是/v 党/n 的/u 十一/m 届/q 三中全会/j 召开/v /m 周年/q /w 是/v 我们/r 党/n 和/c 国家/n 实现/v 伟大/a 的/u 历史/n 转折/vn 、/w 进入/v 改革/vn 开放/vn 历史/n 新/a 时期/n 的/u /m 周年/q 。/w 在/p 新/a 的/u 一/m 年/q 里/f /w 大力/d 发扬/v 十一/m 届/q 三中全会/j 以来/f 我们/r 党/n 所/u 恢复/v 的/u 优良/z 传统/n 和/c 在/p 新/a 的/u 历史/n 条件/n 下/f 形成/v 的/u 优良/z 作风/n /w 对于/p 完成/v 好/a 今年/t 的/u 各项/r 任务/n 具有/v 十分/m 重要/a 的/u 意义/n 。/w
19980101-01-002-006/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 解放思想/i 、/w 实事求是/i 的/u 思想/n 路线/n 。/w 解放思想/i 、/w 实事求是/i /w 是/v 邓小平理论/n 的/u 精髓/n 。/w 实践/v 证明/v /w 只有/c 解放思想/i 、/w 实事求是/i /w 才/c 能/v 冲破/v 各种/r 不/d 切合/v 实际/n 的/u 或者/c 过时/a 的/u 观念/n 的/u 束缚/vn /w 真正/d 做到/v 尊重/v 、/w 认识/v 和/c 掌握/v 客观/a 规律/n /w 勇于/v 突破/v /w 勇于/v 创新/v /w 不断/d 开创/v 社会主义/n 现代化/vn 建设/vn 的/u 新/a 局面/n 。/w 党/n 的/u 十五大/j 是/v 我们/r 党/n 解放思想/i 、/w 实事求是/i 的/u 新/a 的/u 里程碑/n 。/w 进一步/d 认真/ad 学习/v 和/c 掌握/v 十五大/j 精神/n /w 解放思想/i 、/w 实事求是/i /w 我们/r 的/u 各项/r 事业/n 就/d 能/v 结/v 出/v 更加/d 丰硕/a 的/u 成果/n 。/w
19980101-01-002-007/m 我们/r 要/v 更/d 好/a 地/u 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 。/w 各项/r 工作/vn 必须/d 以/p 经济/n 建设/vn 为/v 中心/n /w 是/v 邓小平理论/n 的/u 基本/a 观点/n /w 是/v 党/n 的/u 基本/a 路线/n 的/u 核心/n 内容/n /w 近/a /m 年/q 来/f 的/u 实践/vn 证明/v /w 坚持/v 这个/r 中心/n /w 是/v 完全/ad 正确/a 的/u 。/w 今后/t /w 我们/r 能否/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 伟大/a 事业/n 全面/ad 推向/v /m 世纪/n /w 关键/n 仍然/d 要/v 看/v 能否/v 把/p 经济/n 工作/vn 搞/v 上去/v 。/w 各级/r 领导/n 干部/n 要/v 切实/ad 把/p 精力/n 集中/v 到/v 贯彻/v 落实/v 好/a 中央/n 关于/p 今年/t 经济/n 工作/vn 的/u 总体/n 要求/n 和/c 各项/r 重要/a 任务/n 上/f 来/v /w 不断/d 提高/v 领导/v 经济/n 建设/vn 的/u 能力/n 和/c 水平/n 。/w
19980101-01-002-008/m 我们/r 要/v 更/d 好/a 地/u 坚持/v “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 的/u 方针/n 。/w 在/p 坚持/v 以/p 经济/n 建设/vn 为/v 中心/n 的/u 同时/n /w 积极/ad 推进/v 社会主义/n 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn /w 是/v 建设/v 富强/a 、/w 民主/a 、/w 文明/a 的/u 社会主义/n 现代化/vn 国家/n 的/u 重要/a 内容/n 。/w 实践/v 证明/v /w 经济/n 建设/vn 的/u 顺利/a 进行/vn /w 离/v 不/d 开/v 精神文明/n 建设/vn 和/c 民主/a 法制/n 建设/vn 的/u 保证/vn 。/w 党/n 的/u 十五大/j 依据/p 邓小平理论/n 和/c 党/n 的/u 基本/a 路线/n 提出/v 的/u 党/n 在/p 社会主义/n 初级/b 阶段/n 经济/n 、/w 政治/n 、/w 文化/n 的/u 基本/a 纲领/n /w 为/p “/w 两手抓/l 、/w 两手/m 都/d 要/v 硬/a ”/w 提供/v 了/u 新/a 的/u 理论/n 根据/n /w 提出/v 了/u 更/d 高/a 要求/n /w 现在/t 的/u 关键/n 是/v 认真/ad 抓好/v 落实/v 。/w
19980101-01-002-009/m 我们/r 要/v 更/d 好/a 地/u 发扬/v 求真务实/l 、/w 密切/ad 联系/v 群众/n 的/u 作风/n 。/w 这/r 是/v 把/p 党/n 的/u 方针/n 、/w 政策/n 落到实处/l /w 使/v 改革/v 和/c 建设/v 取得/v 胜利/vn 的/u 重要/a 保证/vn 。/w 在/p 当前/t 改革/v 进一步/d 深化/v /w 经济/n 不断/d 发展/v /w 同时/c 又/d 出现/v 一些/m 新/a 情况/n 、/w 新/a 问题/n 和/c 新/a 困难/an 的/u 形势/n 下/f /w 更/d 要/v 发扬/v 这样/r 的/u 好/a 作风/n 。/w 要/v 尊重/v 群众/n 的/u 意愿/n /w 重视/v 群众/n 的/u 首创/vn 精神/n /w 关心/v 群众/n 的/u 生活/vn 疾苦/n 。/w 江/nr 泽民/nr 同志/n 最近/t 强调/vd 指出/v /w 要/v 大力/d 倡导/v 说实话/l 、/w 办/v 实事/n 、/w 鼓/v 实劲/n 、/w 讲/v 实效/n 的/u 作风/n /w 坚决/ad 制止/v 追求/v 表面文章/i /w 搞/v 花架子/n 等/u 形式主义/n /w 坚决/ad 杜绝/v 脱离/v 群众/n 、/w 脱离/v 实际/n 、/w 浮躁/a 虚夸/v 等/u 官僚主义/n 。/w 这/r 是/v 非常/d 重要/a 的/u 。/w 因此/c /w 各级/r 领导/n 干部/n 务必/d 牢记/v 全心全意/i 为/p 人民/n 服务/v 的/u 宗旨/n /w 在/p 勤政廉政/l 、/w 艰苦奋斗/i 方面/n 以身作则/i /w 当/v 好/a 表率/n 。/w
19980101-01-002-010/m /m /w 瞩目/v 中华/nz 。/w 新/a 的/u 机遇/n 和/c 挑战/vn /w 催/v 人/n 进取/v /w 新/a 的/u 目标/n 和/c 征途/n /w 催/v 人/n 奋发/v 。/w 英雄/n 的/u 中国/ns 人民/n 在/p 以/p 江/nr 泽民/nr 同志/n 为/v 核心/n 的/u 党中央/nt 坚强/a 领导/vn 和/c 党/n 的/u 十五大/j 精神/n 指引/v 下/f /w 更/d 高/a 地/u 举起/v 邓小平理论/n 的/u 伟大/a 旗帜/n /w 团结/a 一致/a /w 扎实/ad 工作/v /w 奋勇/d 前进/v /w 一定/d 能够/v 创造/v 出/v 更加/d 辉煌/a 的/u 业绩/n /w

View File

@ -4,7 +4,6 @@ import os
import unittest import unittest
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader
class TestConfigLoader(unittest.TestCase): class TestConfigLoader(unittest.TestCase):
@ -52,21 +51,3 @@ class TestConfigLoader(unittest.TestCase):
print("pass config test!") print("pass config test!")
class TestDatasetLoader(unittest.TestCase):
def test_case_TokenizeDatasetLoader(self):
loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
data = loader.load_pku(max_seq_len=32)
print("pass TokenizeDatasetLoader test!")
def test_case_POSDatasetLoader(self):
loader = POSDatasetLoader("./test/data_for_tests/people.txt")
data = loader.load()
datas = loader.load_lines()
print("pass POSDatasetLoader test!")
def test_case_LMDatasetLoader(self):
loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
data = loader.load()
datas = loader.load_lines()
print("pass TokenizeDatasetLoader test!")

View File

@ -0,0 +1,42 @@
import unittest
from fastNLP.loader.dataset_loader import POSDatasetLoader, LMDatasetLoader, TokenizeDatasetLoader, \
PeopleDailyCorpusLoader, ConllLoader
class TestDatasetLoader(unittest.TestCase):
def test_case_1(self):
data = """Tom\tT\nand\tF\nJerry\tT\n.\tF\n\nHello\tT\nworld\tF\n!\tF"""
lines = data.split("\n")
answer = POSDatasetLoader.parse(lines)
truth = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], [["Hello", "world", "!"], ["T", "F", "F"]]]
self.assertListEqual(answer, truth, "POS Dataset Loader")
def test_case_TokenizeDatasetLoader(self):
loader = TokenizeDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
data = loader.load_pku(max_seq_len=32)
print("pass TokenizeDatasetLoader test!")
def test_case_POSDatasetLoader(self):
loader = POSDatasetLoader("./test/data_for_tests/people.txt")
data = loader.load()
datas = loader.load_lines()
print("pass POSDatasetLoader test!")
def test_case_LMDatasetLoader(self):
loader = LMDatasetLoader("./test/data_for_tests/cws_pku_utf_8")
data = loader.load()
datas = loader.load_lines()
print("pass TokenizeDatasetLoader test!")
def test_PeopleDailyCorpusLoader(self):
loader = PeopleDailyCorpusLoader("./test/data_for_tests/people_daily_raw.txt")
_, _ = loader.load()
def test_ConllLoader(self):
loader = ConllLoader("./test/data_for_tests/conll_example.txt")
_ = loader.load()
if __name__ == '__main__':
unittest.main()

View File

@ -1,24 +0,0 @@
import unittest
from fastNLP.loader.dataset_loader import POSDatasetLoader
class TestPreprocess(unittest.TestCase):
def test_case_1(self):
data = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]],
["Hello", "world", "!"], ["T", "F", "F"]]
pickle_path = "./data_for_tests/"
# POSPreprocess(data, pickle_path)
class TestDatasetLoader(unittest.TestCase):
def test_case_1(self):
data = """Tom\tT\nand\tF\nJerry\tT\n.\tF\n\nHello\tT\nworld\tF\n!\tF"""
lines = data.split("\n")
answer = POSDatasetLoader.parse(lines)
truth = [[["Tom", "and", "Jerry", "."], ["T", "F", "T", "F"]], [["Hello", "world", "!"], ["T", "F", "F"]]]
self.assertListEqual(answer, truth, "POS Dataset Loader")
if __name__ == '__main__':
unittest.main()

View File

@ -38,7 +38,7 @@ def infer():
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl") word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index) test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl") index2label = load_pickle(pickle_path, "class2id.pkl")
test_args["num_classes"] = len(index2label) test_args["num_classes"] = len(index2label)
# Define the same model # Define the same model

View File

@ -1,74 +1,61 @@
import sys import os
sys.path.append("..")
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.predictor import Predictor from fastNLP.core.predictor import Predictor
from fastNLP.core.preprocess import Preprocessor, load_pickle
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.saver.model_saver import ModelSaver
data_name = "pku_training.utf8" data_name = "pku_training.utf8"
# cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8" cws_data_path = "test/data_for_tests/cws_pku_utf_8"
cws_data_path = "data_for_tests/cws_pku_utf_8" pickle_path = "./save/"
pickle_path = "data_for_tests" data_infer_path = "test/data_for_tests/people_infer.txt"
data_infer_path = "data_for_tests/people_infer.txt" config_path = "test/data_for_tests/config"
def infer(): def infer():
# Load infer configuration, the same as test # Load infer configuration, the same as test
test_args = ConfigSection() test_args = ConfigSection()
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": test_args})
# fetch dictionary size and number of labels from pickle files # fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl") word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index) test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "id2class.pkl") index2label = load_pickle(pickle_path, "class2id.pkl")
test_args["num_classes"] = len(index2label) test_args["num_classes"] = len(index2label)
# Define the same model # Define the same model
model = SeqLabeling(test_args) model = SeqLabeling(test_args)
# Dump trained parameters into the model # Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print("model loaded!") print("model loaded!")
# Data Loader # Data Loader
raw_data_loader = BaseLoader(data_infer_path) raw_data_loader = BaseLoader(data_infer_path)
infer_data = raw_data_loader.load_lines() infer_data = raw_data_loader.load_lines()
"""
Transform strings into list of list of strings.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them.
"""
# Inference interface # Inference interface
infer = Predictor(pickle_path) infer = Predictor(pickle_path, "seq_label")
results = infer.predict(model, infer_data) results = infer.predict(model, infer_data)
print(results) print(results)
print("Inference finished!")
def train_test(): def train_test():
# Config Loader # Config Loader
train_args = ConfigSection() train_args = ConfigSection()
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": train_args})
# Data Loader # Data Loader
loader = TokenizeDatasetLoader(cws_data_path) loader = TokenizeDatasetLoader(cws_data_path)
train_data = loader.load_pku() train_data = loader.load_pku()
# Preprocessor # Preprocessor
p = SeqLabelPreprocess() p = Preprocessor(label_is_seq=True)
data_train = p.run(train_data, pickle_path=pickle_path) data_train = p.run(train_data, pickle_path=pickle_path)
train_args["vocab_size"] = p.vocab_size train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes train_args["num_classes"] = p.num_classes
@ -81,12 +68,10 @@ def train_test():
# Start training # Start training
trainer.train(model, data_train) trainer.train(model, data_train)
print("Training finished!")
# Saver # Saver
saver = ModelSaver("./data_for_tests/saved_model.pkl") saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model) saver.save_pytorch(model)
print("Model saved!")
del model, trainer, loader del model, trainer, loader
@ -94,12 +79,11 @@ def train_test():
model = SeqLabeling(train_args) model = SeqLabeling(train_args)
# Dump trained parameters into the model # Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print("model loaded!")
# Load test configuration # Load test configuration
test_args = ConfigSection() test_args = ConfigSection()
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) ConfigLoader("config.cfg").load_config(config_path, {"POS_infer": test_args})
# Tester # Tester
tester = SeqLabelTester(**test_args.data) tester = SeqLabelTester(**test_args.data)
@ -109,7 +93,13 @@ def train_test():
# print test results # print test results
print(tester.show_metrics()) print(tester.show_metrics())
print("model tested!")
def test():
os.makedirs("save", exist_ok=True)
train_test()
infer()
os.system("rm -rf save")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,7 +1,6 @@
import unittest
import torch import torch
import unittest
from fastNLP.modules.other_modules import GroupNorm, LayerNormalization, BiLinear from fastNLP.modules.other_modules import GroupNorm, LayerNormalization, BiLinear

View File

@ -1,18 +1,9 @@
import torch
import numpy as np
import unittest import unittest
import fastNLP.modules.utils as utils
class TestUtils(unittest.TestCase): class TestUtils(unittest.TestCase):
def test_case_1(self): def test_case_1(self):
a = torch.tensor([ pass
[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]
])
utils.orthogonal(a)
def test_case_2(self): def test_case_2(self):
a = np.random.rand(100, 100) pass
utils.mst(a)

View File

@ -1,16 +1,32 @@
import sys # encoding: utf-8
import os
sys.path.append("..") from fastNLP.core.preprocess import save_pickle
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.fastnlp import FastNLP from fastNLP.fastnlp import FastNLP
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.saver.model_saver import ModelSaver
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/"
PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/"
def word_seg(): DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
nlp.load("cws_basic_model", config_file="cws.cfg", section_name="POS_test") DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
def word_seg(model_dir, config, section):
nlp = FastNLP(model_dir=model_dir)
nlp.load("cws_basic_model", config_file=config, section_name=section)
text = ["这是最好的基于深度学习的中文分词系统。", text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。", "大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"] "我党多年来致力于改善人民生活水平。"]
@ -24,13 +40,150 @@ def word_seg():
print(interpret_word_seg_results(words, labels)) print(interpret_word_seg_results(words, labels))
def text_class(): def mock_cws():
nlp = FastNLP("./data_for_tests/") os.makedirs("mock", exist_ok=True)
nlp.load("text_class_model") text = ["这是最好的基于深度学习的中文分词系统。",
text = "这是最好的基于深度学习的中文分词系统。" "大王叫我来巡山。",
result = nlp.run(text) "我党多年来致力于改善人民生活水平。"]
print(result)
print("FastNLP finished!") word2id = Vocabulary()
word_list = [ch for ch in "".join(text)]
word2id.update(word_list)
save_pickle(word2id, "./mock/", "word2id.pkl")
class2id = Vocabulary(need_default=False)
label_list = ['B', 'M', 'E', 'S']
class2id.update(label_list)
save_pickle(class2id, "./mock/", "class2id.pkl")
model_args = {"vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id)}
config_file = """
[test_section]
vocab_size = {}
word_emb_dim = 50
rnn_hidden_units = 50
num_classes = {}
""".format(len(word2id), len(class2id))
with open("mock/test.cfg", "w", encoding="utf-8") as f:
f.write(config_file)
model = AdvSeqLabel(model_args)
ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
def test_word_seg():
# fake the model and pickles
print("start mocking")
mock_cws()
# run the inference codes
print("start testing")
word_seg("./mock/", "test.cfg", "test_section")
# clean up environments
print("clean up")
os.system("rm -rf mock")
def pos_tag(model_dir, config, section):
nlp = FastNLP(model_dir=model_dir)
nlp.load("pos_tag_model", config_file=config, section_name=section)
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
try:
print(interpret_cws_pos_results(words, labels))
except RuntimeError:
print("inconsistent pos tags. this is for test only.")
def mock_pos_tag():
os.makedirs("mock", exist_ok=True)
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
vocab = Vocabulary()
word_list = [ch for ch in "".join(text)]
vocab.update(word_list)
save_pickle(vocab, "./mock/", "word2id.pkl")
idx2label = Vocabulary(need_default=False)
label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv']
idx2label.update(label_list)
save_pickle(idx2label, "./mock/", "class2id.pkl")
model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)}
config_file = """
[test_section]
vocab_size = {}
word_emb_dim = 50
rnn_hidden_units = 50
num_classes = {}
""".format(len(vocab), len(idx2label))
with open("mock/test.cfg", "w", encoding="utf-8") as f:
f.write(config_file)
model = AdvSeqLabel(model_args)
ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
def test_pos_tag():
mock_pos_tag()
pos_tag("./mock/", "test.cfg", "test_section")
os.system("rm -rf mock")
def text_classify(model_dir, config, section):
nlp = FastNLP(model_dir=model_dir)
nlp.load("text_classify_model", config_file=config, section_name=section)
text = [
"世界物联网大会明日在京召开龙头股启动在即",
"乌鲁木齐市新增一处城市中心旅游目的地",
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"]
results = nlp.run(text)
print(results)
def mock_text_classify():
os.makedirs("mock", exist_ok=True)
text = ["世界物联网大会明日在京召开龙头股启动在即",
"乌鲁木齐市新增一处城市中心旅游目的地",
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"
]
vocab = Vocabulary()
word_list = [ch for ch in "".join(text)]
vocab.update(word_list)
save_pickle(vocab, "./mock/", "word2id.pkl")
idx2label = Vocabulary(need_default=False)
label_list = ['class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F']
idx2label.update(label_list)
save_pickle(idx2label, "./mock/", "class2id.pkl")
model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)}
config_file = """
[test_section]
vocab_size = {}
word_emb_dim = 50
rnn_hidden_units = 50
num_classes = {}
""".format(len(vocab), len(idx2label))
with open("mock/test.cfg", "w", encoding="utf-8") as f:
f.write(config_file)
model = CNNText(model_args)
ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model)
def test_text_classify():
mock_text_classify()
text_classify("./mock/", "test.cfg", "test_section")
os.system("rm -rf mock")
def test_word_seg_interpret(): def test_word_seg_interpret():
@ -52,34 +205,9 @@ def test_interpret_cws_pos_results():
labels = [x[1] for x in foo[0]] labels = [x[1] for x in foo[0]]
print(interpret_cws_pos_results(chars, labels)) print(interpret_cws_pos_results(chars, labels))
def pos_tag():
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES)
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model")
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_cws_pos_results(words, labels))
def text_classify():
nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES)
nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model")
text = [
"世界物联网大会明日在京召开龙头股启动在即",
"乌鲁木齐市新增一处城市中心旅游目的地",
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"]
results = nlp.run(text)
print(results)
"""
['finance', 'travel', 'history']
"""
if __name__ == "__main__": if __name__ == "__main__":
text_classify() test_word_seg()
test_pos_tag()
test_text_classify()
test_word_seg_interpret()
test_interpret_cws_pos_results()