* delete readme_example.py because it is oooooooout of date.

* rename preprocess.py into utils.py, because nothing about preprocess in it
* anything in loader/ and saver/ is moved directly into io/
* corresponding unit tests are moved to /test/io
* delete fastnlp.py, because we have new and better APIs
* rename Biaffine_parser/run_test.py to Biaffine_parser/main.py; Otherwise, test will fail.
* A looooooooooot of ancient codes to be refined...........
This commit is contained in:
FengZiYjun 2018-11-18 19:30:53 +08:00 committed by yunfan
parent b6a0d33cb1
commit e9d7074ba1
41 changed files with 116 additions and 833 deletions

View File

@ -1,75 +0,0 @@
from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDataSetLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregator
from fastNLP.modules import decoder
from fastNLP.modules import encoder
class ClassificationModel(BaseModel):
"""
Simple text classification model based on CNN.
"""
def __init__(self, num_classes, vocab_size):
super(ClassificationModel, self).__init__()
self.emb = encoder.Embedding(nums=vocab_size, dims=300)
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.agg = aggregator.MaxPool()
self.dec = decoder.MLP(size_layer=[100, num_classes])
def forward(self, x):
x = self.emb(x) # [N,L] -> [N,L,C]
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out]
x = self.agg(x) # [N,L,C] -> [N,C]
x = self.dec(x) # [N,C] -> [N, N_class]
return x
data_dir = 'save/' # directory to save data and model
train_path = './data_for_tests/text_classify.txt' # training set file
# load dataset
ds_loader = ClassDataSetLoader()
data = ds_loader.load()
# pre-process dataset
pre = ClassPreprocess()
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
n_classes, vocab_size = pre.num_classes, pre.vocab_size
# construct model
model_args = {
'num_classes': n_classes,
'vocab_size': vocab_size
}
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)
# construct trainer
train_args = {
"epochs": 3,
"batch_size": 16,
"pickle_path": data_dir,
"validate": False,
"save_best_dev": False,
"model_saved_path": None,
"use_cuda": True,
"loss": Loss("cross_entropy"),
"optimizer": Optimizer("Adam", lr=0.001)
}
trainer = ClassificationTrainer(**train_args)
# start training
trainer.train(model, train_data=train_set, dev_data=dev_set)
# predict using model
data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model.cpu(), data_infer)
print(labels_pred)

View File

@ -1,5 +1,7 @@
import torch
import warnings
import torch
warnings.filterwarnings('ignore')
import os
@ -17,7 +19,6 @@ from fastNLP.api.pipeline import Pipeline
from fastNLP.core.metrics import SeqLabelEvaluator2
from fastNLP.core.tester import Tester
model_urls = {
}
@ -228,7 +229,7 @@ class Parser(API):
elif p.field_name == 'pos_list':
p.field_name = 'gold_pos'
pp(ds)
head_cor, label_cor, total = 0,0,0
head_cor, label_cor, total = 0, 0, 0
for ins in ds:
head_gold = ins['gold_heads']
head_pred = ins['heads']
@ -236,7 +237,7 @@ class Parser(API):
total += length
for i in range(length):
head_cor += 1 if head_pred[i] == head_gold[i] else 0
uas = head_cor/total
uas = head_cor / total
print('uas:{:.2f}'.format(uas))
for p in pp:
@ -247,25 +248,34 @@ class Parser(API):
return uas
if __name__ == "__main__":
# pos_model_path = '../../reproduction/pos_tag_model/pos_crf.pkl'
pos = POS(device='cpu')
s = ['编者按7月12日英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。' ,
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
print(pos.test('../../reproduction/chinese_word_segment/new-clean.txt.conll'))
print(pos.predict(s))
# cws_model_path = '../../reproduction/chinese_word_segment/models/cws_crf.pkl'
cws = CWS(device='cuda:0')
s = ['本品是一个抗酸抗胆汁的胃黏膜保护剂' ,
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
if __name__ == "__main__":
# 以下路径在102
"""
pos_model_path = '/home/hyan/fastNLP_models/upload-demo/upload/pos_crf-5e26d3b0.pkl'
pos = POS(model_path=pos_model_path, device='cpu')
s = ['编者按7月12日英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
print(cws.test('../../reproduction/chinese_word_segment/new-clean.txt.conll'))
#print(pos.test('../../reproduction/chinese_word_segment/new-clean.txt.conll'))
print(pos.predict(s))
"""
"""
cws_model_path = '/home/hyan/fastNLP_models/upload-demo/upload/cws_crf-5a8a3e66.pkl'
cws = CWS(model_path=cws_model_path, device='cuda:0')
s = ['本品是一个抗酸抗胆汁的胃黏膜保护剂',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
#print(cws.test('../../reproduction/chinese_word_segment/new-clean.txt.conll'))
cws.predict(s)
parser = Parser(device='cuda:0')
print(parser.test('../../reproduction/Biaffine_parser/test.conll'))
"""
parser_model_path = "/home/hyan/fastNLP_models/upload-demo/upload/parser-d57cd5fc.pkl"
parser = Parser(model_path=parser_model_path, device='cuda:0')
# print(parser.test('../../reproduction/Biaffine_parser/test.conll'))
s = ['编者按7月12日英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
print(parser.predict(s))

View File

@ -1,5 +1,4 @@
import torch
import numpy as np
class Field(object):
@ -30,6 +29,7 @@ class Field(object):
def __repr__(self):
return self.content.__repr__()
class TextField(Field):
def __init__(self, text, is_target):
"""
@ -43,6 +43,7 @@ class LabelField(Field):
"""The Field representing a single label. Can be a string or integer.
"""
def __init__(self, label, is_target=True):
super(LabelField, self).__init__(label, is_target)

View File

@ -1,6 +1,6 @@
import torch
import numpy as np
class FieldArray(object):
def __init__(self, name, content, padding_val=0, is_target=False, need_tensor=False):
self.name = name
@ -10,7 +10,7 @@ class FieldArray(object):
self.need_tensor = need_tensor
def __repr__(self):
#TODO
# TODO
return '{}: {}'.format(self.name, self.content.__repr__())
def append(self, val):

View File

@ -50,20 +50,6 @@ class Predictor(object):
return y
class SeqLabelInfer(Predictor):
def __init__(self, pickle_path):
print(
"[FastNLP Warning] SeqLabelInfer will be deprecated. Please use Predictor directly.")
super(SeqLabelInfer, self).__init__()
class ClassificationInfer(Predictor):
def __init__(self, pickle_path):
print(
"[FastNLP Warning] ClassificationInfer will be deprecated. Please use Predictor directly.")
super(ClassificationInfer, self).__init__()
def seq_label_post_processor(batch_outputs, label_vocab):
results = []
for batch in batch_outputs:

View File

@ -1,6 +1,8 @@
from itertools import chain
import numpy as np
import torch
from itertools import chain
def convert_to_torch_tensor(data_list, use_cuda):
"""Convert lists into (cuda) Tensors.
@ -43,6 +45,7 @@ class RandomSampler(BaseSampler):
def __call__(self, data_set):
return list(np.random.permutation(len(data_set)))
class BucketSampler(BaseSampler):
def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_lens'):
@ -56,14 +59,14 @@ class BucketSampler(BaseSampler):
total_sample_num = len(seq_lens)
bucket_indexes = []
num_sample_per_bucket = total_sample_num//self.num_buckets
num_sample_per_bucket = total_sample_num // self.num_buckets
for i in range(self.num_buckets):
bucket_indexes.append([num_sample_per_bucket*i, num_sample_per_bucket*(i+1)])
bucket_indexes.append([num_sample_per_bucket * i, num_sample_per_bucket * (i + 1)])
bucket_indexes[-1][1] = total_sample_num
sorted_seq_lens = list(sorted([(idx, seq_len) for
idx, seq_len in zip(range(total_sample_num), seq_lens)],
key=lambda x:x[1]))
key=lambda x: x[1]))
batchs = []
@ -73,19 +76,18 @@ class BucketSampler(BaseSampler):
end_idx = bucket_indexes[b_idx][1]
sorted_bucket_seq_lens = sorted_seq_lens[start_idx:end_idx]
left_init_indexes.extend([tup[0] for tup in sorted_bucket_seq_lens])
num_batch_per_bucket = len(left_init_indexes)//self.batch_size
num_batch_per_bucket = len(left_init_indexes) // self.batch_size
np.random.shuffle(left_init_indexes)
for i in range(num_batch_per_bucket):
batchs.append(left_init_indexes[i*self.batch_size:(i+1)*self.batch_size])
left_init_indexes = left_init_indexes[num_batch_per_bucket*self.batch_size:]
if (left_init_indexes)!=0:
batchs.append(left_init_indexes[i * self.batch_size:(i + 1) * self.batch_size])
left_init_indexes = left_init_indexes[num_batch_per_bucket * self.batch_size:]
if (left_init_indexes) != 0:
batchs.append(left_init_indexes)
np.random.shuffle(batchs)
return list(chain(*batchs))
def simple_sort_bucketing(lengths):
"""
@ -105,6 +107,7 @@ def simple_sort_bucketing(lengths):
# TODO: need to return buckets
return [idx for idx, _ in sorted_lengths]
def k_means_1d(x, k, max_iter=100):
"""Perform k-means on 1-D data.
@ -159,4 +162,3 @@ def k_means_bucketing(lengths, buckets):
if buckets[bucket_id] is None or lengths[idx] <= buckets[bucket_id]:
bucket_data[bucket_id].append(idx)
return bucket_data

View File

@ -1,10 +1,11 @@
import torch
from collections import defaultdict
import torch
from fastNLP.core.batch import Batch
from fastNLP.core.metrics import Evaluator
from fastNLP.core.sampler import RandomSampler
from fastNLP.saver.logger import create_logger
from fastNLP.io.logger import create_logger
logger = create_logger(__name__, "./train_test.log")
@ -119,24 +120,3 @@ class Tester(object):
"""
return ", ".join([str(key) + "=" + str(value) for key, value in results.items()])
class SeqLabelTester(Tester):
def __init__(self, **test_args):
print(
"[FastNLP Warning] SeqLabelTester will be deprecated. Please use Tester directly.")
super(SeqLabelTester, self).__init__(**test_args)
class ClassificationTester(Tester):
def __init__(self, **test_args):
print(
"[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester directly.")
super(ClassificationTester, self).__init__(**test_args)
class SNLITester(Tester):
def __init__(self, **test_args):
print(
"[FastNLP Warning] SNLITester will be deprecated. Please use Tester directly.")
super(SNLITester, self).__init__(**test_args)

View File

@ -9,11 +9,10 @@ from fastNLP.core.batch import Batch
from fastNLP.core.loss import Loss
from fastNLP.core.metrics import Evaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import BucketSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.tester import Tester
from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.io.logger import create_logger
from fastNLP.io.model_saver import ModelSaver
logger = create_logger(__name__, "./train_test.log")
logger.disabled = True
@ -182,19 +181,10 @@ class Trainer(object):
self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step)
for name, param in self._model.named_parameters():
if param.requires_grad:
<<<<<<< HEAD
# self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)
pass
if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
=======
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
if kwargs["n_print"] > 0 and self.step % kwargs["n_print"] == 0:
>>>>>>> 5924fe0... fix and update tester, trainer, seq_model, add parser pipeline builder
end = time.time()
diff = timedelta(seconds=round(end - kwargs["start"]))
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
@ -339,40 +329,3 @@ class Trainer(object):
def set_validator(self, validor):
self.validator = validor
class SeqLabelTrainer(Trainer):
"""Trainer for Sequence Labeling
"""
def __init__(self, **kwargs):
print(
"[FastNLP Warning] SeqLabelTrainer will be deprecated. Please use Trainer directly.")
super(SeqLabelTrainer, self).__init__(**kwargs)
def _create_validator(self, valid_args):
return SeqLabelTester(**valid_args)
class ClassificationTrainer(Trainer):
"""Trainer for text classification."""
def __init__(self, **train_args):
print(
"[FastNLP Warning] ClassificationTrainer will be deprecated. Please use Trainer directly.")
super(ClassificationTrainer, self).__init__(**train_args)
def _create_validator(self, valid_args):
return ClassificationTester(**valid_args)
class SNLITrainer(Trainer):
"""Trainer for text SNLI."""
def __init__(self, **train_args):
print(
"[FastNLP Warning] SNLITrainer will be deprecated. Please use Trainer directly.")
super(SNLITrainer, self).__init__(**train_args)
def _create_validator(self, valid_args):
return SNLITester(**valid_args)

View File

@ -2,8 +2,6 @@ import _pickle
import os
# the first vocab in dict with the index = 5
def save_pickle(obj, pickle_path, file_name):
"""Save an object into a pickle file.

View File

@ -13,7 +13,7 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
def isiterable(p_object):
try:
it = iter(p_object)
_ = iter(p_object)
except TypeError:
return False
return True

View File

@ -1,343 +0,0 @@
import os
from fastNLP.core.dataset import DataSet
from fastNLP.loader.dataset_loader import convert_seq_dataset
from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer
from fastNLP.core.preprocess import load_pickle
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader
"""
mapping from model name to [URL, file_name.class_name, model_pickle_name]
Notice that the class of the model should be in "models" directory.
Example:
"seq_label_model": {
"url": "www.fudan.edu.cn",
"class": "sequence_modeling.SeqLabeling", # file_name.class_name in models/
"pickle": "seq_label_model.pkl",
"type": "seq_label",
"config_file_name": "config", # the name of the config file which stores model initialization parameters
"config_section_name": "text_class_model" # the name of the section in the config file which stores model init params
},
"text_class_model": {
"url": "www.fudan.edu.cn",
"class": "cnn_text_classification.CNNText",
"pickle": "text_class_model.pkl",
"type": "text_class"
}
"""
FastNLP_MODEL_COLLECTION = {
"cws_basic_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "cws_basic_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "cws.cfg",
"config_section_name": "text_class_model"
},
"pos_tag_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "pos_tag_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "pos_tag.cfg",
"config_section_name": "pos_tag_model"
},
"text_classify_model": {
"url": "",
"class": "cnn_text_classification.CNNText",
"pickle": "text_class_model_v0.pkl",
"type": "text_class",
"config_file_name": "text_classify.cfg",
"config_section_name": "model"
}
}
class FastNLP(object):
"""
High-level interface for direct model inference.
Example Usage
::
fastnlp = FastNLP()
fastnlp.load("zh_pos_tag_model")
text = "这是最好的基于深度学习的中文分词系统。"
result = fastnlp.run(text)
print(result) # ["这", "是", "最好", "的", "基于", "深度学习", "的", "中文", "分词", "系统", "。"]
"""
def __init__(self, model_dir="./"):
"""
:param model_dir: this directory should contain the following files:
1. a trained model
2. a config file, which is a fastNLP's configuration.
3. two Vocab files, which are pickle objects of Vocab instances, representing feature and label vocabs.
"""
self.model_dir = model_dir
self.model = None
self.infer_type = None # "seq_label"/"text_class"
self.word_vocab = None
self.label_vocab = None
def load(self, model_name, config_file="config", section_name="model"):
"""
Load a pre-trained FastNLP model together with additional data.
:param model_name: str, the name of a FastNLP model.
:param config_file: str, the name of the config file which stores the initialization information of the model.
(default: "config")
:param section_name: str, the name of the corresponding section in the config file. (default: model)
"""
assert type(model_name) is str
if model_name not in FastNLP_MODEL_COLLECTION:
raise ValueError("No FastNLP model named {}.".format(model_name))
if not self.model_exist(model_dir=self.model_dir):
self._download(model_name, FastNLP_MODEL_COLLECTION[model_name]["url"])
model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name]["class"])
print("Restore model class {}".format(str(model_class)))
model_args = ConfigSection()
ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args})
print("Restore model hyper-parameters {}".format(str(model_args.data)))
# fetch dictionary size and number of labels from pickle files
self.word_vocab = load_pickle(self.model_dir, "word2id.pkl")
model_args["vocab_size"] = len(self.word_vocab)
self.label_vocab = load_pickle(self.model_dir, "label2id.pkl")
model_args["num_classes"] = len(self.label_vocab)
# Construct the model
model = model_class(model_args)
print("Model constructed.")
# To do: framework independent
ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"]))
print("Model weights loaded.")
self.model = model
self.infer_type = FastNLP_MODEL_COLLECTION[model_name]["type"]
print("Inference ready.")
def run(self, raw_input):
"""
Perform inference over given input using the loaded model.
:param raw_input: list of string. Each list is an input query.
:return results:
"""
infer = self._create_inference(self.model_dir)
# tokenize: list of string ---> 2-D list of string
infer_input = self.tokenize(raw_input, language="zh")
# create DataSet: 2-D list of strings ----> DataSet
infer_data = self._create_data_set(infer_input)
# DataSet ---> 2-D list of tags
results = infer.predict(self.model, infer_data)
# 2-D list of tags ---> list of final answers
outputs = self._make_output(results, infer_input)
return outputs
@staticmethod
def _get_model_class(file_class_name):
"""
Feature the class specified by <file_class_name>
:param file_class_name: str, contains the name of the Python module followed by the name of the class.
Example: "sequence_modeling.SeqLabeling"
:return module: the model class
"""
import_prefix = "fastNLP.models."
parts = (import_prefix + file_class_name).split(".")
from_module = ".".join(parts[:-1])
module = __import__(from_module)
for sub in parts[1:]:
module = getattr(module, sub)
return module
def _create_inference(self, model_dir):
"""Specify which task to perform.
:param model_dir:
:return:
"""
if self.infer_type == "seq_label":
return SeqLabelInfer(model_dir)
elif self.infer_type == "text_class":
return ClassificationInfer(model_dir)
else:
raise ValueError("fail to create inference instance")
def _create_data_set(self, infer_input):
"""Create a DataSet object given the raw inputs.
:param infer_input: 2-D lists of strings
:return data_set: a DataSet object
"""
if self.infer_type in ["seq_label", "text_class"]:
data_set = convert_seq_dataset(infer_input)
data_set.index_field("word_seq", self.word_vocab)
if self.infer_type == "seq_label":
data_set.set_origin_len("word_seq")
return data_set
else:
raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))
def _load(self, model_dir, model_name):
return 0
def _download(self, model_name, url):
"""
Download the model weights from <url> and save in <self.model_dir>.
:param model_name:
:param url:
"""
print("Downloading {} from {}".format(model_name, url))
# TODO: download model via url
def model_exist(self, model_dir):
"""
Check whether the desired model is already in the directory.
:param model_dir:
"""
return True
def tokenize(self, text, language):
"""Extract tokens from strings.
For English, extract words separated by space.
For Chinese, extract characters.
TODO: more complex tokenization methods
:param text: list of string
:param language: str, one of ('zh', 'en'), Chinese or English.
:return data: list of list of string, each string is a token.
"""
assert language in ("zh", "en")
data = []
for sent in text:
if language == "en":
tokens = sent.strip().split()
elif language == "zh":
tokens = [char for char in sent]
else:
raise RuntimeError("Unknown language {}".format(language))
data.append(tokens)
return data
def _make_output(self, results, infer_input):
"""Transform the infer output into user-friendly output.
:param results: 1 or 2-D list of strings.
If self.infer_type == "seq_label", it is of shape [num_examples, tag_seq_length]
If self.infer_type == "text_class", it is of shape [num_examples]
:param infer_input: 2-D list of string, the input query before inference.
:return outputs: list. Each entry is a prediction.
"""
if self.infer_type == "seq_label":
outputs = make_seq_label_output(results, infer_input)
elif self.infer_type == "text_class":
outputs = make_class_output(results, infer_input)
else:
raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))
return outputs
def make_seq_label_output(result, infer_input):
"""Transform model output into user-friendly contents.
:param result: 2-D list of strings. (model output)
:param infer_input: 2-D list of string (model input)
:return ret: list of list of tuples
[
[(word_11, label_11), (word_12, label_12), ...],
[(word_21, label_21), (word_22, label_22), ...],
...
]
"""
ret = []
for example_x, example_y in zip(infer_input, result):
ret.append([(x, y) for x, y in zip(example_x, example_y)])
return ret
def make_class_output(result, infer_input):
"""Transform model output into user-friendly contents.
:param result: 2-D list of strings. (model output)
:param infer_input: 1-D list of string (model input)
:return ret: the same as result, [label_1, label_2, ...]
"""
return result
def interpret_word_seg_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.
Example: In CWS, convert <BMES> labeling into segmented text.
:param char_seq: list of string,
:param label_seq: list of string, the same length as char_seq
Each entry is one of ('B', 'M', 'E', 'S').
:return output: list of words
"""
words = []
word = ""
for char, label in zip(char_seq, label_seq):
if label[0] == "B":
if word != "":
words.append(word)
word = char
elif label[0] == "M":
word += char
elif label[0] == "E":
word += char
words.append(word)
word = ""
elif label[0] == "S":
if word != "":
words.append(word)
word = ""
words.append(char)
else:
raise ValueError("invalid label {}".format(label[0]))
return words
def interpret_cws_pos_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.
:param char_seq: list of string
:param label_seq: list of string, the same length as char_seq.
:return outputs: list of tuple (words, pos_tag):
"""
def pos_tag_check(seq):
"""check whether all entries are the same """
return len(set(seq)) <= 1
word = []
word_pos = []
outputs = []
for char, label in zip(char_seq, label_seq):
tmp = label.split("-")
cws_label, pos_tag = tmp[0], tmp[1]
if cws_label == "B" or cws_label == "M":
word.append(char)
word_pos.append(pos_tag)
elif cws_label == "E":
word.append(char)
word_pos.append(pos_tag)
if not pos_tag_check(word_pos):
raise RuntimeError("character-wise pos tags inconsistent. ")
outputs.append(("".join(word), word_pos[0]))
word.clear()
word_pos.clear()
elif cws_label == "S":
outputs.append((char, pos_tag))
return outputs

View File

@ -2,7 +2,7 @@ import configparser
import json
import os
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.io.base_loader import BaseLoader
class ConfigLoader(BaseLoader):

View File

@ -1,7 +1,7 @@
import os
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.saver.logger import create_logger
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.logger import create_logger
class ConfigSaver(object):

View File

@ -3,7 +3,7 @@ import os
from fastNLP.core.dataset import DataSet
from fastNLP.core.field import *
from fastNLP.core.instance import Instance
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.io.base_loader import BaseLoader
def convert_seq_dataset(data):

View File

@ -1,10 +1,7 @@
import _pickle
import os
import torch
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.base_loader import BaseLoader
class EmbedLoader(BaseLoader):

View File

@ -1,6 +1,6 @@
import torch
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.io.base_loader import BaseLoader
class ModelLoader(BaseLoader):
@ -19,10 +19,10 @@ class ModelLoader(BaseLoader):
:param model_path: str, the path to the saved model.
"""
empty_model.load_state_dict(torch.load(model_path))
@staticmethod
def load_pytorch(model_path):
def load_pytorch_model(model_path):
"""Load the entire model.
"""
return torch.load(model_path)
return torch.load(model_path)

View File

@ -1,13 +1,15 @@
import torch
class TimestepDropout(torch.nn.Dropout):
"""This module accepts a `[batch_size, num_timesteps, embedding_dim)]` and use a single
dropout mask of shape `(batch_size, embedding_dim)` to apply on every time step.
"""
def forward(self, x):
dropout_mask = x.new_ones(x.shape[0], x.shape[-1])
torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True)
dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim]
dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim]
if self.inplace:
x *= dropout_mask
return

View File

@ -1,13 +1,11 @@
import sys
import os
import sys
sys.path.extend(['/home/yfshao/workdir/dev_fastnlp'])
from fastNLP.api.processor import *
from fastNLP.api.pipeline import Pipeline
from fastNLP.core.dataset import DataSet
from fastNLP.models.biaffine_parser import BiaffineParser
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
import _pickle as pickle
import torch

View File

@ -1,11 +1,9 @@
import sys
import os
sys.path.extend(['/home/yfshao/workdir/dev_fastnlp'])
import torch
import argparse
import numpy as np
from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.core.dataset import DataSet

View File

@ -3,8 +3,6 @@ import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from collections import defaultdict
import math
import torch
import re
@ -13,16 +11,13 @@ from fastNLP.core.metrics import Evaluator
from fastNLP.core.instance import Instance
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core.field import TextField, SeqLabelField
from fastNLP.core.preprocess import load_pickle
from fastNLP.core.tester import Tester
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.loader.embed_loader import EmbedLoader
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.model_loader import ModelLoader
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.models.biaffine_parser import BiaffineParser
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.io.model_saver import ModelSaver
BOS = '<BOS>'
EOS = '<EOS>'

View File

@ -1,10 +1,10 @@
import torch.nn.functional as F
from fastNLP.core.preprocess import ClassPreprocess as Preprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.config_loader import ConfigLoader
from fastNLP.loader.config_loader import ConfigSection
from fastNLP.loader.dataset_loader import ClassDataSetLoader as Dataset_loader
from fastNLP.core.utils import ClassPreprocess as Preprocess
from fastNLP.io.config_loader import ConfigLoader
from fastNLP.io.config_loader import ConfigSection
from fastNLP.io.dataset_loader import ClassDataSetLoader as Dataset_loader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.aggregator.self_attention import SelfAttention
from fastNLP.modules.decoder.MLP import MLP

View File

@ -1,8 +1,8 @@
from fastNLP.loader.dataset_loader import DataSetLoader
from fastNLP.core.instance import Instance
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.io.dataset_loader import DataSetLoader
def cut_long_sentence(sent, max_sample_length=200):

View File

@ -3,17 +3,16 @@ import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.preprocess import load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.utils import load_pickle
from fastNLP.io.model_saver import ModelSaver
from fastNLP.io.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.dataset import DataSet
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.utils import save_pickle
from fastNLP.core.metrics import SeqLabelEvaluator
# not in the file's dir

View File

@ -13,8 +13,8 @@ from fastNLP.core.instance import Instance
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.trainer import Trainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.dataset_loader import PeopleDailyCorpusLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel

View File

@ -1,6 +1,6 @@
import unittest
from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq_dataset
from fastNLP.io.dataset_loader import convert_seq2seq_dataset, convert_seq_dataset
class TestDataSet(unittest.TestCase):

View File

@ -1,12 +1,10 @@
import os
import unittest
from fastNLP.core.dataset import DataSet
from fastNLP.core.predictor import Predictor
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.utils import save_pickle
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.loader.dataset_loader import convert_seq_dataset
from fastNLP.io.dataset_loader import convert_seq_dataset
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.models.sequence_modeling import SeqLabeling

View File

@ -3,7 +3,7 @@ import json
import os
import unittest
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
class TestConfigLoader(unittest.TestCase):

View File

@ -1,8 +1,8 @@
import os
import unittest
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.saver.config_saver import ConfigSaver
from fastNLP.io.config_loader import ConfigSection, ConfigLoader
from fastNLP.io.config_saver import ConfigSaver
class TestConfigSaver(unittest.TestCase):

View File

@ -1,9 +1,9 @@
import os
import unittest
from fastNLP.loader.dataset_loader import POSDataSetLoader, LMDataSetLoader, TokenizeDataSetLoader, \
PeopleDailyCorpusLoader, ConllLoader
from fastNLP.core.dataset import DataSet
from fastNLP.io.dataset_loader import POSDataSetLoader, LMDataSetLoader, TokenizeDataSetLoader, \
PeopleDailyCorpusLoader, ConllLoader
class TestDatasetLoader(unittest.TestCase):
def test_case_1(self):

View File

@ -1,10 +1,8 @@
import unittest
import os
import unittest
import torch
from fastNLP.loader.embed_loader import EmbedLoader
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.embed_loader import EmbedLoader
class TestEmbedLoader(unittest.TestCase):

View File

@ -3,17 +3,17 @@ import sys
sys.path.append("..")
import argparse
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import BaseLoader
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.dataset_loader import BaseLoader
from fastNLP.io.model_saver import ModelSaver
from fastNLP.io.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.preprocess import save_pickle, load_pickle
from fastNLP.core.utils import save_pickle, load_pickle
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--save", type=str, default="./seq_label/", help="path to save pickle files")

View File

@ -1,17 +1,16 @@
import os
from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.preprocess import save_pickle, load_pickle
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader, RawDataSetLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.utils import save_pickle, load_pickle
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.dataset_loader import TokenizeDataSetLoader, RawDataSetLoader
from fastNLP.io.model_loader import ModelLoader
from fastNLP.io.model_saver import ModelSaver
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.saver.model_saver import ModelSaver
data_name = "pku_training.utf8"
cws_data_path = "./test/data_for_tests/cws_pku_utf_8"

View File

@ -2,15 +2,15 @@ import os
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.core.utils import save_pickle
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.dataset_loader import TokenizeDataSetLoader
from fastNLP.io.model_loader import ModelLoader
from fastNLP.io.model_saver import ModelSaver
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.saver.model_saver import ModelSaver
pickle_path = "./seq_label/"
model_name = "seq_label_model.pkl"

View File

@ -8,15 +8,15 @@ import sys
sys.path.append("..")
from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import ClassDataSetLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.io.config_loader import ConfigLoader, ConfigSection
from fastNLP.io.dataset_loader import ClassDataSetLoader
from fastNLP.io.model_loader import ModelLoader
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.io.model_saver import ModelSaver
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.loss import Loss
from fastNLP.core.dataset import TextClassifyDataSet
from fastNLP.core.preprocess import save_pickle, load_pickle
from fastNLP.core.utils import save_pickle, load_pickle
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--save", type=str, default="./test_classification/", help="path to save pickle files")

View File

@ -1,213 +0,0 @@
# encoding: utf-8
import os
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.fastnlp import FastNLP
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.saver.model_saver import ModelSaver
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/"
PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/"
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] # dict index = 2~4
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
def word_seg(model_dir, config, section):
nlp = FastNLP(model_dir=model_dir)
nlp.load("cws_basic_model", config_file=config, section_name=section)
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
print(results)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_word_seg_results(words, labels))
def mock_cws():
os.makedirs("mock", exist_ok=True)
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
word2id = Vocabulary()
word_list = [ch for ch in "".join(text)]
word2id.update(word_list)
save_pickle(word2id, "./mock/", "word2id.pkl")
class2id = Vocabulary(need_default=False)
label_list = ['B', 'M', 'E', 'S']
class2id.update(label_list)
save_pickle(class2id, "./mock/", "label2id.pkl")
model_args = {"vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id)}
config_file = """
[test_section]
vocab_size = {}
word_emb_dim = 50
rnn_hidden_units = 50
num_classes = {}
""".format(len(word2id), len(class2id))
with open("mock/test.cfg", "w", encoding="utf-8") as f:
f.write(config_file)
model = AdvSeqLabel(model_args)
ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
def test_word_seg():
# fake the model and pickles
print("start mocking")
mock_cws()
# run the inference codes
print("start testing")
word_seg("./mock/", "test.cfg", "test_section")
# clean up environments
print("clean up")
os.system("rm -rf mock")
def pos_tag(model_dir, config, section):
nlp = FastNLP(model_dir=model_dir)
nlp.load("pos_tag_model", config_file=config, section_name=section)
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
try:
print(interpret_cws_pos_results(words, labels))
except RuntimeError:
print("inconsistent pos tags. this is for test only.")
def mock_pos_tag():
os.makedirs("mock", exist_ok=True)
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
vocab = Vocabulary()
word_list = [ch for ch in "".join(text)]
vocab.update(word_list)
save_pickle(vocab, "./mock/", "word2id.pkl")
idx2label = Vocabulary(need_default=False)
label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv']
idx2label.update(label_list)
save_pickle(idx2label, "./mock/", "label2id.pkl")
model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)}
config_file = """
[test_section]
vocab_size = {}
word_emb_dim = 50
rnn_hidden_units = 50
num_classes = {}
""".format(len(vocab), len(idx2label))
with open("mock/test.cfg", "w", encoding="utf-8") as f:
f.write(config_file)
model = AdvSeqLabel(model_args)
ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
def test_pos_tag():
mock_pos_tag()
pos_tag("./mock/", "test.cfg", "test_section")
os.system("rm -rf mock")
def text_classify(model_dir, config, section):
nlp = FastNLP(model_dir=model_dir)
nlp.load("text_classify_model", config_file=config, section_name=section)
text = [
"世界物联网大会明日在京召开龙头股启动在即",
"乌鲁木齐市新增一处城市中心旅游目的地",
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"]
results = nlp.run(text)
print(results)
def mock_text_classify():
os.makedirs("mock", exist_ok=True)
text = ["世界物联网大会明日在京召开龙头股启动在即",
"乌鲁木齐市新增一处城市中心旅游目的地",
"朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"
]
vocab = Vocabulary()
word_list = [ch for ch in "".join(text)]
vocab.update(word_list)
save_pickle(vocab, "./mock/", "word2id.pkl")
idx2label = Vocabulary(need_default=False)
label_list = ['class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F']
idx2label.update(label_list)
save_pickle(idx2label, "./mock/", "label2id.pkl")
model_args = {"vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label)}
config_file = """
[test_section]
vocab_size = {}
word_emb_dim = 50
rnn_hidden_units = 50
num_classes = {}
""".format(len(vocab), len(idx2label))
with open("mock/test.cfg", "w", encoding="utf-8") as f:
f.write(config_file)
model = CNNText(model_args)
ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model)
def test_text_classify():
mock_text_classify()
text_classify("./mock/", "test.cfg", "test_section")
os.system("rm -rf mock")
def test_word_seg_interpret():
foo = [[('', 'S'), ('', 'S'), ('', 'S'), ('', 'S'), ('', 'S'), ('', 'B'), ('', 'E'), ('', 'B'), ('', 'E'),
('', 'B'), ('', 'E'), ('', 'S'), ('', 'B'), ('', 'E'), ('', 'B'), ('', 'E'), ('', 'B'), ('', 'E'),
('', 'S')]]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_word_seg_results(chars, labels))
def test_interpret_cws_pos_results():
foo = [
[('', 'S-r'), ('', 'S-v'), ('', 'S-d'), ('', 'S-a'), ('', 'S-u'), ('', 'B-p'), ('', 'E-p'), ('', 'B-d'),
('', 'E-d'), ('', 'B-v'), ('', 'E-v'), ('', 'S-u'), ('', 'B-nz'), ('', 'E-nz'), ('', 'B-vn'),
('', 'E-vn'), ('', 'B-n'), ('', 'E-n'), ('', 'S-w')]
]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_cws_pos_results(chars, labels))
if __name__ == "__main__":
test_word_seg()
test_pos_tag()
test_text_classify()
test_word_seg_interpret()
test_interpret_cws_pos_results()