mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-02 04:07:35 +08:00
[fix] drop "data" in Tester.make_batch; correct spelling of "show_metrics"
[add] PeopleDailyCorpusLoader, to parse PeopleDaily Corpus [update] add CWS + POS_tag interface at FastNLP, see example in test_fastNLP.py [update] modify README.md and readme_example.py to the latest version.
This commit is contained in:
parent
0430067faf
commit
32a036e8e6
54
README.md
54
README.md
@ -27,15 +27,16 @@ fastNLP is a modular Natural Language Processing system based on PyTorch, for fa
|
||||
|
||||
A typical fastNLP routine is composed of four phases: loading dataset, pre-processing data, constructing model and training model.
|
||||
```python
|
||||
from fastNLP.models.base_model import BaseModel
|
||||
from fastNLP.modules import encoder
|
||||
from fastNLP.modules import aggregation
|
||||
from fastNLP.modules import decoder
|
||||
|
||||
from fastNLP.loader.dataset_loader import ClassDatasetLoader
|
||||
from fastNLP.loader.preprocess import ClassPreprocess
|
||||
from fastNLP.core.preprocess import ClassPreprocess
|
||||
from fastNLP.core.predictor import ClassificationInfer
|
||||
from fastNLP.core.trainer import ClassificationTrainer
|
||||
from fastNLP.core.inference import ClassificationInfer
|
||||
from fastNLP.loader.dataset_loader import ClassDatasetLoader
|
||||
from fastNLP.models.base_model import BaseModel
|
||||
from fastNLP.modules import aggregation
|
||||
from fastNLP.modules import encoder
|
||||
from fastNLP.modules import decoder
|
||||
from fastNLP.core.loss import Loss
|
||||
from fastNLP.core.optimizer import Optimizer
|
||||
|
||||
|
||||
class ClassificationModel(BaseModel):
|
||||
@ -50,7 +51,7 @@ class ClassificationModel(BaseModel):
|
||||
self.enc = encoder.Conv(
|
||||
in_channels=300, out_channels=100, kernel_size=3)
|
||||
self.agg = aggregation.MaxPool()
|
||||
self.dec = decoder.MLP(100, num_classes=num_classes)
|
||||
self.dec = decoder.MLP(size_layer=[100, num_classes])
|
||||
|
||||
def forward(self, x):
|
||||
x = self.emb(x) # [N,L] -> [N,L,C]
|
||||
@ -60,16 +61,17 @@ class ClassificationModel(BaseModel):
|
||||
return x
|
||||
|
||||
|
||||
data_dir = 'data' # directory to save data and model
|
||||
train_path = 'test/data_for_tests/text_classify.txt' # training set file
|
||||
data_dir = 'save/' # directory to save data and model
|
||||
train_path = './data_for_tests/text_classify.txt' # training set file
|
||||
|
||||
# load dataset
|
||||
ds_loader = ClassDatasetLoader("train", train_path)
|
||||
data = ds_loader.load()
|
||||
|
||||
# pre-process dataset
|
||||
pre = ClassPreprocess(data_dir)
|
||||
vocab_size, n_classes = pre.process(data, "data_train.pkl")
|
||||
pre = ClassPreprocess()
|
||||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
|
||||
n_classes, vocab_size = pre.num_classes, pre.vocab_size
|
||||
|
||||
# construct model
|
||||
model_args = {
|
||||
@ -78,28 +80,36 @@ model_args = {
|
||||
}
|
||||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)
|
||||
|
||||
# train model
|
||||
# construct trainer
|
||||
train_args = {
|
||||
"epochs": 20,
|
||||
"batch_size": 50,
|
||||
"epochs": 3,
|
||||
"batch_size": 16,
|
||||
"pickle_path": data_dir,
|
||||
"validate": False,
|
||||
"save_best_dev": False,
|
||||
"model_saved_path": None,
|
||||
"use_cuda": True,
|
||||
"learn_rate": 1e-3,
|
||||
"momentum": 0.9}
|
||||
trainer = ClassificationTrainer(train_args)
|
||||
trainer.train(model)
|
||||
"loss": Loss("cross_entropy"),
|
||||
"optimizer": Optimizer("Adam", lr=0.001)
|
||||
}
|
||||
trainer = ClassificationTrainer(**train_args)
|
||||
|
||||
# start training
|
||||
trainer.train(model, train_data=train_set, dev_data=dev_set)
|
||||
|
||||
# predict using model
|
||||
seqs = [x[0] for x in data]
|
||||
data_infer = [x[0] for x in data]
|
||||
infer = ClassificationInfer(data_dir)
|
||||
labels_pred = infer.predict(model, seqs)
|
||||
labels_pred = infer.predict(model.cpu(), data_infer)
|
||||
print(labels_pred)
|
||||
```
|
||||
|
||||
|
||||
## Installation
|
||||
Run the following commands to install fastNLP package.
|
||||
```shell
|
||||
pip install fastNLP
|
||||
```
|
||||
|
||||
### Cloning From GitHub
|
||||
|
||||
|
@ -86,7 +86,7 @@ class BaseTester(object):
|
||||
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
|
||||
step = 0
|
||||
|
||||
for batch_x, batch_y in self.make_batch(iterator, dev_data):
|
||||
for batch_x, batch_y in self.make_batch(iterator):
|
||||
with torch.no_grad():
|
||||
prediction = self.data_forward(network, batch_x)
|
||||
eval_results = self.evaluate(prediction, batch_y)
|
||||
@ -123,14 +123,14 @@ class BaseTester(object):
|
||||
"""Return a list of metrics. """
|
||||
raise NotImplementedError
|
||||
|
||||
def show_matrices(self):
|
||||
def show_metrics(self):
|
||||
"""This is called by Trainer to print evaluation results on dev set during training.
|
||||
|
||||
:return print_str: str
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def make_batch(self, iterator, data):
|
||||
def make_batch(self, iterator):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@ -194,7 +194,7 @@ class SeqLabelTester(BaseTester):
|
||||
batch_accuracy = np.mean([x[1] for x in self.eval_history])
|
||||
return batch_loss, batch_accuracy
|
||||
|
||||
def show_matrices(self):
|
||||
def show_metrics(self):
|
||||
"""
|
||||
This is called by Trainer to print evaluation on dev set.
|
||||
:return print_str: str
|
||||
@ -202,7 +202,7 @@ class SeqLabelTester(BaseTester):
|
||||
loss, accuracy = self.metrics()
|
||||
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy)
|
||||
|
||||
def make_batch(self, iterator, data):
|
||||
def make_batch(self, iterator):
|
||||
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True)
|
||||
|
||||
|
||||
@ -216,7 +216,7 @@ class ClassificationTester(BaseTester):
|
||||
"""
|
||||
super(ClassificationTester, self).__init__(**test_args)
|
||||
|
||||
def make_batch(self, iterator, data, max_len=None):
|
||||
def make_batch(self, iterator, max_len=None):
|
||||
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len)
|
||||
|
||||
def data_forward(self, network, x):
|
||||
|
@ -144,7 +144,7 @@ class BaseTrainer(object):
|
||||
print("Saved better model selected by validation.")
|
||||
logger.info("Saved better model selected by validation.")
|
||||
|
||||
valid_results = validator.show_matrices()
|
||||
valid_results = validator.show_metrics()
|
||||
print("[epoch {}] {}".format(epoch, valid_results))
|
||||
logger.info("[epoch {}] {}".format(epoch, valid_results))
|
||||
|
||||
|
@ -31,7 +31,16 @@ FastNLP_MODEL_COLLECTION = {
|
||||
"type": "seq_label",
|
||||
"config_file_name": "config",
|
||||
"config_section_name": "text_class_model"
|
||||
},
|
||||
"pos_tag_model": {
|
||||
"url": "",
|
||||
"class": "sequence_modeling.AdvSeqLabel",
|
||||
"pickle": "pos_tag_model_v_0.pkl",
|
||||
"type": "seq_label",
|
||||
"config_file_name": "pos_tag.config",
|
||||
"config_section_name": "pos_tag_model"
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -259,3 +268,38 @@ def interpret_word_seg_results(char_seq, label_seq):
|
||||
else:
|
||||
raise ValueError("invalid label {}".format(label[0]))
|
||||
return words
|
||||
|
||||
|
||||
def interpret_cws_pos_results(char_seq, label_seq):
|
||||
"""Transform model output into user-friendly contents.
|
||||
|
||||
:param char_seq: list of string
|
||||
:param label_seq: list of string, the same length as char_seq.
|
||||
:return outputs: list of tuple (words, pos_tag):
|
||||
"""
|
||||
|
||||
def pos_tag_check(seq):
|
||||
"""check whether all entries are the same """
|
||||
return len(set(seq)) <= 1
|
||||
|
||||
word = []
|
||||
word_pos = []
|
||||
outputs = []
|
||||
for char, label in zip(char_seq, label_seq):
|
||||
tmp = label.split("-")
|
||||
cws_label, pos_tag = tmp[0], tmp[1]
|
||||
|
||||
if cws_label == "B" or cws_label == "M":
|
||||
word.append(char)
|
||||
word_pos.append(pos_tag)
|
||||
elif cws_label == "E":
|
||||
word.append(char)
|
||||
word_pos.append(pos_tag)
|
||||
if not pos_tag_check(word_pos):
|
||||
raise RuntimeError("character-wise pos tags inconsistent. ")
|
||||
outputs.append(("".join(word), word_pos[0]))
|
||||
word.clear()
|
||||
word_pos.clear()
|
||||
elif cws_label == "S":
|
||||
outputs.append((char, pos_tag))
|
||||
return outputs
|
||||
|
@ -220,13 +220,57 @@ class LMDatasetLoader(DatasetLoader):
|
||||
return text.strip().split()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
class PeopleDailyCorpusLoader(DatasetLoader):
|
||||
"""
|
||||
data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines()
|
||||
for example in data:
|
||||
for w, l in zip(example[0], example[1]):
|
||||
print(w, l)
|
||||
People Daily Corpus: Chinese word segmentation, POS tag, NER
|
||||
"""
|
||||
|
||||
ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku()
|
||||
print(ans)
|
||||
def __init__(self, data_path):
|
||||
super(PeopleDailyCorpusLoader, self).__init__("people_daily_corpus", data_path)
|
||||
|
||||
def load(self):
|
||||
with open(self.data_path, "r", encoding="utf-8") as f:
|
||||
sents = f.readlines()
|
||||
|
||||
pos_tag_examples = []
|
||||
ner_examples = []
|
||||
for sent in sents:
|
||||
inside_ne = False
|
||||
sent_pos_tag = []
|
||||
sent_words = []
|
||||
sent_ner = []
|
||||
words = sent.strip().split()[1:]
|
||||
for word in words:
|
||||
if "[" in word and "]" in word:
|
||||
ner_tag = "U"
|
||||
print(word)
|
||||
elif "[" in word:
|
||||
inside_ne = True
|
||||
ner_tag = "B"
|
||||
word = word[1:]
|
||||
elif "]" in word:
|
||||
ner_tag = "L"
|
||||
word = word[:word.index("]")]
|
||||
if inside_ne is True:
|
||||
inside_ne = False
|
||||
else:
|
||||
raise RuntimeError("only ] appears!")
|
||||
else:
|
||||
if inside_ne is True:
|
||||
ner_tag = "I"
|
||||
else:
|
||||
ner_tag = "O"
|
||||
tmp = word.split("/")
|
||||
token, pos = tmp[0], tmp[1]
|
||||
sent_ner.append(ner_tag)
|
||||
sent_pos_tag.append(pos)
|
||||
sent_words.append(token)
|
||||
pos_tag_examples.append([sent_words, sent_pos_tag])
|
||||
ner_examples.append([sent_words, sent_ner])
|
||||
return pos_tag_examples, ner_examples
|
||||
|
||||
if __name__ == "__main__":
|
||||
loader = PeopleDailyCorpusLoader("/home/zyfeng/data/CWS_POS_TAG_NER_people_daily.txt")
|
||||
pos, ner = loader.load()
|
||||
print(pos[:10])
|
||||
print(ner[:10])
|
||||
|
@ -1,3 +1,4 @@
|
||||
from .CRF import ConditionalRandomField
|
||||
from .MLP import MLP
|
||||
|
||||
__all__ = ["ConditionalRandomField"]
|
||||
__all__ = ["ConditionalRandomField", "MLP"]
|
||||
|
@ -1,114 +0,0 @@
|
||||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
|
||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
|
||||
from fastNLP.core.trainer import SeqLabelTrainer
|
||||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
|
||||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
|
||||
from fastNLP.saver.model_saver import ModelSaver
|
||||
from fastNLP.loader.model_loader import ModelLoader
|
||||
from fastNLP.core.tester import SeqLabelTester
|
||||
from fastNLP.models.sequence_modeling import SeqLabeling
|
||||
from fastNLP.core.predictor import Predictor
|
||||
|
||||
data_name = "pku_training.utf8"
|
||||
cws_data_path = "/home/zyfeng/data/pku_training.utf8"
|
||||
pickle_path = "./save/"
|
||||
data_infer_path = "/home/zyfeng/data/pku_test.utf8"
|
||||
|
||||
|
||||
def infer():
|
||||
# Load infer configuration, the same as test
|
||||
test_args = ConfigSection()
|
||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
|
||||
|
||||
# fetch dictionary size and number of labels from pickle files
|
||||
word2index = load_pickle(pickle_path, "word2id.pkl")
|
||||
test_args["vocab_size"] = len(word2index)
|
||||
index2label = load_pickle(pickle_path, "id2class.pkl")
|
||||
test_args["num_classes"] = len(index2label)
|
||||
|
||||
# Define the same model
|
||||
model = SeqLabeling(test_args)
|
||||
|
||||
# Dump trained parameters into the model
|
||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
|
||||
print("model loaded!")
|
||||
|
||||
# Data Loader
|
||||
raw_data_loader = BaseLoader(data_name, data_infer_path)
|
||||
infer_data = raw_data_loader.load_lines()
|
||||
|
||||
# Inference interface
|
||||
infer = Predictor(pickle_path)
|
||||
results = infer.predict(model, infer_data)
|
||||
|
||||
print(results)
|
||||
print("Inference finished!")
|
||||
|
||||
|
||||
def train_test():
|
||||
# Config Loader
|
||||
train_args = ConfigSection()
|
||||
test_args = ConfigSection()
|
||||
ConfigLoader("good_name", "good_path").load_config("./cws.cfg", {"train": train_args, "test": test_args})
|
||||
|
||||
# Data Loader
|
||||
loader = TokenizeDatasetLoader(data_name, cws_data_path)
|
||||
train_data = loader.load_pku()
|
||||
|
||||
# Preprocessor
|
||||
preprocess = SeqLabelPreprocess()
|
||||
data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
|
||||
train_args["vocab_size"] = preprocess.vocab_size
|
||||
train_args["num_classes"] = preprocess.num_classes
|
||||
|
||||
# Trainer
|
||||
trainer = SeqLabelTrainer(train_args)
|
||||
|
||||
# Model
|
||||
model = SeqLabeling(train_args)
|
||||
|
||||
# Start training
|
||||
trainer.train(model, data_train, data_dev)
|
||||
print("Training finished!")
|
||||
|
||||
# Saver
|
||||
saver = ModelSaver("./save/saved_model.pkl")
|
||||
saver.save_pytorch(model)
|
||||
print("Model saved!")
|
||||
|
||||
# testing with validation set
|
||||
test(data_dev)
|
||||
|
||||
|
||||
def test(test_data):
|
||||
# Config Loader
|
||||
train_args = ConfigSection()
|
||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
|
||||
|
||||
# Define the same model
|
||||
model = SeqLabeling(train_args)
|
||||
|
||||
# Dump trained parameters into the model
|
||||
ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
|
||||
print("model loaded!")
|
||||
|
||||
# Load test configuration
|
||||
test_args = ConfigSection()
|
||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
|
||||
|
||||
# Tester
|
||||
tester = SeqLabelTester(test_args)
|
||||
|
||||
# Start testing
|
||||
tester.test(model, test_data)
|
||||
|
||||
# print test results
|
||||
print(tester.show_matrices())
|
||||
print("model tested!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_test()
|
@ -32,3 +32,15 @@ use_crf = true
|
||||
use_cuda = true
|
||||
rnn_hidden_units = 100
|
||||
word_emb_dim = 100
|
||||
|
||||
[model]
|
||||
save_output = true
|
||||
validate_in_training = true
|
||||
save_dev_input = false
|
||||
save_loss = true
|
||||
batch_size = 640
|
||||
pickle_path = "./save/"
|
||||
use_crf = true
|
||||
use_cuda = true
|
||||
rnn_hidden_units = 100
|
||||
word_emb_dim = 100
|
@ -125,7 +125,7 @@ def test():
|
||||
tester.test(model, dev_data)
|
||||
|
||||
# print test results
|
||||
print(tester.show_matrices())
|
||||
print(tester.show_metrics())
|
||||
print("model tested!")
|
||||
|
||||
|
||||
|
@ -1,29 +1,35 @@
|
||||
[train]
|
||||
epochs = 10
|
||||
batch_size = 32
|
||||
epochs = 30
|
||||
batch_size = 64
|
||||
pickle_path = "./save/"
|
||||
validate = true
|
||||
save_best_dev = true
|
||||
model_saved_path = "./save/"
|
||||
rnn_hidden_units = 100
|
||||
rnn_layers = 2
|
||||
rnn_bi_direction = true
|
||||
word_emb_dim = 100
|
||||
dropout = 0.5
|
||||
use_crf = true
|
||||
use_cuda = true
|
||||
print_every_step = 10
|
||||
|
||||
[test]
|
||||
save_output = true
|
||||
validate_in_training = true
|
||||
save_dev_input = false
|
||||
save_loss = true
|
||||
batch_size = 64
|
||||
batch_size = 640
|
||||
pickle_path = "./save/"
|
||||
rnn_hidden_units = 100
|
||||
rnn_layers = 1
|
||||
rnn_bi_direction = true
|
||||
word_emb_dim = 100
|
||||
dropout = 0.5
|
||||
use_crf = true
|
||||
use_cuda = true
|
||||
|
||||
|
||||
[POS_test]
|
||||
save_output = true
|
||||
validate_in_training = true
|
||||
save_dev_input = false
|
||||
save_loss = true
|
||||
batch_size = 640
|
||||
pickle_path = "./save/"
|
||||
use_crf = true
|
||||
use_cuda = true
|
||||
rnn_hidden_units = 100
|
||||
word_emb_dim = 100
|
146
reproduction/pos_tag_model/train_pos_tag.py
Normal file
146
reproduction/pos_tag_model/train_pos_tag.py
Normal file
@ -0,0 +1,146 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
|
||||
|
||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
|
||||
from fastNLP.core.trainer import SeqLabelTrainer
|
||||
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader, BaseLoader
|
||||
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
|
||||
from fastNLP.saver.model_saver import ModelSaver
|
||||
from fastNLP.loader.model_loader import ModelLoader
|
||||
from fastNLP.core.tester import SeqLabelTester
|
||||
from fastNLP.models.sequence_modeling import AdvSeqLabel
|
||||
from fastNLP.core.predictor import SeqLabelInfer
|
||||
|
||||
# not in the file's dir
|
||||
if len(os.path.dirname(__file__)) != 0:
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
datadir = "/home/zyfeng/data/"
|
||||
cfgfile = './pos_tag.cfg'
|
||||
data_name = "CWS_POS_TAG_NER_people_daily.txt"
|
||||
|
||||
pos_tag_data_path = os.path.join(datadir, data_name)
|
||||
pickle_path = "save"
|
||||
data_infer_path = os.path.join(datadir, "infer.utf8")
|
||||
|
||||
|
||||
def infer():
|
||||
# Config Loader
|
||||
test_args = ConfigSection()
|
||||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
|
||||
|
||||
# fetch dictionary size and number of labels from pickle files
|
||||
word2index = load_pickle(pickle_path, "word2id.pkl")
|
||||
test_args["vocab_size"] = len(word2index)
|
||||
index2label = load_pickle(pickle_path, "id2class.pkl")
|
||||
test_args["num_classes"] = len(index2label)
|
||||
|
||||
# Define the same model
|
||||
model = AdvSeqLabel(test_args)
|
||||
|
||||
try:
|
||||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
|
||||
print('model loaded!')
|
||||
except Exception as e:
|
||||
print('cannot load model!')
|
||||
raise
|
||||
|
||||
# Data Loader
|
||||
raw_data_loader = BaseLoader(data_name, data_infer_path)
|
||||
infer_data = raw_data_loader.load_lines()
|
||||
print('data loaded')
|
||||
|
||||
# Inference interface
|
||||
infer = SeqLabelInfer(pickle_path)
|
||||
results = infer.predict(model, infer_data)
|
||||
|
||||
print(results)
|
||||
print("Inference finished!")
|
||||
|
||||
|
||||
def train():
|
||||
# Config Loader
|
||||
train_args = ConfigSection()
|
||||
test_args = ConfigSection()
|
||||
ConfigLoader("good_name", "good_path").load_config(cfgfile, {"train": train_args, "test": test_args})
|
||||
|
||||
# Data Loader
|
||||
loader = PeopleDailyCorpusLoader(pos_tag_data_path)
|
||||
train_data, _ = loader.load()
|
||||
|
||||
# Preprocessor
|
||||
preprocessor = SeqLabelPreprocess()
|
||||
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
|
||||
train_args["vocab_size"] = preprocessor.vocab_size
|
||||
train_args["num_classes"] = preprocessor.num_classes
|
||||
|
||||
# Trainer
|
||||
trainer = SeqLabelTrainer(**train_args.data)
|
||||
|
||||
# Model
|
||||
model = AdvSeqLabel(train_args)
|
||||
try:
|
||||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
|
||||
print('model parameter loaded!')
|
||||
except Exception as e:
|
||||
print("No saved model. Continue.")
|
||||
pass
|
||||
|
||||
# Start training
|
||||
trainer.train(model, data_train, data_dev)
|
||||
print("Training finished!")
|
||||
|
||||
# Saver
|
||||
saver = ModelSaver("./save/saved_model.pkl")
|
||||
saver.save_pytorch(model)
|
||||
print("Model saved!")
|
||||
|
||||
|
||||
def test():
|
||||
# Config Loader
|
||||
test_args = ConfigSection()
|
||||
ConfigLoader("config", "").load_config(cfgfile, {"POS_test": test_args})
|
||||
|
||||
# fetch dictionary size and number of labels from pickle files
|
||||
word2index = load_pickle(pickle_path, "word2id.pkl")
|
||||
test_args["vocab_size"] = len(word2index)
|
||||
index2label = load_pickle(pickle_path, "id2class.pkl")
|
||||
test_args["num_classes"] = len(index2label)
|
||||
|
||||
# load dev data
|
||||
dev_data = load_pickle(pickle_path, "data_dev.pkl")
|
||||
|
||||
# Define the same model
|
||||
model = AdvSeqLabel(test_args)
|
||||
|
||||
# Dump trained parameters into the model
|
||||
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
|
||||
print("model loaded!")
|
||||
|
||||
# Tester
|
||||
tester = SeqLabelTester(**test_args.data)
|
||||
|
||||
# Start testing
|
||||
tester.test(model, dev_data)
|
||||
|
||||
# print test results
|
||||
print(tester.show_metrics())
|
||||
print("model tested!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
|
||||
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
|
||||
args = parser.parse_args()
|
||||
if args.mode == 'train':
|
||||
train()
|
||||
elif args.mode == 'test':
|
||||
test()
|
||||
elif args.mode == 'infer':
|
||||
infer()
|
||||
else:
|
||||
print('no mode specified for model!')
|
||||
parser.print_help()
|
@ -68,7 +68,7 @@ class MyNERTester(SeqLabelTester):
|
||||
def metrics(self):
|
||||
return np.mean(self.eval_history)
|
||||
|
||||
def show_matrices(self):
|
||||
def show_metrics(self):
|
||||
return "dev accuracy={:.2f}".format(float(self.metrics()))
|
||||
|
||||
|
||||
|
@ -1,19 +1,13 @@
|
||||
# python: 3.5
|
||||
# pytorch: 0.4
|
||||
|
||||
################
|
||||
# Test cross validation.
|
||||
################
|
||||
|
||||
from fastNLP.loader.preprocess import ClassPreprocess
|
||||
|
||||
from fastNLP.core.loss import Loss
|
||||
from fastNLP.core.optimizer import Optimizer
|
||||
from fastNLP.core.predictor import ClassificationInfer
|
||||
from fastNLP.core.preprocess import ClassPreprocess
|
||||
from fastNLP.core.trainer import ClassificationTrainer
|
||||
from fastNLP.loader.dataset_loader import ClassDatasetLoader
|
||||
from fastNLP.models.base_model import BaseModel
|
||||
from fastNLP.modules import aggregation
|
||||
from fastNLP.modules import encoder
|
||||
from fastNLP.modules import decoder
|
||||
from fastNLP.modules import encoder
|
||||
|
||||
|
||||
class ClassificationModel(BaseModel):
|
||||
@ -28,7 +22,7 @@ class ClassificationModel(BaseModel):
|
||||
self.enc = encoder.Conv(
|
||||
in_channels=300, out_channels=100, kernel_size=3)
|
||||
self.agg = aggregation.MaxPool()
|
||||
self.dec = decoder.MLP(100, num_classes=num_classes)
|
||||
self.dec = decoder.MLP(size_layer=[100, num_classes])
|
||||
|
||||
def forward(self, x):
|
||||
x = self.emb(x) # [N,L] -> [N,L,C]
|
||||
@ -38,18 +32,17 @@ class ClassificationModel(BaseModel):
|
||||
return x
|
||||
|
||||
|
||||
data_dir = 'data' # directory to save data and model
|
||||
train_path = 'test/data_for_tests/text_classify.txt' # training set file
|
||||
data_dir = 'save/' # directory to save data and model
|
||||
train_path = './data_for_tests/text_classify.txt' # training set file
|
||||
|
||||
# load dataset
|
||||
ds_loader = ClassDatasetLoader("train", train_path)
|
||||
data = ds_loader.load()
|
||||
|
||||
# pre-process dataset
|
||||
pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
|
||||
# pre = ClassPreprocess(data, data_dir)
|
||||
n_classes = pre.num_classes
|
||||
vocab_size = pre.vocab_size
|
||||
pre = ClassPreprocess()
|
||||
train_set, dev_set = pre.run(data, train_dev_split=0.3, pickle_path=data_dir)
|
||||
n_classes, vocab_size = pre.num_classes, pre.vocab_size
|
||||
|
||||
# construct model
|
||||
model_args = {
|
||||
@ -58,22 +51,25 @@ model_args = {
|
||||
}
|
||||
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)
|
||||
|
||||
# train model
|
||||
# construct trainer
|
||||
train_args = {
|
||||
"epochs": 10,
|
||||
"batch_size": 50,
|
||||
"epochs": 3,
|
||||
"batch_size": 16,
|
||||
"pickle_path": data_dir,
|
||||
"validate": False,
|
||||
"save_best_dev": False,
|
||||
"model_saved_path": None,
|
||||
"use_cuda": True,
|
||||
"learn_rate": 1e-3,
|
||||
"momentum": 0.9}
|
||||
trainer = ClassificationTrainer(train_args)
|
||||
# trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
|
||||
trainer.cross_validate(model)
|
||||
"loss": Loss("cross_entropy"),
|
||||
"optimizer": Optimizer("Adam", lr=0.001)
|
||||
}
|
||||
trainer = ClassificationTrainer(**train_args)
|
||||
|
||||
# start training
|
||||
trainer.train(model, train_data=train_set, dev_data=dev_set)
|
||||
|
||||
# predict using model
|
||||
data_infer = [x[0] for x in data]
|
||||
infer = ClassificationInfer(data_dir)
|
||||
labels_pred = infer.predict(model, data_infer)
|
||||
labels_pred = infer.predict(model.cpu(), data_infer)
|
||||
print(labels_pred)
|
||||
|
@ -134,7 +134,7 @@ def train_and_test():
|
||||
tester.test(model, data_dev)
|
||||
|
||||
# print test results
|
||||
print(tester.show_matrices())
|
||||
print(tester.show_metrics())
|
||||
print("model tested!")
|
||||
|
||||
|
||||
|
@ -108,7 +108,7 @@ def train_test():
|
||||
tester.test(model, data_train)
|
||||
|
||||
# print test results
|
||||
print(tester.show_matrices())
|
||||
print(tester.show_metrics())
|
||||
print("model tested!")
|
||||
|
||||
|
||||
|
@ -1,9 +1,12 @@
|
||||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
from fastNLP.fastnlp import FastNLP
|
||||
from fastNLP.fastnlp import interpret_word_seg_results
|
||||
from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_results
|
||||
|
||||
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
|
||||
PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/"
|
||||
|
||||
|
||||
def word_seg():
|
||||
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES)
|
||||
@ -39,5 +42,33 @@ def test_word_seg_interpret():
|
||||
print(interpret_word_seg_results(chars, labels))
|
||||
|
||||
|
||||
def test_interpret_cws_pos_results():
|
||||
foo = [
|
||||
[('这', 'S-r'), ('是', 'S-v'), ('最', 'S-d'), ('好', 'S-a'), ('的', 'S-u'), ('基', 'B-p'), ('于', 'E-p'), ('深', 'B-d'),
|
||||
('度', 'E-d'), ('学', 'B-v'), ('习', 'E-v'), ('的', 'S-u'), ('中', 'B-nz'), ('文', 'E-nz'), ('分', 'B-vn'),
|
||||
('词', 'E-vn'), ('系', 'B-n'), ('统', 'E-n'), ('。', 'S-w')]
|
||||
]
|
||||
chars = [x[0] for x in foo[0]]
|
||||
labels = [x[1] for x in foo[0]]
|
||||
print(interpret_cws_pos_results(chars, labels))
|
||||
|
||||
|
||||
def test_pos_tag():
|
||||
nlp = FastNLP(model_dir=PATH_TO_POS_TAG_PICKLE_FILES)
|
||||
nlp.load("pos_tag_model", config_file="pos_tag.config", section_name="pos_tag_model")
|
||||
text = ["这是最好的基于深度学习的中文分词系统。",
|
||||
"大王叫我来巡山。",
|
||||
"我党多年来致力于改善人民生活水平。"]
|
||||
results = nlp.run(text)
|
||||
for example in results:
|
||||
words, labels = [], []
|
||||
for res in example:
|
||||
words.append(res[0])
|
||||
labels.append(res[1])
|
||||
print(interpret_cws_pos_results(words, labels))
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
word_seg()
|
||||
|
@ -5,7 +5,6 @@ from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
|
||||
from fastNLP.models.sequence_modeling import SeqLabeling
|
||||
|
||||
data_name = "pku_training.utf8"
|
||||
cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8"
|
||||
pickle_path = "data_for_tests"
|
||||
|
||||
|
||||
@ -17,7 +16,8 @@ def foo():
|
||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
|
||||
|
||||
# Preprocessor
|
||||
p = SeqLabelPreprocess(train_data, pickle_path)
|
||||
p = SeqLabelPreprocess()
|
||||
p.run(train_data)
|
||||
train_args["vocab_size"] = p.vocab_size
|
||||
train_args["num_classes"] = p.num_classes
|
||||
|
||||
@ -30,7 +30,7 @@ def foo():
|
||||
|
||||
print("start validation.")
|
||||
validator.test(model)
|
||||
print(validator.show_matrices())
|
||||
print(validator.show_metrics())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
Reference in New Issue
Block a user