mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-11-30 03:07:59 +08:00
cnn-text by LJY
This commit is contained in:
parent
606714465b
commit
b83ec6ea7e
110
CNN-sentence_classification/.gitignore
vendored
Normal file
110
CNN-sentence_classification/.gitignore
vendored
Normal file
@ -0,0 +1,110 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache
|
||||
|
||||
#custom
|
||||
GoogleNews-vectors-negative300.bin/
|
||||
GoogleNews-vectors-negative300.bin.gz
|
||||
models/
|
||||
*.swp
|
77
CNN-sentence_classification/README.md
Normal file
77
CNN-sentence_classification/README.md
Normal file
@ -0,0 +1,77 @@
|
||||
## Introduction
|
||||
This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
|
||||
* MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News)
|
||||
* It can be run in both CPU and GPU
|
||||
* The best accuracy is 82.61%, which is better than 81.5% in the paper
|
||||
(by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!)
|
||||
|
||||
## Requirement
|
||||
* python 3.6
|
||||
* pytorch > 0.1
|
||||
* numpy
|
||||
* gensim
|
||||
|
||||
## Run
|
||||
STEP 1
|
||||
install packages like gensim (other needed pakages is the same)
|
||||
```
|
||||
pip install gensim
|
||||
```
|
||||
|
||||
STEP 2
|
||||
install MRdataset and word2vec resources
|
||||
* MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)
|
||||
* word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)
|
||||
|
||||
Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'):
|
||||
|
||||
|
||||
STEP 3
|
||||
train the model
|
||||
```
|
||||
python train.py
|
||||
```
|
||||
you will get the information printed in the screen, like
|
||||
```
|
||||
Epoch [1/20], Iter [100/192] Loss: 0.7008
|
||||
Test Accuracy: 71.869159 %
|
||||
Epoch [2/20], Iter [100/192] Loss: 0.5957
|
||||
Test Accuracy: 75.700935 %
|
||||
Epoch [3/20], Iter [100/192] Loss: 0.4934
|
||||
Test Accuracy: 78.130841 %
|
||||
|
||||
......
|
||||
Epoch [20/20], Iter [100/192] Loss: 0.0364
|
||||
Test Accuracy: 81.495327 %
|
||||
Best Accuracy: 82.616822 %
|
||||
Best Model: models/cnn.pkl
|
||||
```
|
||||
|
||||
## Hyperparameters
|
||||
According to the paper and experiment, I set:
|
||||
|
||||
|Epoch|Kernel Size|dropout|learning rate|batch size|
|
||||
|---|---|---|---|---|
|
||||
|20|\(h,300,100\)|0.5|0.0001|50|
|
||||
|
||||
h = [3,4,5]
|
||||
If the accuracy is not improved, the learning rate will /*0.8.
|
||||
|
||||
## Result
|
||||
I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA)
|
||||
There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel.
|
||||
I have tried CNN-non-static:A model with pre-trained vectors from word2vec.
|
||||
All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task
|
||||
(which has almost the best performance and the most difficut to implement among the four models)
|
||||
|
||||
|Dataset|Class Size|Best Result|Kim's Paper Result|
|
||||
|---|---|---|---|
|
||||
|MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)|
|
||||
|
||||
|
||||
|
||||
## Reference
|
||||
* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
|
||||
* https://github.com/Shawn1993/cnn-text-classification-pytorch
|
||||
* https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py
|
||||
|
149
CNN-sentence_classification/dataset.py
Normal file
149
CNN-sentence_classification/dataset.py
Normal file
@ -0,0 +1,149 @@
|
||||
import re
|
||||
import sys
|
||||
import itertools
|
||||
import numpy as np
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
import random
|
||||
import os
|
||||
import pickle
|
||||
import codecs
|
||||
from gensim import corpora
|
||||
import gensim
|
||||
|
||||
|
||||
def clean_str(string):
|
||||
"""
|
||||
Tokenization/string cleaning for all datasets except for SST.
|
||||
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
|
||||
"""
|
||||
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
|
||||
string = re.sub(r"\'s", " \'s", string)
|
||||
string = re.sub(r"\'ve", " \'ve", string)
|
||||
string = re.sub(r"n\'t", " n\'t", string)
|
||||
string = re.sub(r"\'re", " \'re", string)
|
||||
string = re.sub(r"\'d", " \'d", string)
|
||||
string = re.sub(r"\'ll", " \'ll", string)
|
||||
string = re.sub(r",", " , ", string)
|
||||
string = re.sub(r"!", " ! ", string)
|
||||
string = re.sub(r"\(", " \( ", string)
|
||||
string = re.sub(r"\)", " \) ", string)
|
||||
string = re.sub(r"\?", " \? ", string)
|
||||
string = re.sub(r"\s{2,}", " ", string)
|
||||
return string.strip()
|
||||
|
||||
def pad_sentences(sentence, padding_word=" <PAD/>"):
|
||||
sequence_length = 64
|
||||
sent = sentence.split()
|
||||
padded_sentence = sentence + padding_word * (sequence_length - len(sent))
|
||||
return padded_sentence
|
||||
|
||||
|
||||
|
||||
#data loader
|
||||
class MRDataset(Dataset):
|
||||
def __init__(self):
|
||||
|
||||
#load positive and negative sentenses from files
|
||||
with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f:
|
||||
positive_examples = list(f.readlines())
|
||||
with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f:
|
||||
negative_examples = list(f.readlines())
|
||||
#s.strip: clear "\n"; clear_str; pad
|
||||
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
|
||||
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
|
||||
self.examples = positive_examples + negative_examples
|
||||
self.sentences_texts = [sample.split() for sample in self.examples]
|
||||
|
||||
#word dictionary
|
||||
dictionary = corpora.Dictionary(self.sentences_texts)
|
||||
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...}
|
||||
|
||||
#set lables: postive is 1; negative is 0
|
||||
positive_labels = [1 for _ in positive_examples]
|
||||
negative_labels = [0 for _ in negative_examples]
|
||||
self.lables = positive_labels + negative_labels
|
||||
examples_lables = list(zip(self.examples,self.lables))
|
||||
random.shuffle(examples_lables)
|
||||
self.MRDataset_frame = examples_lables
|
||||
|
||||
#transform word to id
|
||||
self.MRDataset_wordid = \
|
||||
[(
|
||||
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64),
|
||||
sent[1]
|
||||
) for sent in self.MRDataset_frame]
|
||||
|
||||
def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'):
|
||||
#establish from google
|
||||
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
|
||||
print('Please wait ... (it could take a while to load the file : {})'.format(path))
|
||||
|
||||
word_dict = self.word2id_dict
|
||||
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))
|
||||
|
||||
for word in word_dict:
|
||||
word_id = word_dict[word]
|
||||
if word in model.wv.vocab:
|
||||
embedding_weights[word_id, :] = model[word]
|
||||
|
||||
return embedding_weights
|
||||
|
||||
def __len__(self):
|
||||
|
||||
return len(self.MRDataset_frame)
|
||||
|
||||
def __getitem__(self,idx):
|
||||
|
||||
sample = self.MRDataset_wordid[idx]
|
||||
return sample
|
||||
|
||||
def getsent(self, idx):
|
||||
|
||||
sample = self.MRDataset_wordid[idx][0]
|
||||
return sample
|
||||
|
||||
def getlabel(self, idx):
|
||||
|
||||
label = self.MRDataset_wordid[idx][1]
|
||||
return label
|
||||
|
||||
|
||||
def word2id(self):
|
||||
|
||||
return self.word2id_dict
|
||||
|
||||
def id2word(self):
|
||||
|
||||
id2word_dict = dict([val,key] for key,val in self.word2id_dict.items())
|
||||
return id2word_dict
|
||||
|
||||
|
||||
class train_set(Dataset):
|
||||
|
||||
def __init__(self, samples):
|
||||
|
||||
self.train_frame = samples
|
||||
|
||||
def __len__(self):
|
||||
|
||||
return len(self.train_frame)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
|
||||
return self.train_frame[idx]
|
||||
|
||||
|
||||
class test_set(Dataset):
|
||||
|
||||
def __init__(self, samples):
|
||||
|
||||
self.test_frame = samples
|
||||
|
||||
def __len__(self):
|
||||
|
||||
return len(self.test_frame)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
|
||||
return self.test_frame[idx]
|
61
CNN-sentence_classification/model.py
Normal file
61
CNN-sentence_classification/model.py
Normal file
@ -0,0 +1,61 @@
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.autograd import Variable
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
|
||||
import dataset
|
||||
|
||||
"""
|
||||
#some information
|
||||
mode = "static"
|
||||
use_pretrained_embedding = "gensim.word2vec"
|
||||
print('MODE = {}'.format(mode))
|
||||
print('EMBEDDING = {}\n'.format(use_pretrained_embeddings)
|
||||
|
||||
embedding_weights = dataset.word_embedding_300()
|
||||
embed_num = len(embedding_weights)
|
||||
embed_dim = 300
|
||||
class_num = 2
|
||||
len_sentence = 64
|
||||
|
||||
print('embedding size = {}'.format(embed_num))
|
||||
print('embedding dimension = {}'.format(embed_dim))
|
||||
print('sentence len n = {}'.format(len_sentence))
|
||||
print('num of classes = {}'.format(class_num))
|
||||
"""
|
||||
|
||||
class CNN_text(nn.Module):
|
||||
def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None):
|
||||
super(CNN_text, self).__init__()
|
||||
|
||||
self.embedding = nn.Embedding(embed_num,embed_dim)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
if pretrained_embeddings is not None:
|
||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
|
||||
|
||||
#the network structure
|
||||
#Conv2d: input- N,C,H,W output- (50,100,62,1)
|
||||
self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h])
|
||||
self.fc1 = nn.Linear(300,2)
|
||||
|
||||
|
||||
def max_pooling(self, x):
|
||||
x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62)
|
||||
x = F.max_pool1d(x, x.size(2)).squeeze(2)
|
||||
#x.size(2)=62 squeeze: (50,100,1) -> (50,100)
|
||||
return x
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x) #output: (N,H,W) = (50,64,300)
|
||||
x = x.unsqueeze(1) #(N,C,H,W)
|
||||
x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)]
|
||||
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)]
|
||||
x = torch.cat(x,1)
|
||||
x = self.dropout(x)
|
||||
x = self.fc1(x)
|
||||
return x
|
5331
CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
Normal file
5331
CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
Normal file
File diff suppressed because it is too large
Load Diff
5331
CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
Normal file
5331
CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
Normal file
File diff suppressed because it is too large
Load Diff
100
CNN-sentence_classification/train.py
Normal file
100
CNN-sentence_classification/train.py
Normal file
@ -0,0 +1,100 @@
|
||||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.datasets as dsets
|
||||
import torchvision.transforms as transforms
|
||||
import dataset as dst
|
||||
from model import CNN_text
|
||||
from torch.autograd import Variable
|
||||
|
||||
from sklearn import cross_validation
|
||||
from sklearn import datasets
|
||||
|
||||
|
||||
|
||||
# Hyper Parameters
|
||||
batch_size = 50
|
||||
learning_rate = 0.0001
|
||||
num_epochs = 20
|
||||
cuda = True
|
||||
|
||||
|
||||
#split Dataset
|
||||
dataset = dst.MRDataset()
|
||||
length = len(dataset)
|
||||
|
||||
train_dataset = dataset[:int(0.9*length)]
|
||||
test_dataset = dataset[int(0.9*length):]
|
||||
|
||||
train_dataset = dst.train_set(train_dataset)
|
||||
test_dataset = dst.test_set(test_dataset)
|
||||
|
||||
|
||||
|
||||
# Data Loader
|
||||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True)
|
||||
|
||||
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=False)
|
||||
|
||||
|
||||
|
||||
cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings())
|
||||
if cuda:
|
||||
cnn.cuda()
|
||||
|
||||
|
||||
# Loss and Optimizer
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)
|
||||
|
||||
best_acc = None
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
# Train the Model
|
||||
cnn.train()
|
||||
for i, (sents,labels) in enumerate(train_loader):
|
||||
sents = Variable(sents)
|
||||
labels = Variable(labels)
|
||||
if cuda:
|
||||
sents = sents.cuda()
|
||||
labels = labels.cuda()
|
||||
optimizer.zero_grad()
|
||||
outputs = cnn(sents)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
if (i+1) % 100 == 0:
|
||||
print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
|
||||
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
|
||||
|
||||
# Test the Model
|
||||
cnn.eval()
|
||||
correct = 0
|
||||
total = 0
|
||||
for sents, labels in test_loader:
|
||||
sents = Variable(sents)
|
||||
if cuda:
|
||||
sents = sents.cuda()
|
||||
labels = labels.cuda()
|
||||
outputs = cnn(sents)
|
||||
_, predicted = torch.max(outputs.data, 1)
|
||||
total += labels.size(0)
|
||||
correct += (predicted == labels).sum()
|
||||
acc = 100. * correct / total
|
||||
print('Test Accuracy: %f %%' % (acc))
|
||||
|
||||
if best_acc is None or acc > best_acc:
|
||||
best_acc = acc
|
||||
if os.path.exists("models") is False:
|
||||
os.makedirs("models")
|
||||
torch.save(cnn.state_dict(), 'models/cnn.pkl')
|
||||
else:
|
||||
learning_rate = learning_rate * 0.8
|
||||
|
||||
print("Best Accuracy: %f %%" % best_acc)
|
||||
print("Best Model: models/cnn.pkl")
|
Loading…
Reference in New Issue
Block a user