cnn-text by LJY

This commit is contained in:
JingyuanLiu 2018-03-18 21:16:29 +08:00
parent 606714465b
commit b83ec6ea7e
7 changed files with 11159 additions and 0 deletions

110
CNN-sentence_classification/.gitignore vendored Normal file
View File

@ -0,0 +1,110 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache
#custom
GoogleNews-vectors-negative300.bin/
GoogleNews-vectors-negative300.bin.gz
models/
*.swp

View File

@ -0,0 +1,77 @@
## Introduction
This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
* MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News)
* It can be run in both CPU and GPU
* The best accuracy is 82.61%, which is better than 81.5% in the paper
(by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!)
## Requirement
* python 3.6
* pytorch > 0.1
* numpy
* gensim
## Run
STEP 1
install packages like gensim (other needed pakages is the same)
```
pip install gensim
```
STEP 2
install MRdataset and word2vec resources
* MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)
* word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)
Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'):
STEP 3
train the model
```
python train.py
```
you will get the information printed in the screen, like
```
Epoch [1/20], Iter [100/192] Loss: 0.7008
Test Accuracy: 71.869159 %
Epoch [2/20], Iter [100/192] Loss: 0.5957
Test Accuracy: 75.700935 %
Epoch [3/20], Iter [100/192] Loss: 0.4934
Test Accuracy: 78.130841 %
......
Epoch [20/20], Iter [100/192] Loss: 0.0364
Test Accuracy: 81.495327 %
Best Accuracy: 82.616822 %
Best Model: models/cnn.pkl
```
## Hyperparameters
According to the paper and experiment, I set:
|Epoch|Kernel Size|dropout|learning rate|batch size|
|---|---|---|---|---|
|20|\(h,300,100\)|0.5|0.0001|50|
h = [3,4,5]
If the accuracy is not improved, the learning rate will /*0.8.
## Result
I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA)
There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel.
I have tried CNN-non-static:A model with pre-trained vectors from word2vec.
All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task
(which has almost the best performance and the most difficut to implement among the four models)
|Dataset|Class Size|Best Result|Kim's Paper Result|
|---|---|---|---|
|MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)|
## Reference
* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
* https://github.com/Shawn1993/cnn-text-classification-pytorch
* https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py

View File

@ -0,0 +1,149 @@
import re
import sys
import itertools
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random
import os
import pickle
import codecs
from gensim import corpora
import gensim
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
def pad_sentences(sentence, padding_word=" <PAD/>"):
sequence_length = 64
sent = sentence.split()
padded_sentence = sentence + padding_word * (sequence_length - len(sent))
return padded_sentence
#data loader
class MRDataset(Dataset):
def __init__(self):
#load positive and negative sentenses from files
with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f:
positive_examples = list(f.readlines())
with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f:
negative_examples = list(f.readlines())
#s.strip: clear "\n"; clear_str; pad
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
self.examples = positive_examples + negative_examples
self.sentences_texts = [sample.split() for sample in self.examples]
#word dictionary
dictionary = corpora.Dictionary(self.sentences_texts)
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...}
#set lables: postive is 1; negative is 0
positive_labels = [1 for _ in positive_examples]
negative_labels = [0 for _ in negative_examples]
self.lables = positive_labels + negative_labels
examples_lables = list(zip(self.examples,self.lables))
random.shuffle(examples_lables)
self.MRDataset_frame = examples_lables
#transform word to id
self.MRDataset_wordid = \
[(
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64),
sent[1]
) for sent in self.MRDataset_frame]
def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'):
#establish from google
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
print('Please wait ... (it could take a while to load the file : {})'.format(path))
word_dict = self.word2id_dict
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))
for word in word_dict:
word_id = word_dict[word]
if word in model.wv.vocab:
embedding_weights[word_id, :] = model[word]
return embedding_weights
def __len__(self):
return len(self.MRDataset_frame)
def __getitem__(self,idx):
sample = self.MRDataset_wordid[idx]
return sample
def getsent(self, idx):
sample = self.MRDataset_wordid[idx][0]
return sample
def getlabel(self, idx):
label = self.MRDataset_wordid[idx][1]
return label
def word2id(self):
return self.word2id_dict
def id2word(self):
id2word_dict = dict([val,key] for key,val in self.word2id_dict.items())
return id2word_dict
class train_set(Dataset):
def __init__(self, samples):
self.train_frame = samples
def __len__(self):
return len(self.train_frame)
def __getitem__(self, idx):
return self.train_frame[idx]
class test_set(Dataset):
def __init__(self, samples):
self.test_frame = samples
def __len__(self):
return len(self.test_frame)
def __getitem__(self, idx):
return self.test_frame[idx]

View File

@ -0,0 +1,61 @@
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
import dataset
"""
#some information
mode = "static"
use_pretrained_embedding = "gensim.word2vec"
print('MODE = {}'.format(mode))
print('EMBEDDING = {}\n'.format(use_pretrained_embeddings)
embedding_weights = dataset.word_embedding_300()
embed_num = len(embedding_weights)
embed_dim = 300
class_num = 2
len_sentence = 64
print('embedding size = {}'.format(embed_num))
print('embedding dimension = {}'.format(embed_dim))
print('sentence len n = {}'.format(len_sentence))
print('num of classes = {}'.format(class_num))
"""
class CNN_text(nn.Module):
def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None):
super(CNN_text, self).__init__()
self.embedding = nn.Embedding(embed_num,embed_dim)
self.dropout = nn.Dropout(dropout)
if pretrained_embeddings is not None:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
#the network structure
#Conv2d: input- N,C,H,W output- (50,100,62,1)
self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h])
self.fc1 = nn.Linear(300,2)
def max_pooling(self, x):
x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
#x.size(2)=62 squeeze: (50,100,1) -> (50,100)
return x
def forward(self, x):
x = self.embedding(x) #output: (N,H,W) = (50,64,300)
x = x.unsqueeze(1) #(N,C,H,W)
x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)]
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)]
x = torch.cat(x,1)
x = self.dropout(x)
x = self.fc1(x)
return x

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,100 @@
import os
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import dataset as dst
from model import CNN_text
from torch.autograd import Variable
from sklearn import cross_validation
from sklearn import datasets
# Hyper Parameters
batch_size = 50
learning_rate = 0.0001
num_epochs = 20
cuda = True
#split Dataset
dataset = dst.MRDataset()
length = len(dataset)
train_dataset = dataset[:int(0.9*length)]
test_dataset = dataset[int(0.9*length):]
train_dataset = dst.train_set(train_dataset)
test_dataset = dst.test_set(test_dataset)
# Data Loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings())
if cuda:
cnn.cuda()
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)
best_acc = None
for epoch in range(num_epochs):
# Train the Model
cnn.train()
for i, (sents,labels) in enumerate(train_loader):
sents = Variable(sents)
labels = Variable(labels)
if cuda:
sents = sents.cuda()
labels = labels.cuda()
optimizer.zero_grad()
outputs = cnn(sents)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
# Test the Model
cnn.eval()
correct = 0
total = 0
for sents, labels in test_loader:
sents = Variable(sents)
if cuda:
sents = sents.cuda()
labels = labels.cuda()
outputs = cnn(sents)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
acc = 100. * correct / total
print('Test Accuracy: %f %%' % (acc))
if best_acc is None or acc > best_acc:
best_acc = acc
if os.path.exists("models") is False:
os.makedirs("models")
torch.save(cnn.state_dict(), 'models/cnn.pkl')
else:
learning_rate = learning_rate * 0.8
print("Best Accuracy: %f %%" % best_acc)
print("Best Model: models/cnn.pkl")