fastNLP/tutorials/tutorial_3_embedding.ipynb
2020-02-27 21:22:49 +08:00

525 lines
17 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"torch.Size([1, 5, 50])\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
"print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 5, 30])\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"from fastNLP.embeddings import ElmoEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 512])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
"print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 1536])\n"
]
}
],
"source": [
"# 使用后面两层的输出\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
"print(embed(words).size()) # 结果将是在最后一维做拼接"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 7, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
"print(embed(words).size()) # 结果将在序列维度上增加2\n",
"# 取出句子的cls表示\n",
"cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 10 words out of 10.\n",
"torch.Size([1, 9, 768])\n"
]
}
],
"source": [
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import CNNCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50返回的embedding结果维度大小为64。\n",
"embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import LSTMCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50返回的embedding结果维度大小为64。\n",
"embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"50\n",
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"30\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"256\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"512\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"768\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"1536\n",
"80\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(static_embed.embedding_dim) # 50\n",
"char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
"print(char_embed.embedding_dim) # 30\n",
"elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
"print(elmo_embed_1.embedding_dim) # 256\n",
"elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
"print(elmo_embed_2.embedding_dim) # 512\n",
"bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_1.embedding_dim) # 768\n",
"bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_2.embedding_dim) # 1536\n",
"stack_embed = StackEmbedding([static_embed, char_embed])\n",
"print(stack_embed.embedding_dim) # 80"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
"embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练词向量时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 out of 4 words have frequency less than 2.\n",
"tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 out of 5 words have frequency less than 2.\n",
"All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
"tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}