mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-04 21:28:01 +08:00
525 lines
17 KiB
Plaintext
525 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Found 5 out of 7 words in the pre-training embedding.\n",
|
||
"torch.Size([1, 5, 50])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import torch\n",
|
||
"from fastNLP.embeddings import StaticEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
|
||
"\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
|
||
"print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"torch.Size([1, 5, 30])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import StaticEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
|
||
"\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"22 out of 22 characters were found in pretrained elmo embedding.\n",
|
||
"torch.Size([1, 5, 256])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import ElmoEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"22 out of 22 characters were found in pretrained elmo embedding.\n",
|
||
"torch.Size([1, 5, 512])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"22 out of 22 characters were found in pretrained elmo embedding.\n",
|
||
"torch.Size([1, 5, 256])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
|
||
"print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n",
|
||
"torch.Size([1, 5, 768])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import BertEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n",
|
||
"torch.Size([1, 5, 1536])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 使用后面两层的输出\n",
|
||
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
|
||
"print(embed(words).size()) # 结果将是在最后一维做拼接"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n",
|
||
"torch.Size([1, 7, 768])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
|
||
"print(embed(words).size()) # 结果将在序列维度上增加2\n",
|
||
"# 取出句子的cls表示\n",
|
||
"cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n",
|
||
"torch.Size([1, 5, 768])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 10 words out of 10.\n",
|
||
"torch.Size([1, 9, 768])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
|
||
"\n",
|
||
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Start constructing character vocabulary.\n",
|
||
"In total, there are 8 distinct characters.\n",
|
||
"torch.Size([1, 5, 64])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import CNNCharEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
|
||
"embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Start constructing character vocabulary.\n",
|
||
"In total, there are 8 distinct characters.\n",
|
||
"torch.Size([1, 5, 64])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import LSTMCharEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
|
||
"embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
|
||
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
|
||
"print(embed(words).size())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Found 5 out of 7 words in the pre-training embedding.\n",
|
||
"50\n",
|
||
"Start constructing character vocabulary.\n",
|
||
"In total, there are 8 distinct characters.\n",
|
||
"30\n",
|
||
"22 out of 22 characters were found in pretrained elmo embedding.\n",
|
||
"256\n",
|
||
"22 out of 22 characters were found in pretrained elmo embedding.\n",
|
||
"512\n",
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n",
|
||
"768\n",
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n",
|
||
"1536\n",
|
||
"80\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import *\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
|
||
"print(static_embed.embedding_dim) # 50\n",
|
||
"char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
|
||
"print(char_embed.embedding_dim) # 30\n",
|
||
"elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
|
||
"print(elmo_embed_1.embedding_dim) # 256\n",
|
||
"elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
|
||
"print(elmo_embed_2.embedding_dim) # 512\n",
|
||
"bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
|
||
"print(bert_embed_1.embedding_dim) # 768\n",
|
||
"bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
|
||
"print(bert_embed_2.embedding_dim) # 1536\n",
|
||
"stack_embed = StackEmbedding([static_embed, char_embed])\n",
|
||
"print(stack_embed.embedding_dim) # 80"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
|
||
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
|
||
"Start to generate word pieces for word.\n",
|
||
"Found(Or segment into word pieces) 7 words out of 7.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import *\n",
|
||
"\n",
|
||
"vocab = Vocabulary()\n",
|
||
"vocab.add_word_lst(\"this is a demo .\".split())\n",
|
||
"\n",
|
||
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
|
||
"embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import StaticEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
|
||
"# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n",
|
||
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
|
||
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import StaticEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
|
||
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
|
||
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"1 out of 4 words have frequency less than 2.\n",
|
||
"tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import StaticEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
|
||
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
|
||
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"0 out of 5 words have frequency less than 2.\n",
|
||
"All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
|
||
"tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n",
|
||
"tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
|
||
" grad_fn=<EmbeddingBackward>)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP.embeddings import StaticEmbedding\n",
|
||
"from fastNLP import Vocabulary\n",
|
||
"\n",
|
||
"vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
|
||
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
|
||
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
|
||
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python Now",
|
||
"language": "python",
|
||
"name": "now"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.0"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|