PyTorch BERT
#! pip install pytorch-pretrained-bert
Using BERT
Tokenizer
See how BERT tokenizer works Tutorial source : Huggingface BERT repo
import torch
from pytorch_pretrained_bert import BertTokenizer,BertModel, BertForMaskedLM
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.ERROR)
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(tokenizer.wordpiece_tokenizer.vocab), tokenizer.convert_tokens_to_ids(['[MASK]','[CLS]','[SEP]'])
(30522, [103, 101, 102])
text = "[CLS] Who was Jack Wood ? [SEP] Jack Wood was a puppeteer . [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jack', 'wood', '?', '[SEP]', 'jack', '[MASK]', 'was', 'a', 'puppet', '##eer', '.', '[SEP]']
['[CLS]', 'who', 'was', 'jack', 'wood', '?', '[SEP]', 'jack', 'wood', 'was', 'a', 'puppet', '##eer', '.', '[SEP]']
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,1]
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
segment_tensor = torch.tensor([segments_ids])
tokens_tensor, segment_tensor
(tensor([[ 101, 2040, 2001, 2990, 3536, 1029, 102, 2990, 103, 2001,
1037, 13997, 11510, 1012, 102]]),
tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]))
Run BERT classifier
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
print(model)
INFO:pytorch_pretrained_bert.modeling:Model config {
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1)
)
)
...
...
Total 12 layers
...
...
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
Move to GPU/CPU
device = 'cuda'
tokens_tensor = tokens_tensor.to(device)
segment_tensor = segment_tensor.to(device)
model = model.to(device)
%%time
with torch.no_grad():
hidden_states, pooled_state = model(tokens_tensor, segment_tensor)
print('Total hidden states = {}, \n Hidden shape 1 (batch * seq_len * hidden_sdim)= {}, \n\
Pooled state (layer over cls) shape = {}'
.format(len(hidden_states), hidden_states[0].shape, pooled_state.shape))
Total hidden states = 12,
Hidden shape 1 (batch * seq_len * hidden_sdim)= torch.Size([1, 15, 768]),
Pooled state (layer over cls) shape = torch.Size([1, 768])
Wall time: 116 ms
#check that pooled state is a layer above the first token of last hidden state
print(hidden_states[-1][0][0][:10])
print(pooled_state[0][:10])
tensor([-0.3427, 0.4647, -0.6185, -0.1858, -0.5258, -0.0140, 0.5476, 0.4695,
-0.2122, 0.1675])
tensor([-0.9796, -0.6209, -0.9077, 0.9408, 0.6961, -0.4413, 0.9881, 0.5688,
-0.7937, -1.0000])
Bert for masked LM
I think this uses the same bert weights (bert-base-uncased)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
INFO:pytorch_pretrained_bert.modeling:Model config {
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
INFO:pytorch_pretrained_bert.modeling:Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
with torch.no_grad():
predictions = model(tokens_tensor,segment_tensor)
print('prediction shape = ', predictions.shape) # batch, tokens, vocab size
prediction shape = torch.Size([1, 15, 30522])
Check what it predicts for the masked word
token_index = torch.argmax(predictions[0, masked_index]).item()
print(tokenizer.ids_to_tokens[token_index])
wood