PyTorch BERT

#! pip install pytorch-pretrained-bert

Using BERT

BERT Paper

Tokenizer

See how BERT tokenizer works Tutorial source : Huggingface BERT repo

import torch
from pytorch_pretrained_bert import BertTokenizer,BertModel, BertForMaskedLM


# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.ERROR)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(tokenizer.wordpiece_tokenizer.vocab), tokenizer.convert_tokens_to_ids(['[MASK]','[CLS]','[SEP]'])

(30522, [103, 101, 102])

text = "[CLS] Who was Jack Wood ? [SEP] Jack Wood was a puppeteer . [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jack', 'wood', '?', '[SEP]', 'jack', '[MASK]', 'was', 'a', 'puppet', '##eer', '.', '[SEP]']

['[CLS]', 'who', 'was', 'jack', 'wood', '?', '[SEP]', 'jack', 'wood', 'was', 'a', 'puppet', '##eer', '.', '[SEP]']

# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,1]
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

tokens_tensor = torch.tensor([indexed_tokens])
segment_tensor = torch.tensor([segments_ids])
tokens_tensor, segment_tensor

(tensor([[  101,  2040,  2001,  2990,  3536,  1029,   102,  2990,   103,  2001,
           1037, 13997, 11510,  1012,   102]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]))

Run BERT classifier

model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
print(model)

INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): BertLayerNorm()
          (dropout): Dropout(p=0.1)
        )
      )
      
	  ...
	  ...
	  Total 12 layers
	  ...
	  ...
	  
      (11): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): BertLayerNorm()
          (dropout): Dropout(p=0.1)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)

Move to GPU/CPU

device = 'cuda'
tokens_tensor = tokens_tensor.to(device)
segment_tensor = segment_tensor.to(device)
model = model.to(device)

%%time
with torch.no_grad():
    hidden_states, pooled_state = model(tokens_tensor, segment_tensor)
    print('Total hidden states = {}, \n Hidden shape 1 (batch * seq_len * hidden_sdim)= {}, \n\
          Pooled state (layer over cls) shape = {}'
          .format(len(hidden_states), hidden_states[0].shape, pooled_state.shape))

Total hidden states = 12, 
 Hidden shape 1 (batch * seq_len * hidden_sdim)= torch.Size([1, 15, 768]), 
          Pooled state (layer over cls) shape = torch.Size([1, 768])
Wall time: 116 ms

#check that pooled state is a layer above the first token of last hidden state
print(hidden_states[-1][0][0][:10])
print(pooled_state[0][:10])

tensor([-0.3427,  0.4647, -0.6185, -0.1858, -0.5258, -0.0140,  0.5476,  0.4695,
        -0.2122,  0.1675])
tensor([-0.9796, -0.6209, -0.9077,  0.9408,  0.6961, -0.4413,  0.9881,  0.5688,
        -0.7937, -1.0000])

Bert for masked LM

I think this uses the same bert weights (bert-base-uncased)

model = BertForMaskedLM.from_pretrained('bert-base-uncased')

INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:pytorch_pretrained_bert.modeling:Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']

with torch.no_grad():
    predictions = model(tokens_tensor,segment_tensor)
print('prediction shape = ', predictions.shape) # batch, tokens, vocab size

prediction shape =  torch.Size([1, 15, 30522])

Check what it predicts for the masked word

token_index = torch.argmax(predictions[0, masked_index]).item()
print(tokenizer.ids_to_tokens[token_index])

wood