Word analogy using Glove Embeddings
Word Embeddings
Word embeddings are lower dimentional dense representation of words. Generally, embeddings are generated using semi supervised learning where the embeddings are learned based on an objective function such as finding the next word, finding the masked word, relevance of two given words. Trained on a huge corpus of public data such as books and wikipedia, word embeddings capture the features of a word in a lower dimentional space. Word embeddings are also useful in finding similarity between two words as similar words will have similar features in their embeddings.
The example code here loads word embedding file into memory. Then it finds analogy between different words based on word embedding
import numpy as np
import annoy as nn
import os
from argparse import Namespace
args = Namespace(
embedding_folder = r'C:\Users\krkusuk\Downloads\glove.6B',
embedding_file = r'glove.6B.50d.txt'
)
class PretrainedEmbeddings(object):
def __init__(self, word_to_index, word_vectors):
self.word_to_index = word_to_index
self.word_vectors = word_vectors
self.index_to_word = {v:k for k,v in self.word_to_index.items()}
#nearest neighbour index
self.index = nn.AnnoyIndex(len(word_vectors[0]), metric = 'euclidean')
for _,i in self.word_to_index.items():
self.index.add_item(i, self.word_vectors[i])
#50 in number of trees. More trees, more precission
self.index.build(50)
@classmethod
def from_embedding_file(cls, filepath):
word_to_index={}
word_vectors = []
with open(filepath,encoding='UTF8') as f:
for line in f.readlines():
cols= line.split(' ')
word = cols[0]
embedding = np.array([float(x) for x in cols[1:]])
word_to_index[word] = len(word_to_index)
word_vectors.append(embedding)
return cls(word_to_index, word_vectors)
def get_word_vector(self, word):
return self.word_vectors[self.word_to_index[word]]
def get_nearest_neighbout(self, embedding, n =1):
nn_indices = self.index.get_nns_by_vector(embedding, n)
return [self.index_to_word[i] for i in nn_indices]
Load word embedding from file
Link to download glove embedding http://nlp.stanford.edu/data/glove.6B.zip
file = os.path.join(args.embedding_folder, args.embedding_file)
embedding = PretrainedEmbeddings.from_embedding_file(file)
Find word analogy
def find_analogy(w1,w2,w3, n):
#w1-w2 = w3-w4
#w4 = w3 + w2 -w1
e1 = embedding.get_word_vector(w1)
e2 = embedding.get_word_vector(w2)
e3 = embedding.get_word_vector(w3)
e4 = e3+e2-e1
nearest_words = embedding.get_nearest_neighbout(e4,n)
nearest_words = [word for word in nearest_words if word != w3]
for i in range(len(nearest_words)):
print('{} > {}:{} :: {}:{}'.format(i+1, w1,w2,w3,nearest_words[i]))
wordsets =[
['cat','dog','tiger'],
['practical','impractical','freedom'],
['love','fight','marriage'],
['usa','mexico','india'],
]
for wordset in wordsets:
print()
find_analogy(wordset[0],wordset[1],wordset[2], 5)
1 > cat:dog :: tiger:hunt
2 > cat:dog :: tiger:hunting
3 > cat:dog :: tiger:woods
4 > cat:dog :: tiger:horse
1 > practical:impractical :: freedom:declaring
2 > practical:impractical :: freedom:proclaimed
3 > practical:impractical :: freedom:traitor
4 > practical:impractical :: freedom:powerless
5 > practical:impractical :: freedom:traitors
1 > love:fight :: marriage:ruled
2 > love:fight :: marriage:rule
3 > love:fight :: marriage:immunity
4 > love:fight :: marriage:abortion
5 > love:fight :: marriage:civil
1 > usa:mexico :: india:province
2 > usa:mexico :: india:provinces
3 > usa:mexico :: india:indonesia
4 > usa:mexico :: india:thailand
5 > usa:mexico :: india:cambodia