PyTorch IsRead Predictor on my email
This is a GRU based RNN classifier to predict the read probability of a user from his/her email data.
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
input_file = r'mydata_3days.tsv'
df = pd.read_csv(input_file, sep='\t', header = None)
df.columns = ['subject','isread','time']
df = df[df.subject.notna()]
print(len(df))
df.head(5)
print(df.groupby('isread').count())
df.loc[df.isread].head(2)
#60,20,20
train, validate = np.split(df.sample(frac=1), [int(.6*len(df))])
train['split'] = 'train'
validate['split'] = 'val'
#test['split'] = 'test'
df = pd.concat([train,validate])
print(df.groupby('split')['isread'].count())
print('\nTotal isreads\n')
print(df.groupby('split')['isread'].sum())
df.head()
1218
subject time
isread
False 1090 1090
True 128 128
split
train 730
val 488
Name: isread, dtype: int64
Total isreads
split
train 69.0
val 59.0
Name: isread, dtype: float64
subject | isread | time | split | |
---|---|---|---|---|
694 | PR - Updating package InventoryEntities from v... | False | 2019-06-24T16:30:11.0000000Z | train |
588 | PR - Updating build version to v16.02.1397.000... | False | 2019-06-24T14:01:54.0000000Z | train |
156 | Your scheduled experiment submission ( 09bd68e... | False | 2019-06-23T15:30:44.0000000Z | train |
436 | RE: Azure Cognitive Service Form Recognizer | False | 2019-06-24T05:34:50.0000000Z | train |
287 | PR - Updating package TorusGriffinSecrets from... | False | 2019-06-25T06:58:06.0000000Z | train |
class Tokenizer():
def __init__(self, token_to_index, maxlen = 10):
self.token_to_index = token_to_index
self.index_to_token = {index:token for token,index in token_to_index.items()}
self.maxlen = maxlen
@classmethod
def tokenize(self, text):
text = text.lower()
tokens = text.split()
return tokens
def tokens_to_tensor(self,tokens):
tensor = np.zeros((self.maxlen,),dtype=np.int64)
for i in range(min(len(tokens),self.maxlen)):
token = tokens[i]
tensor[i] = self.token_to_index[token] if token in self.token_to_index else 1
return tensor
def text_to_tensor(self,text):
tokens = Tokenizer.tokenize(text)
indexes = self.tokens_to_tensor(tokens)
return indexes
@classmethod
def create_tokenizer_from_df(cls,df):
token_to_index = {}
token_to_index['<MASK>'] = 0
token_to_index['<PAD>'] = 1
lengths = []
for subject in df.subject.values:
if type(subject) is str:
tokens = cls.tokenize(subject)
lengths.append(len(tokens))
for token in tokens:
if token not in token_to_index:
token_to_index[token] = len(token_to_index)
lengths.sort()
max_len = lengths[int(len(lengths)*0.9)]
print('maxlen = ',max_len)
return cls(token_to_index,max_len)
%time tokenizer = Tokenizer.create_tokenizer_from_df(df)
print('vocab len = ', len(tokenizer.token_to_index))
tokenizer.text_to_tensor('Hello , how are you ?')
maxlen = 15
CPU times: user 3.46 ms, sys: 0 ns, total: 3.46 ms
Wall time: 3.24 ms
vocab len = 2518
array([ 1, 1, 79, 325, 218, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0])
Dataset
class EmailDataSets(Dataset):
def __init__(self,df, tokenizer):
super(EmailDataSets, self).__init__()
self.df = df
self.active_df = df[df['split']=='train']
self.tokenizer = tokenizer
def set_split(self, split):
self.active_df = df[df['split']== split]
def __getitem__(self, index):
row = self.active_df.iloc[index]
subject = row['subject']
subject_tensor = tokenizer.text_to_tensor(subject)
label = np.array(1 if row['isread'] else 0)
return subject_tensor,label
def __len__(self):
return len(self.active_df)
dataset = EmailDataSets(df, tokenizer)
print(dataset[0])
dataloader = DataLoader(dataset,batch_size=10)
for x,y in dataloader:
print('x = ',x.shape,'y = ', y.shape)
break
(array([ 2, 3, 4, 5, 6, 7, 8, 3, 9, 10, 11, 12, 13, 0, 0]), array(0))
x = torch.Size([10, 15]) y = torch.Size([10])
Classifier
class IsReadClassifier_RNN(nn.Module):
def __init__(self, vocab_len, seq_len, hidden_dim, embedding_dim, num_layers=1):
super(IsReadClassifier_RNN,self).__init__()
self.embed = nn.Embedding(vocab_len,embedding_dim)
self.GRU = nn.GRU(input_size= embedding_dim, hidden_size= hidden_dim, batch_first = True, \
bidirectional = False, num_layers = num_layers)
self.fc1 = nn.Linear(hidden_dim,1)
def forward(self, x, apply_sigmoid = False):
'''
args
x = shape(batch,seq_len)
'''
yhat = self.embed(x)
_,yhat = self.GRU(yhat) #**h_n** of shape `(num_layers * num_directions, batch, hidden_size)`
yhat = yhat.permute(1,0,2)
yhat = yhat[:, 0, :]
yhat = yhat.contiguous().view(yhat.shape[0], -1)
#print('yhat GRU = ',yhat.shape)
yhat = self.fc1(yhat)
if apply_sigmoid:
yhat = torch.sigmoid(yhat)
return yhat.squeeze()
vocab_len = len(tokenizer.token_to_index)
c = IsReadClassifier_RNN(vocab_len, tokenizer.maxlen, 16, 64, 2)
print(c)
c.to(device)
for x,y in dataloader:
x = x.to(device)
y = y.to(device)
print (x.dtype)
yhat = c.forward(x,True)
print (yhat.dtype)
print(yhat, y)
break
IsReadClassifier_RNN(
(embed): Embedding(2518, 64)
(GRU): GRU(64, 16, num_layers=2, batch_first=True)
(fc1): Linear(in_features=16, out_features=1, bias=True)
)
torch.int64
torch.float32
tensor([0.5130, 0.5123, 0.5351, 0.5406, 0.5145, 0.5351, 0.5476, 0.5396, 0.5351,
0.5395], device='cuda:0', grad_fn=<SqueezeBackward0>) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Check for GPU
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device('cpu')
print(device)
cuda:0
def calculate_val_loss(model, dataset):
model.eval()
dataset.set_split('val')
dataloader = DataLoader(dataset, batch_size=64)
loss = torch.nn.BCEWithLogitsLoss()
total ,correct ,losses = 0,0,[]
for x,y in dataloader:
with torch.no_grad():
x = x.to(device)
y = y.to(device)
yhat = model(x)
y = torch.tensor(y, dtype = torch.float32)
y = y.to(device)
losses.append(loss(yhat,y).item())
yhat = torch.tensor([1.0 if p>0.5 else 0.0 for p in yhat],dtype = torch.float32, device = device)
correct += torch.sum(y==yhat).item()
total += len(y)
loss_avg = sum(losses)/len(losses)
dataset.set_split('train')
model.train()
return loss_avg, correct/total
Start training
%%time
model = IsReadClassifier_RNN(vocab_len, tokenizer.maxlen, 16, 64, 2)
optimizer = torch.optim.Adam( model.parameters(), lr=0.001,)
epochs = 30
losses = []
loader = DataLoader(dataset,batch_size=16,)
loss = torch.nn.BCEWithLogitsLoss()
model.to(device)
model.train()
for e in range(epochs):
for x, y in loader:
x = x.to(device)
y = torch.tensor(y, dtype = torch.float32)
y = y.to(device)
optimizer.zero_grad()
yhat = model(x)
#print(yhat, y)
output = loss(yhat,y)
output.backward()
optimizer.step()
losses.append(output.item())
vloss,vacc = calculate_val_loss(model,dataset)
print('loss = {}, vloss = {}, vacc = {}'.format(losses[-1],vloss,vacc))
/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/__main__.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
loss = 0.14965474605560303, vloss = 0.41250869259238243, vacc = 0.8770491803278688
loss = 0.12384742498397827, vloss = 0.38864874467253685, vacc = 0.8790983606557377
loss = 0.11042094230651855, vloss = 0.3658701665699482, vacc = 0.8790983606557377
loss = 0.09510580450296402, vloss = 0.34096507355570793, vacc = 0.8790983606557377
loss = 0.0777280330657959, vloss = 0.2963421009480953, vacc = 0.8790983606557377
loss = 0.05483933165669441, vloss = 0.2588787507265806, vacc = 0.8790983606557377
loss = 0.030554382130503654, vloss = 0.2376303467899561, vacc = 0.9262295081967213
loss = 0.020488834008574486, vloss = 0.2503366619348526, vacc = 0.9139344262295082
loss = 0.014973670244216919, vloss = 0.23572852555662394, vacc = 0.9344262295081968
loss = 0.012415190227329731, vloss = 0.25379848945885897, vacc = 0.930327868852459
loss = 0.009942229837179184, vloss = 0.2555901985615492, vacc = 0.9323770491803278
loss = 0.00861263182014227, vloss = 0.26605359092354774, vacc = 0.9323770491803278
loss = 0.007612465415149927, vloss = 0.28402069583535194, vacc = 0.930327868852459
loss = 0.0064257122576236725, vloss = 0.28632022719830275, vacc = 0.930327868852459
loss = 0.005462944973260164, vloss = 0.27554639894515276, vacc = 0.9364754098360656
loss = 0.005146912299096584, vloss = 0.3095714058727026, vacc = 0.9282786885245902
loss = 0.004429324995726347, vloss = 0.2790329046547413, vacc = 0.9405737704918032
loss = 0.003884026547893882, vloss = 0.28687122091650963, vacc = 0.9385245901639344
loss = 0.0036064968444406986, vloss = 0.3291930612176657, vacc = 0.9262295081967213
loss = 0.003162527456879616, vloss = 0.30498973093926907, vacc = 0.9385245901639344
loss = 0.002883601002395153, vloss = 0.3186510391533375, vacc = 0.9405737704918032
loss = 0.0029135930817574263, vloss = 0.30434839613735676, vacc = 0.9385245901639344
loss = 0.0032340672332793474, vloss = 0.3205806640908122, vacc = 0.9262295081967213
loss = 0.0027146299835294485, vloss = 0.31844725366681814, vacc = 0.9364754098360656
loss = 0.0024211949203163385, vloss = 0.3328682454302907, vacc = 0.9323770491803278
loss = 0.00218278169631958, vloss = 0.34755814447999, vacc = 0.9282786885245902
loss = 0.0019903674256056547, vloss = 0.36618472915142775, vacc = 0.9241803278688525
loss = 0.00183000264223665, vloss = 0.3771845642477274, vacc = 0.9241803278688525
loss = 0.0016959089552983642, vloss = 0.3852106425911188, vacc = 0.9241803278688525
loss = 0.0015792440390214324, vloss = 0.391275723464787, vacc = 0.9241803278688525
CPU times: user 15 s, sys: 256 ms, total: 15.3 s
Wall time: 15.3 s
def predict (model,text,tokenizer):
model.to(torch.device('cpu')).eval()
with torch.no_grad():
x = tokenizer.text_to_tensor(text)
x = torch.from_numpy(x).unsqueeze(0)
yhat = model(x,True)
return yhat.item()
predict (model, 'hi', dataset.tokenizer )
for index,row in df[df['split']=='val'].head(20).iterrows():
print('{}\t{:f}\t{}'.format( row.isread,predict(model, row.subject,dataset.tokenizer), row.subject+' ...'))
True 0.992817 Your video has finished processing - "PreTraining AML-BERT code on Matrix" ...
False 0.001732 FS: Mega bloks table and building bag. ...
False 0.002356 <WTS> Kid's bike (12inch wheel size) ...
False 0.001922 Everlast 100lb. Heavy Bag ...
True 0.993263 INFORM: INC22717582 training was reset abruptly ...
False 0.003949 Re: Kenmore Coldspot 58289891 refrigerator ...
False 0.002657 Google home ...
True 0.990992 RE: Code for BERT large training? ...
False 0.001544 RE: Double Double: ML.NET and Auto ML ...
False 0.001425 RE: [PROD] Sev 3: ID 129379890: [WpA] [PROD] Tenants Not Delivered ...
False 0.990373 Desk - $40 OBO ...
False 0.001392 PR - Updating package LockBoxClient from version 16.... - MARS 403278 (MPU Build Account) ...
False 0.001342 Your scheduled experiment submission ( 6bf8dba3-09c1-42a3-8cf9-0ebf120b845e ) is skipped. ...
False 0.001565 RE: FS: 6.5kW Diesel Generator, ~100 hours, needs new battery - $500 ...
False 0.001382 PR - Pyspark script for Tenant Details - MARS 404211 (Sneha Saran) ...
True 0.002787 6/24-6/25 full day workshop 6/27 OOF ...
True 0.003230 Need training data? ...
False 0.001369 PR - Log improvement - MARS 404645 (Raj Srivastava) ...
False 0.001389 PR - Updating package InventoryEntities from version... - MARS 404508 (MPU Build Account) ...
False 0.001522 PR - Read TLC batch object from config file and Anan... - MARS 400619 (Komal Gyanani) ...