Get model working (basically)

2023-12-13 11:57:48 +02:00
parent 5dd850d1cb
commit fe7870e9d4
4 changed files with 305 additions and 50 deletions
--- a/africat/categorise.py
+++ b/africat/categorise.py
@@ -13,6 +13,7 @@ import csv
 import random
 import pandas as pd
 import numpy as np
 import itertools
 #from pandarallel import pandarallel
 from tqdm import tqdm
 # torch
@@ -23,11 +24,14 @@ from torchtext.vocab import build_vocab_from_iterator
 from torch.utils.data import Dataset, DataLoader
 from torch import nn
-story_num = 40 # XXX None for all
+from models.rnn import RNN
 story_num = 64 # XXX None for all
 # Hyperparameters
 EPOCHS = 10       # epoch
-LR = 5            # learning rate
+#LR = 5            # learning rate
 LR = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate
 BATCH_SIZE = 64   # batch size for training
 def read_csv(input_csv, rows=None, verbose=0):
@@ -126,7 +130,7 @@ class TextCategoriesDataset(Dataset):
    self.text_vocab = build_vocab_from_iterator(
      [self.textTokens(text) for i, text in self.df[text_column].items()],
      min_freq=2,
-      specials= self.itos.values(),
+      specials=self.itos.values(),
      special_first=True
    )
    self.text_vocab.set_default_index(self.text_vocab['<unk>'])
@@ -135,7 +139,7 @@ class TextCategoriesDataset(Dataset):
    self.cats_vocab = build_vocab_from_iterator(
      [self.catTokens(cats) for i, cats in self.df[cats_column].items()],
      min_freq=1,
-      specials= self.itos.values(),
+      specials=['<unk>'],
      special_first=True
    )
    self.cats_vocab.set_default_index(self.cats_vocab['<unk>'])
@@ -162,8 +166,8 @@ class TextCategoriesDataset(Dataset):
    # Numericalise by applying transforms
    return (
-      self.getTransform(self.text_vocab)(self.textTokens(text)),
+      self.getTransform(self.text_vocab, "text")(self.textTokens(text)),
-      self.getTransform(self.cats_vocab)(self.catTokens(cats)),
+      self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats)),
    )
  @staticmethod
@@ -178,26 +182,32 @@ class TextCategoriesDataset(Dataset):
    elif isinstance(cats, list):
      return [cat for cat in cats]
-  def getTransform(self, vocab):
+  def getTransform(self, vocab, vType):
    '''
    Create transforms based on given vocabulary. The returned transform
    is applied to a sequence of tokens.
    '''
-    return T.Sequential(
+    if vType == "text":
-      # converts the sentences to indices based on given vocabulary
+      return T.Sequential(
-      T.VocabTransform(vocab=vocab),
+        # converts the sentences to indices based on given vocabulary
-      # Add <sos> at beginning of each sentence. 1 because the index
+        T.VocabTransform(vocab=vocab),
-      # for <sos> in vocabulary is 1 as seen in previous section
+        # Add <sos> at beginning of each sentence. 1 because the index
-      T.AddToken(1, begin=True),
+        # for <sos> in vocabulary is 1 as seen in previous section
-      # Add <eos> at beginning of each sentence. 2 because the index
+        T.AddToken(self.text_vocab['<sos>'], begin=True),
-      # for <eos> in vocabulary is 2 as seen in previous section
+        # Add <eos> at end of each sentence. 2 because the index
-      T.AddToken(2, begin=False)
+        # for <eos> in vocabulary is 2 as seen in previous section
-    )
+        T.AddToken(self.text_vocab['<eos>'], begin=False)
      )
    else:
      return T.Sequential(
        # converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
      )
 '''
-  Now that we have a dataset, let's create dataloader,
+  Now that we have a dataset, let's create a dataloader callback;
-  which can batch, shuffle, and load the data in parallel
+  the dataloader can batch, shuffle, and load the data in parallel
 '''
 class CollateBatch:
@@ -207,41 +217,105 @@ class CollateBatch:
  which returns a tensor
  '''
  def __init__(self, pad_idx):
    '''
      pad_idx (int):  the index of the "<pad>" token in the vocabulary.
    '''
    self.pad_idx = pad_idx
  def __call__(self, batch):
-    # T.ToTensor(0) returns a transform that converts the sequence
+    '''
-    # to a torch.tensor and also applies padding.
+      batch: a list of tuples with (text, cats), each of which
-    #
+             is a list of tokens
-    # pad_idx is passed to the constructor to specify the index of
+    '''
-    # the "<pad>" token in the vocabulary.
+    batch_text, batch_cats = zip(*batch)
    #for i in range(len(batch)):
    #  print(batch[i])
    #max_text_len = len(max(batch_text, key=len))
    #max_cats_len = len(max(batch_cats, key=len))
    #text_tensor = T.ToTensor(self.pad_idx)(batch_text)
    #cats_tensor = T.ToTensor(self.pad_idx)(batch_cats)
    # Pad text to the longest
    text_tensor = torch.nn.utils.rnn.pad_sequence(
      [torch.LongTensor(s) for s in batch_text],
      batch_first=True, padding_value=self.pad_idx
    )
    text_lengths = torch.tensor([t.shape[0] for t in text_tensor])
    #cats_tensor = torch.nn.utils.rnn.pad_sequence(
    #  [torch.LongTensor(s) for s in batch_cats],
    #  batch_first=True, padding_value=self.pad_idx
    #)
    #cats_lengths = torch.LongTensor(list(map(len, batch_cats)))
    # Pad cats_tensor to all possible categories
    # TODO will this be necessary with larger training sets, that should
    # encompass all categories? Best to be safe...
    all_cats = list(set(itertools.chain(*batch_cats)))
    num_cats = len(all_cats)
    # if there's no 0, there was no <unk>, so increment to allow for it to be a possible category
    if 0 not in all_cats:
      num_cats += 1
    cats_tensor = torch.full((len(batch_cats), num_cats), self.pad_idx).long()
    cats_lengths = torch.LongTensor(list(map(len, batch_cats)))
    for idx, (c, clen) in enumerate(zip(batch_cats, cats_lengths)):
        cats_tensor[idx, :clen] = torch.LongTensor(c)
    # XXX why??
    ## SORT YOUR TENSORS BY LENGTH!
    text_lengths, perm_idx = text_lengths.sort(0, descending=True)
    text_tensor = text_tensor[perm_idx]
    cats_tensor = cats_tensor[perm_idx]
    #print(text_tensor)
    #print("text shape:", text_tensor.shape)
    #print(cats_tensor)
    #print("cats shape:", cats_tensor.shape)
    #print(text_lengths)
    #print("text_lengths shape:", text_lengths.shape)
    #sys.exit(0)
    return (
-      T.ToTensor(self.pad_idx)(list(batch[0])),
+      text_tensor,
-      T.ToTensor(self.pad_idx)(list(batch[1])),
+      cats_tensor,
      text_lengths,
    )
-class TextClassificationModel(nn.Module):
+def train(dataloader, model, optimizer, criterion):
  def __init__(self, input_size, output_size, verbose):
    super().__init__()
  def forward(self, x):
    return x
 def train(dataloader):
  model.train()
  total_acc, total_count = 0, 0
  log_interval = 500
  start_time = time.time()
-  for idx, (label, text) in enumerate(dataloader):
+  for idx, (text, cats, text_lengths) in enumerate(dataloader):
    optimizer.zero_grad()
-    predicted_label = model(text)
+
-    loss = criterion(predicted_label, label)
+    print("text_lengths shape", text_lengths.shape)
    print("input shape", text.shape)
    print("target", cats)
    print("target shape", cats.shape)
    output = model(text, text_lengths)
    print("output", output)
    print("output shape", output.shape)
    # reshape output and target for cross entropy loss
 #    output = output.reshape(output.size(0)*output.size(1), -1)  # (batch * seq_len x classes)
 #    cats = cats.reshape(-1)  # (batch * seq_len), class index
 #    print("output", output)
 #    print("output shape", output.shape)
 #    print("target shape", cats.shape)
 #    print()
    loss = criterion(input=output, target=cats)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()
    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)
    if idx % log_interval == 0 and idx > 0:
@@ -256,7 +330,7 @@ def train(dataloader):
      start_time = time.time()
-def evaluate(dataloader):
+def evaluate(dataloader, model, criterion):
  model.eval()
  total_acc, total_count = 0, 0
@@ -324,7 +398,8 @@ def main():
    lang_column="language",
    verbose=args.verbose,
  )
-  #print(dataset[2])
+  #for text, cat in enumerate(train_dataset):
  #  print(text, cat)
  #for text, cat in enumerate(valid_dataset):
  #  print(text, cat)
  #sys.exit(0)
@@ -361,24 +436,59 @@ def main():
  )
  #for i_batch, sample_batched in enumerate(dataloader):
  #  print(i_batch, sample_batched[0], sample_batched[1])
  #for i_batch, sample_batched in enumerate(train_dataloader):
    #print(i_batch, sample_batched[0], sample_batched[1])
    #print(i_batch)
    #print("batch elements:")
    #for i in sample_batched:
    #  print(i)
    #  print(i.shape)
    #  print("\n")
  #sys.exit(0)
  num_class = len(set([cats for key, cats, text, lang in train_data.values]))
  input_size = len(train_dataset.text_vocab)
-  output_size = len(train_dataset.cats_vocab)
+  output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category
  emsize = 64
  model = TextClassificationModel(input_size, output_size, args.verbose).to(device)
  embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples
  embedding_size = embed.size(1) # was 64 (should be: samples)
  num_layers = 2 # 2-3 layers should be enough for LTSM
  hidden_size = 128 # hidden size of rnn module, should be tweaked manually
  mean_seq = True # use mean of rnn output
  weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
  #for i in train_dataset.text_vocab.get_itos():
  #  print(i)
  print("input_size: ", input_size)
  print("output_size:", output_size)
  print("embed shape:", embed.shape)
  print("embedding_size:", embedding_size, " (that is, number of samples)")
  model = RNN(
    #rnn_model='GRU',
    rnn_model='LSTM',
    vocab_size=input_size,
    embed_size=embedding_size,
    num_output=output_size,
    use_last=(not mean_seq),
    hidden_size=hidden_size,
    embedding_tensor=embed,
    num_layers=num_layers,
    batch_first=True
  )
  print(model)
  # optimizer and loss
  #optimizer = torch.optim.SGD(model.parameters(), lr=LR)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=weight_decay)
  print(criterion)
  print(optimizer)
  criterion = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=LR)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  total_accu = None
  for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
-    train(train_dataloader)
+    train(train_dataloader, model, optimizer, criterion)
-    accu_val = evaluate(valid_dataloader)
+    accu_val = evaluate(valid_dataloader, model, criterion)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
--- a/africat/models/classifier.py
+++ b/africat/models/classifier.py
@@ -0,0 +1,47 @@
 import torch.nn as nn
 class RNN(nn.Module):
  #define all the layers used in model
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
         n_layers, bidirectional, dropout):
    super().__init__()      
    #embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    #lstm layer
    self.lstm = nn.LSTM(embedding_dim, 
               hidden_dim, 
               num_layers=n_layers, 
               bidirectional=bidirectional, 
               dropout=dropout,
               batch_first=True)
    #dense layer
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    #activation function
    self.act = nn.Sigmoid()
  def forward(self, text, text_lengths):
    #text = [batch size,sent_length]
    embedded = self.embedding(text)
    #embedded = [batch size, sent_len, emb dim]
    #packed sequence
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
    packed_output, (hidden, cell) = self.lstm(packed_embedded)
    #hidden = [batch size, num layers * num directions,hid dim]
    #cell = [batch size, num layers * num directions,hid dim]
    #concat the final forward and backward hidden state
    hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
    #hidden = [batch size, hid dim * num directions]
    dense_outputs=self.fc(hidden)
    #Final activation function
    outputs=self.act(dense_outputs)
    return outputs
--- a/africat/models/multiclass.py
+++ b/africat/models/multiclass.py
@@ -0,0 +1,14 @@
 import torch
 import torch.nn as nn
 class Multiclass(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden = nn.Linear(4, 8)
    self.act = nn.ReLU()
    self.output = nn.Linear(8, 3)
  def forward(self, x):
    x = self.act(self.hidden(x))
    x = self.output(x)
    return x
--- a/africat/models/rnn.py
+++ b/africat/models/rnn.py
@@ -0,0 +1,84 @@
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 class RNN(nn.Module):
  def __init__(self, vocab_size, embed_size, num_output, rnn_model='LSTM', use_last=True, embedding_tensor=None,
         padding_index=0, hidden_size=64, num_layers=1, batch_first=True):
    """
    Args:
      vocab_size: vocab size
      embed_size: embedding size
      num_output: number of output (classes)
      rnn_model:  LSTM or GRU
      use_last:  bool
      embedding_tensor:
      padding_index:
      hidden_size: hidden size of rnn module
      num_layers:  number of layers in rnn module
      batch_first: batch first option
    """
    super(RNN, self).__init__()
    self.use_last = use_last
    # embedding
    self.encoder = None
    if torch.is_tensor(embedding_tensor):
      self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=padding_index, _weight=embedding_tensor)
      self.encoder.weight.requires_grad = False
    else:
      self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=padding_index)
    self.drop_en = nn.Dropout(p=0.6)
    # rnn module
    if rnn_model == 'LSTM':
      self.rnn = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5,
                batch_first=True, bidirectional=True)
    elif rnn_model == 'GRU':
      self.rnn = nn.GRU( input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5,
                batch_first=True, bidirectional=True)
    else:
      raise LookupError(' only support LSTM and GRU')
    self.bn2 = nn.BatchNorm1d(hidden_size*2)
    self.fc = nn.Linear(hidden_size*2, num_output)
  def forward(self, x, seq_lengths):
    '''
    Args:
      x: (batch, time_step, input_size)
    Returns:
      num_output size
    '''
    x_embed = self.encoder(x)
    x_embed = self.drop_en(x_embed)
    packed_input = pack_padded_sequence(x_embed, seq_lengths.cpu().numpy(),batch_first=True)
    # r_out shape (batch, time_step, output_size)
    # None is for initial hidden state
    packed_output, ht = self.rnn(packed_input, None)
    out_rnn, _ = pad_packed_sequence(packed_output, batch_first=True)
    row_indices = torch.arange(0, x.size(0)).long()
    col_indices = seq_lengths - 1
    if next(self.parameters()).is_cuda:
      row_indices = row_indices.cuda()
      col_indices = col_indices.cuda()
    if self.use_last:
      last_tensor=out_rnn[row_indices, col_indices, :]
    else:
      # use mean
      last_tensor = out_rnn[row_indices, :, :]
      last_tensor = torch.mean(last_tensor, dim=1)
    fc_input = self.bn2(last_tensor)
    out = self.fc(fc_input)
    return out