Cleanup, and device-aware training

2023-12-21 11:29:59 +02:00
parent c9a9e24619
commit 61d32c5286
3 changed files with 46 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,2 @@
-data/
+data/*
 __pycache__
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-This is a multi-class, multi-label network that categorises text into one of ~160 categories, mostly relating to the African continent.
+This is a multi-class, multi-label NLP network that categorises text into one of ~160 categories, mostly relating to the African continent.

 The training dataset is a proprietry dataset from allAfrica.com, consisting of stories that have been manuially categorised according to AllAfrica's inhouse categorisation scheme.

--- a/africat/categorise.py
+++ b/africat/categorise.py
@ -19,9 +19,9 @@ import tqdm
 import torch
 import torchdata.datapipes as dp
 import torchtext.transforms as T
-from torchtext.vocab import build_vocab_from_iterator
-from torch.utils.data import Dataset, DataLoader
 from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from torchtext.vocab import build_vocab_from_iterator

 from models.rnn import RNN

@ -29,22 +29,11 @@ all_categories = list()
 # XXX None for all stories
 #story_num = 128
 #story_num = 256
-story_num = 512
+#story_num = 512
 #story_num = 1024
+story_num = 4096
 #story_num = None

-# Hyperparameters
-EPOCHS = 10       # epoch
-#EPOCHS = 2       # epoch
-#LR = 5           # learning rate
-#LR = 0.5
-LR = 0.05
-#LR = 0.005        # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate
-#BATCH_SIZE = 64   # batch size for training
-#BATCH_SIZE = 16   # batch size for training
-BATCH_SIZE = 8   # batch size for training
-#BATCH_SIZE = 4   # batch size for training
-
 def read_csv(input_csv, rows=None, verbose=0):
  if verbose > 0:
    with open(input_csv, 'r', encoding="utf-8") as f:
@ -349,9 +338,9 @@ def tensor2cat(vocab, tensor):
    for idx, pred in enumerate(tensor):
      if idx >= len(all_cats):
        print(f"Idx {idx} not in {len(all_cats)} categories")
-      elif pred > 0: # XXX
+      #elif pred > 0: # XXX
        #print(idx, len(all_cats))
-        chance[all_cats[idx]] = pred.item()
+      chance[all_cats[idx]] = pred.item()
    #print(chance)
    return chance
  else:
@ -383,15 +372,15 @@ def train(dataloader, dataset, model, optimizer, criterion, epoch=0):

    optimizer.step()

-    #print("train loss",loss)
+    print("train loss", loss)

    ##predicted = np.round(output)
    ##total_acc += (predicted == cats).sum().item()

    predictions = torch.zeros(output.shape)
-    predictions[output >= 0.25] = True
-    #predictions[output >= 0.5] = True
-    #predictions[output <  0.5] = False ## assign 0 label to those with less than 0.5
+    #predictions[output >= 0.25] = True
+    predictions[output >= 0.5] = True
+    predictions[output <  0.5] = False ## assign 0 label to those with less than 0.5

    batch.clear()
    for target, out, pred in list(zip(cats, output, predictions)):
@ -548,6 +537,28 @@ def main():
  )
  print(f"Using {device} device")

+  # Hyperparameters
+  #epochs = 10      # epoch
+  epochs = 4       # epoch
+  #lr = 5           # learning rate
+  #lr = 0.5
+  #lr = 0.05
+  #lr = 0.005       # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate
+  lr = 0.0001 
+  batch_size = 64  # batch size for training
+  #batch_size = 16  # batch size for training
+  #batch_size = 8   # batch size for training
+  #batch_size = 4   # batch size for training
+
+  #num_layers = 2 # 2-3 layers should be enough for LTSM
+  num_layers = 3 # 2-3 layers should be enough for LTSM
+  hidden_size = 128 # hidden size of rnn module, should be tweaked manually
+  #hidden_size = 8 # hidden size of rnn module, should be tweaked manually
+  mean_seq = True # use mean of rnn output
+  #mean_seq = False # use mean of rnn output
+  weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
+  #weight_decay = 1e-3 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
+
  '''
  dataloader = DataLoader(dataset,
    batch_size=4,
@ -558,14 +569,14 @@ def main():
  )
  '''
  train_dataloader = DataLoader(train_dataset,
-    batch_size=BATCH_SIZE,
+    batch_size=batch_size,
    drop_last=True,
    shuffle=True,
    num_workers=0,
    collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
  )
  valid_dataloader = DataLoader(valid_dataset,
-    batch_size=BATCH_SIZE,
+    batch_size=batch_size,
    drop_last=True,
    shuffle=True,
    num_workers=0,
@ -582,10 +593,6 @@ def main():

  embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples
  embedding_size = embed.size(1) # was 64 (should be: samples)
-  num_layers = 2 # 2-3 layers should be enough for LTSM
-  hidden_size = 128 # hidden size of rnn module, should be tweaked manually
-  mean_seq = True # use mean of rnn output
-  weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4

  if args.verbose:
    #for i in train_dataset.text_vocab.get_itos():
@ -611,22 +618,28 @@ def main():
    print(model)

  # optimizer and loss
-  #optimizer = torch.optim.SGD(model.parameters(), lr=LR)
  criterion = nn.BCEWithLogitsLoss()
-  optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=weight_decay)
+  #optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+  optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
  if args.verbose:
    print(criterion)
    print(optimizer)

  total_accu = None
-  #for epoch in range(1, EPOCHS + 1):
-  e = tqdm.tqdm(range(1, EPOCHS + 1), unit="epoch")
+  #for epoch in range(1, epochs + 1):
+  e = tqdm.tqdm(range(1, epochs + 1), unit="epoch")
  for epoch in e:
    e.set_description(f"Epoch {epoch}")
+
+    train_dataset.to(device)
+    valid_dataset.to(device)
+    model.to(device)
+
    model.train()
    train(train_dataloader, train_dataset,  model, optimizer, criterion, epoch)

    accu_val = evaluate(valid_dataloader, valid_dataset, model, criterion, epoch)
+
    if total_accu is not None and total_accu > accu_val:
      optimizer.step()
    else: