From c1e3ffdc0b941ac760f54ec9976b59878be23852 Mon Sep 17 00:00:00 2001 From: Timothy Allen Date: Sat, 30 Dec 2023 15:19:52 +0200 Subject: [PATCH] Switch to SentencePiece for tokenisation and Roberta for the model --- africat/categorise.py | 146 ++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 61 deletions(-) diff --git a/africat/categorise.py b/africat/categorise.py index fd44fee..d20246e 100755 --- a/africat/categorise.py +++ b/africat/categorise.py @@ -2,10 +2,10 @@ import argparse import os -import sys -import pprint import re +import pprint import string +import sys import time import warnings # data manupulation @@ -22,14 +22,22 @@ import torchtext.transforms as T import torchtext.vocab as vocab from torch import nn from torch.utils.data import Dataset, DataLoader +from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER + +# Check for TPU availability in notebook environment +tpu_available = os.environ.get('COLAB_TPU_ADDR') is not None + +if tpu_available: + import torch_xla + import torch_xla_py.xla_model as xm xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt" xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model" # XXX None for all stories -story_num = 128 +#story_num = 128 #story_num = 256 -#story_num = 512 +story_num = 512 #story_num = 1024 #story_num = 4096 #story_num = None @@ -115,8 +123,12 @@ class TextCategoriesDataset(Dataset): self.lang = self.df[lang_column] self.text = self.df[text_column] self.cats = self.df.iloc[:, first_cats_column:].sort_index(axis="columns") + self.cats_vocab = self.cats.columns + self.text_length = self.text.str.len().max() + self.num_cats = len(self.cats_vocab) + # index-to-token dict # : padding, used for padding the shorter sentences in a batch # to match the length of longest sentence in the batch @@ -145,8 +157,9 @@ class TextCategoriesDataset(Dataset): cats = self.cats.iloc[idx] #print(self.textTransform()(text)) - #print(cats) - #print(cats.fillna(0).values) + #print(type(cats.fillna(0).values.tolist())) + #print(cats.fillna(0).values.tolist()) + #sys.exit(0) if self.transform: text, cats = self.transform(text, cats) @@ -155,7 +168,7 @@ class TextCategoriesDataset(Dataset): # NaN to zeros and stripping the index return ( self.textTransform()(text), - cats.fillna(0).values, + cats.fillna(0).values.tolist(), ) def textTransform(self): @@ -167,6 +180,8 @@ class TextCategoriesDataset(Dataset): # converts the sentences to indices based on given vocabulary using SentencePiece T.SentencePieceTokenizer(xlmr_spm_model_path), T.VocabTransform(torch.hub.load_state_dict_from_url(xlmr_vocab_path)), + #T.Truncate(self.text_length - 2), # XXX + T.Truncate(256 - 3), # XXX # Add at beginning of each sentence. 1 because the index # for in vocabulary is 1 as seen in previous section T.AddToken(self.stoi[''], begin=True), @@ -221,7 +236,6 @@ class CollateBatch: return ( text_tensor, cats_tensor, - text_lengths, ) def tensor2cat(dataset, tensor): @@ -233,6 +247,7 @@ def tensor2cat(dataset, tensor): for idx, pred in enumerate(result): if pred > 0: # XXX chance[cats[idx]] = pred.item() + chance = dict(sorted(chance.items(), key=lambda x : x[1], reverse=True)) batch.append(chance) return batch elif tensor.ndimension() == 1: @@ -242,24 +257,27 @@ def tensor2cat(dataset, tensor): print(f"Idx {idx} not in {len(cats)} categories") elif pred > 0: # XXX chance[cats[idx]] = pred.item() + chance = dict(sorted(chance.items(), key=lambda x : x[1], reverse=True)) return chance else: raise ValueError("Only tensors with 1 dimension or batches with 2 dimensions are supported") def train(dataloader, dataset, model, optimizer, criterion, epoch=0): - total_acc, total_count = 0, 0 + total_acc, total_count = 0, 1 # XXX log_interval = 500 torch.set_printoptions(precision=2) + model.train() + batch = tqdm.tqdm(dataloader, unit="batch") for idx, data in enumerate(batch): batch.set_description(f"Train {epoch}.{idx}") - text, cats, text_lengths = data + text, cats = data optimizer.zero_grad() - output = model(text, text_lengths) + output = model(text) #print("output", output) #print("output shape", output.shape) @@ -282,9 +300,9 @@ def train(dataloader, dataset, model, optimizer, criterion, epoch=0): batch.clear() for target, out, pred in list(zip(cats, output, predictions)): - expect = tensor2cat(dataset.cats_vocab, target) - raw = tensor2cat(dataset.cats_vocab, out) - predict = tensor2cat(dataset.cats_vocab, pred) + expect = tensor2cat(dataset, target) + raw = tensor2cat(dataset, out) + predict = tensor2cat(dataset, pred) print("Expected: ", expect) print("Predicted: ", predict) print("Raw output:", raw) @@ -307,16 +325,17 @@ def train(dataloader, dataset, model, optimizer, criterion, epoch=0): def evaluate(dataloader, dataset, model, criterion, epoch=0): + total_acc, total_count = 0, 1 # XXX + model.eval() - total_acc, total_count = 0, 0 with torch.no_grad(): batch = tqdm.tqdm(dataloader, unit="batch") for idx, data in enumerate(batch): batch.set_description(f"Evaluate {epoch}.{idx}") - text, cats, text_lengths = data + text, cats = data - output = model(text, text_lengths) + output = model(text) #print("eval predicted", output) loss = criterion(output, cats) @@ -328,9 +347,9 @@ def evaluate(dataloader, dataset, model, criterion, epoch=0): batch.clear() for target, out, pred in list(zip(cats, output, predictions)): - expect = tensor2cat(dataset.cats_vocab, target) - raw = tensor2cat(dataset.cats_vocab, out) - predict = tensor2cat(dataset.cats_vocab, pred) + expect = tensor2cat(dataset, target) + raw = tensor2cat(dataset, out) + predict = tensor2cat(dataset, pred) print("Evaluate expected: ", expect) print("Evaluate predicted: ", predict) print("Evaluate raw output:", raw) @@ -374,7 +393,10 @@ def main(): help='path of CSV file containing dataset') parser.add_argument('--model', '-m', #required=True, # XXX - help='path to training model') + help='path to load training model') + parser.add_argument('--out', '-o', + #required=True, # XXX + help='path to save training model') parser.add_argument('--verbose', '-v', type=int, nargs='?', const=1, # Default value if -v is supplied @@ -386,7 +408,10 @@ def main(): print("ERROR: train or classify data") sys.exit(1) - if args.action == 'classify' and s.path.isfile(model_storage) is None: + model_in = args.model + model_out = args.out + + if args.action == 'classify' and (model_in is None or os.path.isfile(model_in) is None): print("No model found for classification; running training instead") args.action = 'train' @@ -423,29 +448,34 @@ def main(): #print("-" * 20) #for text, cat in enumerate(valid_dataset): # print(text, cat) - #print(tensor2cat(train_dataset, torch.tensor([0, 0, 0, 1., 0.9]))) + #print(tensor2cat(train_dataset, torch.tensor([0, 0, 0, 1., 0.9, 1, 0.5, .6]))) #sys.exit(0) + # Make everything a bit more reproducible + seed_everything(111) + # Get cpu, gpu or mps device for training. # Move tensor to the NVIDIA GPU if available device = ( - "cuda" if torch.cuda.is_available() - else "xps" if hasattr(torch, "xpu") and torch.xpu.is_available() - else "mps" if torch.backends.mps.is_available() + xm.xla_device() if tpu_available # google + else "cuda" if torch.cuda.is_available() # nvidia + else "xps" if hasattr(torch, "xpu") and torch.xpu.is_available() # intel + else "mps" if torch.backends.mps.is_available() # mac else "cpu" ) print(f"Using {device} device") # Hyperparameters #epochs = 10 # epoch - epochs = 4 # epoch + epochs = 6 # epoch + #epochs = 4 # epoch #lr = 5 # learning rate #lr = 0.5 #lr = 0.05 #lr = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate lr = 0.0001 - batch_size = 64 # batch size for training - #batch_size = 16 # batch size for training + #batch_size = 64 # batch size for training + batch_size = 16 # batch size for training #batch_size = 8 # batch size for training #batch_size = 4 # batch size for training @@ -460,10 +490,10 @@ def main(): ''' dataloader = DataLoader(dataset, - batch_size=4, + batch_size=batch_size, drop_last=True, shuffle=True, - num_workers=0, + num_workers=4, collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) ''' @@ -471,48 +501,44 @@ def main(): batch_size=batch_size, drop_last=True, shuffle=True, - num_workers=0, + num_workers=4, collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, drop_last=True, shuffle=True, - num_workers=0, + num_workers=4, collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) #for i_batch, sample_batched in enumerate(dataloader): # print(i_batch, sample_batched[0], sample_batched[1]) - for i_batch, sample_batched in enumerate(train_dataloader): - print(i_batch, sample_batched[0], sample_batched[1]) - sys.exit(0) + #for i_batch, sample_batched in enumerate(train_dataloader): + # print(i_batch, sample_batched[0], sample_batched[1]) + #sys.exit(0) - input_size = len(train_dataset.text_vocab) - output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category - - embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples - embedding_size = embed.size(1) # was 64 (should be: samples) + #input_size = len(train_dataset.text_vocab) + #output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category + #embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples + #embedding_size = embed.size(1) # was 64 (should be: samples) + #input_size = train_dataset.text_length + input_size = 768 + output_size = train_dataset.num_cats if args.verbose: #for i in train_dataset.text_vocab.get_itos(): # print(i) print("input_size: ", input_size) print("output_size:", output_size) - print("embed shape:", embed.shape) - print("embedding_size:", embedding_size, " (that is, number of samples)") + #print("embed shape:", embed.shape) + #print("embedding_size:", embedding_size, " (that is, number of samples)") + + classifier_head = RobertaClassificationHead(num_classes=output_size, input_dim=input_size) + model = XLMR_BASE_ENCODER.get_model(head=classifier_head) + if model_in is not None and os.path.isfile(model_in): + model.load_state_dict(torch.load(model_in)) + model.to(device) - model = RNN( - #rnn_model='GRU', - rnn_model='LSTM', - vocab_size=input_size, - embed_size=embedding_size, - num_output=output_size, - use_last=(not mean_seq), - hidden_size=hidden_size, - embedding_tensor=embed, - num_layers=num_layers, - batch_first=True - ) if args.verbose: print(model) @@ -530,11 +556,6 @@ def main(): for epoch in e: e.set_description(f"Epoch {epoch}") - train_dataset.to(device) - valid_dataset.to(device) - model.to(device) - - model.train() train(train_dataloader, train_dataset, model, optimizer, criterion, epoch) accu_val = evaluate(valid_dataloader, valid_dataset, model, criterion, epoch) @@ -544,13 +565,16 @@ def main(): else: total_accu = accu_val e.set_postfix({ - "accuracy": accu_val.int().item(), + "accuracy": accu_val, }) # print("Checking the results of test dataset.") # accu_test = evaluate(test_dataloader, test_dataset) # print("test accuracy {:8.3f}".format(accu_test)) + if model_out is not None: + torch.save(model.state_dict(), model_out) + return if __name__ == "__main__":