diff --git a/.gitignore b/.gitignore index 773b1fc..899420d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ -data/ +data/* __pycache__ diff --git a/README.md b/README.md index d4c11c8..0b4765d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -This is a multi-class, multi-label network that categorises text into one of ~160 categories, mostly relating to the African continent. +This is a multi-class, multi-label NLP network that categorises text into one of ~160 categories, mostly relating to the African continent. The training dataset is a proprietry dataset from allAfrica.com, consisting of stories that have been manuially categorised according to AllAfrica's inhouse categorisation scheme. diff --git a/africat/categorise.py b/africat/categorise.py index 8e1dd97..650f5b1 100755 --- a/africat/categorise.py +++ b/africat/categorise.py @@ -19,9 +19,9 @@ import tqdm import torch import torchdata.datapipes as dp import torchtext.transforms as T -from torchtext.vocab import build_vocab_from_iterator -from torch.utils.data import Dataset, DataLoader from torch import nn +from torch.utils.data import Dataset, DataLoader +from torchtext.vocab import build_vocab_from_iterator from models.rnn import RNN @@ -29,22 +29,11 @@ all_categories = list() # XXX None for all stories #story_num = 128 #story_num = 256 -story_num = 512 +#story_num = 512 #story_num = 1024 +story_num = 4096 #story_num = None -# Hyperparameters -EPOCHS = 10 # epoch -#EPOCHS = 2 # epoch -#LR = 5 # learning rate -#LR = 0.5 -LR = 0.05 -#LR = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate -#BATCH_SIZE = 64 # batch size for training -#BATCH_SIZE = 16 # batch size for training -BATCH_SIZE = 8 # batch size for training -#BATCH_SIZE = 4 # batch size for training - def read_csv(input_csv, rows=None, verbose=0): if verbose > 0: with open(input_csv, 'r', encoding="utf-8") as f: @@ -349,9 +338,9 @@ def tensor2cat(vocab, tensor): for idx, pred in enumerate(tensor): if idx >= len(all_cats): print(f"Idx {idx} not in {len(all_cats)} categories") - elif pred > 0: # XXX + #elif pred > 0: # XXX #print(idx, len(all_cats)) - chance[all_cats[idx]] = pred.item() + chance[all_cats[idx]] = pred.item() #print(chance) return chance else: @@ -383,15 +372,15 @@ def train(dataloader, dataset, model, optimizer, criterion, epoch=0): optimizer.step() - #print("train loss",loss) + print("train loss", loss) ##predicted = np.round(output) ##total_acc += (predicted == cats).sum().item() predictions = torch.zeros(output.shape) - predictions[output >= 0.25] = True - #predictions[output >= 0.5] = True - #predictions[output < 0.5] = False ## assign 0 label to those with less than 0.5 + #predictions[output >= 0.25] = True + predictions[output >= 0.5] = True + predictions[output < 0.5] = False ## assign 0 label to those with less than 0.5 batch.clear() for target, out, pred in list(zip(cats, output, predictions)): @@ -548,6 +537,28 @@ def main(): ) print(f"Using {device} device") + # Hyperparameters + #epochs = 10 # epoch + epochs = 4 # epoch + #lr = 5 # learning rate + #lr = 0.5 + #lr = 0.05 + #lr = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate + lr = 0.0001 + batch_size = 64 # batch size for training + #batch_size = 16 # batch size for training + #batch_size = 8 # batch size for training + #batch_size = 4 # batch size for training + + #num_layers = 2 # 2-3 layers should be enough for LTSM + num_layers = 3 # 2-3 layers should be enough for LTSM + hidden_size = 128 # hidden size of rnn module, should be tweaked manually + #hidden_size = 8 # hidden size of rnn module, should be tweaked manually + mean_seq = True # use mean of rnn output + #mean_seq = False # use mean of rnn output + weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4 + #weight_decay = 1e-3 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4 + ''' dataloader = DataLoader(dataset, batch_size=4, @@ -558,14 +569,14 @@ def main(): ) ''' train_dataloader = DataLoader(train_dataset, - batch_size=BATCH_SIZE, + batch_size=batch_size, drop_last=True, shuffle=True, num_workers=0, collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['']), ) valid_dataloader = DataLoader(valid_dataset, - batch_size=BATCH_SIZE, + batch_size=batch_size, drop_last=True, shuffle=True, num_workers=0, @@ -582,10 +593,6 @@ def main(): embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples embedding_size = embed.size(1) # was 64 (should be: samples) - num_layers = 2 # 2-3 layers should be enough for LTSM - hidden_size = 128 # hidden size of rnn module, should be tweaked manually - mean_seq = True # use mean of rnn output - weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4 if args.verbose: #for i in train_dataset.text_vocab.get_itos(): @@ -611,22 +618,28 @@ def main(): print(model) # optimizer and loss - #optimizer = torch.optim.SGD(model.parameters(), lr=LR) criterion = nn.BCEWithLogitsLoss() - optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=weight_decay) + #optimizer = torch.optim.SGD(model.parameters(), lr=lr) + optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay) if args.verbose: print(criterion) print(optimizer) total_accu = None - #for epoch in range(1, EPOCHS + 1): - e = tqdm.tqdm(range(1, EPOCHS + 1), unit="epoch") + #for epoch in range(1, epochs + 1): + e = tqdm.tqdm(range(1, epochs + 1), unit="epoch") for epoch in e: e.set_description(f"Epoch {epoch}") + + train_dataset.to(device) + valid_dataset.to(device) + model.to(device) + model.train() train(train_dataloader, train_dataset, model, optimizer, criterion, epoch) accu_val = evaluate(valid_dataloader, valid_dataset, model, criterion, epoch) + if total_accu is not None and total_accu > accu_val: optimizer.step() else: