africat/africat/categorise.py

#!/usr/bin/python

import argparse
import os
import sys
import pprint
import re
import string
import time
import warnings
# data manupulation
import csv
import random
import pandas as pd
import numpy as np
import itertools
import tqdm
# torch
import torch
import torchdata.datapipes as dp
import torchtext.transforms as T
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator

from models.rnn import RNN

all_categories = list()
# XXX None for all stories
#story_num = 128
#story_num = 256
#story_num = 512
#story_num = 1024
story_num = 4096
#story_num = None

def read_csv(input_csv, rows=None, verbose=0):
  if verbose > 0:
    with open(input_csv, 'r', encoding="utf-8") as f:
      data = pd.concat(
        [chunk for chunk in tqdm.tqdm(
          pd.read_csv(f,
            encoding="utf-8",
            quoting=csv.QUOTE_ALL,
            nrows=rows,
            chunksize=50,
          ),
          desc='Loading data'
        )])
  else:
    with open(input_csv, 'r', encoding="utf-8") as f:
      data = pd.read_csv(f,
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        nrows=rows,
      )

  data.dropna(axis='index', inplace=True)
  #print(data)
  #sys.exit(0)
  return data


'''
  Create Training and Validation sets
'''
def split_dataset(data, verbose=0):
  # Create a list of ints till len of data
  data_idx = list(range(len(data)))
  np.random.shuffle(data_idx)

  # Get indexes for validation and train
  split_percent = 0.05
  num_valid = int(len(data) * split_percent)
  #num_tests = int(len(data) * split_percent)
  #train_idx = data_idx[num_valid:-num_tests]
  train_idx = data_idx[num_valid:]
  valid_idx = data_idx[:num_valid]
  #tests_idx = data_idx[-num_tests:]
  if verbose > 0:
    print("Length of train_data: {}".format(len(train_idx)))
    print("Length of valid_data: {}".format(len(valid_idx)))
    #print("Length of tests_data: {}".format(len(tests_idx)))

  # Create the training and validation sets, as dataframes
  train_data = data.iloc[train_idx].reset_index().drop('index', axis=1)
  valid_data = data.iloc[valid_idx].reset_index().drop('index', axis=1)
  #tests_data = data.iloc[tests_idx].reset_index().drop('index', axis=1)
  #return(train_data, valid_data, tests_data)
  return(train_data, valid_data)


'''
  Create a dataset that builds a tokenised vocabulary,
  and then, as each row is accessed, transforms it into
'''
class TextCategoriesDataset(Dataset):
  ''' Dataset of Text and Categories '''
  def __init__(self, df, text_column, cats_column, lang_column, transform=None, verbose=0):
    '''
    Arguments:
      df (panda.Dataframe): csv content, loaded as dataframe
      text_column (str): the name of the column containing the text
      cats_column (str): the name of the column containing
        semicolon-separated categories
      text_column (str): the name of the column containing the language
      transform (callable, optional): Optional transform to be
        applied on a sample.
    '''
    self.df = df
    self.transform = transform
    self.verbose = verbose

    self.text = self.df[text_column]
    self.cats = self.df[cats_column]
    self.lang = self.df[lang_column]

    # index-to-token dict
    # <pad> : padding, used for padding the shorter sentences in a batch
    #         to match the length of longest sentence in the batch
    # <sos> : start of sentence token
    # <eos> : end of sentence token
    # <unk> : unknown token: words which are not found in the vocab are
    #         replaced by this token
    self.itos = {0: '<pad>', 1:'<sos>', 2:'<eos>', 3: '<unk>'}
    # token-to-index dict
    self.stoi = {k:j for j, k in self.itos.items()}

    # Create vocabularies upon initialisation
    self.text_vocab = build_vocab_from_iterator(
      [self.textTokens(text) for i, text in self.df[text_column].items()],
      min_freq=2,
      specials=self.itos.values(),
      special_first=True
    )
    self.text_vocab.set_default_index(self.text_vocab['<unk>'])
    #print(self.text_vocab.get_itos())

    self.cats_vocab = build_vocab_from_iterator(
      #[self.catTokens(cats) for i, cats in self.df[cats_column].items()],
      [self.catTokens(all_categories)],
      min_freq=1,
      specials=['<unk>'],
      special_first=True
    )
    self.cats_vocab.set_default_index(self.cats_vocab['<unk>'])
    #print(self.cats_vocab.get_itos())

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    # Enable use as a plain iterator
    if idx not in self.df.index:
      raise(StopIteration)

    if torch.is_tensor(idx):
      idx = idx.tolist()

    # Get the raw data
    text = self.text[idx]
    cats = self.cats[idx]
    lang = self.lang[idx]

    if self.transform:
      text, cats = self.transform(text, cats)

    #print(cats)
    #print(self.catTokens(cats))
    #print(self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats)))

    # Numericalise by applying transforms
    return (
      self.getTransform(self.text_vocab, "text")(self.textTokens(text)),
      self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats)),
    )

  @staticmethod
  def textTokens(text):
    if isinstance(text, str):
      return [word for word in text.split()]

  @staticmethod
  def catTokens(cats):
    if isinstance(cats, str):
      return [cat for cat in cats.split(';')]
    elif isinstance(cats, list):
      return [cat for cat in cats]

  def getTransform(self, vocab, vType):
    '''
    Create transforms based on given vocabulary. The returned transform
    is applied to a sequence of tokens.
    '''
    if vType == "text":
      return T.Sequential(
        # converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        # Add <sos> at beginning of each sentence. 1 because the index
        # for <sos> in vocabulary is 1 as seen in previous section
        T.AddToken(self.text_vocab['<sos>'], begin=True),
        # Add <eos> at end of each sentence. 2 because the index
        # for <eos> in vocabulary is 2 as seen in previous section
        T.AddToken(self.text_vocab['<eos>'], begin=False)
      )
    elif vType == "cats":
      return T.Sequential(
        # converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
      )
    else:
      raise Exception('wrong transformation type')


'''
  Now that we have a dataset, let's create a dataloader callback;
  the dataloader can batch, shuffle, and load the data in parallel
'''

class CollateBatch:
  '''
  We need to pad shorter sentences in a batch to make all the sequences
  in a batch of equal length. We can do this a collate_fn callback class,
  which returns a tensor
  '''
  def __init__(self, pad_idx, cats):
    '''
      pad_idx (int):  the index of the "<pad>" token in the vocabulary.
    '''
    self.pad_idx = pad_idx
    self.cats = cats

  def __call__(self, batch):
    '''
      batch: a list of tuples with (text, cats), each of which
             is a list of tokens
    '''
    batch_text, batch_cats = zip(*batch)
    #for i in range(len(batch)):
    #  print(batch[i])
    #max_text_len = len(max(batch_text, key=len))
    #max_cats_len = len(max(batch_cats, key=len))

    #text_tensor = T.ToTensor(self.pad_idx)(batch_text)
    #cats_tensor = T.ToTensor(self.pad_idx)(batch_cats)

    # Pad text to the longest
    text_tensor = nn.utils.rnn.pad_sequence(
      [torch.LongTensor(s) for s in batch_text],
      batch_first=True, padding_value=self.pad_idx
    )
    text_lengths = torch.tensor([t.shape[0] for t in text_tensor])

    #cats_tensor = torch.nn.utils.rnn.pad_sequence(
    #  [torch.LongTensor(s) for s in batch_cats],
    #  batch_first=True, padding_value=self.pad_idx
    #)
    #cats_lengths = torch.LongTensor(list(map(len, batch_cats)))

    '''
    # Pad cats_tensor to all possible categories
    num_cats = len(all_categories)

    # Convert cats to multi-label one-hot representation
    cats_tensor = torch.full((len(batch_cats), num_cats), self.pad_idx).float()
    cats_lengths = torch.LongTensor(list(map(len, batch_cats)))
    for idx, cats in enumerate(batch_cats):
      #print("\nsample", idx, cats)
      for c in cats:
        #print(c)
        cats_tensor[idx][c] = 1
      #print(cats_tensor[idx])
    '''
    # Convert cats to multi-label one-hot representation
    # add one to all_categories to account for <unk>
    cats_tensor = torch.full((len(batch_cats), len(all_categories)+1), self.pad_idx).float()
    for idx, cats in enumerate(batch_cats):
      #print("\nsample", idx, cats)
      for c in cats:
        cats_tensor[idx][c] = 1
      #print(cats_tensor[idx])
    #sys.exit(0)


    '''
    # XXX why??
    ## SORT YOUR TENSORS BY LENGTH!
    text_lengths, perm_idx = text_lengths.sort(0, descending=True)
    text_tensor = text_tensor[perm_idx]
    cats_tensor = cats_tensor[perm_idx]
    '''

    #print("text", text_tensor)
    #print("text shape:", text_tensor.shape)
    #print(cats_tensor)
    #print("cats shape:", cats_tensor.shape)
    #print(text_lengths)
    #print("text_lengths shape:", text_lengths.shape)

    #sys.exit(0)

    return (
      text_tensor,
      cats_tensor,
      text_lengths,
    )

def cat2tensor(label_vocab, labels, pad_idx: int):
  all_labels = vocab.get_itos()
  num_labels = len(all_labels)
  # add <unk>
  if 0 not in all_labels:
    num_labels += 1

  labels_tensor = torch.full((len(labels), num_labels), pad_idx).float()
  labels_lengths = torch.LongTensor(list(map(len, labels)))
  for idx, labels in enumerate(labels):
      #print("\nsample", idx, labels)
      for l in labels:
        labels_tensor[idx][l] = 1
      #print(labels_tensor[idx])
  return labels_tensor

def tensor2cat(vocab, tensor):
  all_cats = vocab.get_itos()
  if tensor.ndimension() == 2:
    batch = list()
    for result in tensor:
      chance = dict()
      for idx, pred in enumerate(result):
        if pred > 0: # XXX
          chance[all_cats[idx]] = pred.item()
      #print(chance)
      batch.append(chance)
    return batch
  elif tensor.ndimension() == 1:
    chance = dict()
    for idx, pred in enumerate(tensor):
      if idx >= len(all_cats):
        print(f"Idx {idx} not in {len(all_cats)} categories")
      #elif pred > 0: # XXX
        #print(idx, len(all_cats))
      chance[all_cats[idx]] = pred.item()
    #print(chance)
    return chance
  else:
   raise ValueError("Only tensors with 2 dimensions are supported")

  return vocab.get_itos(cat)


def train(dataloader, dataset, model, optimizer, criterion, epoch=0):
  total_acc, total_count = 0, 0
  log_interval = 500

  torch.set_printoptions(precision=2)

  batch = tqdm.tqdm(dataloader, unit="batch")
  for idx, data in enumerate(batch):
    batch.set_description(f"Train {epoch}.{idx}")
    text, cats, text_lengths = data
    optimizer.zero_grad()

    output = model(text, text_lengths)
    #print("output", output)
    #print("output shape", output.shape)

    loss = criterion(input=output, target=cats)
    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), 0.1)

    optimizer.step()

    print("train loss", loss)

    ##predicted = np.round(output)
    ##total_acc += (predicted == cats).sum().item()

    predictions = torch.zeros(output.shape)
    #predictions[output >= 0.25] = True
    predictions[output >= 0.5] = True
    predictions[output <  0.5] = False ## assign 0 label to those with less than 0.5

    batch.clear()
    for target, out, pred in list(zip(cats, output, predictions)):
      expect  = tensor2cat(dataset.cats_vocab, target)
      raw     = tensor2cat(dataset.cats_vocab, out)
      predict = tensor2cat(dataset.cats_vocab, pred)
      print("Expected:  ", expect)
      print("Predicted: ", predict)
      print("Raw output:", raw)
      print("\n")
    batch.refresh()

    N, C = cats.shape
    #print("eq", (output == cats))
    #print("sum", (output == cats).sum())
    #print("accuracy", (output == cats).sum() / (N*C) * 100)
    accuracy = (output == cats).sum() / (N*C) * 100
    total_acc += accuracy
    #print("train accuracy", accuracy)
    #print("train total_acc", total_acc)
    total_count += cats.size(0)
    batch.set_postfix({
      "accuracy": int(total_acc / total_count),
    })
    total_acc, total_count = 0, 0


def evaluate(dataloader, dataset, model, criterion, epoch=0):
  model.eval()
  total_acc, total_count = 0, 0

  with torch.no_grad():
    batch = tqdm.tqdm(dataloader, unit="batch")
    for idx, data in enumerate(batch):
      batch.set_description(f"Evaluate {epoch}.{idx}")
      text, cats, text_lengths = data

      output = model(text, text_lengths)
      #print("eval predicted", output)

      loss = criterion(output, cats)
      #print("eval loss", loss)

      predictions = torch.zeros(output.shape)
      predictions[output >= 0.5] = True
      predictions[output <  0.5] = False ## assign 0 label to those with less than 0.5

      batch.clear()
      for target, out, pred in list(zip(cats, output, predictions)):
        expect  = tensor2cat(dataset.cats_vocab, target)
        raw     = tensor2cat(dataset.cats_vocab, out)
        predict = tensor2cat(dataset.cats_vocab, pred)
        print("Evaluate expected:  ", expect)
        print("Evaluate predicted: ", predict)
        print("Evaluate raw output:", raw)
        print("\n")
      batch.refresh()

      ##total_acc += (predicted_cats.argmax(1) == cats).sum().item()
      N, C = cats.shape
      accuracy = (predictions == cats).sum() / (N*C) * 100
      total_acc += accuracy
      #print("eval accuracy", accuracy)
      #print("eval total_acc", total_acc)
      total_count += cats.size(0)

      batch.set_postfix({
        "accuracy": int(total_acc / total_count),
      })
    return total_acc / total_count


def main():
  parser = argparse.ArgumentParser(
    description='Classify text data according to categories',
    add_help=True,
  )
  parser.add_argument('action',
    help='train or classify')
  parser.add_argument('--input', '-i',
    required=True,
    help='path of CSV file containing dataset')
  parser.add_argument('--model', '-m',
    #required=True, # XXX
    help='path to training model')
  parser.add_argument('--verbose',  '-v',
    type=int, nargs='?',
    const=1,     # Default value if -v is supplied
    default=0,   # Default value if -v is not supplied
    help='print debugging')
  args = parser.parse_args()

  if args.action != 'train' and args.action != 'classify':
    print("ERROR: train or classify data")
    sys.exit(1)

  if args.action == 'classify' and s.path.isfile(model_storage) is None:
    print("No model found for classification; running training instead")
    args.action = 'train'

  if os.path.isfile(args.input) is False:
    print(f"{args.input} is not a valid file")
    sys.exit(1)

  data = read_csv(input_csv=args.input, rows=story_num, verbose=args.verbose)

  # create list of all categories
  global all_categories
  for cats in data.categories:
    for c in cats.split(";"):
      if c not in all_categories:
        all_categories.append(c)
  all_categories = sorted(all_categories)
  #print(all_categories)
  #print(len(all_categories))
  #sys.exit(0)

  train_data, valid_data, = split_dataset(data, verbose=args.verbose)

  '''
  dataset = TextCategoriesDataset(df=data,
    text_column="content",
    cats_column="categories",
    lang_column="language",
    verbose=args.verbose,
  )
  '''
  train_dataset = TextCategoriesDataset(df=train_data,
    text_column="content",
    cats_column="categories",
    lang_column="language",
    verbose=args.verbose,
  )
  valid_dataset = TextCategoriesDataset(df=valid_data,
    text_column="content",
    cats_column="categories",
    lang_column="language",
    verbose=args.verbose,
  )
  #for text, cat in enumerate(train_dataset):
  #  print(text, cat)
  #print("-" * 20)
  #for text, cat in enumerate(valid_dataset):
  #  print(text, cat)
  #sys.exit(0)

  # Get cpu, gpu or mps device for training.
  # Move tensor to the NVIDIA GPU if available
  device = (
    "cuda" if torch.cuda.is_available()
    else "xps" if hasattr(torch, "xpu") and torch.xpu.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
  )
  print(f"Using {device} device")

  # Hyperparameters
  #epochs = 10      # epoch
  epochs = 4       # epoch
  #lr = 5           # learning rate
  #lr = 0.5
  #lr = 0.05
  #lr = 0.005       # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate
  lr = 0.0001
  batch_size = 64  # batch size for training
  #batch_size = 16  # batch size for training
  #batch_size = 8   # batch size for training
  #batch_size = 4   # batch size for training

  #num_layers = 2 # 2-3 layers should be enough for LTSM
  num_layers = 3 # 2-3 layers should be enough for LTSM
  hidden_size = 128 # hidden size of rnn module, should be tweaked manually
  #hidden_size = 8 # hidden size of rnn module, should be tweaked manually
  mean_seq = True # use mean of rnn output
  #mean_seq = False # use mean of rnn output
  weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
  #weight_decay = 1e-3 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4

  '''
  dataloader = DataLoader(dataset,
    batch_size=4,
    drop_last=True,
    shuffle=True,
    num_workers=0,
    collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
  )
  '''
  train_dataloader = DataLoader(train_dataset,
    batch_size=batch_size,
    drop_last=True,
    shuffle=True,
    num_workers=0,
    collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
  )
  valid_dataloader = DataLoader(valid_dataset,
    batch_size=batch_size,
    drop_last=True,
    shuffle=True,
    num_workers=0,
    collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
  )
  #for i_batch, sample_batched in enumerate(dataloader):
  #  print(i_batch, sample_batched[0], sample_batched[1])
  #for i_batch, sample_batched in enumerate(train_dataloader):
  #  print(i_batch, sample_batched[0], sample_batched[1])
  #sys.exit(0)

  input_size = len(train_dataset.text_vocab)
  output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category

  embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples
  embedding_size = embed.size(1) # was 64 (should be: samples)

  if args.verbose:
    #for i in train_dataset.text_vocab.get_itos():
    #  print(i)
    print("input_size: ", input_size)
    print("output_size:", output_size)
    print("embed shape:", embed.shape)
    print("embedding_size:", embedding_size, " (that is, number of samples)")

  model = RNN(
    #rnn_model='GRU',
    rnn_model='LSTM',
    vocab_size=input_size,
    embed_size=embedding_size,
    num_output=output_size,
    use_last=(not mean_seq),
    hidden_size=hidden_size,
    embedding_tensor=embed,
    num_layers=num_layers,
    batch_first=True
  )
  if args.verbose:
    print(model)

  # optimizer and loss
  criterion = nn.BCEWithLogitsLoss()
  #optimizer = torch.optim.SGD(model.parameters(), lr=lr)
  optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
  if args.verbose:
    print(criterion)
    print(optimizer)

  total_accu = None
  #for epoch in range(1, epochs + 1):
  e = tqdm.tqdm(range(1, epochs + 1), unit="epoch")
  for epoch in e:
    e.set_description(f"Epoch {epoch}")

    train_dataset.to(device)
    valid_dataset.to(device)
    model.to(device)

    model.train()
    train(train_dataloader, train_dataset,  model, optimizer, criterion, epoch)

    accu_val = evaluate(valid_dataloader, valid_dataset, model, criterion, epoch)

    if total_accu is not None and total_accu > accu_val:
      optimizer.step()
    else:
      total_accu = accu_val
    e.set_postfix({
      "accuracy": accu_val.int().item(),
    })

#  print("Checking the results of test dataset.")
#  accu_test = evaluate(test_dataloader, test_dataset)
#  print("test accuracy {:8.3f}".format(accu_test))

  return

if __name__ == "__main__":
  main()

# vim: set expandtab shiftwidth=2 softtabstop=2: