africat/africat/categorise.py

661 lines
20 KiB
Python
Executable File

#!/usr/bin/python
import argparse
import os
import sys
import pprint
import re
import string
import time
import warnings
# data manupulation
import csv
import random
import pandas as pd
import numpy as np
import itertools
import tqdm
# torch
import torch
import torchdata.datapipes as dp
import torchtext.transforms as T
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from models.rnn import RNN
all_categories = list()
# XXX None for all stories
#story_num = 128
#story_num = 256
#story_num = 512
#story_num = 1024
story_num = 4096
#story_num = None
def read_csv(input_csv, rows=None, verbose=0):
if verbose > 0:
with open(input_csv, 'r', encoding="utf-8") as f:
data = pd.concat(
[chunk for chunk in tqdm.tqdm(
pd.read_csv(f,
encoding="utf-8",
quoting=csv.QUOTE_ALL,
nrows=rows,
chunksize=50,
),
desc='Loading data'
)])
else:
with open(input_csv, 'r', encoding="utf-8") as f:
data = pd.read_csv(f,
encoding="utf-8",
quoting=csv.QUOTE_ALL,
nrows=rows,
)
data.dropna(axis='index', inplace=True)
#print(data)
#sys.exit(0)
return data
'''
Create Training and Validation sets
'''
def split_dataset(data, verbose=0):
# Create a list of ints till len of data
data_idx = list(range(len(data)))
np.random.shuffle(data_idx)
# Get indexes for validation and train
split_percent = 0.05
num_valid = int(len(data) * split_percent)
#num_tests = int(len(data) * split_percent)
#train_idx = data_idx[num_valid:-num_tests]
train_idx = data_idx[num_valid:]
valid_idx = data_idx[:num_valid]
#tests_idx = data_idx[-num_tests:]
if verbose > 0:
print("Length of train_data: {}".format(len(train_idx)))
print("Length of valid_data: {}".format(len(valid_idx)))
#print("Length of tests_data: {}".format(len(tests_idx)))
# Create the training and validation sets, as dataframes
train_data = data.iloc[train_idx].reset_index().drop('index', axis=1)
valid_data = data.iloc[valid_idx].reset_index().drop('index', axis=1)
#tests_data = data.iloc[tests_idx].reset_index().drop('index', axis=1)
#return(train_data, valid_data, tests_data)
return(train_data, valid_data)
'''
Create a dataset that builds a tokenised vocabulary,
and then, as each row is accessed, transforms it into
'''
class TextCategoriesDataset(Dataset):
''' Dataset of Text and Categories '''
def __init__(self, df, text_column, cats_column, lang_column, transform=None, verbose=0):
'''
Arguments:
df (panda.Dataframe): csv content, loaded as dataframe
text_column (str): the name of the column containing the text
cats_column (str): the name of the column containing
semicolon-separated categories
text_column (str): the name of the column containing the language
transform (callable, optional): Optional transform to be
applied on a sample.
'''
self.df = df
self.transform = transform
self.verbose = verbose
self.text = self.df[text_column]
self.cats = self.df[cats_column]
self.lang = self.df[lang_column]
# index-to-token dict
# <pad> : padding, used for padding the shorter sentences in a batch
# to match the length of longest sentence in the batch
# <sos> : start of sentence token
# <eos> : end of sentence token
# <unk> : unknown token: words which are not found in the vocab are
# replaced by this token
self.itos = {0: '<pad>', 1:'<sos>', 2:'<eos>', 3: '<unk>'}
# token-to-index dict
self.stoi = {k:j for j, k in self.itos.items()}
# Create vocabularies upon initialisation
self.text_vocab = build_vocab_from_iterator(
[self.textTokens(text) for i, text in self.df[text_column].items()],
min_freq=2,
specials=self.itos.values(),
special_first=True
)
self.text_vocab.set_default_index(self.text_vocab['<unk>'])
#print(self.text_vocab.get_itos())
self.cats_vocab = build_vocab_from_iterator(
#[self.catTokens(cats) for i, cats in self.df[cats_column].items()],
[self.catTokens(all_categories)],
min_freq=1,
specials=['<unk>'],
special_first=True
)
self.cats_vocab.set_default_index(self.cats_vocab['<unk>'])
#print(self.cats_vocab.get_itos())
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
# Enable use as a plain iterator
if idx not in self.df.index:
raise(StopIteration)
if torch.is_tensor(idx):
idx = idx.tolist()
# Get the raw data
text = self.text[idx]
cats = self.cats[idx]
lang = self.lang[idx]
if self.transform:
text, cats = self.transform(text, cats)
#print(cats)
#print(self.catTokens(cats))
#print(self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats)))
# Numericalise by applying transforms
return (
self.getTransform(self.text_vocab, "text")(self.textTokens(text)),
self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats)),
)
@staticmethod
def textTokens(text):
if isinstance(text, str):
return [word for word in text.split()]
@staticmethod
def catTokens(cats):
if isinstance(cats, str):
return [cat for cat in cats.split(';')]
elif isinstance(cats, list):
return [cat for cat in cats]
def getTransform(self, vocab, vType):
'''
Create transforms based on given vocabulary. The returned transform
is applied to a sequence of tokens.
'''
if vType == "text":
return T.Sequential(
# converts the sentences to indices based on given vocabulary
T.VocabTransform(vocab=vocab),
# Add <sos> at beginning of each sentence. 1 because the index
# for <sos> in vocabulary is 1 as seen in previous section
T.AddToken(self.text_vocab['<sos>'], begin=True),
# Add <eos> at end of each sentence. 2 because the index
# for <eos> in vocabulary is 2 as seen in previous section
T.AddToken(self.text_vocab['<eos>'], begin=False)
)
elif vType == "cats":
return T.Sequential(
# converts the sentences to indices based on given vocabulary
T.VocabTransform(vocab=vocab),
)
else:
raise Exception('wrong transformation type')
'''
Now that we have a dataset, let's create a dataloader callback;
the dataloader can batch, shuffle, and load the data in parallel
'''
class CollateBatch:
'''
We need to pad shorter sentences in a batch to make all the sequences
in a batch of equal length. We can do this a collate_fn callback class,
which returns a tensor
'''
def __init__(self, pad_idx, cats):
'''
pad_idx (int): the index of the "<pad>" token in the vocabulary.
'''
self.pad_idx = pad_idx
self.cats = cats
def __call__(self, batch):
'''
batch: a list of tuples with (text, cats), each of which
is a list of tokens
'''
batch_text, batch_cats = zip(*batch)
#for i in range(len(batch)):
# print(batch[i])
#max_text_len = len(max(batch_text, key=len))
#max_cats_len = len(max(batch_cats, key=len))
#text_tensor = T.ToTensor(self.pad_idx)(batch_text)
#cats_tensor = T.ToTensor(self.pad_idx)(batch_cats)
# Pad text to the longest
text_tensor = nn.utils.rnn.pad_sequence(
[torch.LongTensor(s) for s in batch_text],
batch_first=True, padding_value=self.pad_idx
)
text_lengths = torch.tensor([t.shape[0] for t in text_tensor])
#cats_tensor = torch.nn.utils.rnn.pad_sequence(
# [torch.LongTensor(s) for s in batch_cats],
# batch_first=True, padding_value=self.pad_idx
#)
#cats_lengths = torch.LongTensor(list(map(len, batch_cats)))
'''
# Pad cats_tensor to all possible categories
num_cats = len(all_categories)
# Convert cats to multi-label one-hot representation
cats_tensor = torch.full((len(batch_cats), num_cats), self.pad_idx).float()
cats_lengths = torch.LongTensor(list(map(len, batch_cats)))
for idx, cats in enumerate(batch_cats):
#print("\nsample", idx, cats)
for c in cats:
#print(c)
cats_tensor[idx][c] = 1
#print(cats_tensor[idx])
'''
# Convert cats to multi-label one-hot representation
# add one to all_categories to account for <unk>
cats_tensor = torch.full((len(batch_cats), len(all_categories)+1), self.pad_idx).float()
for idx, cats in enumerate(batch_cats):
#print("\nsample", idx, cats)
for c in cats:
cats_tensor[idx][c] = 1
#print(cats_tensor[idx])
#sys.exit(0)
'''
# XXX why??
## SORT YOUR TENSORS BY LENGTH!
text_lengths, perm_idx = text_lengths.sort(0, descending=True)
text_tensor = text_tensor[perm_idx]
cats_tensor = cats_tensor[perm_idx]
'''
#print("text", text_tensor)
#print("text shape:", text_tensor.shape)
#print(cats_tensor)
#print("cats shape:", cats_tensor.shape)
#print(text_lengths)
#print("text_lengths shape:", text_lengths.shape)
#sys.exit(0)
return (
text_tensor,
cats_tensor,
text_lengths,
)
def cat2tensor(label_vocab, labels, pad_idx: int):
all_labels = vocab.get_itos()
num_labels = len(all_labels)
# add <unk>
if 0 not in all_labels:
num_labels += 1
labels_tensor = torch.full((len(labels), num_labels), pad_idx).float()
labels_lengths = torch.LongTensor(list(map(len, labels)))
for idx, labels in enumerate(labels):
#print("\nsample", idx, labels)
for l in labels:
labels_tensor[idx][l] = 1
#print(labels_tensor[idx])
return labels_tensor
def tensor2cat(vocab, tensor):
all_cats = vocab.get_itos()
if tensor.ndimension() == 2:
batch = list()
for result in tensor:
chance = dict()
for idx, pred in enumerate(result):
if pred > 0: # XXX
chance[all_cats[idx]] = pred.item()
#print(chance)
batch.append(chance)
return batch
elif tensor.ndimension() == 1:
chance = dict()
for idx, pred in enumerate(tensor):
if idx >= len(all_cats):
print(f"Idx {idx} not in {len(all_cats)} categories")
#elif pred > 0: # XXX
#print(idx, len(all_cats))
chance[all_cats[idx]] = pred.item()
#print(chance)
return chance
else:
raise ValueError("Only tensors with 2 dimensions are supported")
return vocab.get_itos(cat)
def train(dataloader, dataset, model, optimizer, criterion, epoch=0):
total_acc, total_count = 0, 0
log_interval = 500
torch.set_printoptions(precision=2)
batch = tqdm.tqdm(dataloader, unit="batch")
for idx, data in enumerate(batch):
batch.set_description(f"Train {epoch}.{idx}")
text, cats, text_lengths = data
optimizer.zero_grad()
output = model(text, text_lengths)
#print("output", output)
#print("output shape", output.shape)
loss = criterion(input=output, target=cats)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
print("train loss", loss)
##predicted = np.round(output)
##total_acc += (predicted == cats).sum().item()
predictions = torch.zeros(output.shape)
#predictions[output >= 0.25] = True
predictions[output >= 0.5] = True
predictions[output < 0.5] = False ## assign 0 label to those with less than 0.5
batch.clear()
for target, out, pred in list(zip(cats, output, predictions)):
expect = tensor2cat(dataset.cats_vocab, target)
raw = tensor2cat(dataset.cats_vocab, out)
predict = tensor2cat(dataset.cats_vocab, pred)
print("Expected: ", expect)
print("Predicted: ", predict)
print("Raw output:", raw)
print("\n")
batch.refresh()
N, C = cats.shape
#print("eq", (output == cats))
#print("sum", (output == cats).sum())
#print("accuracy", (output == cats).sum() / (N*C) * 100)
accuracy = (output == cats).sum() / (N*C) * 100
total_acc += accuracy
#print("train accuracy", accuracy)
#print("train total_acc", total_acc)
total_count += cats.size(0)
batch.set_postfix({
"accuracy": int(total_acc / total_count),
})
total_acc, total_count = 0, 0
def evaluate(dataloader, dataset, model, criterion, epoch=0):
model.eval()
total_acc, total_count = 0, 0
with torch.no_grad():
batch = tqdm.tqdm(dataloader, unit="batch")
for idx, data in enumerate(batch):
batch.set_description(f"Evaluate {epoch}.{idx}")
text, cats, text_lengths = data
output = model(text, text_lengths)
#print("eval predicted", output)
loss = criterion(output, cats)
#print("eval loss", loss)
predictions = torch.zeros(output.shape)
predictions[output >= 0.5] = True
predictions[output < 0.5] = False ## assign 0 label to those with less than 0.5
batch.clear()
for target, out, pred in list(zip(cats, output, predictions)):
expect = tensor2cat(dataset.cats_vocab, target)
raw = tensor2cat(dataset.cats_vocab, out)
predict = tensor2cat(dataset.cats_vocab, pred)
print("Evaluate expected: ", expect)
print("Evaluate predicted: ", predict)
print("Evaluate raw output:", raw)
print("\n")
batch.refresh()
##total_acc += (predicted_cats.argmax(1) == cats).sum().item()
N, C = cats.shape
accuracy = (predictions == cats).sum() / (N*C) * 100
total_acc += accuracy
#print("eval accuracy", accuracy)
#print("eval total_acc", total_acc)
total_count += cats.size(0)
batch.set_postfix({
"accuracy": int(total_acc / total_count),
})
return total_acc / total_count
def main():
parser = argparse.ArgumentParser(
description='Classify text data according to categories',
add_help=True,
)
parser.add_argument('action',
help='train or classify')
parser.add_argument('--input', '-i',
required=True,
help='path of CSV file containing dataset')
parser.add_argument('--model', '-m',
#required=True, # XXX
help='path to training model')
parser.add_argument('--verbose', '-v',
type=int, nargs='?',
const=1, # Default value if -v is supplied
default=0, # Default value if -v is not supplied
help='print debugging')
args = parser.parse_args()
if args.action != 'train' and args.action != 'classify':
print("ERROR: train or classify data")
sys.exit(1)
if args.action == 'classify' and s.path.isfile(model_storage) is None:
print("No model found for classification; running training instead")
args.action = 'train'
if os.path.isfile(args.input) is False:
print(f"{args.input} is not a valid file")
sys.exit(1)
data = read_csv(input_csv=args.input, rows=story_num, verbose=args.verbose)
# create list of all categories
global all_categories
for cats in data.categories:
for c in cats.split(";"):
if c not in all_categories:
all_categories.append(c)
all_categories = sorted(all_categories)
#print(all_categories)
#print(len(all_categories))
#sys.exit(0)
train_data, valid_data, = split_dataset(data, verbose=args.verbose)
'''
dataset = TextCategoriesDataset(df=data,
text_column="content",
cats_column="categories",
lang_column="language",
verbose=args.verbose,
)
'''
train_dataset = TextCategoriesDataset(df=train_data,
text_column="content",
cats_column="categories",
lang_column="language",
verbose=args.verbose,
)
valid_dataset = TextCategoriesDataset(df=valid_data,
text_column="content",
cats_column="categories",
lang_column="language",
verbose=args.verbose,
)
#for text, cat in enumerate(train_dataset):
# print(text, cat)
#print("-" * 20)
#for text, cat in enumerate(valid_dataset):
# print(text, cat)
#sys.exit(0)
# Get cpu, gpu or mps device for training.
# Move tensor to the NVIDIA GPU if available
device = (
"cuda" if torch.cuda.is_available()
else "xps" if hasattr(torch, "xpu") and torch.xpu.is_available()
else "mps" if torch.backends.mps.is_available()
else "cpu"
)
print(f"Using {device} device")
# Hyperparameters
#epochs = 10 # epoch
epochs = 4 # epoch
#lr = 5 # learning rate
#lr = 0.5
#lr = 0.05
#lr = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate
lr = 0.0001
batch_size = 64 # batch size for training
#batch_size = 16 # batch size for training
#batch_size = 8 # batch size for training
#batch_size = 4 # batch size for training
#num_layers = 2 # 2-3 layers should be enough for LTSM
num_layers = 3 # 2-3 layers should be enough for LTSM
hidden_size = 128 # hidden size of rnn module, should be tweaked manually
#hidden_size = 8 # hidden size of rnn module, should be tweaked manually
mean_seq = True # use mean of rnn output
#mean_seq = False # use mean of rnn output
weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
#weight_decay = 1e-3 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
'''
dataloader = DataLoader(dataset,
batch_size=4,
drop_last=True,
shuffle=True,
num_workers=0,
collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
)
'''
train_dataloader = DataLoader(train_dataset,
batch_size=batch_size,
drop_last=True,
shuffle=True,
num_workers=0,
collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
)
valid_dataloader = DataLoader(valid_dataset,
batch_size=batch_size,
drop_last=True,
shuffle=True,
num_workers=0,
collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['<pad>']),
)
#for i_batch, sample_batched in enumerate(dataloader):
# print(i_batch, sample_batched[0], sample_batched[1])
#for i_batch, sample_batched in enumerate(train_dataloader):
# print(i_batch, sample_batched[0], sample_batched[1])
#sys.exit(0)
input_size = len(train_dataset.text_vocab)
output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category
embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples
embedding_size = embed.size(1) # was 64 (should be: samples)
if args.verbose:
#for i in train_dataset.text_vocab.get_itos():
# print(i)
print("input_size: ", input_size)
print("output_size:", output_size)
print("embed shape:", embed.shape)
print("embedding_size:", embedding_size, " (that is, number of samples)")
model = RNN(
#rnn_model='GRU',
rnn_model='LSTM',
vocab_size=input_size,
embed_size=embedding_size,
num_output=output_size,
use_last=(not mean_seq),
hidden_size=hidden_size,
embedding_tensor=embed,
num_layers=num_layers,
batch_first=True
)
if args.verbose:
print(model)
# optimizer and loss
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
if args.verbose:
print(criterion)
print(optimizer)
total_accu = None
#for epoch in range(1, epochs + 1):
e = tqdm.tqdm(range(1, epochs + 1), unit="epoch")
for epoch in e:
e.set_description(f"Epoch {epoch}")
train_dataset.to(device)
valid_dataset.to(device)
model.to(device)
model.train()
train(train_dataloader, train_dataset, model, optimizer, criterion, epoch)
accu_val = evaluate(valid_dataloader, valid_dataset, model, criterion, epoch)
if total_accu is not None and total_accu > accu_val:
optimizer.step()
else:
total_accu = accu_val
e.set_postfix({
"accuracy": accu_val.int().item(),
})
# print("Checking the results of test dataset.")
# accu_test = evaluate(test_dataloader, test_dataset)
# print("test accuracy {:8.3f}".format(accu_test))
return
if __name__ == "__main__":
main()
# vim: set expandtab shiftwidth=2 softtabstop=2: