560 lines
17 KiB
Python
Executable File
560 lines
17 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import pprint
|
|
import re
|
|
import string
|
|
import time
|
|
import warnings
|
|
# data manupulation
|
|
import csv
|
|
import random
|
|
import pandas as pd
|
|
import numpy as np
|
|
import itertools
|
|
import tqdm
|
|
# torch
|
|
import torch
|
|
import torchdata.datapipes as dp
|
|
import torchtext.transforms as T
|
|
import torchtext.vocab as vocab
|
|
from torch import nn
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
|
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
|
|
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"
|
|
|
|
# XXX None for all stories
|
|
story_num = 128
|
|
#story_num = 256
|
|
#story_num = 512
|
|
#story_num = 1024
|
|
#story_num = 4096
|
|
#story_num = None
|
|
|
|
def read_csv(input_csv, rows=None, verbose=0):
|
|
if verbose > 0:
|
|
with open(input_csv, 'r', encoding="utf-8") as f:
|
|
data = pd.concat(
|
|
[chunk for chunk in tqdm.tqdm(
|
|
pd.read_csv(f,
|
|
encoding="utf-8",
|
|
quoting=csv.QUOTE_ALL,
|
|
index_col=0,
|
|
nrows=rows,
|
|
chunksize=50,
|
|
),
|
|
desc='Loading data'
|
|
)])
|
|
else:
|
|
with open(input_csv, 'r', encoding="utf-8") as f:
|
|
data = pd.read_csv(f,
|
|
encoding="utf-8",
|
|
quoting=csv.QUOTE_ALL,
|
|
index_col=0,
|
|
nrows=rows,
|
|
)
|
|
|
|
#print(data)
|
|
#sys.exit(0)
|
|
return data
|
|
|
|
|
|
'''
|
|
Create Training and Validation sets
|
|
'''
|
|
def split_dataset(data, verbose=0):
|
|
# Create a list of ints till len of data
|
|
data_idx = list(range(len(data)))
|
|
np.random.shuffle(data_idx)
|
|
|
|
# Get indexes for validation and train
|
|
split_percent = 0.05
|
|
num_valid = int(len(data) * split_percent)
|
|
#num_tests = int(len(data) * split_percent)
|
|
#train_idx = data_idx[num_valid:-num_tests]
|
|
train_idx = data_idx[num_valid:]
|
|
valid_idx = data_idx[:num_valid]
|
|
#tests_idx = data_idx[-num_tests:]
|
|
if verbose > 0:
|
|
print("Length of train_data: {}".format(len(train_idx)))
|
|
print("Length of valid_data: {}".format(len(valid_idx)))
|
|
#print("Length of tests_data: {}".format(len(tests_idx)))
|
|
|
|
# Create the training and validation sets, as dataframes
|
|
train_data = data.iloc[train_idx].reset_index()
|
|
valid_data = data.iloc[valid_idx].reset_index()
|
|
#tests_data = data.iloc[tests_idx].reset_index()
|
|
#return(train_data, valid_data, tests_data)
|
|
return(train_data, valid_data)
|
|
|
|
|
|
'''
|
|
Create a dataset that builds a tokenised vocabulary,
|
|
and then, as each row is accessed, transforms it into
|
|
'''
|
|
class TextCategoriesDataset(Dataset):
|
|
''' Dataset of Text and Categories '''
|
|
def __init__(self, df, lang_column, text_column, first_cats_column=0, transform=None, verbose=0):
|
|
'''
|
|
Arguments:
|
|
df (panda.Dataframe): csv content, loaded as dataframe
|
|
lang_column (str): the name of the column containing the language
|
|
text_column (str): the name of the column containing the text
|
|
first_cats_column (int): the index of the first column containing
|
|
a category
|
|
transform (callable, optional): Optional transform to be applied
|
|
on a sample.
|
|
'''
|
|
self.df = df
|
|
self.transform = transform
|
|
self.verbose = verbose
|
|
|
|
self.lang = self.df[lang_column]
|
|
self.text = self.df[text_column]
|
|
self.cats = self.df.iloc[:, first_cats_column:].sort_index(axis="columns")
|
|
self.cats_vocab = self.cats.columns
|
|
|
|
# index-to-token dict
|
|
# <pad> : padding, used for padding the shorter sentences in a batch
|
|
# to match the length of longest sentence in the batch
|
|
# <sos> : start of sentence token
|
|
# <eos> : end of sentence token
|
|
# <unk> : unknown token: words which are not found in the vocab are
|
|
# replaced by this token
|
|
self.itos = {0: '<pad>', 1:'<sos>', 2:'<eos>', 3: '<unk>'}
|
|
# token-to-index dict
|
|
self.stoi = {k:j for j, k in self.itos.items()}
|
|
|
|
def __len__(self):
|
|
return len(self.df)
|
|
|
|
def __getitem__(self, idx):
|
|
# Enable use as a plain iterator
|
|
if idx not in self.df.index:
|
|
raise(StopIteration)
|
|
|
|
if torch.is_tensor(idx):
|
|
idx = idx.tolist()
|
|
|
|
# Get the raw data
|
|
lang = self.lang[idx]
|
|
text = self.text[idx]
|
|
cats = self.cats.iloc[idx]
|
|
|
|
#print(self.textTransform()(text))
|
|
#print(cats)
|
|
#print(cats.fillna(0).values)
|
|
|
|
if self.transform:
|
|
text, cats = self.transform(text, cats)
|
|
|
|
# Numericalise text by applying transforms, and cats by converting
|
|
# NaN to zeros and stripping the index
|
|
return (
|
|
self.textTransform()(text),
|
|
cats.fillna(0).values,
|
|
)
|
|
|
|
def textTransform(self):
|
|
'''
|
|
Create transforms based on given vocabulary. The returned transform
|
|
is applied to a sequence of tokens.
|
|
'''
|
|
return T.Sequential(
|
|
# converts the sentences to indices based on given vocabulary using SentencePiece
|
|
T.SentencePieceTokenizer(xlmr_spm_model_path),
|
|
T.VocabTransform(torch.hub.load_state_dict_from_url(xlmr_vocab_path)),
|
|
# Add <sos> at beginning of each sentence. 1 because the index
|
|
# for <sos> in vocabulary is 1 as seen in previous section
|
|
T.AddToken(self.stoi['<sos>'], begin=True),
|
|
# Add <eos> at end of each sentence. 2 because the index
|
|
# for <eos> in vocabulary is 2 as seen in previous section
|
|
T.AddToken(self.stoi['<eos>'], begin=False)
|
|
)
|
|
|
|
|
|
|
|
'''
|
|
Now that we have a dataset, let's create a dataloader callback;
|
|
the dataloader can batch, shuffle, and load the data in parallel
|
|
'''
|
|
|
|
class CollateBatch:
|
|
'''
|
|
We need to pad shorter sentences in a batch to make all the sequences
|
|
in a batch of equal length. We can do this a collate_fn callback class,
|
|
which returns a tensor
|
|
'''
|
|
def __init__(self, pad_idx):
|
|
'''
|
|
pad_idx (int): the index of the "<pad>" token in the vocabulary.
|
|
'''
|
|
self.pad_idx = pad_idx
|
|
|
|
def __call__(self, batch):
|
|
'''
|
|
batch: a list of tuples with (text, cats), each of which
|
|
is a list of tokens
|
|
'''
|
|
batch_text, batch_cats = zip(*batch)
|
|
|
|
# Pad text to the longest
|
|
text_tensor = nn.utils.rnn.pad_sequence(
|
|
[torch.LongTensor(s) for s in batch_text],
|
|
batch_first=True, padding_value=self.pad_idx
|
|
)
|
|
text_lengths = torch.tensor([t.shape[0] for t in text_tensor])
|
|
|
|
cats_tensor = torch.tensor(batch_cats, dtype=torch.float32)
|
|
|
|
#print("text", text_tensor)
|
|
#print("text shape:", text_tensor.shape)
|
|
#print(cats_tensor)
|
|
#print("cats shape:", cats_tensor.shape)
|
|
#print(text_lengths)
|
|
#print("text_lengths shape:", text_lengths.shape)
|
|
#sys.exit(0)
|
|
|
|
return (
|
|
text_tensor,
|
|
cats_tensor,
|
|
text_lengths,
|
|
)
|
|
|
|
def tensor2cat(dataset, tensor):
|
|
cats = dataset.cats_vocab
|
|
if tensor.ndimension() == 2:
|
|
batch = list()
|
|
for result in tensor:
|
|
chance = dict()
|
|
for idx, pred in enumerate(result):
|
|
if pred > 0: # XXX
|
|
chance[cats[idx]] = pred.item()
|
|
batch.append(chance)
|
|
return batch
|
|
elif tensor.ndimension() == 1:
|
|
chance = dict()
|
|
for idx, pred in enumerate(tensor):
|
|
if idx >= len(cats):
|
|
print(f"Idx {idx} not in {len(cats)} categories")
|
|
elif pred > 0: # XXX
|
|
chance[cats[idx]] = pred.item()
|
|
return chance
|
|
else:
|
|
raise ValueError("Only tensors with 1 dimension or batches with 2 dimensions are supported")
|
|
|
|
|
|
def train(dataloader, dataset, model, optimizer, criterion, epoch=0):
|
|
total_acc, total_count = 0, 0
|
|
log_interval = 500
|
|
|
|
torch.set_printoptions(precision=2)
|
|
|
|
batch = tqdm.tqdm(dataloader, unit="batch")
|
|
for idx, data in enumerate(batch):
|
|
batch.set_description(f"Train {epoch}.{idx}")
|
|
text, cats, text_lengths = data
|
|
optimizer.zero_grad()
|
|
|
|
output = model(text, text_lengths)
|
|
#print("output", output)
|
|
#print("output shape", output.shape)
|
|
|
|
loss = criterion(input=output, target=cats)
|
|
loss.backward()
|
|
|
|
nn.utils.clip_grad_norm_(model.parameters(), 0.1)
|
|
|
|
optimizer.step()
|
|
|
|
print("train loss", loss)
|
|
|
|
##predicted = np.round(output)
|
|
##total_acc += (predicted == cats).sum().item()
|
|
|
|
predictions = torch.zeros(output.shape)
|
|
#predictions[output >= 0.25] = True
|
|
predictions[output >= 0.5] = True
|
|
predictions[output < 0.5] = False ## assign 0 label to those with less than 0.5
|
|
|
|
batch.clear()
|
|
for target, out, pred in list(zip(cats, output, predictions)):
|
|
expect = tensor2cat(dataset.cats_vocab, target)
|
|
raw = tensor2cat(dataset.cats_vocab, out)
|
|
predict = tensor2cat(dataset.cats_vocab, pred)
|
|
print("Expected: ", expect)
|
|
print("Predicted: ", predict)
|
|
print("Raw output:", raw)
|
|
print("\n")
|
|
batch.refresh()
|
|
|
|
N, C = cats.shape
|
|
#print("eq", (output == cats))
|
|
#print("sum", (output == cats).sum())
|
|
#print("accuracy", (output == cats).sum() / (N*C) * 100)
|
|
accuracy = (output == cats).sum() / (N*C) * 100
|
|
total_acc += accuracy
|
|
#print("train accuracy", accuracy)
|
|
#print("train total_acc", total_acc)
|
|
total_count += cats.size(0)
|
|
batch.set_postfix({
|
|
"accuracy": int(total_acc / total_count),
|
|
})
|
|
total_acc, total_count = 0, 0
|
|
|
|
|
|
def evaluate(dataloader, dataset, model, criterion, epoch=0):
|
|
model.eval()
|
|
total_acc, total_count = 0, 0
|
|
|
|
with torch.no_grad():
|
|
batch = tqdm.tqdm(dataloader, unit="batch")
|
|
for idx, data in enumerate(batch):
|
|
batch.set_description(f"Evaluate {epoch}.{idx}")
|
|
text, cats, text_lengths = data
|
|
|
|
output = model(text, text_lengths)
|
|
#print("eval predicted", output)
|
|
|
|
loss = criterion(output, cats)
|
|
#print("eval loss", loss)
|
|
|
|
predictions = torch.zeros(output.shape)
|
|
predictions[output >= 0.5] = True
|
|
predictions[output < 0.5] = False ## assign 0 label to those with less than 0.5
|
|
|
|
batch.clear()
|
|
for target, out, pred in list(zip(cats, output, predictions)):
|
|
expect = tensor2cat(dataset.cats_vocab, target)
|
|
raw = tensor2cat(dataset.cats_vocab, out)
|
|
predict = tensor2cat(dataset.cats_vocab, pred)
|
|
print("Evaluate expected: ", expect)
|
|
print("Evaluate predicted: ", predict)
|
|
print("Evaluate raw output:", raw)
|
|
print("\n")
|
|
batch.refresh()
|
|
|
|
##total_acc += (predicted_cats.argmax(1) == cats).sum().item()
|
|
N, C = cats.shape
|
|
accuracy = (predictions == cats).sum() / (N*C) * 100
|
|
total_acc += accuracy
|
|
#print("eval accuracy", accuracy)
|
|
#print("eval total_acc", total_acc)
|
|
total_count += cats.size(0)
|
|
|
|
batch.set_postfix({
|
|
"accuracy": int(total_acc / total_count),
|
|
})
|
|
return total_acc / total_count
|
|
|
|
# TODO seeding:
|
|
def seed_everything(seed=42):
|
|
random.seed(seed)
|
|
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
np.random.seed(seed)
|
|
torch.manual_seed(seed)
|
|
torch.cuda.manual_seed(seed)
|
|
torch.cuda.manual_seed_all(seed)
|
|
# Some cudnn methods can be random even after fixing the seed
|
|
# unless you tell it to be deterministic
|
|
torch.backends.cudnn.deterministic = True
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Classify text data according to categories',
|
|
add_help=True,
|
|
)
|
|
parser.add_argument('action',
|
|
help='train or classify')
|
|
parser.add_argument('--input', '-i',
|
|
required=True,
|
|
help='path of CSV file containing dataset')
|
|
parser.add_argument('--model', '-m',
|
|
#required=True, # XXX
|
|
help='path to training model')
|
|
parser.add_argument('--verbose', '-v',
|
|
type=int, nargs='?',
|
|
const=1, # Default value if -v is supplied
|
|
default=0, # Default value if -v is not supplied
|
|
help='print debugging')
|
|
args = parser.parse_args()
|
|
|
|
if args.action != 'train' and args.action != 'classify':
|
|
print("ERROR: train or classify data")
|
|
sys.exit(1)
|
|
|
|
if args.action == 'classify' and s.path.isfile(model_storage) is None:
|
|
print("No model found for classification; running training instead")
|
|
args.action = 'train'
|
|
|
|
if os.path.isfile(args.input) is False:
|
|
print(f"{args.input} is not a valid file")
|
|
sys.exit(1)
|
|
|
|
data = read_csv(input_csv=args.input, rows=story_num, verbose=args.verbose)
|
|
|
|
train_data, valid_data, = split_dataset(data, verbose=args.verbose)
|
|
|
|
'''
|
|
dataset = TextCategoriesDataset(df=data,
|
|
lang_column="language",
|
|
text_column="content",
|
|
first_cats_column=data.columns.get_loc("content")+1,
|
|
verbose=args.verbose,
|
|
)
|
|
'''
|
|
train_dataset = TextCategoriesDataset(df=train_data,
|
|
lang_column="language",
|
|
text_column="content",
|
|
first_cats_column=train_data.columns.get_loc("content")+1,
|
|
verbose=args.verbose,
|
|
)
|
|
valid_dataset = TextCategoriesDataset(df=valid_data,
|
|
lang_column="language",
|
|
text_column="content",
|
|
first_cats_column=valid_data.columns.get_loc("content")+1,
|
|
verbose=args.verbose,
|
|
)
|
|
#for text, cat in enumerate(train_dataset):
|
|
# print(text, cat)
|
|
#print("-" * 20)
|
|
#for text, cat in enumerate(valid_dataset):
|
|
# print(text, cat)
|
|
#print(tensor2cat(train_dataset, torch.tensor([0, 0, 0, 1., 0.9])))
|
|
#sys.exit(0)
|
|
|
|
# Get cpu, gpu or mps device for training.
|
|
# Move tensor to the NVIDIA GPU if available
|
|
device = (
|
|
"cuda" if torch.cuda.is_available()
|
|
else "xps" if hasattr(torch, "xpu") and torch.xpu.is_available()
|
|
else "mps" if torch.backends.mps.is_available()
|
|
else "cpu"
|
|
)
|
|
print(f"Using {device} device")
|
|
|
|
# Hyperparameters
|
|
#epochs = 10 # epoch
|
|
epochs = 4 # epoch
|
|
#lr = 5 # learning rate
|
|
#lr = 0.5
|
|
#lr = 0.05
|
|
#lr = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate
|
|
lr = 0.0001
|
|
batch_size = 64 # batch size for training
|
|
#batch_size = 16 # batch size for training
|
|
#batch_size = 8 # batch size for training
|
|
#batch_size = 4 # batch size for training
|
|
|
|
#num_layers = 2 # 2-3 layers should be enough for LTSM
|
|
num_layers = 3 # 2-3 layers should be enough for LTSM
|
|
hidden_size = 128 # hidden size of rnn module, should be tweaked manually
|
|
#hidden_size = 8 # hidden size of rnn module, should be tweaked manually
|
|
mean_seq = True # use mean of rnn output
|
|
#mean_seq = False # use mean of rnn output
|
|
weight_decay = 1e-4 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
|
|
#weight_decay = 1e-3 # helps the neural networks to learn smoother / simpler functions which most of the time generalizes better compared to spiky, noisy ones ; try 1e-3, 1e-4
|
|
|
|
'''
|
|
dataloader = DataLoader(dataset,
|
|
batch_size=4,
|
|
drop_last=True,
|
|
shuffle=True,
|
|
num_workers=0,
|
|
collate_fn=CollateBatch(pad_idx=train_dataset.stoi['<pad>']),
|
|
)
|
|
'''
|
|
train_dataloader = DataLoader(train_dataset,
|
|
batch_size=batch_size,
|
|
drop_last=True,
|
|
shuffle=True,
|
|
num_workers=0,
|
|
collate_fn=CollateBatch(pad_idx=train_dataset.stoi['<pad>']),
|
|
)
|
|
valid_dataloader = DataLoader(valid_dataset,
|
|
batch_size=batch_size,
|
|
drop_last=True,
|
|
shuffle=True,
|
|
num_workers=0,
|
|
collate_fn=CollateBatch(pad_idx=train_dataset.stoi['<pad>']),
|
|
)
|
|
#for i_batch, sample_batched in enumerate(dataloader):
|
|
# print(i_batch, sample_batched[0], sample_batched[1])
|
|
for i_batch, sample_batched in enumerate(train_dataloader):
|
|
print(i_batch, sample_batched[0], sample_batched[1])
|
|
sys.exit(0)
|
|
|
|
input_size = len(train_dataset.text_vocab)
|
|
output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category
|
|
|
|
embed = torch.empty(input_size, len(train_dataset)) # tokens per sample x samples
|
|
embedding_size = embed.size(1) # was 64 (should be: samples)
|
|
|
|
if args.verbose:
|
|
#for i in train_dataset.text_vocab.get_itos():
|
|
# print(i)
|
|
print("input_size: ", input_size)
|
|
print("output_size:", output_size)
|
|
print("embed shape:", embed.shape)
|
|
print("embedding_size:", embedding_size, " (that is, number of samples)")
|
|
|
|
model = RNN(
|
|
#rnn_model='GRU',
|
|
rnn_model='LSTM',
|
|
vocab_size=input_size,
|
|
embed_size=embedding_size,
|
|
num_output=output_size,
|
|
use_last=(not mean_seq),
|
|
hidden_size=hidden_size,
|
|
embedding_tensor=embed,
|
|
num_layers=num_layers,
|
|
batch_first=True
|
|
)
|
|
if args.verbose:
|
|
print(model)
|
|
|
|
# optimizer and loss
|
|
criterion = nn.BCEWithLogitsLoss()
|
|
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
|
|
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
|
|
if args.verbose:
|
|
print(criterion)
|
|
print(optimizer)
|
|
|
|
total_accu = None
|
|
#for epoch in range(1, epochs + 1):
|
|
e = tqdm.tqdm(range(1, epochs + 1), unit="epoch")
|
|
for epoch in e:
|
|
e.set_description(f"Epoch {epoch}")
|
|
|
|
train_dataset.to(device)
|
|
valid_dataset.to(device)
|
|
model.to(device)
|
|
|
|
model.train()
|
|
train(train_dataloader, train_dataset, model, optimizer, criterion, epoch)
|
|
|
|
accu_val = evaluate(valid_dataloader, valid_dataset, model, criterion, epoch)
|
|
|
|
if total_accu is not None and total_accu > accu_val:
|
|
optimizer.step()
|
|
else:
|
|
total_accu = accu_val
|
|
e.set_postfix({
|
|
"accuracy": accu_val.int().item(),
|
|
})
|
|
|
|
# print("Checking the results of test dataset.")
|
|
# accu_test = evaluate(test_dataloader, test_dataset)
|
|
# print("test accuracy {:8.3f}".format(accu_test))
|
|
|
|
return
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
# vim: set expandtab shiftwidth=2 softtabstop=2:
|