From 9052767750240990e974496959518cb4f5ead7bf Mon Sep 17 00:00:00 2001 From: Timothy Allen Date: Sat, 30 Dec 2023 12:30:43 +0200 Subject: [PATCH] Convert to a multi-hot index in the CSV, to simplify our DataSets and DataLoaders --- africat/aa_create_dataset.py | 22 ++- africat/categorise.py | 255 +++++++++++------------------------ 2 files changed, 92 insertions(+), 185 deletions(-) diff --git a/africat/aa_create_dataset.py b/africat/aa_create_dataset.py index 9564244..1a036eb 100755 --- a/africat/aa_create_dataset.py +++ b/africat/aa_create_dataset.py @@ -96,11 +96,7 @@ def parse_and_extract(input_dir, verbose): cats = list() for cat in doc.findall('./category'): - # TODO check against a list of current categories, - # and strip any non-current categories cats.append(cat.text) - #entry["categories"] = cats # if you want a list - entry["categories"] = ";".join(cats) # if you want a string text = list() lang = "" @@ -115,10 +111,19 @@ def parse_and_extract(input_dir, verbose): except Exception as e: print(f"{xml_file} : {e}") - if text is not None and len(cats) > 1: - entry["content"] = "\n".join(text) + if text is not None and len(cats) >= 1: entry["language"] = lang + entry["content"] = "\n".join(text) + for cat in cats: + entry[cat] = 1 articles.append(entry) + else: + if len(cats) < 1: + print(f"No article added for key {key} due to lack of categories") + elif text is None: + print(f"No article added for key {key} due to lack of text") + else: + print(f"No article added for key {key} due to unknown error") except ET.ParseError as e: if verbose > 1: @@ -158,7 +163,10 @@ def scrub_data(articles, verbose): data['content'] = data.content.parallel_apply(lambda x: x.strip()) data['content'] = data.content.parallel_apply(lambda x: re.sub(" +", " ", x)) - # TODO: lemmas? See spaCy + # Any remaining text processing can be done by training/inference step + + # sort category columns: lowercase first (key, language, content), then title-cased categories + data.reindex(columns=sorted(data.columns, key=lambda x: (x.casefold(), x.swapcase()))) return data diff --git a/africat/categorise.py b/africat/categorise.py index 650f5b1..fd44fee 100755 --- a/africat/categorise.py +++ b/africat/categorise.py @@ -19,19 +19,19 @@ import tqdm import torch import torchdata.datapipes as dp import torchtext.transforms as T +import torchtext.vocab as vocab from torch import nn from torch.utils.data import Dataset, DataLoader -from torchtext.vocab import build_vocab_from_iterator -from models.rnn import RNN +xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt" +xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model" -all_categories = list() # XXX None for all stories -#story_num = 128 +story_num = 128 #story_num = 256 #story_num = 512 #story_num = 1024 -story_num = 4096 +#story_num = 4096 #story_num = None def read_csv(input_csv, rows=None, verbose=0): @@ -42,6 +42,7 @@ def read_csv(input_csv, rows=None, verbose=0): pd.read_csv(f, encoding="utf-8", quoting=csv.QUOTE_ALL, + index_col=0, nrows=rows, chunksize=50, ), @@ -52,10 +53,10 @@ def read_csv(input_csv, rows=None, verbose=0): data = pd.read_csv(f, encoding="utf-8", quoting=csv.QUOTE_ALL, + index_col=0, nrows=rows, ) - data.dropna(axis='index', inplace=True) #print(data) #sys.exit(0) return data @@ -83,9 +84,9 @@ def split_dataset(data, verbose=0): #print("Length of tests_data: {}".format(len(tests_idx))) # Create the training and validation sets, as dataframes - train_data = data.iloc[train_idx].reset_index().drop('index', axis=1) - valid_data = data.iloc[valid_idx].reset_index().drop('index', axis=1) - #tests_data = data.iloc[tests_idx].reset_index().drop('index', axis=1) + train_data = data.iloc[train_idx].reset_index() + valid_data = data.iloc[valid_idx].reset_index() + #tests_data = data.iloc[tests_idx].reset_index() #return(train_data, valid_data, tests_data) return(train_data, valid_data) @@ -96,24 +97,25 @@ def split_dataset(data, verbose=0): ''' class TextCategoriesDataset(Dataset): ''' Dataset of Text and Categories ''' - def __init__(self, df, text_column, cats_column, lang_column, transform=None, verbose=0): + def __init__(self, df, lang_column, text_column, first_cats_column=0, transform=None, verbose=0): ''' Arguments: df (panda.Dataframe): csv content, loaded as dataframe + lang_column (str): the name of the column containing the language text_column (str): the name of the column containing the text - cats_column (str): the name of the column containing - semicolon-separated categories - text_column (str): the name of the column containing the language - transform (callable, optional): Optional transform to be - applied on a sample. + first_cats_column (int): the index of the first column containing + a category + transform (callable, optional): Optional transform to be applied + on a sample. ''' self.df = df self.transform = transform self.verbose = verbose - self.text = self.df[text_column] - self.cats = self.df[cats_column] self.lang = self.df[lang_column] + self.text = self.df[text_column] + self.cats = self.df.iloc[:, first_cats_column:].sort_index(axis="columns") + self.cats_vocab = self.cats.columns # index-to-token dict # : padding, used for padding the shorter sentences in a batch @@ -126,26 +128,6 @@ class TextCategoriesDataset(Dataset): # token-to-index dict self.stoi = {k:j for j, k in self.itos.items()} - # Create vocabularies upon initialisation - self.text_vocab = build_vocab_from_iterator( - [self.textTokens(text) for i, text in self.df[text_column].items()], - min_freq=2, - specials=self.itos.values(), - special_first=True - ) - self.text_vocab.set_default_index(self.text_vocab['']) - #print(self.text_vocab.get_itos()) - - self.cats_vocab = build_vocab_from_iterator( - #[self.catTokens(cats) for i, cats in self.df[cats_column].items()], - [self.catTokens(all_categories)], - min_freq=1, - specials=[''], - special_first=True - ) - self.cats_vocab.set_default_index(self.cats_vocab['']) - #print(self.cats_vocab.get_itos()) - def __len__(self): return len(self.df) @@ -158,58 +140,41 @@ class TextCategoriesDataset(Dataset): idx = idx.tolist() # Get the raw data - text = self.text[idx] - cats = self.cats[idx] lang = self.lang[idx] + text = self.text[idx] + cats = self.cats.iloc[idx] + + #print(self.textTransform()(text)) + #print(cats) + #print(cats.fillna(0).values) if self.transform: text, cats = self.transform(text, cats) - #print(cats) - #print(self.catTokens(cats)) - #print(self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats))) - - # Numericalise by applying transforms + # Numericalise text by applying transforms, and cats by converting + # NaN to zeros and stripping the index return ( - self.getTransform(self.text_vocab, "text")(self.textTokens(text)), - self.getTransform(self.cats_vocab, "cats")(self.catTokens(cats)), + self.textTransform()(text), + cats.fillna(0).values, ) - @staticmethod - def textTokens(text): - if isinstance(text, str): - return [word for word in text.split()] - - @staticmethod - def catTokens(cats): - if isinstance(cats, str): - return [cat for cat in cats.split(';')] - elif isinstance(cats, list): - return [cat for cat in cats] - - def getTransform(self, vocab, vType): + def textTransform(self): ''' Create transforms based on given vocabulary. The returned transform is applied to a sequence of tokens. ''' - if vType == "text": - return T.Sequential( - # converts the sentences to indices based on given vocabulary - T.VocabTransform(vocab=vocab), - # Add at beginning of each sentence. 1 because the index - # for in vocabulary is 1 as seen in previous section - T.AddToken(self.text_vocab[''], begin=True), - # Add at end of each sentence. 2 because the index - # for in vocabulary is 2 as seen in previous section - T.AddToken(self.text_vocab[''], begin=False) - ) - elif vType == "cats": - return T.Sequential( - # converts the sentences to indices based on given vocabulary - T.VocabTransform(vocab=vocab), - ) - else: - raise Exception('wrong transformation type') + return T.Sequential( + # converts the sentences to indices based on given vocabulary using SentencePiece + T.SentencePieceTokenizer(xlmr_spm_model_path), + T.VocabTransform(torch.hub.load_state_dict_from_url(xlmr_vocab_path)), + # Add at beginning of each sentence. 1 because the index + # for in vocabulary is 1 as seen in previous section + T.AddToken(self.stoi[''], begin=True), + # Add at end of each sentence. 2 because the index + # for in vocabulary is 2 as seen in previous section + T.AddToken(self.stoi[''], begin=False) + ) + ''' @@ -223,12 +188,11 @@ class CollateBatch: in a batch of equal length. We can do this a collate_fn callback class, which returns a tensor ''' - def __init__(self, pad_idx, cats): + def __init__(self, pad_idx): ''' pad_idx (int): the index of the "" token in the vocabulary. ''' self.pad_idx = pad_idx - self.cats = cats def __call__(self, batch): ''' @@ -236,13 +200,6 @@ class CollateBatch: is a list of tokens ''' batch_text, batch_cats = zip(*batch) - #for i in range(len(batch)): - # print(batch[i]) - #max_text_len = len(max(batch_text, key=len)) - #max_cats_len = len(max(batch_cats, key=len)) - - #text_tensor = T.ToTensor(self.pad_idx)(batch_text) - #cats_tensor = T.ToTensor(self.pad_idx)(batch_cats) # Pad text to the longest text_tensor = nn.utils.rnn.pad_sequence( @@ -251,44 +208,7 @@ class CollateBatch: ) text_lengths = torch.tensor([t.shape[0] for t in text_tensor]) - #cats_tensor = torch.nn.utils.rnn.pad_sequence( - # [torch.LongTensor(s) for s in batch_cats], - # batch_first=True, padding_value=self.pad_idx - #) - #cats_lengths = torch.LongTensor(list(map(len, batch_cats))) - - ''' - # Pad cats_tensor to all possible categories - num_cats = len(all_categories) - - # Convert cats to multi-label one-hot representation - cats_tensor = torch.full((len(batch_cats), num_cats), self.pad_idx).float() - cats_lengths = torch.LongTensor(list(map(len, batch_cats))) - for idx, cats in enumerate(batch_cats): - #print("\nsample", idx, cats) - for c in cats: - #print(c) - cats_tensor[idx][c] = 1 - #print(cats_tensor[idx]) - ''' - # Convert cats to multi-label one-hot representation - # add one to all_categories to account for - cats_tensor = torch.full((len(batch_cats), len(all_categories)+1), self.pad_idx).float() - for idx, cats in enumerate(batch_cats): - #print("\nsample", idx, cats) - for c in cats: - cats_tensor[idx][c] = 1 - #print(cats_tensor[idx]) - #sys.exit(0) - - - ''' - # XXX why?? - ## SORT YOUR TENSORS BY LENGTH! - text_lengths, perm_idx = text_lengths.sort(0, descending=True) - text_tensor = text_tensor[perm_idx] - cats_tensor = cats_tensor[perm_idx] - ''' + cats_tensor = torch.tensor(batch_cats, dtype=torch.float32) #print("text", text_tensor) #print("text shape:", text_tensor.shape) @@ -296,7 +216,6 @@ class CollateBatch: #print("cats shape:", cats_tensor.shape) #print(text_lengths) #print("text_lengths shape:", text_lengths.shape) - #sys.exit(0) return ( @@ -305,48 +224,27 @@ class CollateBatch: text_lengths, ) -def cat2tensor(label_vocab, labels, pad_idx: int): - all_labels = vocab.get_itos() - num_labels = len(all_labels) - # add - if 0 not in all_labels: - num_labels += 1 - - labels_tensor = torch.full((len(labels), num_labels), pad_idx).float() - labels_lengths = torch.LongTensor(list(map(len, labels))) - for idx, labels in enumerate(labels): - #print("\nsample", idx, labels) - for l in labels: - labels_tensor[idx][l] = 1 - #print(labels_tensor[idx]) - return labels_tensor - -def tensor2cat(vocab, tensor): - all_cats = vocab.get_itos() +def tensor2cat(dataset, tensor): + cats = dataset.cats_vocab if tensor.ndimension() == 2: batch = list() for result in tensor: chance = dict() for idx, pred in enumerate(result): if pred > 0: # XXX - chance[all_cats[idx]] = pred.item() - #print(chance) + chance[cats[idx]] = pred.item() batch.append(chance) return batch elif tensor.ndimension() == 1: chance = dict() for idx, pred in enumerate(tensor): - if idx >= len(all_cats): - print(f"Idx {idx} not in {len(all_cats)} categories") - #elif pred > 0: # XXX - #print(idx, len(all_cats)) - chance[all_cats[idx]] = pred.item() - #print(chance) + if idx >= len(cats): + print(f"Idx {idx} not in {len(cats)} categories") + elif pred > 0: # XXX + chance[cats[idx]] = pred.item() return chance else: - raise ValueError("Only tensors with 2 dimensions are supported") - - return vocab.get_itos(cat) + raise ValueError("Only tensors with 1 dimension or batches with 2 dimensions are supported") def train(dataloader, dataset, model, optimizer, criterion, epoch=0): @@ -452,6 +350,17 @@ def evaluate(dataloader, dataset, model, criterion, epoch=0): }) return total_acc / total_count +# TODO seeding: +def seed_everything(seed=42): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + # Some cudnn methods can be random even after fixing the seed + # unless you tell it to be deterministic + torch.backends.cudnn.deterministic = True def main(): parser = argparse.ArgumentParser( @@ -487,37 +396,26 @@ def main(): data = read_csv(input_csv=args.input, rows=story_num, verbose=args.verbose) - # create list of all categories - global all_categories - for cats in data.categories: - for c in cats.split(";"): - if c not in all_categories: - all_categories.append(c) - all_categories = sorted(all_categories) - #print(all_categories) - #print(len(all_categories)) - #sys.exit(0) - train_data, valid_data, = split_dataset(data, verbose=args.verbose) ''' dataset = TextCategoriesDataset(df=data, - text_column="content", - cats_column="categories", lang_column="language", + text_column="content", + first_cats_column=data.columns.get_loc("content")+1, verbose=args.verbose, ) ''' train_dataset = TextCategoriesDataset(df=train_data, - text_column="content", - cats_column="categories", lang_column="language", + text_column="content", + first_cats_column=train_data.columns.get_loc("content")+1, verbose=args.verbose, ) valid_dataset = TextCategoriesDataset(df=valid_data, - text_column="content", - cats_column="categories", lang_column="language", + text_column="content", + first_cats_column=valid_data.columns.get_loc("content")+1, verbose=args.verbose, ) #for text, cat in enumerate(train_dataset): @@ -525,6 +423,7 @@ def main(): #print("-" * 20) #for text, cat in enumerate(valid_dataset): # print(text, cat) + #print(tensor2cat(train_dataset, torch.tensor([0, 0, 0, 1., 0.9]))) #sys.exit(0) # Get cpu, gpu or mps device for training. @@ -544,7 +443,7 @@ def main(): #lr = 0.5 #lr = 0.05 #lr = 0.005 # initial learning rate; too small may result in a long training process that could get stuck, whereas a value too large may result in learning a sub-optimal set of weights too fast or an unstable training process -- perhaps the most important hyperparameter. If you have time to tune only one hyperparameter, tune the learning rate - lr = 0.0001 + lr = 0.0001 batch_size = 64 # batch size for training #batch_size = 16 # batch size for training #batch_size = 8 # batch size for training @@ -565,7 +464,7 @@ def main(): drop_last=True, shuffle=True, num_workers=0, - collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['']), + collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) ''' train_dataloader = DataLoader(train_dataset, @@ -573,20 +472,20 @@ def main(): drop_last=True, shuffle=True, num_workers=0, - collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['']), + collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, drop_last=True, shuffle=True, num_workers=0, - collate_fn=CollateBatch(cats=train_dataset.cats_vocab.get_stoi(), pad_idx=train_dataset.stoi['']), + collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) #for i_batch, sample_batched in enumerate(dataloader): # print(i_batch, sample_batched[0], sample_batched[1]) - #for i_batch, sample_batched in enumerate(train_dataloader): - # print(i_batch, sample_batched[0], sample_batched[1]) - #sys.exit(0) + for i_batch, sample_batched in enumerate(train_dataloader): + print(i_batch, sample_batched[0], sample_batched[1]) + sys.exit(0) input_size = len(train_dataset.text_vocab) output_size = len(train_dataset.cats_vocab) # every output item is the likelihood of a particular category