#!/usr/bin/python import argparse import os import sys import pprint import re import string import time import warnings # data manupulation import csv import random import pandas as pd import numpy as np #from pandarallel import pandarallel from tqdm import tqdm # torch import torch import torchdata.datapipes as dp import torchtext.transforms as T from torchtext.vocab import build_vocab_from_iterator from torch.utils.data import Dataset, DataLoader from torch import nn story_num = 40 # XXX None for all # Hyperparameters EPOCHS = 10 # epoch LR = 5 # learning rate BATCH_SIZE = 64 # batch size for training def read_csv(input_csv, rows=None, verbose=0): if verbose > 0: with open(input_csv, 'r', encoding="utf-8") as f: data = pd.concat( [chunk for chunk in tqdm( pd.read_csv(f, encoding="utf-8", quoting=csv.QUOTE_ALL, nrows=rows, chunksize=50, ), desc='Loading data' )]) else: with open(input_csv, 'r', encoding="utf-8") as f: data = pd.read_csv(f, encoding="utf-8", quoting=csv.QUOTE_ALL, nrows=rows, ) data.dropna(axis='index', inplace=True) #print(data) #sys.exit(0) return data ''' Create Training and Validation sets ''' def split_dataset(data, verbose=0): # Create a list of ints till len of data data_idx = list(range(len(data))) np.random.shuffle(data_idx) # Get indexes for validation and train split_percent = 0.05 num_valid = int(len(data) * split_percent) #num_tests = int(len(data) * split_percent) #train_idx = data_idx[num_valid:-num_tests] train_idx = data_idx[num_valid:] valid_idx = data_idx[:num_valid] #tests_idx = data_idx[-num_tests:] if verbose > 0: print("Length of train_data: {}".format(len(train_idx))) print("Length of valid_data: {}".format(len(valid_idx))) #print("Length of tests_data: {}".format(len(tests_idx))) # Create the training and validation sets, as dataframes train_data = data.iloc[train_idx].reset_index().drop('index', axis=1) valid_data = data.iloc[valid_idx].reset_index().drop('index', axis=1) #tests_data = data.iloc[tests_idx].reset_index().drop('index', axis=1) #return(train_data, valid_data, tests_data) return(train_data, valid_data) ''' Create a dataset that builds a tokenised vocabulary, and then, as each row is accessed, transforms it into ''' class TextCategoriesDataset(Dataset): ''' Dataset of Text and Categories ''' def __init__(self, df, text_column, cats_column, lang_column, transform=None, verbose=0): ''' Arguments: df (panda.Dataframe): csv content, loaded as dataframe text_column (str): the name of the column containing the text cats_column (str): the name of the column containing semicolon-separated categories text_column (str): the name of the column containing the language transform (callable, optional): Optional transform to be applied on a sample. ''' self.df = df self.transform = transform self.verbose = verbose self.text = self.df[text_column] self.cats = self.df[cats_column] self.lang = self.df[lang_column] # index-to-token dict # : padding, used for padding the shorter sentences in a batch # to match the length of longest sentence in the batch # : start of sentence token # : end of sentence token # : unknown token: words which are not found in the vocab are # replaced by this token self.itos = {0: '', 1:'', 2:'', 3: ''} # token-to-index dict self.stoi = {k:j for j, k in self.itos.items()} # Create vocabularies upon initialisation self.text_vocab = build_vocab_from_iterator( [self.textTokens(text) for i, text in self.df[text_column].items()], min_freq=2, specials= self.itos.values(), special_first=True ) self.text_vocab.set_default_index(self.text_vocab['']) #print(self.text_vocab.get_itos()) self.cats_vocab = build_vocab_from_iterator( [self.catTokens(cats) for i, cats in self.df[cats_column].items()], min_freq=1, specials= self.itos.values(), special_first=True ) self.cats_vocab.set_default_index(self.cats_vocab['']) #print(self.cats_vocab.get_itos()) def __len__(self): return len(self.df) def __getitem__(self, idx): # Enable use as a plain iterator if idx not in self.df.index: raise(StopIteration) if torch.is_tensor(idx): idx = idx.tolist() # Get the raw data text = self.text[idx] cats = self.cats[idx] lang = self.lang[idx] if self.transform: text, cats = self.transform(text, cats) # Numericalise by applying transforms return ( self.getTransform(self.text_vocab)(self.textTokens(text)), self.getTransform(self.cats_vocab)(self.catTokens(cats)), ) @staticmethod def textTokens(text): if isinstance(text, str): return [word for word in text.split()] @staticmethod def catTokens(cats): if isinstance(cats, str): return [cat for cat in cats.split(';')] elif isinstance(cats, list): return [cat for cat in cats] def getTransform(self, vocab): ''' Create transforms based on given vocabulary. The returned transform is applied to a sequence of tokens. ''' return T.Sequential( # converts the sentences to indices based on given vocabulary T.VocabTransform(vocab=vocab), # Add at beginning of each sentence. 1 because the index # for in vocabulary is 1 as seen in previous section T.AddToken(1, begin=True), # Add at beginning of each sentence. 2 because the index # for in vocabulary is 2 as seen in previous section T.AddToken(2, begin=False) ) ''' Now that we have a dataset, let's create dataloader, which can batch, shuffle, and load the data in parallel ''' class CollateBatch: ''' We need to pad shorter sentences in a batch to make all the sequences in a batch of equal length. We can do this a collate_fn callback class, which returns a tensor ''' def __init__(self, pad_idx): self.pad_idx = pad_idx def __call__(self, batch): # T.ToTensor(0) returns a transform that converts the sequence # to a torch.tensor and also applies padding. # # pad_idx is passed to the constructor to specify the index of # the "" token in the vocabulary. return ( T.ToTensor(self.pad_idx)(list(batch[0])), T.ToTensor(self.pad_idx)(list(batch[1])), ) class TextClassificationModel(nn.Module): def __init__(self, input_size, output_size, verbose): super().__init__() def forward(self, x): return x def train(dataloader): model.train() total_acc, total_count = 0, 0 log_interval = 500 start_time = time.time() for idx, (label, text) in enumerate(dataloader): optimizer.zero_grad() predicted_label = model(text) loss = criterion(predicted_label, label) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() total_acc += (predicted_label.argmax(1) == label).sum().item() total_count += label.size(0) if idx % log_interval == 0 and idx > 0: elapsed = time.time() - start_time print( "| epoch {:3d} | {:5d}/{:5d} batches " "| accuracy {:8.3f}".format( epoch, idx, len(dataloader), total_acc / total_count ) ) total_acc, total_count = 0, 0 start_time = time.time() def evaluate(dataloader): model.eval() total_acc, total_count = 0, 0 with torch.no_grad(): for idx, (label, text) in enumerate(dataloader): predicted_label = model(text) loss = criterion(predicted_label, label) total_acc += (predicted_label.argmax(1) == label).sum().item() total_count += label.size(0) return total_acc / total_count def main(): parser = argparse.ArgumentParser( description='Classify text data according to categories', add_help=True, ) parser.add_argument('action', help='train or classify') parser.add_argument('--input', '-i', required=True, help='path of CSV file containing dataset') parser.add_argument('--model', '-m', #required=True, # XXX help='path to training model') parser.add_argument('--verbose', '-v', type=int, nargs='?', const=1, # Default value if -v is supplied default=0, # Default value if -v is not supplied help='print debugging') args = parser.parse_args() if args.action != 'train' and args.action != 'classify': print("ERROR: train or classify data") sys.exit(1) if args.action == 'classify' and s.path.isfile(model_storage) is None: print("No model found for classification; running training instead") args.action = 'train' if os.path.isfile(args.input) is False: print(f"{args.input} is not a valid file") sys.exit(1) data = read_csv(input_csv=args.input, rows=story_num, verbose=args.verbose) train_data, valid_data, = split_dataset(data, verbose=args.verbose) ''' dataset = TextCategoriesDataset(df=data, text_column="content", cats_column="categories", lang_column="language", verbose=args.verbose, ) ''' train_dataset = TextCategoriesDataset(df=train_data, text_column="content", cats_column="categories", lang_column="language", verbose=args.verbose, ) valid_dataset = TextCategoriesDataset(df=valid_data, text_column="content", cats_column="categories", lang_column="language", verbose=args.verbose, ) #print(dataset[2]) #for text, cat in enumerate(valid_dataset): # print(text, cat) #sys.exit(0) # Get cpu, gpu or mps device for training. # Move tensor to the NVIDIA GPU if available device = ( "cuda" if torch.cuda.is_available() else "xps" if hasattr(torch, "xpu") and torch.xpu.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) print(f"Using {device} device") ''' dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=CollateBatch(pad_idx=dataset.stoi['']), ) ''' train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=CollateBatch(pad_idx=train_dataset.stoi['']), ) valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=CollateBatch(pad_idx=valid_dataset.stoi['']), ) #for i_batch, sample_batched in enumerate(dataloader): # print(i_batch, sample_batched[0], sample_batched[1]) #sys.exit(0) num_class = len(set([cats for key, cats, text, lang in train_data.values])) input_size = len(train_dataset.text_vocab) output_size = len(train_dataset.cats_vocab) emsize = 64 model = TextClassificationModel(input_size, output_size, args.verbose).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) total_accu = None for epoch in range(1, EPOCHS + 1): epoch_start_time = time.time() train(train_dataloader) accu_val = evaluate(valid_dataloader) if total_accu is not None and total_accu > accu_val: scheduler.step() else: total_accu = accu_val print("-" * 59) print( "| end of epoch {:3d} | time: {:5.2f}s | " "valid accuracy {:8.3f} ".format( epoch, time.time() - epoch_start_time, accu_val ) ) print("-" * 59) print("Checking the results of test dataset.") accu_test = evaluate(test_dataloader) print("test accuracy {:8.3f}".format(accu_test)) return if __name__ == "__main__": main()