Add possible split between training and validation data
This commit is contained in:
parent
da6f0142e0
commit
235c58f3c5
@ -63,6 +63,30 @@ data.dropna(axis='index', inplace=True)
|
|||||||
#print(data)
|
#print(data)
|
||||||
#sys.exit(0)
|
#sys.exit(0)
|
||||||
|
|
||||||
|
'''
|
||||||
|
#######################################################
|
||||||
|
# Create Training and Validation sets
|
||||||
|
#######################################################
|
||||||
|
|
||||||
|
# create a list of ints till len of data
|
||||||
|
data_idx = list(range(len(data)))
|
||||||
|
np.random.shuffle(data_idx)
|
||||||
|
|
||||||
|
# get indexes for validation and train
|
||||||
|
val_frac = 0.1 # precentage of data in validation set
|
||||||
|
val_split_idx = int(len(data)*val_frac) # index on which to split (10% of data)
|
||||||
|
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
|
||||||
|
print('len of train: ', len(train_idx))
|
||||||
|
print('len of val: ', len(val_idx))
|
||||||
|
|
||||||
|
# create the training and validation sets, as dataframes
|
||||||
|
train_data = data.iloc[train_idx].reset_index().drop('index',axis=1)
|
||||||
|
valid_data = data.iloc[val_idx].reset_index().drop('index',axis=1)
|
||||||
|
|
||||||
|
# Next, we create Pytorch Datasets and Dataloaders for these dataframes
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Create a dataset that builds a tokenised vocabulary,
|
Create a dataset that builds a tokenised vocabulary,
|
||||||
and then, as each row is accessed, transforms it into
|
and then, as each row is accessed, transforms it into
|
||||||
@ -167,6 +191,16 @@ dataset = TextCategoriesDataset(df=data,
|
|||||||
text_column="content",
|
text_column="content",
|
||||||
cats_column="categories",
|
cats_column="categories",
|
||||||
)
|
)
|
||||||
|
'''
|
||||||
|
train_dataset = TextCategoriesDataset(df=train_data,
|
||||||
|
text_column="content",
|
||||||
|
cats_column="categories",
|
||||||
|
)
|
||||||
|
valid_dataset = TextCategoriesDataset(df=valid_data,
|
||||||
|
text_column="content",
|
||||||
|
cats_column="categories",
|
||||||
|
)
|
||||||
|
'''
|
||||||
#print(dataset[2])
|
#print(dataset[2])
|
||||||
#for text, cat in dataset:
|
#for text, cat in dataset:
|
||||||
# print(text, cat)
|
# print(text, cat)
|
||||||
@ -197,14 +231,26 @@ class Collate:
|
|||||||
T.ToTensor(self.pad_idx)(list(batch[1])),
|
T.ToTensor(self.pad_idx)(list(batch[1])),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
pad_idx = dataset.stoi['<pad>']
|
|
||||||
dataloader = DataLoader(dataset,
|
dataloader = DataLoader(dataset,
|
||||||
batch_size=4,
|
batch_size=4,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
num_workers=0,
|
num_workers=0,
|
||||||
collate_fn=Collate(pad_idx=pad_idx),
|
collate_fn=Collate(pad_idx=dataset.stoi['<pad>']),
|
||||||
)
|
)
|
||||||
|
'''
|
||||||
|
train_dataloader = DataLoader(train_dataset,
|
||||||
|
batch_size=4,
|
||||||
|
shuffle=True,
|
||||||
|
num_workers=0,
|
||||||
|
collate_fn=Collate(pad_idx=dataset.stoi['<pad>']),
|
||||||
|
)
|
||||||
|
valid_dataloader = DataLoader(valid_dataset,
|
||||||
|
batch_size=4,
|
||||||
|
shuffle=True,
|
||||||
|
num_workers=0,
|
||||||
|
collate_fn=Collate(pad_idx=dataset.stoi['<pad>']),
|
||||||
|
)
|
||||||
|
'''
|
||||||
#for i_batch, sample_batched in enumerate(dataloader):
|
#for i_batch, sample_batched in enumerate(dataloader):
|
||||||
# print(i_batch, sample_batched[0], sample_batched[1])
|
# print(i_batch, sample_batched[0], sample_batched[1])
|
||||||
#sys.exit(0)
|
#sys.exit(0)
|
||||||
|
Loading…
Reference in New Issue
Block a user