-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathValidationSet.py
More file actions
48 lines (35 loc) · 1.59 KB
/
Copy pathValidationSet.py
File metadata and controls
48 lines (35 loc) · 1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
'''
Generate a validation dataset from the training dataset, the ratio is train : val = 9 : 1.
'''
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import torch
pd.options.mode.chained_assignment = None
plt.switch_backend('agg')
if __name__ == '__main__':
train_df = torch.load('train_tok.pt')
labels = torch.load('train_labels.pt')
print('.......Validation Set Generating........')
X = range(len(train_df))
# splitting of dataset
X_train, X_val, y_train, y_val = train_test_split(X, labels, test_size = 0.1, random_state = 42)
train_text_df = pd.DataFrame(index = range(len(X_train)), columns = ['text', 'text_b'], dtype = object)
val_text_df = pd.DataFrame(index = range(len(X_val)), columns = ['text', 'text_b'], dtype = object)
train_label = []
val_label = []
for i in tqdm(range(len(X_train))):
train_text_df['text'][i] = train_df['text'][X_train[i]]
train_text_df['text_b'][i] = train_df['text_b'][X_train[i]]
train_label.append(labels[X_train[i]])
for i in tqdm(range(len(X_val))):
val_text_df['text'][i] = train_df['text'][X_val[i]]
val_text_df['text_b'][i] = train_df['text_b'][X_val[i]]
val_label.append(labels[X_val[i]])
print('.......Validation Set Saving........')
torch.save(train_text_df, 'train_tok.pt')
torch.save(val_text_df, 'val_tok.pt')
torch.save(train_label, 'train_labels.pt')
torch.save(val_label, 'val_labels.pt')
print('=== END ===')