-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdataset_to_codefiles.py
More file actions
71 lines (55 loc) · 2.24 KB
/
dataset_to_codefiles.py
File metadata and controls
71 lines (55 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import argparse
import pandas as pd
import shutil
import os
import numpy as np
from sklearn.model_selection import train_test_split
def create_dir(path):
if not os.path.exists(path):
os.makedirs(path)
parser = argparse.ArgumentParser()
parser.add_argument('--chunks_dir', dest='chunks_dir', type=str, required=True)
args = parser.parse_args()
df_train = pd.read_csv('datasets/train.csv', index_col=0)
df_test = pd.read_csv('datasets/test.csv', index_col=0)
df_train.drop_duplicates(subset=['code_block'], inplace=True)
df_test.drop_duplicates(subset=['code_block'], inplace=True)
df_train = df_train[df_train.groupby('target')['target'].transform('size') > 9]
print('\nTrain dataset target statistics:')
print(df_train['target'].value_counts())
print('\nTest dataset target statistics:')
print(df_test['target'].value_counts(), end='\n\n')
X_train, X_test, y_train, y_test = train_test_split(
df_train['code_block'],
df_train['target'],
test_size=0.1,
random_state=42,
stratify=df_train['target'].tolist()
)
if os.path.exists(args.chunks_dir):
shutil.rmtree(args.chunks_dir)
train_path = os.path.join(args.chunks_dir, 'train')
val_path = os.path.join(args.chunks_dir, 'val')
test_path = os.path.join(args.chunks_dir, 'test')
create_dir(train_path)
for i, code in X_train.items():
filepath = train_path + '/' + str(i) + '|' + y_train.loc[i] + '.py'
print(code[1:-1], file=open(filepath, 'w+'))
create_dir(val_path)
for i, code in X_test.items():
filepath = val_path + '/' + str(i) + '|' + y_test.loc[i] + '.py'
print(code[1:-1], file=open(filepath, 'w+'))
test_codeblocks_to_filepaths_dict = dict()
create_dir(test_path)
for i, code in df_test['code_block'].items():
filepath = test_path + '/' + str(i) + '|' + df_test['target'].loc[i] + '.py'
test_codeblocks_to_filepaths_dict[code] = filepath
print(code[1:-1], file=open(filepath, 'w+'))
test_codeblocks_to_filepaths = (
pd.DataFrame
.from_dict(test_codeblocks_to_filepaths_dict, orient='index', columns=['filepath'])
.reset_index()
.rename(columns={'index': 'code_block'})
)
with open('nl2ml/test_codeblocks_to_filepaths.csv', 'w+') as f:
test_codeblocks_to_filepaths.to_csv(f, index=False) # TODO: get data folder from upper level