-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathData_Balancing.py
More file actions
62 lines (48 loc) · 1.73 KB
/
Data_Balancing.py
File metadata and controls
62 lines (48 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import os
import json
import random
from sklearn.model_selection import train_test_split
PATH = './'
max_one_emotion = 4000
max_emotion = 6000
nclass = 7
# 파일 읽기
json_path = os.path.join(PATH, 'data', 'total_data.json')
with open(json_path,'r') as file:
base_json = json.load(file)
# 감정 클래스 분포 확인
Final_data = {"data" : []}
emo_count = {}
del_list = []
for idx, item in enumerate(base_json['data']):
if not (os.path.isfile(os.path.join(PATH, 'TOTAL', item['wav']))):
del_list.append(idx)
for emo in item['Emotion']:
emo_count[emo] = emo_count.get(emo, 0) + 1
while (del_list):
idx = del_list.pop()
item = base_json['data'][idx]
print('파일없음 삭제 :', os.path.join(PATH, 'TOTAL', item['wav']))
del base_json['data'][idx]
print("감정 클래스 분포 확인")
print(emo_count)
# 감정 데이터 셔플
random.shuffle(base_json['data'])
print("데이터 저장")
# save preprocessed data
json_path = os.path.join(PATH, 'data', 'preprocessed_data.json')
with open(json_path,'w') as j:
json.dump(base_json,j,ensure_ascii=False, indent=4)
# 훈련, 테스트 데이터 split
train_data, test_data = train_test_split(base_json['data'], train_size=0.8, test_size=0.2, random_state=123, shuffle=True)
train_path = os.path.join(PATH, 'data', 'train_preprocessed_data.json')
print("Train 데이터셋 저장")
train_json = {'data' : train_data}
with open(train_path,'w') as j:
json.dump(train_json,j,ensure_ascii=False, indent=4)
test_path = os.path.join(PATH, 'data', 'test_preprocessed_data.json')
print("Test 데이터셋 저장")
test_json = {'data' : test_data}
with open(test_path,'w') as j:
json.dump(test_json,j,ensure_ascii=False, indent=4)