tio2-sentiment-analysis/omni_script.py at main · pfizer-opensource/tio2-sentiment-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
### This script is designed to reproduce the model built in the
### Sentiment Analysis publication on your labeled/annotated dataset.
### For the trained model in the publication,
### please contact the authors, chiril.calin@pfizer.com, maria.deligianni@pfizer.com.


import pickle as pkl
from scripts.eda_utils import clean_and_save_csv, build_dictionary, eda_mining_from_dict, data_augmentation
from scripts.train import launch_training
from sklearn.model_selection import train_test_split


#Specify annotated dataset here.
#At the minimum, your starting dataset must have these fields:
#'text': Should contain the review for analysis.
#'primary_qualifiers': Categorizations of the text with the qualifier structure, e.g. ['ROUGH'+'SIZE']
#'final_category': Categorizations of the text with the category structure, e.g. ['SWALLOWABILITY'], ['PALATABILITY']
dataset = 'data/processed_final.csv'

#Specify an output directory to hold any decoding dictionaries produced, or intermediate datasets you wish to save.
output_dir = 'output/'

#Specify a directory for the final model to be written to.
model_output_dir = 'models/'

#Specify the directory for your base model to be tuned.
base_model_path = '/home/calinc/sentiment_analysis/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411'


#Replace the third parameter "text" with whatever the name of your review column is in your dataset.
cleaned_df = clean_and_save_csv(input_file=dataset, output_dir=output_dir, reviews_column='text', save=False)


#Building a dictionary with label encoding/decoding
dictionary=build_dictionary(cleaned_df)

#Saving dictionary to output folder as a decoding reference:
with open(output_dir + 'dictionary.pkl', 'wb') as f:
    pkl.dump(dictionary, f)


#Finding primary qualifiers with low amount of examples. If less than 16 examples for primary qualifier category,
#those qualifiers are marked for EDA in the data_augmentation step.
cleaned_df, small_categoried = eda_mining_from_dict(input_file=cleaned_df, enc_dict=dictionary)


#Split into training and test sets... we do this before EDA to prevent train-test overlap.
training_set, test_set = train_test_split(cleaned_df, test_size=0.2, random_state=0, stratify=cleaned_df['label'])


#Now that they are separate, we want to apply EDA to both our train and test set.
#This will create many examples of our qualifiers with a small amount of items.
#This process will take a long time for very large datasets or high num_aug values (default 9), so having an output dir
#to restart model training from is recommended.
training_set = data_augmentation(data=training_set, small_cats=small_categoried, reviews_column='text', output_dir=output_dir, save=False)
test_set = data_augmentation(data=test_set, small_cats=small_categoried, reviews_column='text', output_dir=output_dir, save=False)

#Hint: To save,
#training_set = data_augmentation(data=training_set, small_cats=small_categoried, reviews_column='text', output_dir=output_dir, data_name=dataset, label='train', save=True)
#test_set = data_augmentation(data=test_set, small_cats=small_categoried, reviews_column='text', output_dir=output_dir,  data_name=dataset, label='test', save=True)

#Hint, To load and begin training,
#training_set = pd.read_csv(output_dir + dataset.split('/')[-1].split('.')[0] + 'train.csv')
#test_set = pd.read_csv(output_dir + dataset.split('/')[-1].split('.')[0] + 'test.csv')


#Finally, to run our model:
launch_training(training_set, test_set, reviews_column='text', model_path=base_model_path, model_output_dir=model_output_dir)