-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_builder.py
More file actions
121 lines (96 loc) · 3.96 KB
/
model_builder.py
File metadata and controls
121 lines (96 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader, RandomSampler
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
import numpy as np
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
import re
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
df = pd.read_csv('output_file.csv')
df['cleaned_text'] = df['cleaned_text'].astype(str).fillna('')
df['emojis'] = df['emojis'].astype(str).fillna('')
emoji_tokenizer = Tokenizer(num_words=500, filters='')
emoji_tokenizer.fit_on_texts(df['emojis'])
emoji_sequences = emoji_tokenizer.texts_to_sequences(df['emojis'])
emoji_padded = pad_sequences(emoji_sequences, padding='post')
emoji_labels = np.argmax(to_categorical(emoji_padded, num_classes=len(emoji_tokenizer.word_index) + 1), axis=1)
class_weights = compute_class_weight('balanced', classes=np.unique(emoji_labels), y=emoji_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class EmojiDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128
texts = df['cleaned_text'].tolist()
train_dataset = EmojiDataset(
texts=texts,
labels=emoji_labels,
tokenizer=bert_tokenizer,
max_len=max_len
)
train_loader = DataLoader(
train_dataset,
sampler=RandomSampler(train_dataset),
batch_size=32
)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(np.unique(emoji_labels)))
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
epochs = 4
for epoch in range(epochs):
print(f"Epoch {epoch+1}/{epochs}")
model.train()
total_loss = 0
correct_predictions = 0
total_predictions = 0
for step, batch in enumerate(train_loader):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
logits = outputs.logits
loss = loss_fn(logits, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
preds = torch.argmax(logits, dim=1)
correct_predictions += torch.sum(preds == labels)
total_predictions += len(labels)
if step % 5 == 0:
epoch_accuracy = correct_predictions.double() / total_predictions
print(f"Step {step}, Loss: {loss.item()}, Epoch Accuracy So Far: {epoch_accuracy.item()}")
avg_loss = total_loss / len(train_loader)
epoch_accuracy = correct_predictions.double() / total_predictions
print(f"Loss: {avg_loss}, Accuracy: {epoch_accuracy}")
model.save_pretrained('emoji_prediction_model_torch_revised')