-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetEmbeddings.py
More file actions
71 lines (50 loc) · 2.15 KB
/
getEmbeddings.py
File metadata and controls
71 lines (50 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
from torch import nn
import allennlp
from allennlp.commands.elmo import ElmoEmbedder
import numpy as np
device = torch.device('cuda')
def get_elmo_embedder():
elmo = ElmoEmbedder(
options_file = './data/ELMo/options_128.json',
weight_file = './data/ELMo/elmo_weights_128.hdf5'
# options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json',
# weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
)
return elmo
def get_elmo_embedding(elmo, tokens):
max_len = max([len(p) for p in tokens])
batch_embeddings = []
for token in tokens:
emb = elmo.embed_sentence(token)[2]
if len(token) != max_len:
unk = np.zeros(((max_len - len(token)), emb.shape[1]))
emb = np.concatenate([emb,unk],axis=0)
batch_embeddings.append(emb)
batch_embeddings = torch.Tensor(np.array(batch_embeddings))
return batch_embeddings
def get_glove_embedding():
weights_matrix = np.load('./data/Glove/glove_embeddings.npy')
num_embeddings, embedding_dim = weights_matrix.shape
embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix).to(device),freeze=True)
return embedding
def get_one_hot_vector(dictionary, listOfTags):
batch_tags = []
for tagSent in listOfTags:
max_len_words = len(tagSent)
listOfIndx = [int(dictionary[i]) if i!= 100 else 100 for i in tagSent]
try:
ind = listOfIndx.index(100)
except:
ind = max_len_words
a = np.array(listOfIndx[:ind])
b = np.zeros((ind, len(dictionary)))
b[np.arange(ind),a] = 1
if ind == max_len_words:
batch_tags.append(b)
else:
ind = max_len_words - ind
c = np.zeros((ind, len(dictionary)))
b = np.concatenate([b,c],axis=0)
batch_tags.append(b)
return np.array(batch_tags)