word2vec-pytorch/utils.py at master · cmsflash/word2vec-pytorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import numpy as np
import scipy.stats
from torch import nn


def init_embedding_(weight, dimension):
    range_ = 0.5 / dimension
    nn.init.uniform_(weight, -range_, range_)


def dump_embedding(embedding, dimension, word_from_id, path):
    embedding_array = embedding.cpu().data.numpy()
    with open(path, 'w') as file_:
        print(f'{len(word_from_id)} {dimension}', file=file_)
        for id_, word in word_from_id.items():
            embedding = embedding_array[id_]
            embedding_string = ' '.join(str(x) for x in embedding)
            print(f'{word} {embedding_string}', file=file_)


def get_word_embedding(word, embedding, id_from_word):
    if word in id_from_word:
        word_embedding = embedding[id_from_word[word]]
    else:
        char_embeddings = []
        for char in word:
            if char in id_from_word:
                char_embeddings.append(embedding[id_from_word[char]])
            else:
                char_embeddings.append(np.zeros_like(embedding[0]))
        char_embeddings = np.array(char_embeddings)
        word_embedding = char_embeddings.mean(axis=0)
    return word_embedding


def compute_wordsim_rho(embedding, wordsim_tuples, id_from_word):
    predicted_similarities = []
    actual_similarities = []
    for word0, word1, actual_similarity in wordsim_tuples:
        embedding0 = get_word_embedding(word0, embedding, id_from_word)
        embedding1 = get_word_embedding(word1, embedding, id_from_word)
        predicted_similarity = (
            np.dot(embedding0, embedding1)
            / (np.linalg.norm(embedding0) * np.linalg.norm(embedding1))
        )
        predicted_similarities.append(predicted_similarity)
        actual_similarities.append(actual_similarity)
    spearman_rho, _ = scipy.stats.spearmanr(
        actual_similarities, predicted_similarities
    )
    spearman_rho = spearman_rho * len(actual_similarities) / len(wordsim_tuples)
    return spearman_rho