-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_sentence_corpus_st.py
More file actions
87 lines (63 loc) · 2.87 KB
/
process_sentence_corpus_st.py
File metadata and controls
87 lines (63 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import argparse
import numpy as np
from sentence_transformers import SentenceTransformer, models
import read_files as read
from Pooling_custom import Pooling as Pooling
from transformer_custom import Transformer
def main(model_path, model_type, start, sentence_corpus, output_path):
#### Read sentence courpus. output: list of sentences ####
sentences = read.read_from_tsv(sentence_corpus)
sentences = [item for row in sentences for item in row]
print(sentences[:10])
if model_type.lower() in ["bert"]:
# Load pretrained model
word_embedding_model = Transformer(model_path,start)
pooling_model = Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=False,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
pooling_mode_mean_mark_tokens=True)
# dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh())
embedder = SentenceTransformer(
modules=[word_embedding_model, pooling_model])
#### load sentence BERT models and generate sentence embeddings ####
else:
#### load sentence BERT models and generate sentence embeddings ####
embedder = SentenceTransformer(model_path)
embedder.max_seq_length = 16
sentences_embedding = embedder.encode(sentences,
batch_size=1024,
show_progress_bar=True,
num_workers=8)
read.create_folder(output_path)
np.save(output_path, sentences_embedding)
# pickle.dump(d, open("file", 'w'), protocol=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=
'Generate sentence embedding for each sentence in the sentence corpus '
)
parser.add_argument('--model',
help='the direcotory of the model',
required=True)
parser.add_argument(
'--model_type',
help='the type of the model, sentence_bert or just bert',
required=True)
parser.add_argument('--sentences',
help='the direcotory of the sentence corpus',
required=True)
parser.add_argument('--output',
help='the direcotory of the sentence corpus',
required=True)
parser.add_argument('--start',
help='the direcotory of the sentence corpus',default=False,
action='store_true')
args = parser.parse_args()
model_path = args.model
model_type = args.model_type
sentence_corpus = args.sentences
output_path = args.output
start = args.start
main(model_path, model_type, start, sentence_corpus, output_path)