-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrf_text.py
More file actions
91 lines (69 loc) · 2.92 KB
/
crf_text.py
File metadata and controls
91 lines (69 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy
from collections import Counter
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras_contrib.layers import CRF
from keras_contrib.datasets import conll2000
EPOCHS = 10
EMBED_DIM = 200
BiRNN_UNITS = 200
save_path="crf_test.h5"
def classification_report(y_true, y_pred, labels):
'''Similar to the one in sklearn.metrics, reports per classs recall, precision and F1 score'''
y_true = numpy.asarray(y_true).ravel()##similar to flatten, instead of returning a copy of array, it returns a view
y_pred = numpy.asarray(y_pred).ravel()
corrects = Counter(yt for yt, yp in zip(y_true, y_pred) if yt == yp)
y_true_counts = Counter(y_true)
y_pred_counts = Counter(y_pred)
report = ((lab, # label
corrects[i] / max(1, y_true_counts[i]), # recall
corrects[i] / max(1, y_pred_counts[i]), # precision
y_true_counts[i] # support
) for i, lab in enumerate(labels))
report = [(l, r, p, 2 * r * p / max(1e-9, r + p), s) for l, r, p, s in report]
print('{:<15}{:>10}{:>10}{:>10}{:>10}\n'.format('', 'recall', 'precision', 'f1-score', 'support'))
formatter = '{:<15}{:>10.2f}{:>10.2f}{:>10.2f}{:>10d}'.format
for r in report:
print(formatter(*r))
print('')
# ------
# Data
# -----
# conll200 has two different targets, here will only use IBO like chunking as an example
(train_x, _, train_y), (test_x, _, test_y), (vocab, _, class_labels) = conll2000.load_data()
# --------------
# 1. Regular CRF
# --------------
print('==== training CRF ====')
model = Sequential()
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
crf = CRF(len(class_labels), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.fit(train_x, train_y, epochs=EPOCHS, validation_data=[test_x, test_y])
model.save(save_path)
model.load_weights(save_path)
test_y_pred = model.predict(test_x).argmax(-1)[test_x > 0]
test_y_true = test_y[test_x > 0]
print('\n---- Result of CRF ----\n')
classification_report(test_y_true, test_y_pred, class_labels)
# -------------
# 2. BiLSTM-CRF
# -------------
print('==== training BiLSTM-CRF ====')
model = Sequential()
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
crf = CRF(len(class_labels), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.fit(train_x, train_y, epochs=EPOCHS, validation_data=[test_x, test_y])
model.save(save_path)
model.load_weights(save_path)
test_y_pred = model.predict(test_x).argmax(-1)[test_x > 0]
test_y_true = test_y[test_x > 0]
print('\n---- Result of BiLSTM-CRF ----\n')
classification_report(test_y_true, test_y_pred, class_labels)