Add sequence_tagging example#302
Conversation
Codecov Report
@@ Coverage Diff @@
## master #302 +/- ##
=======================================
Coverage 79.76% 79.76%
=======================================
Files 133 133
Lines 11122 11122
=======================================
Hits 8872 8872
Misses 2250 2250 Continue to review full report at Codecov.
|
|
|
||
| import texar.torch as tx | ||
|
|
||
| # pylint: disable=redefined-outer-name, unused-variable |
There was a problem hiding this comment.
Why are these necessary? I can imagine unused-variable being required for the constants, but why redefined-outer-name? It's also better practice to add a corresponding pylint: enable comment after where it's no longer required.
| DIGIT_RE = re.compile(r"\d") | ||
|
|
||
|
|
||
| def create_vocabs(train_path, dev_path, test_path, normalize_digits=True, |
There was a problem hiding this comment.
Please add type annotations for functions.
| # Prepares/loads data | ||
| if config.load_glove: | ||
| print('loading GloVe embedding...') | ||
| glove_dict = load_glove(embedding_path, EMBEDD_DIM) | ||
| else: | ||
| glove_dict = None | ||
|
|
||
| (word_vocab, char_vocab, ner_vocab), (i2w, i2n) = create_vocabs( | ||
| train_path, dev_path, test_path, glove_dict=glove_dict) | ||
|
|
||
| data_train = read_data(train_path, word_vocab, char_vocab, ner_vocab) | ||
| data_dev = read_data(dev_path, word_vocab, char_vocab, ner_vocab) | ||
| data_test = read_data(test_path, word_vocab, char_vocab, ner_vocab) | ||
|
|
||
| scale = np.sqrt(3.0 / EMBEDD_DIM) | ||
| word_vecs = np.random.uniform( | ||
| -scale, scale, [len(word_vocab), EMBEDD_DIM]).astype(np.float32) | ||
| if config.load_glove: | ||
| word_vecs = construct_init_word_vecs(word_vocab, word_vecs, glove_dict) | ||
|
|
||
| scale = np.sqrt(3.0 / CHAR_DIM) | ||
| char_vecs = np.random.uniform( | ||
| -scale, scale, [len(char_vocab), CHAR_DIM]).astype(np.float32) |
There was a problem hiding this comment.
Consider moving these into main as well.
| return word_vecs | ||
|
|
||
|
|
||
| class CoNLLReader: |
There was a problem hiding this comment.
Can we modify this to use tx.data.Dataset? This would eliminate the need for separate read_data and iterate_batch methods.
| ner_tags, ner_ids) | ||
|
|
||
|
|
||
| class NERInstance: |
There was a problem hiding this comment.
This and Sentence below can be changed to NamedTuples, which also supports custom methods.
| yield wid_inputs, cid_inputs, nid_inputs, masks, lengths | ||
|
|
||
|
|
||
| def load_glove(filename, emb_dim, normalize_digits=True): |
There was a problem hiding this comment.
I thought we had a load_glove method inside tx.data? What are the differences here?
| self.dense_2 = nn.Linear(in_features=config.tag_space, | ||
| out_features=len(ner_vocab)) | ||
|
|
||
| def forward(self, inputs, chars, targets, masks, seq_lengths, mode): |
| def start(self, file_path): | ||
| self.__source_file = open(file_path, 'w', encoding='utf-8') | ||
|
|
||
| def close(self): | ||
| self.__source_file.close() |
There was a problem hiding this comment.
We could change these to __enter__ and __exit__ and use the with writer.open(path) as f context manager pattern. It's also fine to keep it as is if you're more comfortable with this.
Adapted from sequence_tagging in
texar-tf.