Skip to content

Commit dba5855

Browse files
authored
Bugfixes for EvaluateModel and MatFileSampler (#168)
* prelim changes to the evaluatemodel class, need to remove print statements * bugfix evalmodel for use_features_ord parameter * address off-by-1 error in mat file sampler * adjust the if statement and remove sorting on test performance.txt feature scores * fix format issue in metrics * update use_features_ord documentation with more information
1 parent 0da6430 commit dba5855

3 files changed

Lines changed: 41 additions & 32 deletions

File tree

selene_sdk/evaluate_model.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,12 @@ class EvaluateModel(object):
5959
Default is None. Specify an ordered list of features for which to
6060
run the evaluation. The features in this list must be identical to or
6161
a subset of `features`, and in the order you want the resulting
62-
`test_targets.npz` and `test_predictions.npz` to be saved.
62+
`test_targets.npz` and `test_predictions.npz` to be saved. If using
63+
a FileSampler or H5DataLoader for the evaluation, you can pass in
64+
a dataset with the targets matrix only containing these features, but
65+
note that this subsetted targets matrix MUST be ordered the same
66+
way as `features`, and the predictions and targets .npz output
67+
will be reordered according to `use_features_ord`.
6368
6469
Attributes
6570
----------
@@ -117,17 +122,14 @@ def __init__(self,
117122
self.output_dir = output_dir
118123
os.makedirs(output_dir, exist_ok=True)
119124

120-
self.features = features
125+
self.features = np.array(features)
121126
self._use_ixs = list(range(len(features)))
122127
if use_features_ord is not None:
123128
feature_ixs = {f: ix for (ix, f) in enumerate(features)}
124129
self._use_ixs = []
125-
self.features = []
126-
127130
for f in use_features_ord:
128131
if f in feature_ixs:
129132
self._use_ixs.append(feature_ixs[f])
130-
self.features.append(f)
131133
else:
132134
warnings.warn(("Feature {0} in `use_features_ord` "
133135
"does not match any features in the list "
@@ -157,11 +159,23 @@ def __init__(self,
157159

158160
self._test_data, self._all_test_targets = \
159161
self.sampler.get_data_and_targets(self.batch_size, n_test_samples)
160-
# TODO: we should be able to do this on the sampler end instead of
161-
# here. the current workaround is problematic, since
162-
# self._test_data still has the full featureset in it, and we
163-
# select the subset during `evaluate`
164-
self._all_test_targets = self._all_test_targets[:, self._use_ixs]
162+
163+
self._use_testmat_ixs = self._use_ixs[:]
164+
# if the targets shape is the same as the subsetted features,
165+
# reindex based on the subsetted list
166+
if self._all_test_targets.shape[1] == len(self._use_ixs):
167+
subset_features = {self.features[ix]: i for (i, ix) in
168+
enumerate(sorted(self._use_ixs))}
169+
self._use_testmat_ixs = [
170+
subset_features[f] for f in self.features[self._use_ixs]]
171+
172+
self._all_test_targets = self._all_test_targets[
173+
:, self._use_testmat_ixs]
174+
175+
# save the targets dataset now
176+
np.savez_compressed(
177+
os.path.join(self.output_dir, "test_targets.npz"),
178+
data=self._all_test_targets)
165179

166180
# reset Genome base ordering when applicable.
167181
if (hasattr(self.sampler, "reference_sequence") and
@@ -179,7 +193,7 @@ def _write_features_ordered_to_file(self):
179193
"""
180194
fp = os.path.join(self.output_dir, 'use_features_ord.txt')
181195
with open(fp, 'w+') as file_handle:
182-
for f in self.features:
196+
for f in self.features[self._use_ixs]:
183197
file_handle.write('{0}\n'.format(f))
184198

185199
def _get_feature_from_index(self, index):
@@ -196,7 +210,7 @@ def _get_feature_from_index(self, index):
196210
The name of the feature/target at the specified index.
197211
198212
"""
199-
return self.features[index]
213+
return self.features[self._use_ixs][index]
200214

201215
def evaluate(self):
202216
"""
@@ -216,7 +230,7 @@ def evaluate(self):
216230
all_predictions = []
217231
for (inputs, targets) in self._test_data:
218232
inputs = torch.Tensor(inputs)
219-
targets = torch.Tensor(targets[:, self._use_ixs])
233+
targets = torch.Tensor(targets[:, self._use_testmat_ixs])
220234

221235
if self.use_cuda:
222236
inputs = inputs.cuda()
@@ -246,10 +260,6 @@ def evaluate(self):
246260
os.path.join(self.output_dir, "test_predictions.npz"),
247261
data=all_predictions)
248262

249-
np.savez_compressed(
250-
os.path.join(self.output_dir, "test_targets.npz"),
251-
data=self._all_test_targets)
252-
253263
loss = np.average(batch_losses)
254264
logger.info("test loss: {0}".format(loss))
255265
for name, score in average_scores.items():

selene_sdk/samplers/file_samplers/mat_file_sampler.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,7 @@ def __init__(self,
105105
self._tgts_batch_axis = targets_batch_axis
106106
self.n_samples = self._sample_seqs.shape[self._seq_batch_axis]
107107

108-
self._sample_indices = np.arange(
109-
self.n_samples).tolist()
108+
self._sample_indices = np.arange(self.n_samples).tolist()
110109
self._sample_next = 0
111110

112111
self._shuffle = shuffle
@@ -138,7 +137,7 @@ def sample(self, batch_size=1):
138137
"""
139138
sample_up_to = self._sample_next + batch_size
140139
use_indices = None
141-
if sample_up_to >= len(self._sample_indices):
140+
if sample_up_to > len(self._sample_indices):
142141
if self._shuffle:
143142
np.random.shuffle(self._sample_indices)
144143
self._sample_next = 0
@@ -237,19 +236,18 @@ def get_data_and_targets(self, batch_size, n_samples=None):
237236
"initialization. Please use `get_data` instead.")
238237
if not n_samples:
239238
n_samples = self.n_samples
239+
240240
sequences_and_targets = []
241241
targets_mat = []
242242

243-
count = batch_size
243+
count = 0
244244
while count < n_samples:
245-
seqs, tgts = self.sample(batch_size=batch_size)
245+
sample_size = min(n_samples - count, batch_size)
246+
seqs, tgts = self.sample(batch_size=sample_size)
246247
sequences_and_targets.append((seqs, tgts))
247248
targets_mat.append(tgts)
248-
count += batch_size
249-
remainder = batch_size - (count - n_samples)
250-
seqs, tgts = self.sample(batch_size=remainder)
251-
sequences_and_targets.append((seqs, tgts))
252-
targets_mat.append(tgts)
249+
count += sample_size
250+
253251
# TODO: should not assume targets are always integers
254252
targets_mat = np.vstack(targets_mat).astype(float)
255253
return sequences_and_targets, targets_mat

selene_sdk/utils/performance_metrics.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def get_feature_specific_scores(data, get_feature_from_index_fn):
250250

251251
def auc_u_test(labels, predictions):
252252
"""
253-
Outputs the area under the the ROC curve associated with a certain
253+
Outputs the area under the the ROC curve associated with a certain
254254
set of labels and the predictions given by the training model.
255255
Computed from the U statistic.
256256
@@ -265,8 +265,8 @@ def auc_u_test(labels, predictions):
265265
Returns
266266
-------
267267
float
268-
AUC value of given label, prediction pairs
269-
268+
AUC value of given label, prediction pairs
269+
270270
"""
271271
len_pos = int(np.sum(labels))
272272
len_neg = len(labels) - len_pos
@@ -316,7 +316,8 @@ class PerformanceMetrics(object):
316316
def __init__(self,
317317
get_feature_from_index_fn,
318318
report_gt_feature_n_positives=10,
319-
metrics=dict(roc_auc=roc_auc_score, average_precision=average_precision_score)):
319+
metrics=dict(roc_auc=roc_auc_score,
320+
average_precision=average_precision_score)):
320321
"""
321322
Creates a new object of the `PerformanceMetrics` class.
322323
"""
@@ -467,7 +468,7 @@ def write_feature_scores_to_file(self, output_path):
467468
cols = '\t'.join(["class"] + metric_cols)
468469
with open(output_path, 'w+') as file_handle:
469470
file_handle.write("{0}\n".format(cols))
470-
for feature, metric_scores in sorted(feature_scores.items()):
471+
for feature, metric_scores in feature_scores.items():
471472
if not metric_scores:
472473
file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
473474
else:

0 commit comments

Comments
 (0)