Skip to content

Commit e17b1bb

Browse files
committed
add: expected_error_reduction implementation finished, tests added
1 parent d61b500 commit e17b1bb

File tree

4 files changed

+191
-144
lines changed

4 files changed

+191
-144
lines changed

modAL/expected_error.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Expected error reduction framework for active learning.
3+
"""
4+
5+
from typing import Tuple
6+
7+
import numpy as np
8+
9+
from scipy.stats import entropy
10+
11+
from sklearn.base import clone
12+
from sklearn.exceptions import NotFittedError
13+
14+
from modAL.models import ActiveLearner
15+
from modAL.utils.data import modALinput, data_vstack
16+
from modAL.utils.selection import multi_argmax
17+
18+
19+
def expected_error_reduction(learner: ActiveLearner, X: modALinput,
20+
p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
21+
"""
22+
Expected error reduction query strategy.
23+
24+
References:
25+
Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)
26+
27+
Args:
28+
learner: The ActiveLearner object for which the expected error is to be estimated.
29+
X: The samples.
30+
p_subsample: Probability of keeping a sample from the pool when calculating expected error.
31+
Significantly improves runtime for large sample pools.
32+
n_instances: The number of instances to be sampled.
33+
34+
35+
Returns:
36+
The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
37+
"""
38+
39+
assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
40+
41+
#expected_error = np.full(shape=(len(X), ), fill_value=-np.nan)
42+
expected_error = np.zeros(shape=(len(X), ))
43+
possible_labels = np.unique(learner.y_training)
44+
45+
try:
46+
X_proba = learner.predict_proba(X)
47+
except NotFittedError:
48+
# TODO: implement a proper cold-start
49+
return 0, X[0]
50+
51+
for x_idx, x in enumerate(X):
52+
# subsample the data if needed
53+
if np.random.rand() <= p_subsample:
54+
# estimate the expected error
55+
for y_idx, y in enumerate(possible_labels):
56+
X_new = data_vstack((learner.X_training, x.reshape(1, -1)))
57+
y_new = data_vstack((learner.y_training, np.array(y).reshape(1, )))
58+
59+
refitted_estimator = clone(learner.estimator).fit(X_new, y_new)
60+
uncertainty = 1 - np.max(refitted_estimator.predict_proba(X), axis=1)
61+
62+
expected_error[x_idx] += np.sum(uncertainty)*X_proba[x_idx, y_idx]
63+
64+
else:
65+
expected_error[x_idx] -np.nan
66+
67+
query_idx = multi_argmax(expected_error, n_instances)
68+
69+
return query_idx, X[query_idx]
70+

modAL/expected_error_reduction.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

modAL/utils/data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Union
1+
from typing import Union, Container
22
from itertools import chain
33

44
import numpy as np
@@ -8,7 +8,7 @@
88
modALinput = Union[list, np.ndarray, sp.csr_matrix]
99

1010

11-
def data_vstack(blocks: modALinput) -> modALinput:
11+
def data_vstack(blocks: Container) -> modALinput:
1212
"""
1313
Stack vertically both sparse and dense arrays.
1414

tests/core_tests.py

Lines changed: 119 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
import mock
66
import modAL.models.base
77
import modAL.models.learners
8-
import modAL.uncertainty
9-
import modAL.disagreement
10-
import modAL.density
118
import modAL.utils.selection
129
import modAL.utils.validation
1310
import modAL.utils.combination
11+
import modAL.acquisition
12+
import modAL.batch
13+
import modAL.density
14+
import modAL.disagreement
15+
import modAL.expected_error
1416
import modAL.multilabel
17+
import modAL.uncertainty
1518

1619
from copy import deepcopy
1720
from itertools import chain, product
@@ -280,6 +283,119 @@ def test_selection(self):
280283
modAL.acquisition.max_UCB(optimizer, X, beta=np.random.rand(), n_instances=n_instances)
281284

282285

286+
class TestDensity(unittest.TestCase):
287+
288+
def test_similarize_distance(self):
289+
from scipy.spatial.distance import cosine
290+
sim = modAL.density.similarize_distance(cosine)
291+
for _ in range(100):
292+
for n_dim in range(1, 10):
293+
X_1, X_2 = np.random.rand(n_dim), np.random.rand(n_dim)
294+
np.testing.assert_almost_equal(
295+
sim(X_1, X_2),
296+
1/(1 + cosine(X_1, X_2))
297+
)
298+
299+
def test_information_density(self):
300+
for n_samples in range(1, 10):
301+
for n_dim in range(1, 10):
302+
X_pool = np.random.rand(n_samples, n_dim)
303+
similarities = modAL.density.information_density(X_pool)
304+
np.testing.assert_equal(len(similarities), n_samples)
305+
306+
307+
class TestDisagreements(unittest.TestCase):
308+
309+
def test_vote_entropy(self):
310+
for n_samples in range(1, 10):
311+
for n_classes in range(1, 10):
312+
for true_query_idx in range(n_samples):
313+
# 1. fitted committee
314+
vote_return = np.zeros(shape=(n_samples, n_classes), dtype=np.int16)
315+
vote_return[true_query_idx] = np.asarray(range(n_classes), dtype=np.int16)
316+
committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), vote_return=vote_return)
317+
vote_entr = modAL.disagreement.vote_entropy(
318+
committee, np.random.rand(n_samples, n_classes)
319+
)
320+
true_entropy = np.zeros(shape=(n_samples, ))
321+
true_entropy[true_query_idx] = entropy(np.ones(n_classes)/n_classes)
322+
np.testing.assert_array_almost_equal(vote_entr, true_entropy)
323+
324+
# 2. unfitted committee
325+
committee = mock.MockCommittee(fitted=False)
326+
true_entropy = np.zeros(shape=(n_samples,))
327+
vote_entr = modAL.disagreement.vote_entropy(
328+
committee, np.random.rand(n_samples, n_classes)
329+
)
330+
np.testing.assert_almost_equal(vote_entr, true_entropy)
331+
332+
def test_consensus_entropy(self):
333+
for n_samples in range(1, 10):
334+
for n_classes in range(2, 10):
335+
for true_query_idx in range(n_samples):
336+
# 1. fitted committee
337+
proba = np.zeros(shape=(n_samples, n_classes))
338+
proba[:, 0] = 1.0
339+
proba[true_query_idx] = np.ones(n_classes)/n_classes
340+
committee = mock.MockCommittee(predict_proba_return=proba)
341+
consensus_entropy = modAL.disagreement.consensus_entropy(
342+
committee, np.random.rand(n_samples, n_classes)
343+
)
344+
true_entropy = np.zeros(shape=(n_samples,))
345+
true_entropy[true_query_idx] = entropy(np.ones(n_classes) / n_classes)
346+
np.testing.assert_array_almost_equal(consensus_entropy, true_entropy)
347+
348+
# 2. unfitted committee
349+
committee = mock.MockCommittee(fitted=False)
350+
true_entropy = np.zeros(shape=(n_samples,))
351+
consensus_entropy = modAL.disagreement.consensus_entropy(
352+
committee, np.random.rand(n_samples, n_classes)
353+
)
354+
np.testing.assert_almost_equal(consensus_entropy, true_entropy)
355+
356+
def test_KL_max_disagreement(self):
357+
for n_samples in range(1, 10):
358+
for n_classes in range(2, 10):
359+
for n_learners in range (2, 10):
360+
# 1. fitted committee
361+
vote_proba = np.zeros(shape=(n_samples, n_learners, n_classes))
362+
vote_proba[:, :, 0] = 1.0
363+
committee = mock.MockCommittee(
364+
n_learners=n_learners, classes_=range(n_classes),
365+
vote_proba_return=vote_proba
366+
)
367+
368+
true_KL_disagreement = np.zeros(shape=(n_samples, ))
369+
370+
try:
371+
np.testing.assert_array_almost_equal(
372+
true_KL_disagreement,
373+
modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1))
374+
)
375+
except:
376+
modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1))
377+
378+
# 2. unfitted committee
379+
committee = mock.MockCommittee(fitted=False)
380+
true_KL_disagreement = np.zeros(shape=(n_samples,))
381+
returned_KL_disagreement = modAL.disagreement.KL_max_disagreement(
382+
committee, np.random.rand(n_samples, n_classes)
383+
)
384+
np.testing.assert_almost_equal(returned_KL_disagreement, true_KL_disagreement)
385+
386+
387+
class TestEER(unittest.TestCase):
388+
def test_eer(self):
389+
for n_pool, n_features, n_classes in product(range(1, 10), range(1, 5), range(2, 5)):
390+
X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10)
391+
X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool)
392+
393+
learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2),
394+
X_training=X_training, y_training=y_training)
395+
396+
modAL.expected_error.expected_error_reduction(learner, X_pool)
397+
398+
283399
class TestUncertainties(unittest.TestCase):
284400

285401
def test_classifier_uncertainty(self):
@@ -383,107 +499,6 @@ def test_entropy_sampling(self):
383499
np.testing.assert_array_equal(query_idx, true_query_idx)
384500

385501

386-
class TestDensity(unittest.TestCase):
387-
388-
def test_similarize_distance(self):
389-
from scipy.spatial.distance import cosine
390-
sim = modAL.density.similarize_distance(cosine)
391-
for _ in range(100):
392-
for n_dim in range(1, 10):
393-
X_1, X_2 = np.random.rand(n_dim), np.random.rand(n_dim)
394-
np.testing.assert_almost_equal(
395-
sim(X_1, X_2),
396-
1/(1 + cosine(X_1, X_2))
397-
)
398-
399-
def test_information_density(self):
400-
for n_samples in range(1, 10):
401-
for n_dim in range(1, 10):
402-
X_pool = np.random.rand(n_samples, n_dim)
403-
similarities = modAL.density.information_density(X_pool)
404-
np.testing.assert_equal(len(similarities), n_samples)
405-
406-
407-
class TestDisagreements(unittest.TestCase):
408-
409-
def test_vote_entropy(self):
410-
for n_samples in range(1, 10):
411-
for n_classes in range(1, 10):
412-
for true_query_idx in range(n_samples):
413-
# 1. fitted committee
414-
vote_return = np.zeros(shape=(n_samples, n_classes), dtype=np.int16)
415-
vote_return[true_query_idx] = np.asarray(range(n_classes), dtype=np.int16)
416-
committee = mock.MockCommittee(classes_=np.asarray(range(n_classes)), vote_return=vote_return)
417-
vote_entr = modAL.disagreement.vote_entropy(
418-
committee, np.random.rand(n_samples, n_classes)
419-
)
420-
true_entropy = np.zeros(shape=(n_samples, ))
421-
true_entropy[true_query_idx] = entropy(np.ones(n_classes)/n_classes)
422-
np.testing.assert_array_almost_equal(vote_entr, true_entropy)
423-
424-
# 2. unfitted committee
425-
committee = mock.MockCommittee(fitted=False)
426-
true_entropy = np.zeros(shape=(n_samples,))
427-
vote_entr = modAL.disagreement.vote_entropy(
428-
committee, np.random.rand(n_samples, n_classes)
429-
)
430-
np.testing.assert_almost_equal(vote_entr, true_entropy)
431-
432-
def test_consensus_entropy(self):
433-
for n_samples in range(1, 10):
434-
for n_classes in range(2, 10):
435-
for true_query_idx in range(n_samples):
436-
# 1. fitted committee
437-
proba = np.zeros(shape=(n_samples, n_classes))
438-
proba[:, 0] = 1.0
439-
proba[true_query_idx] = np.ones(n_classes)/n_classes
440-
committee = mock.MockCommittee(predict_proba_return=proba)
441-
consensus_entropy = modAL.disagreement.consensus_entropy(
442-
committee, np.random.rand(n_samples, n_classes)
443-
)
444-
true_entropy = np.zeros(shape=(n_samples,))
445-
true_entropy[true_query_idx] = entropy(np.ones(n_classes) / n_classes)
446-
np.testing.assert_array_almost_equal(consensus_entropy, true_entropy)
447-
448-
# 2. unfitted committee
449-
committee = mock.MockCommittee(fitted=False)
450-
true_entropy = np.zeros(shape=(n_samples,))
451-
consensus_entropy = modAL.disagreement.consensus_entropy(
452-
committee, np.random.rand(n_samples, n_classes)
453-
)
454-
np.testing.assert_almost_equal(consensus_entropy, true_entropy)
455-
456-
def test_KL_max_disagreement(self):
457-
for n_samples in range(1, 10):
458-
for n_classes in range(2, 10):
459-
for n_learners in range (2, 10):
460-
# 1. fitted committee
461-
vote_proba = np.zeros(shape=(n_samples, n_learners, n_classes))
462-
vote_proba[:, :, 0] = 1.0
463-
committee = mock.MockCommittee(
464-
n_learners=n_learners, classes_=range(n_classes),
465-
vote_proba_return=vote_proba
466-
)
467-
468-
true_KL_disagreement = np.zeros(shape=(n_samples, ))
469-
470-
try:
471-
np.testing.assert_array_almost_equal(
472-
true_KL_disagreement,
473-
modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1))
474-
)
475-
except:
476-
modAL.disagreement.KL_max_disagreement(committee, np.random.rand(n_samples, 1))
477-
478-
# 2. unfitted committee
479-
committee = mock.MockCommittee(fitted=False)
480-
true_KL_disagreement = np.zeros(shape=(n_samples,))
481-
returned_KL_disagreement = modAL.disagreement.KL_max_disagreement(
482-
committee, np.random.rand(n_samples, n_classes)
483-
)
484-
np.testing.assert_almost_equal(returned_KL_disagreement, true_KL_disagreement)
485-
486-
487502
class TestQueries(unittest.TestCase):
488503

489504
def test_multi_argmax(self):
@@ -963,7 +978,6 @@ def test_strategies(self):
963978
modAL.multilabel.avg_score(classifier, X_pool, n_query_instances)
964979

965980

966-
967981
class TestExamples(unittest.TestCase):
968982

969983
def test_examples(self):

0 commit comments

Comments
 (0)